|
|
import torch
|
|
|
from torch import nn
|
|
|
from transformers import BertTokenizer, BertForSequenceClassification
|
|
|
import pandas as pd
|
|
|
|
|
|
class PredictAspect(nn.Module):
|
|
|
def __init__(self,
|
|
|
model_name='alwanrahmana/aspect-detection-bert-large',
|
|
|
num_labels=6,
|
|
|
threshold=0.3,
|
|
|
aspect_labels=None,
|
|
|
device=None):
|
|
|
super(PredictAspect, self).__init__()
|
|
|
self.tokenizer = BertTokenizer.from_pretrained(model_name)
|
|
|
self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
|
|
|
|
|
|
|
|
|
self.GLOBAL_THRESHOLD = threshold
|
|
|
self.ASPECT_LABELS = aspect_labels if aspect_labels else ["service", "price", "ambience", "food", "cleanliness", "location"]
|
|
|
self.DEVICE = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
self.model.to(self.DEVICE)
|
|
|
self.model.eval()
|
|
|
|
|
|
def _predict_probs(self, texts, max_len=512):
|
|
|
encoded_batch = self.tokenizer(
|
|
|
texts,
|
|
|
padding=True,
|
|
|
truncation=True,
|
|
|
max_length=max_len,
|
|
|
return_tensors='pt',
|
|
|
return_token_type_ids=False
|
|
|
)
|
|
|
|
|
|
input_ids = encoded_batch['input_ids'].to(self.DEVICE)
|
|
|
attention_mask = encoded_batch['attention_mask'].to(self.DEVICE)
|
|
|
|
|
|
with torch.no_grad():
|
|
|
outputs = self.model(
|
|
|
input_ids=input_ids,
|
|
|
attention_mask=attention_mask
|
|
|
)
|
|
|
logits = outputs.logits
|
|
|
probs = torch.sigmoid(logits).cpu().numpy()
|
|
|
|
|
|
return probs
|
|
|
|
|
|
def predict(self, texts, max_len=512):
|
|
|
"""
|
|
|
Full end-to-end pipeline:
|
|
|
input: list of texts
|
|
|
output: dataframe with detected aspects and probabilities
|
|
|
"""
|
|
|
probs = self._predict_probs(texts, max_len=max_len)
|
|
|
|
|
|
results = []
|
|
|
for text, prob in zip(texts, probs):
|
|
|
positive_aspects = []
|
|
|
proba_values = []
|
|
|
|
|
|
for aspect, p in zip(self.ASPECT_LABELS, prob):
|
|
|
if p > self.GLOBAL_THRESHOLD:
|
|
|
positive_aspects.append(aspect)
|
|
|
proba_values.append(round(float(p), 4))
|
|
|
|
|
|
results.append({
|
|
|
'text': text,
|
|
|
'aspects': positive_aspects,
|
|
|
'proba_values': proba_values
|
|
|
})
|
|
|
|
|
|
df = pd.DataFrame(results)
|
|
|
return df
|
|
|
|
|
|
|
|
|
|