File size: 2,562 Bytes
97f37a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import torch
from torch import nn
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd

class PredictAspect(nn.Module):
    def __init__(self, 

                 model_name='alwanrahmana/aspect-detection-bert-large',

                 num_labels=6,

                 threshold=0.3,

                 aspect_labels=None,

                 device=None):
        super(PredictAspect, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

        # Hyperparams & device config
        self.GLOBAL_THRESHOLD = threshold
        self.ASPECT_LABELS = aspect_labels if aspect_labels else ["service", "price", "ambience", "food", "cleanliness", "location"]
        self.DEVICE = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.DEVICE)
        self.model.eval()

    def _predict_probs(self, texts, max_len=512):
        encoded_batch = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors='pt',
            return_token_type_ids=False
        )

        input_ids = encoded_batch['input_ids'].to(self.DEVICE)
        attention_mask = encoded_batch['attention_mask'].to(self.DEVICE)

        with torch.no_grad():
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            logits = outputs.logits
            probs = torch.sigmoid(logits).cpu().numpy()

        return probs

    def predict(self, texts, max_len=512):
        """

        Full end-to-end pipeline:

        input: list of texts

        output: dataframe with detected aspects and probabilities

        """
        probs = self._predict_probs(texts, max_len=max_len)

        results = []
        for text, prob in zip(texts, probs):
            positive_aspects = []
            proba_values = []

            for aspect, p in zip(self.ASPECT_LABELS, prob):
                if p > self.GLOBAL_THRESHOLD:
                    positive_aspects.append(aspect)
                    proba_values.append(round(float(p), 4))

            results.append({
                'text': text,
                'aspects': positive_aspects,
                'proba_values': proba_values
            })

        df = pd.DataFrame(results)
        return df