File size: 4,588 Bytes
04ca1b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
File managing the text analysis functionality for the application.
"""
import torch
import torch.nn.functional as F
from datasets import Dataset
from models import load_model, device

def create_chunked_dataset(dataset, tokenizer, max_length=512, stride=256):
    """
    Crée un nouveau dataset avec des chunks à partir du dataset original
    """
    all_chunks = {
        'input_ids': [],
        'attention_mask': [],
        'chunk_id': [],
        'example_id': [],
        'labels': []
    }
    
    for idx, example in enumerate(dataset):
        text = example['text']
        label = int(example['label'])
        
        tokenized = tokenizer(text, truncation=False, padding=False)
        input_ids = tokenized['input_ids']
        
        if len(input_ids) <= max_length:
            tokenized = tokenizer(
                text, 
                truncation=True, 
                padding='max_length', 
                max_length=max_length
            )
            all_chunks['input_ids'].append(tokenized['input_ids'])
            all_chunks['attention_mask'].append(tokenized['attention_mask'])
            all_chunks['chunk_id'].append(0)
            all_chunks['example_id'].append(idx)
            all_chunks['labels'].append(label)
        else:
            chunk_id = 0
            for i in range(0, len(input_ids), stride):
                chunk = input_ids[i:i + max_length]
                if len(chunk) < max_length // 2:  
                    continue
                    
                # Padding to max_length
                attention_mask = [1] * len(chunk)
                if len(chunk) < max_length:
                    padding_length = max_length - len(chunk)
                    chunk = chunk + [tokenizer.pad_token_id] * padding_length
                    attention_mask = attention_mask + [0] * padding_length
                
                all_chunks['input_ids'].append(chunk)
                all_chunks['attention_mask'].append(attention_mask)
                all_chunks['chunk_id'].append(chunk_id)
                all_chunks['example_id'].append(idx)
                all_chunks['labels'].append(label)
                chunk_id += 1

    return Dataset.from_dict(all_chunks)

def analyze_text(text: str, model_name: str):
    """
    Analyze the text for bias or neutrality using a selected classification model.
    
    Args:
        text (str) : Text to analyze.
        model_name (str) : Name of the model to use for analysis.
        
    Returns:
        tuple (confidence_map, message) : Confidence map and analysis message.
    """
    if not text.strip():
        return {"Empty text": 1.0}, "Please enter text to analyze."
    
    try:
        print("[Checkpoint] Starting classification...")
        model, tokenizer = load_model(model_name)
        
        mini_dataset = Dataset.from_dict({"text": [text], "label": [0]})
        
        chunked_dataset = create_chunked_dataset(mini_dataset, tokenizer)
        
        print("[Checkpoint] Tokenization complete. Running model...")
        model.eval()
        model.to(device)
        
        all_logits = []
        for i in range(len(chunked_dataset)):
            chunk = chunked_dataset[i]
            inputs = {
                'input_ids': torch.tensor([chunk['input_ids']]).to(device),
                'attention_mask': torch.tensor([chunk['attention_mask']]).to(device),
            }
            
            with torch.no_grad():
                outputs = model(**inputs)
            
            all_logits.append(outputs.logits[0].cpu().squeeze(0))


        stacked_logits = torch.stack(all_logits)
        averaged_logits = torch.mean(stacked_logits, dim=0)
        probs = F.softmax(averaged_logits, dim=0)
        predicted_class = torch.argmax(averaged_logits).item()
        confidence = probs[predicted_class].item()
        status = "neutral" if predicted_class == 1 else "biased"
        tag = "✅" if predicted_class == 1 else "⚠️"
        
        message = f"{tag} The text is classified as {status} with a confidence of {confidence:.2%}."
        confidence_map = {"Neutral": probs[1].item(), "Biased": probs[0].item()}
        
        print(f"[Checkpoint] Classification complete. Predicted answer: {status}")
        return confidence_map, message
    
    except ValueError as e:
        return {"Error": 1.0}, f"Configuration error: {str(e)}"
    
    except RuntimeError as e:
        return {"Error": 1.0}, f"Model error: {str(e)}"
    
    except Exception as e:
        return {"Error": 1.0}, f"Error analyzing text: {str(e)}"