Spaces:

Aubins
/

Objectivity-Analysis-Suite

Sleeping

File size: 4,588 Bytes

04ca1b2

"""
File managing the text analysis functionality for the application.
"""
import torch
import torch.nn.functional as F
from datasets import Dataset
from models import load_model, device

def create_chunked_dataset(dataset, tokenizer, max_length=512, stride=256):
    """
    Crée un nouveau dataset avec des chunks à partir du dataset original
    """
    all_chunks = {
        'input_ids': [],
        'attention_mask': [],
        'chunk_id': [],
        'example_id': [],
        'labels': []
    }
    
    for idx, example in enumerate(dataset):
        text = example['text']
        label = int(example['label'])
        
        tokenized = tokenizer(text, truncation=False, padding=False)
        input_ids = tokenized['input_ids']
        
        if len(input_ids) <= max_length:
            tokenized = tokenizer(
                text, 
                truncation=True, 
                padding='max_length', 
                max_length=max_length
            )
            all_chunks['input_ids'].append(tokenized['input_ids'])
            all_chunks['attention_mask'].append(tokenized['attention_mask'])
            all_chunks['chunk_id'].append(0)
            all_chunks['example_id'].append(idx)
            all_chunks['labels'].append(label)
        else:
            chunk_id = 0
            for i in range(0, len(input_ids), stride):
                chunk = input_ids[i:i + max_length]
                if len(chunk) < max_length // 2:  
                    continue
                    
                # Padding to max_length
                attention_mask = [1] * len(chunk)
                if len(chunk) < max_length:
                    padding_length = max_length - len(chunk)
                    chunk = chunk + [tokenizer.pad_token_id] * padding_length
                    attention_mask = attention_mask + [0] * padding_length
                
                all_chunks['input_ids'].append(chunk)
                all_chunks['attention_mask'].append(attention_mask)
                all_chunks['chunk_id'].append(chunk_id)
                all_chunks['example_id'].append(idx)
                all_chunks['labels'].append(label)
                chunk_id += 1

    return Dataset.from_dict(all_chunks)

def analyze_text(text: str, model_name: str):
    """
    Analyze the text for bias or neutrality using a selected classification model.
    
    Args:
        text (str) : Text to analyze.
        model_name (str) : Name of the model to use for analysis.
        
    Returns:
        tuple (confidence_map, message) : Confidence map and analysis message.
    """
    if not text.strip():
        return {"Empty text": 1.0}, "Please enter text to analyze."
    
    try:
        print("[Checkpoint] Starting classification...")
        model, tokenizer = load_model(model_name)
        
        mini_dataset = Dataset.from_dict({"text": [text], "label": [0]})
        
        chunked_dataset = create_chunked_dataset(mini_dataset, tokenizer)
        
        print("[Checkpoint] Tokenization complete. Running model...")
        model.eval()
        model.to(device)
        
        all_logits = []
        for i in range(len(chunked_dataset)):
            chunk = chunked_dataset[i]
            inputs = {
                'input_ids': torch.tensor([chunk['input_ids']]).to(device),
                'attention_mask': torch.tensor([chunk['attention_mask']]).to(device),
            }
            
            with torch.no_grad():
                outputs = model(**inputs)
            
            all_logits.append(outputs.logits[0].cpu().squeeze(0))


        stacked_logits = torch.stack(all_logits)
        averaged_logits = torch.mean(stacked_logits, dim=0)
        probs = F.softmax(averaged_logits, dim=0)
        predicted_class = torch.argmax(averaged_logits).item()
        confidence = probs[predicted_class].item()
        status = "neutral" if predicted_class == 1 else "biased"
        tag = "✅" if predicted_class == 1 else "⚠️"
        
        message = f"{tag} The text is classified as {status} with a confidence of {confidence:.2%}."
        confidence_map = {"Neutral": probs[1].item(), "Biased": probs[0].item()}
        
        print(f"[Checkpoint] Classification complete. Predicted answer: {status}")
        return confidence_map, message
    
    except ValueError as e:
        return {"Error": 1.0}, f"Configuration error: {str(e)}"
    
    except RuntimeError as e:
        return {"Error": 1.0}, f"Model error: {str(e)}"
    
    except Exception as e:
        return {"Error": 1.0}, f"Error analyzing text: {str(e)}"