import torch
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
from all_models import models

# Remove these lines since we're using the singleton
# MODEL_NAME = "google/flan-t5-xl"
# model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Get device and ensure model is on correct device
device = "cuda" if torch.cuda.is_available() else "cpu"
try:
    models.flan_model.to(device)
except Exception as e:
    print(f"Warning: Could not move model to device {device}: {e}")

def llm_score(correct_answers, answer):
    try:
        score = []

        for correct_answer in correct_answers:
            try:
                prompt = (
                    "You are an expert evaluator of answers. Your response must be a *single numeric score (0-10), not a range.*\n\n"
                    
                    "The user's answer has been converted from handwriting using OCR, so minor spelling, punctuation, or small word variations may exist. "
                    "Focus on meaning rather than transcription errors.\n\n"
                    
                    "### Evaluation Criteria:\n"
                    "- *Correctness (90% weight):* Does the answer accurately convey the meaning of the correct answer?\n"
                    "- *Completeness (10% weight):* Does it cover all key points?\n\n"
                    
                    "### Handling OCR Errors:\n"
                    "- Ignore minor spelling/punctuation mistakes that don't affect meaning.\n"
                    "- Penalize only if word substitutions change the meaning.\n\n"
                    
                    "### Scoring Guidelines:\n"
                    "- *10:* Fully correct and complete (90-100% accurate).\n"
                    "- *From 9 to 8:* Mostly correct, minor missing details (80-90% accurate).\n"
                    "- *From 7 to 6:* Good but missing some key points (60-80% accurate).\n"
                    "- *From 5 to 4:* Average, with several omissions/errors (40-60% accurate).\n"
                    "- *From 3 to 2:* Poor, major meaning errors (20-40% accurate).\n"
                    "- *From 1 to 0:* Incorrect or irrelevant (less than 20% accurate).\n\n"
                    
                    "Compare the answers and assign a *single numeric score (0-10)* based on correctness and completeness.\n\n"
                    
                    "Correct answer:\n"
                    f"{correct_answer}\n\n"
                    "User's answer:\n"
                    f"{answer}\n\n"
                    "Final Score (numeric only, strictly between 0 and 10):")

                # Tokenize input prompt
                inputs = models.flan_tokenizer(prompt, return_tensors="pt").to(device)
                
                # Generate response
                with torch.no_grad():
                    outputs = models.flan_model.generate(
                        **inputs,
                        max_length=2048,
                        do_sample=True,
                        num_return_sequences=1,
                        num_beams=5,
                        temperature=0.6,
                        top_p=0.9,
                        early_stopping=True,
                        pad_token_id=models.flan_tokenizer.pad_token_id,
                        eos_token_id=models.flan_tokenizer.eos_token_id,
                        bos_token_id=models.flan_tokenizer.bos_token_id,
                    )

                # Decode and clean response
                response = models.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
                try:
                    # Extract numeric score from response
                    score_value = float(''.join(filter(str.isdigit, response)))
                    score_value = max(0, min(10, score_value))  # Clamp between 0 and 10
                    score.append(score_value)
                except ValueError:
                    print(f"Warning: Could not extract numeric score from response: {response}")
                    score.append(0)
                    
            except Exception as e:
                print(f"Error processing answer: {str(e)}")
                score.append(0)
        
        return score
    except Exception as e:
        print(f"Error in llm_score: {str(e)}")
        return [0] * len(correct_answers)