import torch import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) from all_models import models # Remove these lines since we're using the singleton # MODEL_NAME = "google/flan-t5-xl" # model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Get device and ensure model is on correct device device = "cuda" if torch.cuda.is_available() else "cpu" try: models.flan_model.to(device) except Exception as e: print(f"Warning: Could not move model to device {device}: {e}") def llm_score(correct_answers, answer): try: score = [] for correct_answer in correct_answers: try: prompt = ( "You are an expert evaluator of answers. Your response must be a *single numeric score (0-10), not a range.*\n\n" "The user's answer has been converted from handwriting using OCR, so minor spelling, punctuation, or small word variations may exist. " "Focus on meaning rather than transcription errors.\n\n" "### Evaluation Criteria:\n" "- *Correctness (90% weight):* Does the answer accurately convey the meaning of the correct answer?\n" "- *Completeness (10% weight):* Does it cover all key points?\n\n" "### Handling OCR Errors:\n" "- Ignore minor spelling/punctuation mistakes that don't affect meaning.\n" "- Penalize only if word substitutions change the meaning.\n\n" "### Scoring Guidelines:\n" "- *10:* Fully correct and complete (90-100% accurate).\n" "- *From 9 to 8:* Mostly correct, minor missing details (80-90% accurate).\n" "- *From 7 to 6:* Good but missing some key points (60-80% accurate).\n" "- *From 5 to 4:* Average, with several omissions/errors (40-60% accurate).\n" "- *From 3 to 2:* Poor, major meaning errors (20-40% accurate).\n" "- *From 1 to 0:* Incorrect or irrelevant (less than 20% accurate).\n\n" "Compare the answers and assign a *single numeric score (0-10)* based on correctness and completeness.\n\n" "Correct answer:\n" f"{correct_answer}\n\n" "User's answer:\n" f"{answer}\n\n" "Final Score (numeric only, strictly between 0 and 10):") # Tokenize input prompt inputs = models.flan_tokenizer(prompt, return_tensors="pt").to(device) # Generate response with torch.no_grad(): outputs = models.flan_model.generate( **inputs, max_length=2048, do_sample=True, num_return_sequences=1, num_beams=5, temperature=0.6, top_p=0.9, early_stopping=True, pad_token_id=models.flan_tokenizer.pad_token_id, eos_token_id=models.flan_tokenizer.eos_token_id, bos_token_id=models.flan_tokenizer.bos_token_id, ) # Decode and clean response response = models.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) try: # Extract numeric score from response score_value = float(''.join(filter(str.isdigit, response))) score_value = max(0, min(10, score_value)) # Clamp between 0 and 10 score.append(score_value) except ValueError: print(f"Warning: Could not extract numeric score from response: {response}") score.append(0) except Exception as e: print(f"Error processing answer: {str(e)}") score.append(0) return score except Exception as e: print(f"Error in llm_score: {str(e)}") return [0] * len(correct_answers)