yamanavijayavardhan's picture
update_new_new
26f855a
raw
history blame
4.42 kB
import torch
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
from all_models import models
# Remove these lines since we're using the singleton
# MODEL_NAME = "google/flan-t5-xl"
# model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Get device and ensure model is on correct device
device = "cuda" if torch.cuda.is_available() else "cpu"
try:
models.flan_model.to(device)
except Exception as e:
print(f"Warning: Could not move model to device {device}: {e}")
def llm_score(correct_answers, answer):
try:
score = []
for correct_answer in correct_answers:
try:
prompt = (
"You are an expert evaluator of answers. Your response must be a *single numeric score (0-10), not a range.*\n\n"
"The user's answer has been converted from handwriting using OCR, so minor spelling, punctuation, or small word variations may exist. "
"Focus on meaning rather than transcription errors.\n\n"
"### Evaluation Criteria:\n"
"- *Correctness (90% weight):* Does the answer accurately convey the meaning of the correct answer?\n"
"- *Completeness (10% weight):* Does it cover all key points?\n\n"
"### Handling OCR Errors:\n"
"- Ignore minor spelling/punctuation mistakes that don't affect meaning.\n"
"- Penalize only if word substitutions change the meaning.\n\n"
"### Scoring Guidelines:\n"
"- *10:* Fully correct and complete (90-100% accurate).\n"
"- *From 9 to 8:* Mostly correct, minor missing details (80-90% accurate).\n"
"- *From 7 to 6:* Good but missing some key points (60-80% accurate).\n"
"- *From 5 to 4:* Average, with several omissions/errors (40-60% accurate).\n"
"- *From 3 to 2:* Poor, major meaning errors (20-40% accurate).\n"
"- *From 1 to 0:* Incorrect or irrelevant (less than 20% accurate).\n\n"
"Compare the answers and assign a *single numeric score (0-10)* based on correctness and completeness.\n\n"
"Correct answer:\n"
f"{correct_answer}\n\n"
"User's answer:\n"
f"{answer}\n\n"
"Final Score (numeric only, strictly between 0 and 10):")
# Tokenize input prompt
inputs = models.flan_tokenizer(prompt, return_tensors="pt").to(device)
# Generate response
with torch.no_grad():
outputs = models.flan_model.generate(
**inputs,
max_length=2048,
do_sample=True,
num_return_sequences=1,
num_beams=5,
temperature=0.6,
top_p=0.9,
early_stopping=True,
pad_token_id=models.flan_tokenizer.pad_token_id,
eos_token_id=models.flan_tokenizer.eos_token_id,
bos_token_id=models.flan_tokenizer.bos_token_id,
)
# Decode and clean response
response = models.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
try:
# Extract numeric score from response
score_value = float(''.join(filter(str.isdigit, response)))
score_value = max(0, min(10, score_value)) # Clamp between 0 and 10
score.append(score_value)
except ValueError:
print(f"Warning: Could not extract numeric score from response: {response}")
score.append(0)
except Exception as e:
print(f"Error processing answer: {str(e)}")
score.append(0)
return score
except Exception as e:
print(f"Error in llm_score: {str(e)}")
return [0] * len(correct_answers)