|
import torch |
|
import sys |
|
import os |
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) |
|
from all_models import models |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
try: |
|
models.flan_model.to(device) |
|
except Exception as e: |
|
print(f"Warning: Could not move model to device {device}: {e}") |
|
|
|
def llm_score(correct_answers, answer): |
|
try: |
|
score = [] |
|
|
|
for correct_answer in correct_answers: |
|
try: |
|
prompt = ( |
|
"You are an expert evaluator of answers. Your response must be a *single numeric score (0-10), not a range.*\n\n" |
|
|
|
"The user's answer has been converted from handwriting using OCR, so minor spelling, punctuation, or small word variations may exist. " |
|
"Focus on meaning rather than transcription errors.\n\n" |
|
|
|
"### Evaluation Criteria:\n" |
|
"- *Correctness (90% weight):* Does the answer accurately convey the meaning of the correct answer?\n" |
|
"- *Completeness (10% weight):* Does it cover all key points?\n\n" |
|
|
|
"### Handling OCR Errors:\n" |
|
"- Ignore minor spelling/punctuation mistakes that don't affect meaning.\n" |
|
"- Penalize only if word substitutions change the meaning.\n\n" |
|
|
|
"### Scoring Guidelines:\n" |
|
"- *10:* Fully correct and complete (90-100% accurate).\n" |
|
"- *From 9 to 8:* Mostly correct, minor missing details (80-90% accurate).\n" |
|
"- *From 7 to 6:* Good but missing some key points (60-80% accurate).\n" |
|
"- *From 5 to 4:* Average, with several omissions/errors (40-60% accurate).\n" |
|
"- *From 3 to 2:* Poor, major meaning errors (20-40% accurate).\n" |
|
"- *From 1 to 0:* Incorrect or irrelevant (less than 20% accurate).\n\n" |
|
|
|
"Compare the answers and assign a *single numeric score (0-10)* based on correctness and completeness.\n\n" |
|
|
|
"Correct answer:\n" |
|
f"{correct_answer}\n\n" |
|
"User's answer:\n" |
|
f"{answer}\n\n" |
|
"Final Score (numeric only, strictly between 0 and 10):") |
|
|
|
|
|
inputs = models.flan_tokenizer(prompt, return_tensors="pt").to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = models.flan_model.generate( |
|
**inputs, |
|
max_length=2048, |
|
do_sample=True, |
|
num_return_sequences=1, |
|
num_beams=5, |
|
temperature=0.6, |
|
top_p=0.9, |
|
early_stopping=True, |
|
pad_token_id=models.flan_tokenizer.pad_token_id, |
|
eos_token_id=models.flan_tokenizer.eos_token_id, |
|
bos_token_id=models.flan_tokenizer.bos_token_id, |
|
) |
|
|
|
|
|
response = models.flan_tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
try: |
|
|
|
score_value = float(''.join(filter(str.isdigit, response))) |
|
score_value = max(0, min(10, score_value)) |
|
score.append(score_value) |
|
except ValueError: |
|
print(f"Warning: Could not extract numeric score from response: {response}") |
|
score.append(0) |
|
|
|
except Exception as e: |
|
print(f"Error processing answer: {str(e)}") |
|
score.append(0) |
|
|
|
return score |
|
except Exception as e: |
|
print(f"Error in llm_score: {str(e)}") |
|
return [0] * len(correct_answers) |
|
|