Spaces:

yamanavijayavardhan
/

answer-grading-app

Sleeping

App Files Files Community

answer-grading-app / similarity_check /llm_based_scoring /llm.py

yamanavijayavardhan

update_new_new

26f855a 4 months ago

raw

history blame

4.42 kB

	import torch
	import sys
	import os
	sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
	from all_models import models

	# Remove these lines since we're using the singleton
	# MODEL_NAME = "google/flan-t5-xl"
	# model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
	# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	# Get device and ensure model is on correct device
	device = "cuda" if torch.cuda.is_available() else "cpu"
	try:
	models.flan_model.to(device)
	except Exception as e:
	print(f"Warning: Could not move model to device {device}: {e}")

	def llm_score(correct_answers, answer):
	try:
	score = []

	for correct_answer in correct_answers:
	try:
	prompt = (
	"You are an expert evaluator of answers. Your response must be a single numeric score (0-10), not a range.\n\n"

	"The user's answer has been converted from handwriting using OCR, so minor spelling, punctuation, or small word variations may exist. "
	"Focus on meaning rather than transcription errors.\n\n"

	"### Evaluation Criteria:\n"
	"- Correctness (90% weight): Does the answer accurately convey the meaning of the correct answer?\n"
	"- Completeness (10% weight): Does it cover all key points?\n\n"

	"### Handling OCR Errors:\n"
	"- Ignore minor spelling/punctuation mistakes that don't affect meaning.\n"
	"- Penalize only if word substitutions change the meaning.\n\n"

	"### Scoring Guidelines:\n"
	"- 10: Fully correct and complete (90-100% accurate).\n"
	"- From 9 to 8: Mostly correct, minor missing details (80-90% accurate).\n"
	"- From 7 to 6: Good but missing some key points (60-80% accurate).\n"
	"- From 5 to 4: Average, with several omissions/errors (40-60% accurate).\n"
	"- From 3 to 2: Poor, major meaning errors (20-40% accurate).\n"
	"- From 1 to 0: Incorrect or irrelevant (less than 20% accurate).\n\n"

	"Compare the answers and assign a single numeric score (0-10) based on correctness and completeness.\n\n"

	"Correct answer:\n"
	f"{correct_answer}\n\n"
	"User's answer:\n"
	f"{answer}\n\n"
	"Final Score (numeric only, strictly between 0 and 10):")

	# Tokenize input prompt
	inputs = models.flan_tokenizer(prompt, return_tensors="pt").to(device)

	# Generate response
	with torch.no_grad():
	outputs = models.flan_model.generate(
	**inputs,
	max_length=2048,
	do_sample=True,
	num_return_sequences=1,
	num_beams=5,
	temperature=0.6,
	top_p=0.9,
	early_stopping=True,
	pad_token_id=models.flan_tokenizer.pad_token_id,
	eos_token_id=models.flan_tokenizer.eos_token_id,
	bos_token_id=models.flan_tokenizer.bos_token_id,
	)

	# Decode and clean response
	response = models.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
	try:
	# Extract numeric score from response
	score_value = float(''.join(filter(str.isdigit, response)))
	score_value = max(0, min(10, score_value)) # Clamp between 0 and 10
	score.append(score_value)
	except ValueError:
	print(f"Warning: Could not extract numeric score from response: {response}")
	score.append(0)

	except Exception as e:
	print(f"Error processing answer: {str(e)}")
	score.append(0)

	return score
	except Exception as e:
	print(f"Error in llm_score: {str(e)}")
	return [0] * len(correct_answers)