Spaces:

yamanavijayavardhan
/

answer-grading-app

Sleeping

App Files Files Community

answer-grading-app / similarity_check /semantic_meaning_check /semantic.py

yamanavijayavardhan

update_new_new_new_new_new

9556995 4 months ago

raw

history blame

6.5 kB

	from sentence_transformers import util
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from gensim.models import KeyedVectors
	import numpy as np
	import nltk
	from gensim import corpora
	from gensim.models import FastText
	from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
	from gensim.downloader import load
	import sys
	import os
	import tempfile
	sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
	from all_models import models
	from sentence_transformers import SentenceTransformer
	import torch
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Global model variable
	model = None

	def initialize_model():
	global model
	try:
	# Use a smaller, more efficient model
	model_name = 'paraphrase-MiniLM-L6-v2' # Only about 80MB
	cache_dir = os.path.join(os.environ.get('TMPDIR', '/tmp'), 'model_cache')
	os.makedirs(cache_dir, exist_ok=True)

	model = SentenceTransformer(model_name, cache_folder=cache_dir)
	logger.info(f"Loaded model: {model_name}")
	return model
	except Exception as e:
	logger.error(f"Error loading model: {str(e)}")
	return None

	def get_sentence_embedding(text):
	try:
	global model
	if model is None:
	model = initialize_model()
	if model is None:
	return None

	# Get embeddings
	embedding = model.encode(text, convert_to_tensor=True)
	return embedding
	except Exception as e:
	logger.error(f"Error getting embedding: {str(e)}")
	return None

	def compute_similarity(student_answer, correct_answer):
	"""Compute semantic similarity between two texts"""
	try:
	# Get embeddings
	student_emb = get_sentence_embedding(student_answer)
	correct_emb = get_sentence_embedding(correct_answer)

	if student_emb is None or correct_emb is None:
	return 0.0

	# Calculate cosine similarity
	similarity = torch.nn.functional.cosine_similarity(student_emb.unsqueeze(0), correct_emb.unsqueeze(0))
	return float(similarity)
	except Exception as e:
	logger.error(f"Error calculating similarity: {str(e)}")
	return 0.0

	def question_vector_sentence(student_answer, correct_answer):
	"""Get semantic similarity score for sentences"""
	try:
	return compute_similarity(student_answer, correct_answer)
	except Exception as e:
	logger.error(f"Error in question_vector_sentence: {str(e)}")
	return 0.0

	def question_vector_word(student_answer, correct_answer):
	"""Get semantic similarity score for individual words"""
	try:
	# Split into words
	student_words = student_answer.lower().split()
	correct_words = correct_answer.lower().split()

	if not student_words or not correct_words:
	return 0.0

	# Calculate similarities for each word pair
	similarities = []
	for s_word in student_words:
	word_sims = []
	for c_word in correct_words:
	sim = compute_similarity(s_word, c_word)
	word_sims.append(sim)
	if word_sims:
	similarities.append(max(word_sims))

	# Return average similarity
	if similarities:
	return sum(similarities) / len(similarities)
	return 0.0
	except Exception as e:
	logger.error(f"Error in question_vector_word: {str(e)}")
	return 0.0

	# Use custom directory for gensim data
	gensim_data_dir = os.getenv('GENSIM_DATA_DIR', tempfile.gettempdir())
	os.environ['GENSIM_DATA_DIR'] = gensim_data_dir

	# Load fasttext with error handling
	try:
	print("Loading fasttext model...")
	fasttext = load('fasttext-wiki-news-subwords-300')
	except Exception as e:
	print(f"Error loading fasttext model: {e}")
	# Provide a fallback for similarity calculations
	class DummyFasttext:
	def __getitem__(self, word):
	return np.zeros(300) # Return zero vector of size 300
	def __contains__(self, word):
	return True
	fasttext = DummyFasttext()

	def compute_scm(tokens1, tokens2, model):
	return 0.5 # Return default similarity score

	# Remove these commented lines since we're handling downloads in main.py
	# nltk.download('punkt')
	# nltk.download('stopwords')

	def preprocess(sentence):
	try:
	# Lowercase and remove punctuation
	sentence = sentence.lower()
	# Tokenize
	words = word_tokenize(sentence)
	# Remove stop words
	words = [word for word in words if word not in stopwords.words('english')]
	return words
	except Exception as e:
	print(f"Error in preprocess: {str(e)}")
	return []

	def sentence_to_vec(tokens, model):
	try:
	# Filter words that are in the Word2Vec vocabulary
	valid_words = [word for word in tokens if word in model]

	# If there are no valid words, return a zero vector
	if not valid_words:
	return np.zeros(model.vector_size)

	# Compute the average vector
	word_vectors = [model[word] for word in valid_words]
	sentence_vector = np.mean(word_vectors, axis=0)

	return sentence_vector
	except Exception as e:
	print(f"Error in sentence_to_vec: {str(e)}")
	return np.zeros(300) # Return zero vector as fallback

	def compute_scm(tokens1, tokens2, model):
	try:
	dictionary = corpora.Dictionary([tokens1, tokens2])
	tokens1 = dictionary.doc2bow(tokens1)
	tokens2 = dictionary.doc2bow(tokens2)
	termsim_index = WordEmbeddingSimilarityIndex(model)
	termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)
	similarity = termsim_matrix.inner_product(tokens1, tokens2, normalized=(True, True))
	return float(similarity) # Convert to float for JSON serialization
	except Exception as e:
	print(f"Error in compute_scm: {str(e)}")
	return 0.5 # Return default similarity score

	def fasttext_similarity(student_answer, correct_answer):
	"""Compute fasttext-based similarity between answers"""
	try:
	return compute_similarity(student_answer, correct_answer)
	except Exception as e:
	logger.error(f"Error in fasttext_similarity: {str(e)}")
	return 0.0