yamanavijayavardhan's picture
update_new_new_new_new_new
9556995
raw
history blame
6.5 kB
from sentence_transformers import util
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
import numpy as np
import nltk
from gensim import corpora
from gensim.models import FastText
from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
from gensim.downloader import load
import sys
import os
import tempfile
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
from all_models import models
from sentence_transformers import SentenceTransformer
import torch
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Global model variable
model = None
def initialize_model():
global model
try:
# Use a smaller, more efficient model
model_name = 'paraphrase-MiniLM-L6-v2' # Only about 80MB
cache_dir = os.path.join(os.environ.get('TMPDIR', '/tmp'), 'model_cache')
os.makedirs(cache_dir, exist_ok=True)
model = SentenceTransformer(model_name, cache_folder=cache_dir)
logger.info(f"Loaded model: {model_name}")
return model
except Exception as e:
logger.error(f"Error loading model: {str(e)}")
return None
def get_sentence_embedding(text):
try:
global model
if model is None:
model = initialize_model()
if model is None:
return None
# Get embeddings
embedding = model.encode(text, convert_to_tensor=True)
return embedding
except Exception as e:
logger.error(f"Error getting embedding: {str(e)}")
return None
def compute_similarity(student_answer, correct_answer):
"""Compute semantic similarity between two texts"""
try:
# Get embeddings
student_emb = get_sentence_embedding(student_answer)
correct_emb = get_sentence_embedding(correct_answer)
if student_emb is None or correct_emb is None:
return 0.0
# Calculate cosine similarity
similarity = torch.nn.functional.cosine_similarity(student_emb.unsqueeze(0), correct_emb.unsqueeze(0))
return float(similarity)
except Exception as e:
logger.error(f"Error calculating similarity: {str(e)}")
return 0.0
def question_vector_sentence(student_answer, correct_answer):
"""Get semantic similarity score for sentences"""
try:
return compute_similarity(student_answer, correct_answer)
except Exception as e:
logger.error(f"Error in question_vector_sentence: {str(e)}")
return 0.0
def question_vector_word(student_answer, correct_answer):
"""Get semantic similarity score for individual words"""
try:
# Split into words
student_words = student_answer.lower().split()
correct_words = correct_answer.lower().split()
if not student_words or not correct_words:
return 0.0
# Calculate similarities for each word pair
similarities = []
for s_word in student_words:
word_sims = []
for c_word in correct_words:
sim = compute_similarity(s_word, c_word)
word_sims.append(sim)
if word_sims:
similarities.append(max(word_sims))
# Return average similarity
if similarities:
return sum(similarities) / len(similarities)
return 0.0
except Exception as e:
logger.error(f"Error in question_vector_word: {str(e)}")
return 0.0
# Use custom directory for gensim data
gensim_data_dir = os.getenv('GENSIM_DATA_DIR', tempfile.gettempdir())
os.environ['GENSIM_DATA_DIR'] = gensim_data_dir
# Load fasttext with error handling
try:
print("Loading fasttext model...")
fasttext = load('fasttext-wiki-news-subwords-300')
except Exception as e:
print(f"Error loading fasttext model: {e}")
# Provide a fallback for similarity calculations
class DummyFasttext:
def __getitem__(self, word):
return np.zeros(300) # Return zero vector of size 300
def __contains__(self, word):
return True
fasttext = DummyFasttext()
def compute_scm(tokens1, tokens2, model):
return 0.5 # Return default similarity score
# Remove these commented lines since we're handling downloads in main.py
# nltk.download('punkt')
# nltk.download('stopwords')
def preprocess(sentence):
try:
# Lowercase and remove punctuation
sentence = sentence.lower()
# Tokenize
words = word_tokenize(sentence)
# Remove stop words
words = [word for word in words if word not in stopwords.words('english')]
return words
except Exception as e:
print(f"Error in preprocess: {str(e)}")
return []
def sentence_to_vec(tokens, model):
try:
# Filter words that are in the Word2Vec vocabulary
valid_words = [word for word in tokens if word in model]
# If there are no valid words, return a zero vector
if not valid_words:
return np.zeros(model.vector_size)
# Compute the average vector
word_vectors = [model[word] for word in valid_words]
sentence_vector = np.mean(word_vectors, axis=0)
return sentence_vector
except Exception as e:
print(f"Error in sentence_to_vec: {str(e)}")
return np.zeros(300) # Return zero vector as fallback
def compute_scm(tokens1, tokens2, model):
try:
dictionary = corpora.Dictionary([tokens1, tokens2])
tokens1 = dictionary.doc2bow(tokens1)
tokens2 = dictionary.doc2bow(tokens2)
termsim_index = WordEmbeddingSimilarityIndex(model)
termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)
similarity = termsim_matrix.inner_product(tokens1, tokens2, normalized=(True, True))
return float(similarity) # Convert to float for JSON serialization
except Exception as e:
print(f"Error in compute_scm: {str(e)}")
return 0.5 # Return default similarity score
def fasttext_similarity(student_answer, correct_answer):
"""Compute fasttext-based similarity between answers"""
try:
return compute_similarity(student_answer, correct_answer)
except Exception as e:
logger.error(f"Error in fasttext_similarity: {str(e)}")
return 0.0