from sentence_transformers import util from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from gensim.models import KeyedVectors import numpy as np import nltk from gensim import corpora from gensim.models import FastText from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex from gensim.downloader import load import sys import os import tempfile sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) from all_models import models from sentence_transformers import SentenceTransformer import torch import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Global model variable model = None def initialize_model(): global model try: # Use a smaller, more efficient model model_name = 'paraphrase-MiniLM-L6-v2' # Only about 80MB cache_dir = os.path.join(os.environ.get('TMPDIR', '/tmp'), 'model_cache') os.makedirs(cache_dir, exist_ok=True) model = SentenceTransformer(model_name, cache_folder=cache_dir) logger.info(f"Loaded model: {model_name}") return model except Exception as e: logger.error(f"Error loading model: {str(e)}") return None def get_sentence_embedding(text): try: global model if model is None: model = initialize_model() if model is None: return None # Get embeddings embedding = model.encode(text, convert_to_tensor=True) return embedding except Exception as e: logger.error(f"Error getting embedding: {str(e)}") return None def compute_similarity(student_answer, correct_answer): """Compute semantic similarity between two texts""" try: # Get embeddings student_emb = get_sentence_embedding(student_answer) correct_emb = get_sentence_embedding(correct_answer) if student_emb is None or correct_emb is None: return 0.0 # Calculate cosine similarity similarity = torch.nn.functional.cosine_similarity(student_emb.unsqueeze(0), correct_emb.unsqueeze(0)) return float(similarity) except Exception as e: logger.error(f"Error calculating similarity: {str(e)}") return 0.0 def question_vector_sentence(student_answer, correct_answer): """Get semantic similarity score for sentences""" try: return compute_similarity(student_answer, correct_answer) except Exception as e: logger.error(f"Error in question_vector_sentence: {str(e)}") return 0.0 def question_vector_word(student_answer, correct_answer): """Get semantic similarity score for individual words""" try: # Split into words student_words = student_answer.lower().split() correct_words = correct_answer.lower().split() if not student_words or not correct_words: return 0.0 # Calculate similarities for each word pair similarities = [] for s_word in student_words: word_sims = [] for c_word in correct_words: sim = compute_similarity(s_word, c_word) word_sims.append(sim) if word_sims: similarities.append(max(word_sims)) # Return average similarity if similarities: return sum(similarities) / len(similarities) return 0.0 except Exception as e: logger.error(f"Error in question_vector_word: {str(e)}") return 0.0 # Use custom directory for gensim data gensim_data_dir = os.getenv('GENSIM_DATA_DIR', tempfile.gettempdir()) os.environ['GENSIM_DATA_DIR'] = gensim_data_dir # Load fasttext with error handling try: print("Loading fasttext model...") fasttext = load('fasttext-wiki-news-subwords-300') except Exception as e: print(f"Error loading fasttext model: {e}") # Provide a fallback for similarity calculations class DummyFasttext: def __getitem__(self, word): return np.zeros(300) # Return zero vector of size 300 def __contains__(self, word): return True fasttext = DummyFasttext() def compute_scm(tokens1, tokens2, model): return 0.5 # Return default similarity score # Remove these commented lines since we're handling downloads in main.py # nltk.download('punkt') # nltk.download('stopwords') def preprocess(sentence): try: # Lowercase and remove punctuation sentence = sentence.lower() # Tokenize words = word_tokenize(sentence) # Remove stop words words = [word for word in words if word not in stopwords.words('english')] return words except Exception as e: print(f"Error in preprocess: {str(e)}") return [] def sentence_to_vec(tokens, model): try: # Filter words that are in the Word2Vec vocabulary valid_words = [word for word in tokens if word in model] # If there are no valid words, return a zero vector if not valid_words: return np.zeros(model.vector_size) # Compute the average vector word_vectors = [model[word] for word in valid_words] sentence_vector = np.mean(word_vectors, axis=0) return sentence_vector except Exception as e: print(f"Error in sentence_to_vec: {str(e)}") return np.zeros(300) # Return zero vector as fallback def compute_scm(tokens1, tokens2, model): try: dictionary = corpora.Dictionary([tokens1, tokens2]) tokens1 = dictionary.doc2bow(tokens1) tokens2 = dictionary.doc2bow(tokens2) termsim_index = WordEmbeddingSimilarityIndex(model) termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) similarity = termsim_matrix.inner_product(tokens1, tokens2, normalized=(True, True)) return float(similarity) # Convert to float for JSON serialization except Exception as e: print(f"Error in compute_scm: {str(e)}") return 0.5 # Return default similarity score def fasttext_similarity(student_answer, correct_answer): """Compute fasttext-based similarity between answers""" try: return compute_similarity(student_answer, correct_answer) except Exception as e: logger.error(f"Error in fasttext_similarity: {str(e)}") return 0.0