|
from sentence_transformers import util |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
from gensim.models import KeyedVectors |
|
import numpy as np |
|
import nltk |
|
from gensim import corpora |
|
from gensim.models import FastText |
|
from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex |
|
from gensim.downloader import load |
|
import sys |
|
import os |
|
import tempfile |
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) |
|
from all_models import models |
|
from sentence_transformers import SentenceTransformer |
|
import torch |
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
model = None |
|
|
|
def initialize_model(): |
|
global model |
|
try: |
|
|
|
model_name = 'paraphrase-MiniLM-L6-v2' |
|
cache_dir = os.path.join(os.environ.get('TMPDIR', '/tmp'), 'model_cache') |
|
os.makedirs(cache_dir, exist_ok=True) |
|
|
|
model = SentenceTransformer(model_name, cache_folder=cache_dir) |
|
logger.info(f"Loaded model: {model_name}") |
|
return model |
|
except Exception as e: |
|
logger.error(f"Error loading model: {str(e)}") |
|
return None |
|
|
|
def get_sentence_embedding(text): |
|
try: |
|
global model |
|
if model is None: |
|
model = initialize_model() |
|
if model is None: |
|
return None |
|
|
|
|
|
embedding = model.encode(text, convert_to_tensor=True) |
|
return embedding |
|
except Exception as e: |
|
logger.error(f"Error getting embedding: {str(e)}") |
|
return None |
|
|
|
def compute_similarity(student_answer, correct_answer): |
|
"""Compute semantic similarity between two texts""" |
|
try: |
|
|
|
student_emb = get_sentence_embedding(student_answer) |
|
correct_emb = get_sentence_embedding(correct_answer) |
|
|
|
if student_emb is None or correct_emb is None: |
|
return 0.0 |
|
|
|
|
|
similarity = torch.nn.functional.cosine_similarity(student_emb.unsqueeze(0), correct_emb.unsqueeze(0)) |
|
return float(similarity) |
|
except Exception as e: |
|
logger.error(f"Error calculating similarity: {str(e)}") |
|
return 0.0 |
|
|
|
def question_vector_sentence(student_answer, correct_answer): |
|
"""Get semantic similarity score for sentences""" |
|
try: |
|
return compute_similarity(student_answer, correct_answer) |
|
except Exception as e: |
|
logger.error(f"Error in question_vector_sentence: {str(e)}") |
|
return 0.0 |
|
|
|
def question_vector_word(student_answer, correct_answer): |
|
"""Get semantic similarity score for individual words""" |
|
try: |
|
|
|
student_words = student_answer.lower().split() |
|
correct_words = correct_answer.lower().split() |
|
|
|
if not student_words or not correct_words: |
|
return 0.0 |
|
|
|
|
|
similarities = [] |
|
for s_word in student_words: |
|
word_sims = [] |
|
for c_word in correct_words: |
|
sim = compute_similarity(s_word, c_word) |
|
word_sims.append(sim) |
|
if word_sims: |
|
similarities.append(max(word_sims)) |
|
|
|
|
|
if similarities: |
|
return sum(similarities) / len(similarities) |
|
return 0.0 |
|
except Exception as e: |
|
logger.error(f"Error in question_vector_word: {str(e)}") |
|
return 0.0 |
|
|
|
|
|
gensim_data_dir = os.getenv('GENSIM_DATA_DIR', tempfile.gettempdir()) |
|
os.environ['GENSIM_DATA_DIR'] = gensim_data_dir |
|
|
|
|
|
try: |
|
print("Loading fasttext model...") |
|
fasttext = load('fasttext-wiki-news-subwords-300') |
|
except Exception as e: |
|
print(f"Error loading fasttext model: {e}") |
|
|
|
class DummyFasttext: |
|
def __getitem__(self, word): |
|
return np.zeros(300) |
|
def __contains__(self, word): |
|
return True |
|
fasttext = DummyFasttext() |
|
|
|
def compute_scm(tokens1, tokens2, model): |
|
return 0.5 |
|
|
|
|
|
|
|
|
|
|
|
def preprocess(sentence): |
|
try: |
|
|
|
sentence = sentence.lower() |
|
|
|
words = word_tokenize(sentence) |
|
|
|
words = [word for word in words if word not in stopwords.words('english')] |
|
return words |
|
except Exception as e: |
|
print(f"Error in preprocess: {str(e)}") |
|
return [] |
|
|
|
def sentence_to_vec(tokens, model): |
|
try: |
|
|
|
valid_words = [word for word in tokens if word in model] |
|
|
|
|
|
if not valid_words: |
|
return np.zeros(model.vector_size) |
|
|
|
|
|
word_vectors = [model[word] for word in valid_words] |
|
sentence_vector = np.mean(word_vectors, axis=0) |
|
|
|
return sentence_vector |
|
except Exception as e: |
|
print(f"Error in sentence_to_vec: {str(e)}") |
|
return np.zeros(300) |
|
|
|
def compute_scm(tokens1, tokens2, model): |
|
try: |
|
dictionary = corpora.Dictionary([tokens1, tokens2]) |
|
tokens1 = dictionary.doc2bow(tokens1) |
|
tokens2 = dictionary.doc2bow(tokens2) |
|
termsim_index = WordEmbeddingSimilarityIndex(model) |
|
termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) |
|
similarity = termsim_matrix.inner_product(tokens1, tokens2, normalized=(True, True)) |
|
return float(similarity) |
|
except Exception as e: |
|
print(f"Error in compute_scm: {str(e)}") |
|
return 0.5 |
|
|
|
def fasttext_similarity(student_answer, correct_answer): |
|
"""Compute fasttext-based similarity between answers""" |
|
try: |
|
return compute_similarity(student_answer, correct_answer) |
|
except Exception as e: |
|
logger.error(f"Error in fasttext_similarity: {str(e)}") |
|
return 0.0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|