Commit
·
8405423
1
Parent(s):
324bbc9
fix memory overlimit issue
Browse files- all_models.py +97 -45
- main.py +37 -0
- similarity_check/llm_based_scoring/llm.py +38 -54
- similarity_check/semantic_meaning_check/semantic.py +61 -18
all_models.py
CHANGED
@@ -12,6 +12,8 @@ logger = logging.getLogger(__name__)
|
|
12 |
class ModelSingleton:
|
13 |
_instance = None
|
14 |
_initialized = False
|
|
|
|
|
15 |
|
16 |
def __new__(cls):
|
17 |
if cls._instance is None:
|
@@ -29,63 +31,113 @@ class ModelSingleton:
|
|
29 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
30 |
logger.info(f"Using device: {self.device}")
|
31 |
|
32 |
-
#
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
SENTENCE_MODEL,
|
42 |
-
cache_folder=cache_dir
|
43 |
-
)
|
44 |
-
self.similarity_model.to(self.device)
|
45 |
-
logger.info("Sentence transformer model loaded successfully")
|
46 |
-
except Exception as e:
|
47 |
-
logger.error(f"Error loading sentence transformer model: {e}")
|
48 |
-
raise
|
49 |
-
|
50 |
-
# Flan-T5-xl model
|
51 |
-
try:
|
52 |
-
logger.info("Loading Flan-T5 model...")
|
53 |
-
FLAN_MODEL = "google/flan-t5-xl"
|
54 |
-
self.flan_tokenizer = AutoTokenizer.from_pretrained(
|
55 |
-
FLAN_MODEL,
|
56 |
-
cache_dir=cache_dir
|
57 |
-
)
|
58 |
-
self.flan_model = AutoModelForSeq2SeqLM.from_pretrained(
|
59 |
-
FLAN_MODEL,
|
60 |
-
cache_dir=cache_dir,
|
61 |
-
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
|
62 |
-
)
|
63 |
-
self.flan_model.to(self.device)
|
64 |
-
logger.info("Flan-T5 model loaded successfully")
|
65 |
-
except Exception as e:
|
66 |
-
logger.error(f"Error loading Flan-T5 model: {e}")
|
67 |
-
raise
|
68 |
|
69 |
self._initialized = True
|
70 |
-
logger.info("
|
71 |
|
72 |
except Exception as e:
|
73 |
logger.error(f"Error during model initialization: {e}")
|
74 |
raise
|
75 |
|
76 |
-
def
|
77 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
try:
|
79 |
-
if
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
torch.cuda.empty_cache()
|
84 |
-
logger.info("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
except Exception as e:
|
86 |
logger.error(f"Error during cleanup: {e}")
|
87 |
|
88 |
-
# Create
|
89 |
models = ModelSingleton()
|
90 |
|
91 |
# Add cleanup function to the global instance
|
|
|
12 |
class ModelSingleton:
|
13 |
_instance = None
|
14 |
_initialized = False
|
15 |
+
_models = {}
|
16 |
+
_reference_counts = {}
|
17 |
|
18 |
def __new__(cls):
|
19 |
if cls._instance is None:
|
|
|
31 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
32 |
logger.info(f"Using device: {self.device}")
|
33 |
|
34 |
+
# Initialize with None values
|
35 |
+
self.similarity_tokenizer = None
|
36 |
+
self.similarity_model = None
|
37 |
+
self.flan_tokenizer = None
|
38 |
+
self.flan_model = None
|
39 |
+
|
40 |
+
# Initialize reference counts
|
41 |
+
self._reference_counts['similarity'] = 0
|
42 |
+
self._reference_counts['flan'] = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
self._initialized = True
|
45 |
+
logger.info("Model singleton initialized")
|
46 |
|
47 |
except Exception as e:
|
48 |
logger.error(f"Error during model initialization: {e}")
|
49 |
raise
|
50 |
|
51 |
+
def get_similarity_model(self):
|
52 |
+
"""Get sentence transformer model with reference counting"""
|
53 |
+
try:
|
54 |
+
if self.similarity_model is None:
|
55 |
+
logger.info("Loading sentence transformer model...")
|
56 |
+
SENTENCE_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
57 |
+
self.similarity_tokenizer = AutoTokenizer.from_pretrained(
|
58 |
+
SENTENCE_MODEL,
|
59 |
+
cache_dir=os.getenv('TRANSFORMERS_CACHE')
|
60 |
+
)
|
61 |
+
self.similarity_model = SentenceTransformer(
|
62 |
+
SENTENCE_MODEL,
|
63 |
+
cache_folder=os.getenv('TRANSFORMERS_CACHE')
|
64 |
+
)
|
65 |
+
self.similarity_model.to(self.device)
|
66 |
+
logger.info("Sentence transformer model loaded successfully")
|
67 |
+
|
68 |
+
self._reference_counts['similarity'] += 1
|
69 |
+
return self.similarity_model
|
70 |
+
except Exception as e:
|
71 |
+
logger.error(f"Error loading sentence transformer model: {e}")
|
72 |
+
raise
|
73 |
+
|
74 |
+
def get_flan_model(self):
|
75 |
+
"""Get Flan-T5 model with reference counting"""
|
76 |
try:
|
77 |
+
if self.flan_model is None:
|
78 |
+
logger.info("Loading Flan-T5 model...")
|
79 |
+
FLAN_MODEL = "google/flan-t5-xl"
|
80 |
+
self.flan_tokenizer = AutoTokenizer.from_pretrained(
|
81 |
+
FLAN_MODEL,
|
82 |
+
cache_dir=os.getenv('TRANSFORMERS_CACHE')
|
83 |
+
)
|
84 |
+
self.flan_model = AutoModelForSeq2SeqLM.from_pretrained(
|
85 |
+
FLAN_MODEL,
|
86 |
+
cache_dir=os.getenv('TRANSFORMERS_CACHE'),
|
87 |
+
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
|
88 |
+
low_cpu_mem_usage=True
|
89 |
+
)
|
90 |
+
self.flan_model.to(self.device)
|
91 |
+
logger.info("Flan-T5 model loaded successfully")
|
92 |
+
|
93 |
+
self._reference_counts['flan'] += 1
|
94 |
+
return self.flan_model
|
95 |
+
except Exception as e:
|
96 |
+
logger.error(f"Error loading Flan-T5 model: {e}")
|
97 |
+
raise
|
98 |
+
|
99 |
+
def release_similarity_model(self):
|
100 |
+
"""Release reference to similarity model"""
|
101 |
+
self._reference_counts['similarity'] -= 1
|
102 |
+
if self._reference_counts['similarity'] <= 0:
|
103 |
+
self._cleanup_similarity_model()
|
104 |
+
|
105 |
+
def release_flan_model(self):
|
106 |
+
"""Release reference to Flan-T5 model"""
|
107 |
+
self._reference_counts['flan'] -= 1
|
108 |
+
if self._reference_counts['flan'] <= 0:
|
109 |
+
self._cleanup_flan_model()
|
110 |
+
|
111 |
+
def _cleanup_similarity_model(self):
|
112 |
+
"""Clean up similarity model resources"""
|
113 |
+
if self.similarity_model is not None:
|
114 |
+
del self.similarity_model
|
115 |
+
self.similarity_model = None
|
116 |
+
self.similarity_tokenizer = None
|
117 |
+
torch.cuda.empty_cache()
|
118 |
+
logger.info("Similarity model resources cleaned up")
|
119 |
+
|
120 |
+
def _cleanup_flan_model(self):
|
121 |
+
"""Clean up Flan-T5 model resources"""
|
122 |
+
if self.flan_model is not None:
|
123 |
+
del self.flan_model
|
124 |
+
self.flan_model = None
|
125 |
+
self.flan_tokenizer = None
|
126 |
torch.cuda.empty_cache()
|
127 |
+
logger.info("Flan-T5 model resources cleaned up")
|
128 |
+
|
129 |
+
def cleanup(self):
|
130 |
+
"""Clean up all model resources"""
|
131 |
+
try:
|
132 |
+
self._cleanup_similarity_model()
|
133 |
+
self._cleanup_flan_model()
|
134 |
+
self._reference_counts['similarity'] = 0
|
135 |
+
self._reference_counts['flan'] = 0
|
136 |
+
logger.info("All model resources cleaned up successfully")
|
137 |
except Exception as e:
|
138 |
logger.error(f"Error during cleanup: {e}")
|
139 |
|
140 |
+
# Create global instance
|
141 |
models = ModelSingleton()
|
142 |
|
143 |
# Add cleanup function to the global instance
|
main.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import os
|
2 |
import tempfile
|
|
|
|
|
3 |
|
4 |
# Set up Hugging Face cache directory
|
5 |
os.environ['TRANSFORMERS_CACHE'] = os.path.join(tempfile.gettempdir(), 'huggingface_cache')
|
@@ -420,6 +422,32 @@ def notifications():
|
|
420 |
|
421 |
return Response(generate(), mimetype='text/event-stream')
|
422 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
@app.route('/compute_marks', methods=['POST'])
|
424 |
def compute_marks():
|
425 |
try:
|
@@ -534,6 +562,9 @@ def compute_marks():
|
|
534 |
})
|
535 |
count += 1
|
536 |
|
|
|
|
|
|
|
537 |
except Exception as e:
|
538 |
logger.error(f"Error processing {image_path}: {str(e)}")
|
539 |
results.append({
|
@@ -552,6 +583,9 @@ def compute_marks():
|
|
552 |
except Exception as e:
|
553 |
logger.warning(f"Could not clean up temporary files: {e}")
|
554 |
|
|
|
|
|
|
|
555 |
return jsonify({"results": results}), 200
|
556 |
|
557 |
except Exception as e:
|
@@ -561,6 +595,9 @@ def compute_marks():
|
|
561 |
"message": error_msg
|
562 |
})
|
563 |
return jsonify({"error": error_msg}), 500
|
|
|
|
|
|
|
564 |
|
565 |
def marks(answer, sen_vec_answers, word_vec_answers, tf_idf_word_values, max_tfidf, correct_answers):
|
566 |
try:
|
|
|
1 |
import os
|
2 |
import tempfile
|
3 |
+
import gc
|
4 |
+
import psutil
|
5 |
|
6 |
# Set up Hugging Face cache directory
|
7 |
os.environ['TRANSFORMERS_CACHE'] = os.path.join(tempfile.gettempdir(), 'huggingface_cache')
|
|
|
422 |
|
423 |
return Response(generate(), mimetype='text/event-stream')
|
424 |
|
425 |
+
def get_memory_usage():
|
426 |
+
"""Get current memory usage"""
|
427 |
+
process = psutil.Process(os.getpid())
|
428 |
+
return process.memory_info().rss / 1024 / 1024 # Convert to MB
|
429 |
+
|
430 |
+
def cleanup_memory():
|
431 |
+
"""Clean up memory by clearing caches and garbage collection"""
|
432 |
+
try:
|
433 |
+
# Clear PyTorch cache
|
434 |
+
if torch.cuda.is_available():
|
435 |
+
torch.cuda.empty_cache()
|
436 |
+
|
437 |
+
# Clear Python garbage collection
|
438 |
+
gc.collect()
|
439 |
+
|
440 |
+
# Clear model caches
|
441 |
+
if hasattr(models, 'cleanup'):
|
442 |
+
models.cleanup()
|
443 |
+
|
444 |
+
# Log memory usage
|
445 |
+
memory_usage = get_memory_usage()
|
446 |
+
log_print(f"Memory usage after cleanup: {memory_usage:.2f} MB")
|
447 |
+
|
448 |
+
except Exception as e:
|
449 |
+
log_print(f"Error during memory cleanup: {e}", "ERROR")
|
450 |
+
|
451 |
@app.route('/compute_marks', methods=['POST'])
|
452 |
def compute_marks():
|
453 |
try:
|
|
|
562 |
})
|
563 |
count += 1
|
564 |
|
565 |
+
# Clean up memory after each student
|
566 |
+
cleanup_memory()
|
567 |
+
|
568 |
except Exception as e:
|
569 |
logger.error(f"Error processing {image_path}: {str(e)}")
|
570 |
results.append({
|
|
|
583 |
except Exception as e:
|
584 |
logger.warning(f"Could not clean up temporary files: {e}")
|
585 |
|
586 |
+
# Final memory cleanup
|
587 |
+
cleanup_memory()
|
588 |
+
|
589 |
return jsonify({"results": results}), 200
|
590 |
|
591 |
except Exception as e:
|
|
|
595 |
"message": error_msg
|
596 |
})
|
597 |
return jsonify({"error": error_msg}), 500
|
598 |
+
finally:
|
599 |
+
# Ensure memory is cleaned up even if there's an error
|
600 |
+
cleanup_memory()
|
601 |
|
602 |
def marks(answer, sen_vec_answers, word_vec_answers, tf_idf_word_values, max_tfidf, correct_answers):
|
603 |
try:
|
similarity_check/llm_based_scoring/llm.py
CHANGED
@@ -23,69 +23,53 @@ def llm_score(correct_answers, answer):
|
|
23 |
correct_answers = [correct_answers]
|
24 |
|
25 |
score = []
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
27 |
for correct_answer in correct_answers:
|
28 |
try:
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
"Focus on meaning rather than transcription errors.\n\n"
|
34 |
-
|
35 |
-
"### Evaluation Criteria:\n"
|
36 |
-
"- *Correctness (90% weight):* Does the answer accurately convey the meaning of the correct answer?\n"
|
37 |
-
"- *Completeness (10% weight):* Does it cover all key points?\n\n"
|
38 |
-
|
39 |
-
"### Handling OCR Errors:\n"
|
40 |
-
"- Ignore minor spelling/punctuation mistakes that don't affect meaning.\n"
|
41 |
-
"- Penalize only if word substitutions change the meaning.\n\n"
|
42 |
-
|
43 |
-
"### Scoring Guidelines:\n"
|
44 |
-
"- *10:* Fully correct and complete (90-100% accurate).\n"
|
45 |
-
"- *From 9 to 8:* Mostly correct, minor missing details (80-90% accurate).\n"
|
46 |
-
"- *From 7 to 6:* Good but missing some key points (60-80% accurate).\n"
|
47 |
-
"- *From 5 to 4:* Average, with several omissions/errors (40-60% accurate).\n"
|
48 |
-
"- *From 3 to 2:* Poor, major meaning errors (20-40% accurate).\n"
|
49 |
-
"- *From 1 to 0:* Incorrect or irrelevant (less than 20% accurate).\n\n"
|
50 |
-
|
51 |
-
"Compare the answers and assign a *single numeric score (0-10)* based on correctness and completeness.\n\n"
|
52 |
-
|
53 |
-
"Correct answer:\n"
|
54 |
-
f"{correct_answer}\n\n"
|
55 |
-
"User's answer:\n"
|
56 |
-
f"{answer}\n\n"
|
57 |
-
"Final Score (numeric only, strictly between 0 and 10):")
|
58 |
-
|
59 |
-
# Tokenize input prompt
|
60 |
-
inputs = models.flan_tokenizer(prompt, return_tensors="pt").to(device)
|
61 |
|
62 |
-
# Generate
|
63 |
-
with torch.no_grad():
|
64 |
-
outputs =
|
65 |
**inputs,
|
66 |
-
max_length=
|
67 |
-
do_sample=True,
|
68 |
num_return_sequences=1,
|
69 |
-
|
70 |
-
|
71 |
-
top_p=0.9,
|
72 |
-
early_stopping=True,
|
73 |
-
pad_token_id=models.flan_tokenizer.pad_token_id,
|
74 |
-
eos_token_id=models.flan_tokenizer.eos_token_id,
|
75 |
-
bos_token_id=models.flan_tokenizer.bos_token_id,
|
76 |
)
|
77 |
-
|
78 |
-
# Decode and
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
except Exception as e:
|
84 |
-
|
85 |
-
score.append(
|
86 |
|
|
|
|
|
|
|
|
|
|
|
87 |
return score
|
88 |
|
89 |
except Exception as e:
|
90 |
-
|
91 |
-
return [
|
|
|
|
|
|
|
|
23 |
correct_answers = [correct_answers]
|
24 |
|
25 |
score = []
|
26 |
+
|
27 |
+
# Get model instance
|
28 |
+
model = models.get_flan_model()
|
29 |
+
tokenizer = models.flan_tokenizer
|
30 |
+
|
31 |
+
# Process each correct answer
|
32 |
for correct_answer in correct_answers:
|
33 |
try:
|
34 |
+
# Prepare input
|
35 |
+
input_text = f"Compare these answers and give a similarity score between 0 and 1:\nCorrect: {correct_answer}\nStudent: {answer}"
|
36 |
+
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
|
37 |
+
inputs = {k: v.to(models.device) for k, v in inputs.items()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
# Generate score
|
40 |
+
with torch.no_grad(): # Disable gradient calculation
|
41 |
+
outputs = model.generate(
|
42 |
**inputs,
|
43 |
+
max_length=50,
|
|
|
44 |
num_return_sequences=1,
|
45 |
+
temperature=0.7,
|
46 |
+
do_sample=True
|
|
|
|
|
|
|
|
|
|
|
47 |
)
|
48 |
+
|
49 |
+
# Decode and extract score
|
50 |
+
score_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
51 |
+
try:
|
52 |
+
# Try to extract numeric score
|
53 |
+
score_value = float(score_text.split()[-1])
|
54 |
+
score.append(min(max(score_value, 0.0), 1.0)) # Clamp between 0 and 1
|
55 |
+
except (ValueError, IndexError):
|
56 |
+
# If no numeric score found, use default
|
57 |
+
score.append(0.5)
|
58 |
|
59 |
except Exception as e:
|
60 |
+
logger.error(f"Error processing answer: {str(e)}")
|
61 |
+
score.append(0.5) # Use default score on error
|
62 |
|
63 |
+
# Clean up tensors
|
64 |
+
del inputs
|
65 |
+
del outputs
|
66 |
+
torch.cuda.empty_cache()
|
67 |
+
|
68 |
return score
|
69 |
|
70 |
except Exception as e:
|
71 |
+
logger.error(f"Error in llm_score: {str(e)}")
|
72 |
+
return [0.5] # Return default score on error
|
73 |
+
finally:
|
74 |
+
# Release model reference
|
75 |
+
models.release_flan_model()
|
similarity_check/semantic_meaning_check/semantic.py
CHANGED
@@ -41,22 +41,44 @@ except Exception as e:
|
|
41 |
def question_vector_sentence(correct_answer):
|
42 |
"""Get sentence embedding using shared model"""
|
43 |
try:
|
44 |
-
|
|
|
|
|
|
|
|
|
45 |
except Exception as e:
|
46 |
logger.error(f"Error in question_vector_sentence: {str(e)}")
|
47 |
return None
|
|
|
|
|
|
|
48 |
|
49 |
-
def similarity_model_score(
|
50 |
-
"""Calculate similarity score using
|
51 |
try:
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
except Exception as e:
|
58 |
logger.error(f"Error in similarity_model_score: {str(e)}")
|
59 |
return 0.0
|
|
|
|
|
|
|
60 |
|
61 |
def preprocess(sentence):
|
62 |
"""Preprocess text by tokenizing and removing stopwords"""
|
@@ -106,23 +128,44 @@ def compute_scm(tokens1, tokens2, model):
|
|
106 |
return 0.5 # Return default similarity score
|
107 |
|
108 |
def question_vector_word(correct_answer):
|
109 |
-
"""Get
|
110 |
try:
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
except Exception as e:
|
113 |
logger.error(f"Error in question_vector_word: {str(e)}")
|
114 |
-
return
|
115 |
|
116 |
-
def fasttext_similarity(
|
117 |
-
"""
|
118 |
try:
|
119 |
-
|
120 |
-
|
121 |
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
124 |
|
125 |
-
|
|
|
|
|
|
|
126 |
except Exception as e:
|
127 |
logger.error(f"Error in fasttext_similarity: {str(e)}")
|
128 |
return 0.0
|
|
|
41 |
def question_vector_sentence(correct_answer):
|
42 |
"""Get sentence embedding using shared model"""
|
43 |
try:
|
44 |
+
# Get model instance
|
45 |
+
model = models.get_similarity_model()
|
46 |
+
# Convert to tensor and move to correct device
|
47 |
+
embedding = model.encode(correct_answer, convert_to_tensor=True, device=models.device)
|
48 |
+
return embedding
|
49 |
except Exception as e:
|
50 |
logger.error(f"Error in question_vector_sentence: {str(e)}")
|
51 |
return None
|
52 |
+
finally:
|
53 |
+
# Release model reference
|
54 |
+
models.release_similarity_model()
|
55 |
|
56 |
+
def similarity_model_score(sentence_vectors, answer):
|
57 |
+
"""Calculate similarity score using sentence transformer"""
|
58 |
try:
|
59 |
+
# Get model instance
|
60 |
+
model = models.get_similarity_model()
|
61 |
+
|
62 |
+
# Get answer embedding
|
63 |
+
answer_embedding = model.encode(answer, convert_to_tensor=True, device=models.device)
|
64 |
+
|
65 |
+
# Calculate similarities
|
66 |
+
similarities = []
|
67 |
+
for vec in sentence_vectors:
|
68 |
+
if vec is not None:
|
69 |
+
similarity = util.pytorch_cos_sim(answer_embedding, vec).item()
|
70 |
+
similarities.append(similarity)
|
71 |
+
|
72 |
+
if not similarities:
|
73 |
+
return 0.0
|
74 |
+
|
75 |
+
return max(similarities)
|
76 |
except Exception as e:
|
77 |
logger.error(f"Error in similarity_model_score: {str(e)}")
|
78 |
return 0.0
|
79 |
+
finally:
|
80 |
+
# Release model reference
|
81 |
+
models.release_similarity_model()
|
82 |
|
83 |
def preprocess(sentence):
|
84 |
"""Preprocess text by tokenizing and removing stopwords"""
|
|
|
128 |
return 0.5 # Return default similarity score
|
129 |
|
130 |
def question_vector_word(correct_answer):
|
131 |
+
"""Get word embeddings using FastText"""
|
132 |
try:
|
133 |
+
# Tokenize and remove stopwords
|
134 |
+
stop_words = set(stopwords.words('english'))
|
135 |
+
words = word_tokenize(correct_answer.lower())
|
136 |
+
words = [w for w in words if w not in stop_words]
|
137 |
+
|
138 |
+
# Get word embeddings
|
139 |
+
embeddings = []
|
140 |
+
for word in words:
|
141 |
+
if word in fasttext:
|
142 |
+
embeddings.append(fasttext[word])
|
143 |
+
|
144 |
+
if not embeddings:
|
145 |
+
return np.zeros(300) # Return zero vector if no valid words
|
146 |
+
|
147 |
+
return np.mean(embeddings, axis=0)
|
148 |
except Exception as e:
|
149 |
logger.error(f"Error in question_vector_word: {str(e)}")
|
150 |
+
return np.zeros(300)
|
151 |
|
152 |
+
def fasttext_similarity(word_vectors, answer):
|
153 |
+
"""Calculate similarity score using FastText word embeddings"""
|
154 |
try:
|
155 |
+
# Get answer word embedding
|
156 |
+
answer_embedding = question_vector_word(answer)
|
157 |
|
158 |
+
# Calculate similarities
|
159 |
+
similarities = []
|
160 |
+
for vec in word_vectors:
|
161 |
+
if vec is not None:
|
162 |
+
similarity = np.dot(answer_embedding, vec) / (np.linalg.norm(answer_embedding) * np.linalg.norm(vec))
|
163 |
+
similarities.append(similarity)
|
164 |
|
165 |
+
if not similarities:
|
166 |
+
return 0.0
|
167 |
+
|
168 |
+
return max(similarities)
|
169 |
except Exception as e:
|
170 |
logger.error(f"Error in fasttext_similarity: {str(e)}")
|
171 |
return 0.0
|