Commit
·
26f855a
1
Parent(s):
13cd7b4
update_new_new
Browse files- HTR/app.py +22 -12
- HTR/hcr.py +43 -17
- HTR/spell_and_gramer_check.py +39 -18
- HTR/strike.py +59 -27
- HTR/word.py +217 -199
- all_models.py +77 -12
- main.py +18 -5
- similarity_check/llm_based_scoring/llm.py +75 -59
- similarity_check/semantic_meaning_check/semantic.py +69 -37
- similarity_check/tf_idf/tf_idf_score.py +128 -105
HTR/app.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
import cv2
|
|
|
|
|
2 |
|
3 |
from HTR.word import convert_image
|
4 |
from HTR.strike import struck_images
|
@@ -7,17 +9,25 @@ from HTR.spell_and_gramer_check import spell_grammer
|
|
7 |
|
8 |
# Define a function to extract text from an image
|
9 |
def extract_text_from_image(img_path):
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# extract_text_from_image("ans_image/1.jpg")
|
|
|
1 |
import cv2
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
|
5 |
from HTR.word import convert_image
|
6 |
from HTR.strike import struck_images
|
|
|
9 |
|
10 |
# Define a function to extract text from an image
|
11 |
def extract_text_from_image(img_path):
|
12 |
+
try:
|
13 |
+
# Ensure the image exists
|
14 |
+
if not os.path.exists(img_path):
|
15 |
+
raise FileNotFoundError(f"Image file not found: {img_path}")
|
16 |
+
|
17 |
+
# Read the image
|
18 |
+
img = cv2.imread(img_path)
|
19 |
+
if img is None:
|
20 |
+
raise ValueError(f"Failed to read image: {img_path}")
|
21 |
+
|
22 |
+
# Process the image
|
23 |
+
imgs = convert_image(img)
|
24 |
+
images_path = struck_images(imgs)
|
25 |
+
t = text(images_path)
|
26 |
+
t = spell_grammer(t)
|
27 |
+
|
28 |
+
return t
|
29 |
+
except Exception as e:
|
30 |
+
print(f"Error in extract_text_from_image: {str(e)}")
|
31 |
+
return ""
|
32 |
|
33 |
# extract_text_from_image("ans_image/1.jpg")
|
HTR/hcr.py
CHANGED
@@ -1,27 +1,53 @@
|
|
1 |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
2 |
from PIL import Image
|
3 |
import cv2
|
|
|
|
|
4 |
|
5 |
-
|
6 |
MODEL_NAME = "microsoft/trocr-large-handwritten"
|
7 |
-
processor =
|
8 |
-
model =
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def text(image_cv):
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
image = Image.fromarray(img_rgb)
|
16 |
-
|
17 |
-
# image = Image.open(i).convert("RGB")
|
18 |
-
pixel_values = processor(image, return_tensors="pt").pixel_values
|
19 |
-
generated_ids = model.generate(pixel_values)
|
20 |
-
|
21 |
-
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
22 |
-
t = t+generated_text.replace(" ", "")+ " "
|
23 |
-
|
24 |
-
# print(t)
|
25 |
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
|
|
1 |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
2 |
from PIL import Image
|
3 |
import cv2
|
4 |
+
import os
|
5 |
+
import torch
|
6 |
|
7 |
+
# Initialize model and processor globally
|
8 |
MODEL_NAME = "microsoft/trocr-large-handwritten"
|
9 |
+
processor = None
|
10 |
+
model = None
|
11 |
|
12 |
+
def initialize_model():
|
13 |
+
global processor, model
|
14 |
+
if processor is None or model is None:
|
15 |
+
try:
|
16 |
+
processor = TrOCRProcessor.from_pretrained(MODEL_NAME)
|
17 |
+
model = VisionEncoderDecoderModel.from_pretrained(MODEL_NAME)
|
18 |
+
if torch.cuda.is_available():
|
19 |
+
model = model.to('cuda')
|
20 |
+
except Exception as e:
|
21 |
+
print(f"Error initializing model: {str(e)}")
|
22 |
+
raise
|
23 |
|
24 |
def text(image_cv):
|
25 |
+
try:
|
26 |
+
# Initialize model if not already done
|
27 |
+
initialize_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
+
t = ""
|
30 |
+
for i in image_cv:
|
31 |
+
try:
|
32 |
+
# Convert BGR to RGB
|
33 |
+
img_rgb = cv2.cvtColor(i, cv2.COLOR_BGR2RGB)
|
34 |
+
image = Image.fromarray(img_rgb)
|
35 |
+
|
36 |
+
# Process image
|
37 |
+
pixel_values = processor(image, return_tensors="pt").pixel_values
|
38 |
+
if torch.cuda.is_available():
|
39 |
+
pixel_values = pixel_values.to('cuda')
|
40 |
+
|
41 |
+
generated_ids = model.generate(pixel_values)
|
42 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
43 |
+
t = t + generated_text.replace(" ", "") + " "
|
44 |
+
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Error processing image: {str(e)}")
|
47 |
+
continue
|
48 |
+
|
49 |
+
return t.strip()
|
50 |
+
except Exception as e:
|
51 |
+
print(f"Error in text function: {str(e)}")
|
52 |
+
return ""
|
53 |
|
HTR/spell_and_gramer_check.py
CHANGED
@@ -1,22 +1,43 @@
|
|
1 |
from spellchecker import SpellChecker
|
|
|
2 |
|
3 |
-
spell
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
def spell_grammer(text):
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from spellchecker import SpellChecker
|
2 |
+
import os
|
3 |
|
4 |
+
# Initialize spell checker globally
|
5 |
+
spell = None
|
6 |
+
|
7 |
+
def initialize_spell_checker():
|
8 |
+
global spell
|
9 |
+
if spell is None:
|
10 |
+
try:
|
11 |
+
spell = SpellChecker()
|
12 |
+
except Exception as e:
|
13 |
+
print(f"Error initializing spell checker: {str(e)}")
|
14 |
+
raise
|
15 |
|
16 |
def spell_grammer(text):
|
17 |
+
try:
|
18 |
+
# Initialize spell checker if not already done
|
19 |
+
initialize_spell_checker()
|
20 |
+
|
21 |
+
if not text or not isinstance(text, str):
|
22 |
+
return ""
|
23 |
+
|
24 |
+
# Split text into words
|
25 |
+
words = text.split()
|
26 |
+
|
27 |
+
# Find misspelled words
|
28 |
+
misspelled = spell.unknown(words)
|
29 |
+
|
30 |
+
# Correct misspelled words
|
31 |
+
corrected_words = []
|
32 |
+
for word in words:
|
33 |
+
if word in misspelled:
|
34 |
+
corrected_words.append(spell.correction(word))
|
35 |
+
else:
|
36 |
+
corrected_words.append(word)
|
37 |
+
|
38 |
+
# Join words back into text
|
39 |
+
corrected_text = ' '.join(corrected_words)
|
40 |
+
return corrected_text.strip()
|
41 |
+
except Exception as e:
|
42 |
+
print(f"Error in spell_grammer: {str(e)}")
|
43 |
+
return text
|
HTR/strike.py
CHANGED
@@ -5,40 +5,72 @@ import os
|
|
5 |
import cv2
|
6 |
from transformers import AutoModelForImageClassification
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
for i in image:
|
11 |
-
binary_image = i
|
12 |
-
binary_image = cv2.resize(binary_image, (224, 224))
|
13 |
-
binary_image = cv2.merge([binary_image, binary_image, binary_image])
|
14 |
-
binary_image = binary_image/255
|
15 |
-
binary_image = torch.from_numpy(binary_image)
|
16 |
-
images.append(binary_image)
|
17 |
-
return images
|
18 |
|
19 |
-
def
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
def struck_images(word__image):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
for i in range(len(predictions)):
|
36 |
-
if predictions[i].argmax().item() == 0:
|
37 |
-
|
38 |
-
not_struck.append(word__image[i])
|
39 |
|
40 |
-
|
41 |
-
|
|
|
|
|
42 |
|
43 |
|
44 |
# struck_images()
|
|
|
5 |
import cv2
|
6 |
from transformers import AutoModelForImageClassification
|
7 |
|
8 |
+
# Initialize model globally
|
9 |
+
model = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
+
def initialize_model():
|
12 |
+
global model
|
13 |
+
if model is None:
|
14 |
+
try:
|
15 |
+
model = AutoModelForImageClassification.from_pretrained("models/vit-base-beans")
|
16 |
+
if torch.cuda.is_available():
|
17 |
+
model = model.to('cuda')
|
18 |
+
except Exception as e:
|
19 |
+
print(f"Error initializing model: {str(e)}")
|
20 |
+
raise
|
21 |
|
22 |
+
def image_preprocessing(image):
|
23 |
+
try:
|
24 |
+
images = []
|
25 |
+
for i in image:
|
26 |
+
binary_image = i
|
27 |
+
binary_image = cv2.resize(binary_image, (224, 224))
|
28 |
+
binary_image = cv2.merge([binary_image, binary_image, binary_image])
|
29 |
+
binary_image = binary_image/255
|
30 |
+
binary_image = torch.from_numpy(binary_image)
|
31 |
+
images.append(binary_image)
|
32 |
+
return images
|
33 |
+
except Exception as e:
|
34 |
+
print(f"Error in image_preprocessing: {str(e)}")
|
35 |
+
return []
|
36 |
|
37 |
+
def predict_image(image_path, model):
|
38 |
+
try:
|
39 |
+
preprocessed_img = image_preprocessing(image_path)
|
40 |
+
if not preprocessed_img:
|
41 |
+
return None
|
42 |
+
|
43 |
+
images = torch.stack(preprocessed_img)
|
44 |
+
images = images.permute(0, 3, 1, 2)
|
45 |
+
|
46 |
+
if torch.cuda.is_available():
|
47 |
+
images = images.to('cuda')
|
48 |
+
|
49 |
+
with torch.no_grad():
|
50 |
+
predictions = model(images).logits.detach().cpu().numpy()
|
51 |
+
return predictions
|
52 |
+
except Exception as e:
|
53 |
+
print(f"Error in predict_image: {str(e)}")
|
54 |
+
return None
|
55 |
|
56 |
def struck_images(word__image):
|
57 |
+
try:
|
58 |
+
# Initialize model if not already done
|
59 |
+
initialize_model()
|
60 |
+
|
61 |
+
predictions = predict_image(word__image, model)
|
62 |
+
if predictions is None:
|
63 |
+
return []
|
64 |
|
65 |
+
not_struck = []
|
66 |
+
for i in range(len(predictions)):
|
67 |
+
if predictions[i].argmax().item() == 0:
|
68 |
+
not_struck.append(word__image[i])
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
return not_struck
|
71 |
+
except Exception as e:
|
72 |
+
print(f"Error in struck_images: {str(e)}")
|
73 |
+
return word__image # Return original images if error occurs
|
74 |
|
75 |
|
76 |
# struck_images()
|
HTR/word.py
CHANGED
@@ -3,6 +3,7 @@ import cv2
|
|
3 |
# import matplotlib.pyplot as plt
|
4 |
import sys
|
5 |
import os
|
|
|
6 |
|
7 |
|
8 |
cordinates =[]
|
@@ -10,95 +11,107 @@ cordinates =[]
|
|
10 |
|
11 |
|
12 |
def four_point_transform(image, pts):
|
13 |
-
|
14 |
-
|
|
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
|
32 |
-
|
33 |
|
34 |
-
|
35 |
-
|
36 |
|
37 |
-
|
|
|
|
|
|
|
38 |
|
39 |
|
40 |
def remove_shadow(image):
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
|
58 |
|
59 |
|
60 |
|
61 |
def analise(image):
|
62 |
global line, binary_image1, x_scaling , y_scaling
|
63 |
-
|
|
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
|
101 |
|
|
|
|
|
|
|
102 |
def image_resize_and_errosion(image):
|
103 |
|
104 |
height, width = image.shape[:2]
|
@@ -122,163 +135,168 @@ line_length = 0
|
|
122 |
count = 0
|
123 |
|
124 |
def convert_image(img):
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
os.
|
132 |
-
|
133 |
-
|
|
|
134 |
|
135 |
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
|
143 |
-
|
144 |
-
|
145 |
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
|
159 |
-
|
160 |
|
161 |
-
|
162 |
|
163 |
-
|
164 |
|
165 |
-
|
166 |
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
|
171 |
-
|
172 |
-
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
|
196 |
-
|
197 |
-
|
198 |
|
199 |
-
|
200 |
|
201 |
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
|
208 |
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
|
216 |
-
|
217 |
-
|
218 |
|
219 |
|
220 |
-
|
221 |
-
|
222 |
|
223 |
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
-
|
256 |
-
analise(line_image)
|
257 |
|
258 |
-
|
259 |
-
count = 0
|
260 |
-
kernel1 = np.ones((8,8),np.uint8)
|
261 |
-
word__image = [] # newly added
|
262 |
-
for line_image in line:
|
263 |
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
image_name ="images/"+str(count)+".png"
|
274 |
-
final_image = 255 - final_image
|
275 |
-
word__image.append(final_image)# newly added
|
276 |
-
# cv2.imwrite(image_name, final_image)
|
277 |
-
count=count+1
|
278 |
-
|
279 |
-
# cv2.waitKey(0)
|
280 |
-
# cv2.destroyAllWindows()
|
281 |
-
return word__image
|
282 |
|
283 |
|
284 |
|
|
|
3 |
# import matplotlib.pyplot as plt
|
4 |
import sys
|
5 |
import os
|
6 |
+
import tempfile
|
7 |
|
8 |
|
9 |
cordinates =[]
|
|
|
11 |
|
12 |
|
13 |
def four_point_transform(image, pts):
|
14 |
+
try:
|
15 |
+
rect = pts
|
16 |
+
(tl, tr, br, bl) = rect
|
17 |
|
18 |
+
# Compute the width of the new image
|
19 |
+
widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
|
20 |
+
widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
|
21 |
+
maxWidth = max(int(widthA), int(widthB))
|
22 |
|
23 |
+
# Compute the height of the new image
|
24 |
+
heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
|
25 |
+
heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
|
26 |
+
maxHeight = max(int(heightA), int(heightB))
|
27 |
|
28 |
+
dst = np.array([
|
29 |
+
[0, 0],
|
30 |
+
[maxWidth - 1, 0],
|
31 |
+
[maxWidth - 1, maxHeight - 1],
|
32 |
+
[0, maxHeight - 1]], dtype="float32")
|
33 |
|
34 |
+
rect = np.array(rect, dtype="float32")
|
35 |
|
36 |
+
M = cv2.getPerspectiveTransform(rect, dst)
|
37 |
+
warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
|
38 |
|
39 |
+
return warped
|
40 |
+
except Exception as e:
|
41 |
+
print(f"Error in four_point_transform: {str(e)}")
|
42 |
+
return image
|
43 |
|
44 |
|
45 |
def remove_shadow(image):
|
46 |
+
try:
|
47 |
+
rgb_planes = cv2.split(image)
|
48 |
+
|
49 |
+
result_planes = []
|
50 |
+
result_norm_planes = []
|
51 |
+
for plane in rgb_planes:
|
52 |
+
dilated_img = cv2.dilate(plane, np.ones((7,7), np.uint8))
|
53 |
+
bg_img = cv2.medianBlur(dilated_img, 21)
|
54 |
+
diff_img = 255 - cv2.absdiff(plane, bg_img)
|
55 |
+
norm_img = cv2.normalize(diff_img,None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
|
56 |
+
result_planes.append(diff_img)
|
57 |
+
result_norm_planes.append(norm_img)
|
58 |
+
|
59 |
+
result = cv2.merge(result_planes)
|
60 |
+
result_norm = cv2.merge(result_norm_planes)
|
61 |
|
62 |
+
return result, result_norm
|
63 |
+
except Exception as e:
|
64 |
+
print(f"Error in remove_shadow: {str(e)}")
|
65 |
+
return image, image
|
66 |
|
67 |
|
68 |
|
69 |
|
70 |
def analise(image):
|
71 |
global line, binary_image1, x_scaling , y_scaling
|
72 |
+
try:
|
73 |
+
kernel = np.ones((1,250),np.uint8)
|
74 |
|
75 |
+
dilation = cv2.dilate(image, kernel, iterations = 2)
|
76 |
+
|
77 |
+
# cv2.namedWindow("Image", cv2.WINDOW_NORMAL)
|
78 |
+
# cv2.imshow('Image',dilation)
|
79 |
+
# cv2.waitKey(0)
|
80 |
+
|
81 |
+
contours, _ = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
82 |
+
|
83 |
+
for i in reversed(contours):
|
84 |
+
x, y, w, h = cv2.boundingRect(i)
|
85 |
+
if cv2.contourArea(i)<20 :
|
86 |
+
continue
|
87 |
+
elif h < 8:
|
88 |
+
continue
|
89 |
+
else:
|
90 |
+
scaling_factor_in_y = 0.5
|
91 |
+
scaling_factor_in_x = 0
|
92 |
+
resized_contour = i.copy()
|
93 |
+
|
94 |
+
resized_contour = i * [x_scaling, y_scaling]
|
95 |
+
|
96 |
+
resized_contour = resized_contour.astype(int)
|
97 |
+
final_image__ = np.zeros_like(binary_image1)
|
98 |
+
cv2.drawContours(final_image__, [resized_contour], 0, (255), -1)
|
99 |
+
|
100 |
+
kernel_dil = np.ones((3,3),np.uint8)
|
101 |
+
final_image__ = cv2.dilate(final_image__,kernel_dil,iterations = 3)
|
102 |
+
|
103 |
+
|
104 |
+
line_image_final = cv2.bitwise_and(final_image__, binary_image1)
|
105 |
+
line.append(line_image_final)
|
106 |
+
# cv2.namedWindow("Line image", cv2.WINDOW_NORMAL)
|
107 |
+
# cv2.imshow('Line image',line_image_final)
|
108 |
+
# cv2.waitKey(0)
|
109 |
+
|
110 |
|
111 |
|
112 |
+
except Exception as e:
|
113 |
+
print(f"Error in analise: {str(e)}")
|
114 |
+
|
115 |
def image_resize_and_errosion(image):
|
116 |
|
117 |
height, width = image.shape[:2]
|
|
|
135 |
count = 0
|
136 |
|
137 |
def convert_image(img):
|
138 |
+
try:
|
139 |
+
folder_path = 'images'
|
140 |
+
|
141 |
+
for filename in os.listdir(folder_path):
|
142 |
+
file_path = os.path.join(folder_path, filename)
|
143 |
+
try:
|
144 |
+
if os.path.isfile(file_path):
|
145 |
+
os.remove(file_path)
|
146 |
+
except Exception as e:
|
147 |
+
print(f"Error deleting file {file_path}: {e}")
|
148 |
|
149 |
|
150 |
|
151 |
+
global x_scaling,y_scaling,binary_image1,line,line_lenght,count
|
152 |
+
# img = cv2.imread(image_file)
|
153 |
+
img_copy = np.copy(img)
|
154 |
+
line_lenght = 250
|
155 |
+
rect_image = img
|
156 |
|
157 |
+
# removing the shadow in the image
|
158 |
+
image1, image2_ = remove_shadow(rect_image)
|
159 |
|
160 |
+
# converting into grayscale
|
161 |
+
gray_ = cv2.cvtColor(image2_,cv2.COLOR_BGR2GRAY)
|
162 |
+
|
163 |
+
# cv2.namedWindow("grayscale image", cv2.WINDOW_NORMAL)
|
164 |
+
# cv2.imshow('grayscale image',gray_)
|
165 |
+
# cv2.waitKey(0)
|
166 |
|
167 |
+
# convrting into binaryimage
|
168 |
+
_, binary_image_ = cv2.threshold(gray_, 200, 255, cv2.THRESH_BINARY)
|
169 |
+
# cv2.namedWindow("binary image", cv2.WINDOW_NORMAL)
|
170 |
+
# cv2.imshow('binary image',binary_image_)
|
171 |
+
# cv2.waitKey(0)
|
172 |
|
173 |
+
inverted_binary_image_ = 255 - binary_image_
|
174 |
|
175 |
+
binary_image1 = np.copy(inverted_binary_image_)
|
176 |
|
177 |
+
y_height ,x_width= rect_image.shape[:2]
|
178 |
|
179 |
+
# print("image width, height =", x_width, y_height)
|
180 |
|
181 |
+
# resizing the image
|
182 |
+
new_width = 500*5
|
183 |
+
new_height = 705*5
|
184 |
|
185 |
+
x_scaling = x_width/new_width
|
186 |
+
y_scaling = y_height/new_height
|
187 |
|
188 |
+
# print("After resizing width, height", new_width , new_height)
|
189 |
+
rect_image = cv2.resize(rect_image, (new_width, new_height), interpolation=cv2.INTER_NEAREST)
|
190 |
+
# cv2.namedWindow("resized image", cv2.WINDOW_NORMAL)
|
191 |
+
# cv2.imshow('resized image',rect_image)
|
192 |
+
# cv2.waitKey(0)
|
193 |
+
|
194 |
+
# removing the shadow in the image
|
195 |
+
image1, image2 = remove_shadow(rect_image)
|
196 |
+
|
197 |
+
# converting into grayscale
|
198 |
+
gray = cv2.cvtColor(image2,cv2.COLOR_BGR2GRAY)
|
199 |
+
# cv2.namedWindow("grayscale image", cv2.WINDOW_NORMAL)
|
200 |
+
# cv2.imshow('grayscale image',gray)
|
201 |
+
# cv2.waitKey(0)
|
202 |
+
|
203 |
+
# convrting into binaryimage
|
204 |
+
_, binary_image = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
205 |
+
_, binary_image = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
|
206 |
+
# cv2.namedWindow("binary image", cv2.WINDOW_NORMAL)
|
207 |
+
# cv2.imshow('binary image',gray)
|
208 |
+
# cv2.waitKey(0)
|
209 |
|
210 |
+
# inverting the pixel
|
211 |
+
inverted_binary_image = 255 - binary_image
|
212 |
|
213 |
+
kernel = np.ones((2,2),np.uint8)
|
214 |
|
215 |
|
216 |
+
# performing erosion to remove noise
|
217 |
+
erosion = cv2.erode(inverted_binary_image,kernel,iterations = 1)
|
218 |
+
# cv2.namedWindow("erosion", cv2.WINDOW_NORMAL)
|
219 |
+
# cv2.imshow('erosion',erosion)
|
220 |
+
# cv2.waitKey(0)
|
221 |
|
222 |
|
223 |
+
# performing Dilution operatiom
|
224 |
+
dilation = cv2.dilate(erosion,kernel,iterations = 1)
|
225 |
+
# cv2.namedWindow("dilation", cv2.WINDOW_NORMAL)
|
226 |
+
# cv2.imshow('dilation',erosion)
|
227 |
+
# cv2.waitKey(0)
|
228 |
+
|
229 |
|
230 |
+
new_image = np.copy(dilation)
|
231 |
+
new_image = 255 - new_image
|
232 |
|
233 |
|
234 |
+
# defining kernal size
|
235 |
+
kernel = np.ones((1,250),np.uint8)
|
236 |
|
237 |
|
238 |
+
# performing Dilution operatiom
|
239 |
+
dilation_1 = cv2.dilate(dilation,kernel,iterations = 2)
|
240 |
+
# cv2.namedWindow("dilation_1", cv2.WINDOW_NORMAL)
|
241 |
+
# cv2.imshow('dilation_1',dilation_1)
|
242 |
+
# cv2.waitKey(0)
|
243 |
+
|
244 |
+
contours, _ = cv2.findContours(dilation_1, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
245 |
+
|
246 |
+
line = []
|
247 |
+
# line saparation
|
248 |
+
for i in reversed(contours):
|
249 |
+
x, y, w, h = cv2.boundingRect(i)
|
250 |
+
if cv2.contourArea(i)<20:
|
251 |
+
continue
|
252 |
+
elif h < 10:
|
253 |
+
continue
|
254 |
+
else:
|
255 |
+
cv2.drawContours(new_image, [i],-1,(0),2)
|
256 |
+
final_image_ = np.zeros_like(binary_image)
|
257 |
+
cv2.drawContours(final_image_, [i], 0, (255), -1)
|
258 |
+
|
259 |
+
# cv2.namedWindow("final_image_", cv2.WINDOW_NORMAL)
|
260 |
+
# cv2.imshow('final_image_',final_image_)
|
261 |
+
# cv2.waitKey(0)
|
262 |
+
|
263 |
+
|
264 |
+
line_image = cv2.bitwise_and(final_image_, dilation)
|
265 |
+
# cv2.namedWindow("line_image", cv2.WINDOW_NORMAL)
|
266 |
+
# cv2.imshow('line_image',line_image)
|
267 |
+
# cv2.waitKey(0)
|
268 |
+
|
269 |
+
|
270 |
+
analise(line_image)
|
271 |
+
|
272 |
+
|
273 |
+
count = 0
|
274 |
+
kernel1 = np.ones((8,8),np.uint8)
|
275 |
+
word__image = [] # newly added
|
276 |
+
for line_image in line:
|
277 |
+
|
278 |
+
dilation_2 = cv2.dilate(line_image,kernel1,iterations = 2)
|
279 |
|
280 |
+
contours1, _ = cv2.findContours(dilation_2, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
|
281 |
|
282 |
+
sorted_contours = sorted(contours1, key=lambda c: cv2.boundingRect(c)[0])
|
|
|
|
|
|
|
|
|
283 |
|
284 |
+
for j in sorted_contours:
|
285 |
+
x1,y1,w1,h1 = cv2.boundingRect(j)
|
286 |
+
final_image = line_image[y1:y1+h1,x1:x1+w1]
|
287 |
+
image_name ="images/"+str(count)+".png"
|
288 |
+
final_image = 255 - final_image
|
289 |
+
word__image.append(final_image)# newly added
|
290 |
+
# cv2.imwrite(image_name, final_image)
|
291 |
+
count=count+1
|
292 |
+
|
293 |
+
# cv2.waitKey(0)
|
294 |
+
# cv2.destroyAllWindows()
|
295 |
+
return word__image
|
296 |
|
297 |
+
except Exception as e:
|
298 |
+
print(f"Error in convert_image: {str(e)}")
|
299 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
|
301 |
|
302 |
|
all_models.py
CHANGED
@@ -1,5 +1,13 @@
|
|
1 |
from sentence_transformers import SentenceTransformer
|
2 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
class ModelSingleton:
|
5 |
_instance = None
|
@@ -12,17 +20,74 @@ class ModelSingleton:
|
|
12 |
|
13 |
def __init__(self):
|
14 |
if not self._initialized:
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
# Create a global instance
|
28 |
-
models = ModelSingleton()
|
|
|
|
|
|
|
|
|
|
1 |
from sentence_transformers import SentenceTransformer
|
2 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
3 |
+
import torch
|
4 |
+
import os
|
5 |
+
import tempfile
|
6 |
+
import logging
|
7 |
+
|
8 |
+
# Set up logging
|
9 |
+
logging.basicConfig(level=logging.INFO)
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
|
12 |
class ModelSingleton:
|
13 |
_instance = None
|
|
|
20 |
|
21 |
def __init__(self):
|
22 |
if not self._initialized:
|
23 |
+
try:
|
24 |
+
# Set cache directory to temporary directory
|
25 |
+
cache_dir = os.getenv('TRANSFORMERS_CACHE', tempfile.gettempdir())
|
26 |
+
os.environ['TRANSFORMERS_CACHE'] = cache_dir
|
27 |
+
|
28 |
+
# Get device
|
29 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
30 |
+
logger.info(f"Using device: {self.device}")
|
31 |
+
|
32 |
+
# Sentence transformer model
|
33 |
+
try:
|
34 |
+
logger.info("Loading sentence transformer model...")
|
35 |
+
SENTENCE_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
36 |
+
self.similarity_tokenizer = AutoTokenizer.from_pretrained(
|
37 |
+
SENTENCE_MODEL,
|
38 |
+
cache_dir=cache_dir
|
39 |
+
)
|
40 |
+
self.similarity_model = SentenceTransformer(
|
41 |
+
SENTENCE_MODEL,
|
42 |
+
cache_folder=cache_dir
|
43 |
+
)
|
44 |
+
self.similarity_model.to(self.device)
|
45 |
+
logger.info("Sentence transformer model loaded successfully")
|
46 |
+
except Exception as e:
|
47 |
+
logger.error(f"Error loading sentence transformer model: {e}")
|
48 |
+
raise
|
49 |
+
|
50 |
+
# Flan-T5-xl model
|
51 |
+
try:
|
52 |
+
logger.info("Loading Flan-T5 model...")
|
53 |
+
FLAN_MODEL = "google/flan-t5-xl"
|
54 |
+
self.flan_tokenizer = AutoTokenizer.from_pretrained(
|
55 |
+
FLAN_MODEL,
|
56 |
+
cache_dir=cache_dir
|
57 |
+
)
|
58 |
+
self.flan_model = AutoModelForSeq2SeqLM.from_pretrained(
|
59 |
+
FLAN_MODEL,
|
60 |
+
cache_dir=cache_dir,
|
61 |
+
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
|
62 |
+
)
|
63 |
+
self.flan_model.to(self.device)
|
64 |
+
logger.info("Flan-T5 model loaded successfully")
|
65 |
+
except Exception as e:
|
66 |
+
logger.error(f"Error loading Flan-T5 model: {e}")
|
67 |
+
raise
|
68 |
+
|
69 |
+
self._initialized = True
|
70 |
+
logger.info("All models initialized successfully")
|
71 |
+
|
72 |
+
except Exception as e:
|
73 |
+
logger.error(f"Error during model initialization: {e}")
|
74 |
+
raise
|
75 |
+
|
76 |
+
def cleanup(self):
|
77 |
+
"""Clean up model resources"""
|
78 |
+
try:
|
79 |
+
if hasattr(self, 'similarity_model'):
|
80 |
+
del self.similarity_model
|
81 |
+
if hasattr(self, 'flan_model'):
|
82 |
+
del self.flan_model
|
83 |
+
torch.cuda.empty_cache()
|
84 |
+
logger.info("Model resources cleaned up successfully")
|
85 |
+
except Exception as e:
|
86 |
+
logger.error(f"Error during cleanup: {e}")
|
87 |
|
88 |
# Create a global instance
|
89 |
+
models = ModelSingleton()
|
90 |
+
|
91 |
+
# Add cleanup function to the global instance
|
92 |
+
def cleanup_models():
|
93 |
+
models.cleanup()
|
main.py
CHANGED
@@ -291,8 +291,9 @@ def compute_marks():
|
|
291 |
student_folder = path_parts[-2] # Get the parent folder name
|
292 |
filename = path_parts[-1] # Get the actual filename
|
293 |
|
294 |
-
#
|
295 |
-
|
|
|
296 |
htr_filepath = os.path.join(images_dir, htr_filename)
|
297 |
|
298 |
# Save the file with full permissions
|
@@ -304,10 +305,10 @@ def compute_marks():
|
|
304 |
|
305 |
log_print(f"Saved file: {htr_filepath}")
|
306 |
|
307 |
-
# Add to data structure
|
308 |
if student_folder not in data:
|
309 |
data[student_folder] = []
|
310 |
-
data[student_folder].append(htr_filepath)
|
311 |
|
312 |
if not data:
|
313 |
log_print("No valid image files were found in the upload", "ERROR")
|
@@ -334,8 +335,20 @@ def compute_marks():
|
|
334 |
count = 0
|
335 |
for image_path in file_paths:
|
336 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
s_answer = extract_text_from_image(image_path)
|
338 |
-
log_print(f"\nProcessing {student_folder}/{os.path.basename(image_path)}:")
|
339 |
log_print(f"Extracted answer: {s_answer}")
|
340 |
|
341 |
if s_answer and count < len(answers):
|
|
|
291 |
student_folder = path_parts[-2] # Get the parent folder name
|
292 |
filename = path_parts[-1] # Get the actual filename
|
293 |
|
294 |
+
# Create a clean filename without any path separators
|
295 |
+
clean_filename = filename.replace('/', '_').replace('\\', '_')
|
296 |
+
htr_filename = f"{student_folder}_{clean_filename}"
|
297 |
htr_filepath = os.path.join(images_dir, htr_filename)
|
298 |
|
299 |
# Save the file with full permissions
|
|
|
305 |
|
306 |
log_print(f"Saved file: {htr_filepath}")
|
307 |
|
308 |
+
# Add to data structure with absolute path
|
309 |
if student_folder not in data:
|
310 |
data[student_folder] = []
|
311 |
+
data[student_folder].append(os.path.abspath(htr_filepath))
|
312 |
|
313 |
if not data:
|
314 |
log_print("No valid image files were found in the upload", "ERROR")
|
|
|
335 |
count = 0
|
336 |
for image_path in file_paths:
|
337 |
try:
|
338 |
+
log_print(f"\nProcessing image: {image_path}")
|
339 |
+
log_print(f"Checking if file exists: {os.path.exists(image_path)}")
|
340 |
+
|
341 |
+
# Ensure the image path is absolute and exists
|
342 |
+
if not os.path.isabs(image_path):
|
343 |
+
image_path = os.path.abspath(image_path)
|
344 |
+
|
345 |
+
if not os.path.exists(image_path):
|
346 |
+
log_print(f"Error: Image file not found at {image_path}", "ERROR")
|
347 |
+
s_marks[student_folder].append(0)
|
348 |
+
continue
|
349 |
+
|
350 |
+
# Extract text from image
|
351 |
s_answer = extract_text_from_image(image_path)
|
|
|
352 |
log_print(f"Extracted answer: {s_answer}")
|
353 |
|
354 |
if s_answer and count < len(answers):
|
similarity_check/llm_based_scoring/llm.py
CHANGED
@@ -9,68 +9,84 @@ from all_models import models
|
|
9 |
# model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
10 |
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
11 |
|
|
|
12 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
-
|
|
|
|
|
|
|
14 |
|
15 |
def llm_score(correct_answers, answer):
|
16 |
-
|
|
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
"User's answer:\n"
|
50 |
-
f"{answer}\n\n"
|
51 |
-
"Final Score (numeric only, strictly between 0 and 10):")
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
# model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
|
10 |
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
11 |
|
12 |
+
# Get device and ensure model is on correct device
|
13 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
+
try:
|
15 |
+
models.flan_model.to(device)
|
16 |
+
except Exception as e:
|
17 |
+
print(f"Warning: Could not move model to device {device}: {e}")
|
18 |
|
19 |
def llm_score(correct_answers, answer):
|
20 |
+
try:
|
21 |
+
score = []
|
22 |
|
23 |
+
for correct_answer in correct_answers:
|
24 |
+
try:
|
25 |
+
prompt = (
|
26 |
+
"You are an expert evaluator of answers. Your response must be a *single numeric score (0-10), not a range.*\n\n"
|
27 |
+
|
28 |
+
"The user's answer has been converted from handwriting using OCR, so minor spelling, punctuation, or small word variations may exist. "
|
29 |
+
"Focus on meaning rather than transcription errors.\n\n"
|
30 |
+
|
31 |
+
"### Evaluation Criteria:\n"
|
32 |
+
"- *Correctness (90% weight):* Does the answer accurately convey the meaning of the correct answer?\n"
|
33 |
+
"- *Completeness (10% weight):* Does it cover all key points?\n\n"
|
34 |
+
|
35 |
+
"### Handling OCR Errors:\n"
|
36 |
+
"- Ignore minor spelling/punctuation mistakes that don't affect meaning.\n"
|
37 |
+
"- Penalize only if word substitutions change the meaning.\n\n"
|
38 |
+
|
39 |
+
"### Scoring Guidelines:\n"
|
40 |
+
"- *10:* Fully correct and complete (90-100% accurate).\n"
|
41 |
+
"- *From 9 to 8:* Mostly correct, minor missing details (80-90% accurate).\n"
|
42 |
+
"- *From 7 to 6:* Good but missing some key points (60-80% accurate).\n"
|
43 |
+
"- *From 5 to 4:* Average, with several omissions/errors (40-60% accurate).\n"
|
44 |
+
"- *From 3 to 2:* Poor, major meaning errors (20-40% accurate).\n"
|
45 |
+
"- *From 1 to 0:* Incorrect or irrelevant (less than 20% accurate).\n\n"
|
46 |
+
|
47 |
+
"Compare the answers and assign a *single numeric score (0-10)* based on correctness and completeness.\n\n"
|
48 |
+
|
49 |
+
"Correct answer:\n"
|
50 |
+
f"{correct_answer}\n\n"
|
51 |
+
"User's answer:\n"
|
52 |
+
f"{answer}\n\n"
|
53 |
+
"Final Score (numeric only, strictly between 0 and 10):")
|
|
|
|
|
|
|
54 |
|
55 |
+
# Tokenize input prompt
|
56 |
+
inputs = models.flan_tokenizer(prompt, return_tensors="pt").to(device)
|
57 |
+
|
58 |
+
# Generate response
|
59 |
+
with torch.no_grad():
|
60 |
+
outputs = models.flan_model.generate(
|
61 |
+
**inputs,
|
62 |
+
max_length=2048,
|
63 |
+
do_sample=True,
|
64 |
+
num_return_sequences=1,
|
65 |
+
num_beams=5,
|
66 |
+
temperature=0.6,
|
67 |
+
top_p=0.9,
|
68 |
+
early_stopping=True,
|
69 |
+
pad_token_id=models.flan_tokenizer.pad_token_id,
|
70 |
+
eos_token_id=models.flan_tokenizer.eos_token_id,
|
71 |
+
bos_token_id=models.flan_tokenizer.bos_token_id,
|
72 |
+
)
|
73 |
|
74 |
+
# Decode and clean response
|
75 |
+
response = models.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
76 |
+
try:
|
77 |
+
# Extract numeric score from response
|
78 |
+
score_value = float(''.join(filter(str.isdigit, response)))
|
79 |
+
score_value = max(0, min(10, score_value)) # Clamp between 0 and 10
|
80 |
+
score.append(score_value)
|
81 |
+
except ValueError:
|
82 |
+
print(f"Warning: Could not extract numeric score from response: {response}")
|
83 |
+
score.append(0)
|
84 |
+
|
85 |
+
except Exception as e:
|
86 |
+
print(f"Error processing answer: {str(e)}")
|
87 |
+
score.append(0)
|
88 |
+
|
89 |
+
return score
|
90 |
+
except Exception as e:
|
91 |
+
print(f"Error in llm_score: {str(e)}")
|
92 |
+
return [0] * len(correct_answers)
|
similarity_check/semantic_meaning_check/semantic.py
CHANGED
@@ -16,6 +16,7 @@ from all_models import models
|
|
16 |
|
17 |
# Use custom directory for gensim data
|
18 |
gensim_data_dir = os.getenv('GENSIM_DATA_DIR', tempfile.gettempdir())
|
|
|
19 |
|
20 |
# Load fasttext with error handling
|
21 |
try:
|
@@ -39,58 +40,89 @@ except Exception as e:
|
|
39 |
# nltk.download('stopwords')
|
40 |
|
41 |
def question_vector_sentence(correct_answer):
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
|
44 |
def similarity_model_score(correct_answer_vector, answer):
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
def preprocess(sentence):
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
59 |
|
60 |
def sentence_to_vec(tokens, model):
|
61 |
-
|
62 |
-
|
|
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
|
72 |
-
|
|
|
|
|
|
|
73 |
|
74 |
def compute_scm(tokens1, tokens2, model):
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
82 |
|
83 |
def question_vector_word(correct_answer):
|
84 |
-
|
|
|
|
|
|
|
|
|
85 |
|
86 |
def fasttext_similarity(correct_answer_vector, answer):
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
94 |
|
95 |
|
96 |
|
|
|
16 |
|
17 |
# Use custom directory for gensim data
|
18 |
gensim_data_dir = os.getenv('GENSIM_DATA_DIR', tempfile.gettempdir())
|
19 |
+
os.environ['GENSIM_DATA_DIR'] = gensim_data_dir
|
20 |
|
21 |
# Load fasttext with error handling
|
22 |
try:
|
|
|
40 |
# nltk.download('stopwords')
|
41 |
|
42 |
def question_vector_sentence(correct_answer):
|
43 |
+
try:
|
44 |
+
return models.similarity_model.encode(correct_answer, convert_to_tensor=True)
|
45 |
+
except Exception as e:
|
46 |
+
print(f"Error in question_vector_sentence: {str(e)}")
|
47 |
+
return None
|
48 |
|
49 |
def similarity_model_score(correct_answer_vector, answer):
|
50 |
+
try:
|
51 |
+
if correct_answer_vector is None:
|
52 |
+
return 0.0
|
53 |
+
|
54 |
+
answer_embedding = models.similarity_model.encode(answer, convert_to_tensor=True)
|
55 |
+
cosine_score = float('-inf')
|
56 |
+
for i in correct_answer_vector:
|
57 |
+
cosine_score = max(cosine_score, util.pytorch_cos_sim(i, answer_embedding))
|
58 |
+
return float(cosine_score) # Convert to float for JSON serialization
|
59 |
+
except Exception as e:
|
60 |
+
print(f"Error in similarity_model_score: {str(e)}")
|
61 |
+
return 0.0
|
62 |
|
63 |
def preprocess(sentence):
|
64 |
+
try:
|
65 |
+
# Lowercase and remove punctuation
|
66 |
+
sentence = sentence.lower()
|
67 |
+
# Tokenize
|
68 |
+
words = word_tokenize(sentence)
|
69 |
+
# Remove stop words
|
70 |
+
words = [word for word in words if word not in stopwords.words('english')]
|
71 |
+
return words
|
72 |
+
except Exception as e:
|
73 |
+
print(f"Error in preprocess: {str(e)}")
|
74 |
+
return []
|
75 |
|
76 |
def sentence_to_vec(tokens, model):
|
77 |
+
try:
|
78 |
+
# Filter words that are in the Word2Vec vocabulary
|
79 |
+
valid_words = [word for word in tokens if word in model]
|
80 |
|
81 |
+
# If there are no valid words, return a zero vector
|
82 |
+
if not valid_words:
|
83 |
+
return np.zeros(model.vector_size)
|
84 |
|
85 |
+
# Compute the average vector
|
86 |
+
word_vectors = [model[word] for word in valid_words]
|
87 |
+
sentence_vector = np.mean(word_vectors, axis=0)
|
88 |
|
89 |
+
return sentence_vector
|
90 |
+
except Exception as e:
|
91 |
+
print(f"Error in sentence_to_vec: {str(e)}")
|
92 |
+
return np.zeros(300) # Return zero vector as fallback
|
93 |
|
94 |
def compute_scm(tokens1, tokens2, model):
|
95 |
+
try:
|
96 |
+
dictionary = corpora.Dictionary([tokens1, tokens2])
|
97 |
+
tokens1 = dictionary.doc2bow(tokens1)
|
98 |
+
tokens2 = dictionary.doc2bow(tokens2)
|
99 |
+
termsim_index = WordEmbeddingSimilarityIndex(model)
|
100 |
+
termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)
|
101 |
+
similarity = termsim_matrix.inner_product(tokens1, tokens2, normalized=(True, True))
|
102 |
+
return float(similarity) # Convert to float for JSON serialization
|
103 |
+
except Exception as e:
|
104 |
+
print(f"Error in compute_scm: {str(e)}")
|
105 |
+
return 0.5 # Return default similarity score
|
106 |
|
107 |
def question_vector_word(correct_answer):
|
108 |
+
try:
|
109 |
+
return preprocess(correct_answer)
|
110 |
+
except Exception as e:
|
111 |
+
print(f"Error in question_vector_word: {str(e)}")
|
112 |
+
return []
|
113 |
|
114 |
def fasttext_similarity(correct_answer_vector, answer):
|
115 |
+
try:
|
116 |
+
preprocess_answer = preprocess(answer)
|
117 |
+
soft_cosine = float('-inf')
|
118 |
+
|
119 |
+
for i in correct_answer_vector:
|
120 |
+
soft_cosine = max(compute_scm(i, preprocess_answer, fasttext), soft_cosine)
|
121 |
+
|
122 |
+
return float(soft_cosine) # Convert to float for JSON serialization
|
123 |
+
except Exception as e:
|
124 |
+
print(f"Error in fasttext_similarity: {str(e)}")
|
125 |
+
return 0.0
|
126 |
|
127 |
|
128 |
|
similarity_check/tf_idf/tf_idf_score.py
CHANGED
@@ -4,132 +4,155 @@ from nltk.tokenize import word_tokenize
|
|
4 |
from nltk.corpus import wordnet
|
5 |
from collections import Counter
|
6 |
import string
|
|
|
|
|
7 |
|
8 |
-
|
|
|
|
|
9 |
|
10 |
def remove_stopwords(sentence):
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
|
15 |
-
|
16 |
-
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
24 |
|
25 |
def get_synonyms(word):
|
26 |
-
|
27 |
-
|
28 |
-
for
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
32 |
|
33 |
def process_sentence(words):
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
43 |
|
44 |
def tf(dict1):
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
# print(word_frequency)
|
55 |
-
|
56 |
-
for i in word_frequency:
|
57 |
-
word_frequency[i] = word_frequency[i]/no_of_terms_in_document
|
58 |
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
def idf(di):
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
for
|
66 |
-
|
67 |
-
new_dict
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
76 |
|
77 |
def total_tf_idf_value(tf_idf_word_values,synonyms_words):
|
78 |
-
|
79 |
-
|
80 |
-
for
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
86 |
|
87 |
def create_tfidf_values(correct_answer):
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
for i in correct_answer_words:
|
95 |
-
correct_synonyms_words.append(process_sentence(i))
|
96 |
-
|
97 |
-
tf_ = []
|
98 |
-
for i in correct_synonyms_words:
|
99 |
-
tf_.append(tf(i))
|
100 |
-
|
101 |
-
|
102 |
-
idf_values = idf(correct_synonyms_words)
|
103 |
-
|
104 |
-
tf_idf_word_values = {}
|
105 |
-
count = 0
|
106 |
-
for correct_synonyms_word in correct_synonyms_words:
|
107 |
-
for i in correct_synonyms_word:
|
108 |
-
value = tf_[count][i]*idf_values[i]
|
109 |
-
if i in tf_idf_word_values:
|
110 |
-
tf_idf_word_values[i] = max(tf_idf_word_values[i],value)
|
111 |
-
else:
|
112 |
-
tf_idf_word_values[i] = value
|
113 |
-
count+=1
|
114 |
-
for i in tf_idf_word_values:
|
115 |
-
tf_idf_word_values[i] = round(tf_idf_word_values[i], 4)
|
116 |
-
|
117 |
-
tfidf_correct_ans = float('inf')
|
118 |
-
for i in correct_synonyms_words:
|
119 |
-
tfidf_correct_ans = min(total_tf_idf_value(tf_idf_word_values,i),tfidf_correct_ans)
|
120 |
|
121 |
-
|
|
|
|
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
def tfidf_answer_score(answer,tf_idf_word_values,max_tfidf,marks=10):
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
return score
|
135 |
|
|
|
4 |
from nltk.corpus import wordnet
|
5 |
from collections import Counter
|
6 |
import string
|
7 |
+
import os
|
8 |
+
import tempfile
|
9 |
|
10 |
+
# Set NLTK data path to temporary directory
|
11 |
+
nltk_data_dir = os.getenv('NLTK_DATA', tempfile.gettempdir())
|
12 |
+
os.environ['NLTK_DATA'] = nltk_data_dir
|
13 |
|
14 |
def remove_stopwords(sentence):
|
15 |
+
try:
|
16 |
+
# converting into words
|
17 |
+
words = word_tokenize(sentence)
|
18 |
|
19 |
+
# Get the set of English stop words
|
20 |
+
stop_words = set(stopwords.words('english'))
|
21 |
|
22 |
+
# Remove stop words from the list of words
|
23 |
+
filtered_words = [word for word in words if word.lower() not in stop_words]
|
24 |
+
|
25 |
+
words = [word.lower() for word in words if word.isalpha() and len(word)>1]
|
26 |
+
|
27 |
+
return words
|
28 |
+
except Exception as e:
|
29 |
+
print(f"Error in remove_stopwords: {str(e)}")
|
30 |
+
return []
|
31 |
|
32 |
def get_synonyms(word):
|
33 |
+
try:
|
34 |
+
synonyms = set()
|
35 |
+
for syn in wordnet.synsets(word):
|
36 |
+
for lemma in syn.lemmas():
|
37 |
+
synonyms.add(lemma.name().lower())
|
38 |
+
return synonyms
|
39 |
+
except Exception as e:
|
40 |
+
print(f"Error in get_synonyms: {str(e)}")
|
41 |
+
return {word.lower()}
|
42 |
|
43 |
def process_sentence(words):
|
44 |
+
try:
|
45 |
+
# Find synonyms for each word
|
46 |
+
synonym_map = {}
|
47 |
+
for word in words:
|
48 |
+
synonyms = get_synonyms(word)
|
49 |
+
synonyms.add(word) # Ensure the word itself is included if no synonyms are found
|
50 |
+
synonym_map[word] = list(synonyms)
|
51 |
+
|
52 |
+
return synonym_map
|
53 |
+
except Exception as e:
|
54 |
+
print(f"Error in process_sentence: {str(e)}")
|
55 |
+
return {word: [word] for word in words}
|
56 |
|
57 |
def tf(dict1):
|
58 |
+
try:
|
59 |
+
no_of_terms_in_document = len(dict1)
|
60 |
+
word_frequency = {}
|
61 |
+
for i in dict1:
|
62 |
+
count = 0
|
63 |
+
for j in dict1:
|
64 |
+
if i in dict1[j]:
|
65 |
+
count+=1
|
66 |
+
word_frequency[i] = count
|
|
|
|
|
|
|
|
|
67 |
|
68 |
+
for i in word_frequency:
|
69 |
+
word_frequency[i] = word_frequency[i]/no_of_terms_in_document
|
70 |
+
|
71 |
+
return word_frequency
|
72 |
+
except Exception as e:
|
73 |
+
print(f"Error in tf: {str(e)}")
|
74 |
+
return {}
|
75 |
|
76 |
def idf(di):
|
77 |
+
try:
|
78 |
+
no_of_documents = len(di)
|
79 |
+
new_dict = {}
|
80 |
+
for d in range(len(di)):
|
81 |
+
for i in di[d]:
|
82 |
+
if i not in new_dict:
|
83 |
+
new_dict[i]=set()
|
84 |
+
new_dict[i].add(d)
|
85 |
+
else:
|
86 |
+
new_dict[i].add(d)
|
87 |
+
|
88 |
+
r = {}
|
89 |
+
for i in new_dict:
|
90 |
+
r[i]=len(new_dict[i])/no_of_documents
|
91 |
+
return r
|
92 |
+
except Exception as e:
|
93 |
+
print(f"Error in idf: {str(e)}")
|
94 |
+
return {}
|
95 |
|
96 |
def total_tf_idf_value(tf_idf_word_values,synonyms_words):
|
97 |
+
try:
|
98 |
+
value = 0
|
99 |
+
for i in synonyms_words:
|
100 |
+
for j in synonyms_words[i]:
|
101 |
+
if j in tf_idf_word_values:
|
102 |
+
value += tf_idf_word_values[j]
|
103 |
+
break
|
104 |
+
return value
|
105 |
+
except Exception as e:
|
106 |
+
print(f"Error in total_tf_idf_value: {str(e)}")
|
107 |
+
return 0
|
108 |
|
109 |
def create_tfidf_values(correct_answer):
|
110 |
+
try:
|
111 |
+
correct_answer_words = []
|
112 |
+
for i in correct_answer:
|
113 |
+
correct_answer_words.append(remove_stopwords(i))
|
114 |
+
|
115 |
+
correct_synonyms_words = []
|
116 |
|
117 |
+
for i in correct_answer_words:
|
118 |
+
correct_synonyms_words.append(process_sentence(i))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
+
tf_ = []
|
121 |
+
for i in correct_synonyms_words:
|
122 |
+
tf_.append(tf(i))
|
123 |
|
124 |
+
idf_values = idf(correct_synonyms_words)
|
125 |
+
|
126 |
+
tf_idf_word_values = {}
|
127 |
+
count = 0
|
128 |
+
for correct_synonyms_word in correct_synonyms_words:
|
129 |
+
for i in correct_synonyms_word:
|
130 |
+
value = tf_[count][i]*idf_values[i]
|
131 |
+
if i in tf_idf_word_values:
|
132 |
+
tf_idf_word_values[i] = max(tf_idf_word_values[i],value)
|
133 |
+
else:
|
134 |
+
tf_idf_word_values[i] = value
|
135 |
+
count+=1
|
136 |
+
for i in tf_idf_word_values:
|
137 |
+
tf_idf_word_values[i] = round(tf_idf_word_values[i], 4)
|
138 |
+
|
139 |
+
tfidf_correct_ans = float('inf')
|
140 |
+
for i in correct_synonyms_words:
|
141 |
+
tfidf_correct_ans = min(total_tf_idf_value(tf_idf_word_values,i),tfidf_correct_ans)
|
142 |
+
|
143 |
+
return tf_idf_word_values,tfidf_correct_ans
|
144 |
+
except Exception as e:
|
145 |
+
print(f"Error in create_tfidf_values: {str(e)}")
|
146 |
+
return {}, 0.0
|
147 |
|
148 |
def tfidf_answer_score(answer,tf_idf_word_values,max_tfidf,marks=10):
|
149 |
+
try:
|
150 |
+
answer = remove_stopwords(answer)
|
151 |
+
answer_synonyms_words = process_sentence(answer)
|
152 |
+
value = total_tf_idf_value(tf_idf_word_values,answer_synonyms_words)
|
153 |
+
score = (value/max_tfidf)*marks if max_tfidf > 0 else 0
|
154 |
+
return min(10, max(0, score)) # Clamp between 0 and 10
|
155 |
+
except Exception as e:
|
156 |
+
print(f"Error in tfidf_answer_score: {str(e)}")
|
157 |
+
return 0
|
|
|
158 |
|