yamanavijayavardhan commited on
Commit
8434b5d
·
1 Parent(s): 44fb620

update_new_new_new_new_new

Browse files
HTR/strike.py CHANGED
@@ -149,22 +149,70 @@ def process_without_model(image_paths):
149
  logger.error(f"Error in process_without_model: {str(e)}")
150
  return np.zeros(len(image_paths)) # Return all as not struck
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def struck_images(image_paths):
 
153
  try:
154
  if not image_paths:
155
  logger.error("No image paths provided")
156
  return []
157
 
158
  logger.info(f"Processing {len(image_paths)} images")
159
- processed_paths = []
160
-
161
- # Initialize model
162
- model = initialize_model()
163
 
164
  for i, img_path in enumerate(image_paths):
165
  try:
166
- # Read the image from the path
167
- img = cv2.imread(img_path)
168
  if img is None:
169
  logger.error(f"Failed to read image: {img_path}")
170
  continue
@@ -173,29 +221,15 @@ def struck_images(image_paths):
173
  processed = process_single_image(img)
174
  if processed is None:
175
  continue
176
-
177
- # Save the processed image
178
- output_path = os.path.join(TEMP_IMAGES_DIR, f'processed_{i}.png')
179
- cv2.imwrite(output_path, processed)
180
- processed_paths.append(output_path)
181
 
 
 
 
 
182
  except Exception as e:
183
  logger.error(f"Error processing image {img_path}: {str(e)}")
184
  continue
185
 
186
- # Get predictions
187
- predictions = predict_image(processed_paths, model)
188
-
189
- # Filter based on predictions
190
- not_struck = []
191
- for i, pred in enumerate(predictions):
192
- if isinstance(pred, np.ndarray):
193
- if pred.argmax() == 0: # Not struck
194
- not_struck.append(processed_paths[i])
195
- else:
196
- if pred == 0: # Not struck
197
- not_struck.append(processed_paths[i])
198
-
199
  logger.info(f"Found {len(not_struck)} non-struck images")
200
  return not_struck
201
 
@@ -203,33 +237,4 @@ def struck_images(image_paths):
203
  logger.error(f"Error in struck_images: {str(e)}")
204
  return []
205
 
206
- def process_single_image(img):
207
- try:
208
- # Convert to grayscale if needed
209
- if len(img.shape) == 3:
210
- img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
211
-
212
- # Enhance contrast
213
- img = cv2.equalizeHist(img)
214
-
215
- # Denoise
216
- img = cv2.fastNlMeansDenoising(img)
217
-
218
- # Apply adaptive thresholding
219
- binary = cv2.adaptiveThreshold(
220
- img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
221
- cv2.THRESH_BINARY, 21, 15
222
- )
223
-
224
- # Remove noise and smooth edges
225
- kernel = np.ones((3,3), np.uint8)
226
- binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
227
- binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
228
-
229
- return binary
230
-
231
- except Exception as e:
232
- logger.error(f"Error in process_single_image: {str(e)}")
233
- return None
234
-
235
  # struck_images()
 
149
  logger.error(f"Error in process_without_model: {str(e)}")
150
  return np.zeros(len(image_paths)) # Return all as not struck
151
 
152
+ def process_single_image(img):
153
+ try:
154
+ # Convert to grayscale if needed
155
+ if len(img.shape) == 3:
156
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
157
+
158
+ # Enhance contrast
159
+ img = cv2.equalizeHist(img)
160
+
161
+ # Denoise
162
+ img = cv2.fastNlMeansDenoising(img)
163
+
164
+ # Apply adaptive thresholding
165
+ binary = cv2.adaptiveThreshold(
166
+ img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
167
+ cv2.THRESH_BINARY, 21, 15
168
+ )
169
+
170
+ # Remove noise and smooth edges
171
+ kernel = np.ones((3,3), np.uint8)
172
+ binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
173
+ binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
174
+
175
+ return binary
176
+
177
+ except Exception as e:
178
+ logger.error(f"Error in process_single_image: {str(e)}")
179
+ return None
180
+
181
+ def check_strike_through(img):
182
+ """Check if an image contains strike-through lines"""
183
+ try:
184
+ # Convert to binary
185
+ _, binary = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
186
+
187
+ # Look for horizontal lines
188
+ horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))
189
+ horizontal_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel)
190
+
191
+ # Count pixels in horizontal lines
192
+ line_pixels = np.sum(horizontal_lines == 255)
193
+ total_pixels = img.shape[0] * img.shape[1]
194
+
195
+ # If more than 5% of pixels are part of horizontal lines, consider it struck through
196
+ return (line_pixels / total_pixels) > 0.05
197
+
198
+ except Exception as e:
199
+ logger.error(f"Error checking strike-through: {str(e)}")
200
+ return False
201
+
202
  def struck_images(image_paths):
203
+ """Process images and detect which ones are not struck through"""
204
  try:
205
  if not image_paths:
206
  logger.error("No image paths provided")
207
  return []
208
 
209
  logger.info(f"Processing {len(image_paths)} images")
210
+ not_struck = []
 
 
 
211
 
212
  for i, img_path in enumerate(image_paths):
213
  try:
214
+ # Read the image
215
+ img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
216
  if img is None:
217
  logger.error(f"Failed to read image: {img_path}")
218
  continue
 
221
  processed = process_single_image(img)
222
  if processed is None:
223
  continue
 
 
 
 
 
224
 
225
+ # Check if image is struck through
226
+ if not check_strike_through(processed):
227
+ not_struck.append(img_path)
228
+
229
  except Exception as e:
230
  logger.error(f"Error processing image {img_path}: {str(e)}")
231
  continue
232
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  logger.info(f"Found {len(not_struck)} non-struck images")
234
  return not_struck
235
 
 
237
  logger.error(f"Error in struck_images: {str(e)}")
238
  return []
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  # struck_images()
similarity_check/semantic_meaning_check/semantic.py CHANGED
@@ -13,6 +13,93 @@ import os
13
  import tempfile
14
  sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
15
  from all_models import models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # Use custom directory for gensim data
18
  gensim_data_dir = os.getenv('GENSIM_DATA_DIR', tempfile.gettempdir())
@@ -39,27 +126,6 @@ except Exception as e:
39
  # nltk.download('punkt')
40
  # nltk.download('stopwords')
41
 
42
- def question_vector_sentence(correct_answer):
43
- try:
44
- return models.similarity_model.encode(correct_answer, convert_to_tensor=True)
45
- except Exception as e:
46
- print(f"Error in question_vector_sentence: {str(e)}")
47
- return None
48
-
49
- def similarity_model_score(correct_answer_vector, answer):
50
- try:
51
- if correct_answer_vector is None:
52
- return 0.0
53
-
54
- answer_embedding = models.similarity_model.encode(answer, convert_to_tensor=True)
55
- cosine_score = float('-inf')
56
- for i in correct_answer_vector:
57
- cosine_score = max(cosine_score, util.pytorch_cos_sim(i, answer_embedding))
58
- return float(cosine_score) # Convert to float for JSON serialization
59
- except Exception as e:
60
- print(f"Error in similarity_model_score: {str(e)}")
61
- return 0.0
62
-
63
  def preprocess(sentence):
64
  try:
65
  # Lowercase and remove punctuation
@@ -104,13 +170,6 @@ def compute_scm(tokens1, tokens2, model):
104
  print(f"Error in compute_scm: {str(e)}")
105
  return 0.5 # Return default similarity score
106
 
107
- def question_vector_word(correct_answer):
108
- try:
109
- return preprocess(correct_answer)
110
- except Exception as e:
111
- print(f"Error in question_vector_word: {str(e)}")
112
- return []
113
-
114
  def fasttext_similarity(correct_answer_vector, answer):
115
  try:
116
  preprocess_answer = preprocess(answer)
 
13
  import tempfile
14
  sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
15
  from all_models import models
16
+ from sentence_transformers import SentenceTransformer
17
+ import torch
18
+ import logging
19
+
20
+ # Set up logging
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Global model variable
25
+ model = None
26
+
27
+ def initialize_model():
28
+ global model
29
+ try:
30
+ # Use a smaller, more efficient model
31
+ model_name = 'paraphrase-MiniLM-L6-v2' # Only about 80MB
32
+ cache_dir = os.path.join(os.environ.get('TMPDIR', '/tmp'), 'model_cache')
33
+ os.makedirs(cache_dir, exist_ok=True)
34
+
35
+ model = SentenceTransformer(model_name, cache_folder=cache_dir)
36
+ logger.info(f"Loaded model: {model_name}")
37
+ return model
38
+ except Exception as e:
39
+ logger.error(f"Error loading model: {str(e)}")
40
+ return None
41
+
42
+ def get_sentence_embedding(text):
43
+ try:
44
+ global model
45
+ if model is None:
46
+ model = initialize_model()
47
+ if model is None:
48
+ return None
49
+
50
+ # Get embeddings
51
+ embedding = model.encode(text, convert_to_tensor=True)
52
+ return embedding
53
+ except Exception as e:
54
+ logger.error(f"Error getting embedding: {str(e)}")
55
+ return None
56
+
57
+ def similarity_model_score(student_answer, correct_answer):
58
+ try:
59
+ # Get embeddings
60
+ student_emb = get_sentence_embedding(student_answer)
61
+ correct_emb = get_sentence_embedding(correct_answer)
62
+
63
+ if student_emb is None or correct_emb is None:
64
+ return 0.0
65
+
66
+ # Calculate cosine similarity
67
+ similarity = torch.nn.functional.cosine_similarity(student_emb, correct_emb, dim=0)
68
+ return float(similarity)
69
+ except Exception as e:
70
+ logger.error(f"Error calculating similarity: {str(e)}")
71
+ return 0.0
72
+
73
+ def question_vector_sentence(student_answer, correct_answer):
74
+ try:
75
+ return similarity_model_score(student_answer, correct_answer)
76
+ except Exception as e:
77
+ logger.error(f"Error in question_vector_sentence: {str(e)}")
78
+ return 0.0
79
+
80
+ def question_vector_word(student_answer, correct_answer):
81
+ try:
82
+ # Split into words
83
+ student_words = student_answer.lower().split()
84
+ correct_words = correct_answer.lower().split()
85
+
86
+ # Calculate similarities for each word pair
87
+ similarities = []
88
+ for s_word in student_words:
89
+ word_sims = []
90
+ for c_word in correct_words:
91
+ sim = similarity_model_score(s_word, c_word)
92
+ word_sims.append(sim)
93
+ if word_sims:
94
+ similarities.append(max(word_sims))
95
+
96
+ # Return average similarity
97
+ if similarities:
98
+ return sum(similarities) / len(similarities)
99
+ return 0.0
100
+ except Exception as e:
101
+ logger.error(f"Error in question_vector_word: {str(e)}")
102
+ return 0.0
103
 
104
  # Use custom directory for gensim data
105
  gensim_data_dir = os.getenv('GENSIM_DATA_DIR', tempfile.gettempdir())
 
126
  # nltk.download('punkt')
127
  # nltk.download('stopwords')
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  def preprocess(sentence):
130
  try:
131
  # Lowercase and remove punctuation
 
170
  print(f"Error in compute_scm: {str(e)}")
171
  return 0.5 # Return default similarity score
172
 
 
 
 
 
 
 
 
173
  def fasttext_similarity(correct_answer_vector, answer):
174
  try:
175
  preprocess_answer = preprocess(answer)