samyak152002 commited on
Commit
82c3ba5
·
verified ·
1 Parent(s): de1c169

Update main_analyzer.py

Browse files
Files changed (1) hide show
  1. main_analyzer.py +44 -23
main_analyzer.py CHANGED
@@ -1,7 +1,7 @@
1
  # main_analyzer.py
2
  import fitz # PyMuPDF
3
  import os
4
- import tempfile
5
  import re
6
  import traceback
7
  from typing import Tuple, Dict, Any, List
@@ -14,7 +14,7 @@ from pdf_processing import (
14
  )
15
  from content_analysis import (
16
  check_metadata, check_disclosures, check_figures_and_tables,
17
- check_references_summary, check_structure,
18
  check_figure_order, check_reference_order
19
  )
20
  from language_checker import perform_language_checks
@@ -22,30 +22,37 @@ from regex_checker import perform_regex_checks
22
 
23
  def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
24
  original_pdf_access_path = None
25
- temp_file_for_stream_path = None
 
 
 
26
  doc_for_mapping = None
27
 
28
  try:
29
- if isinstance(filepath_or_stream, str):
30
  original_pdf_access_path = filepath_or_stream
31
  print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
 
 
 
 
32
 
33
  if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
34
  return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
35
 
36
  print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
37
  raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
38
-
39
  pdf_size = os.path.getsize(original_pdf_access_path)
40
  if not raw_unfiltered_plain_text and pdf_size > 0 :
41
  print("Analyzer: Warning: Raw unfiltered plain text extraction yielded empty result. PDF might be image-based or have extraction issues.")
42
-
43
  cleaned_unfiltered_plain_text = re.sub(r'\s+', ' ', raw_unfiltered_plain_text.replace('\n', ' ')).strip()
44
-
45
  # 2. Font-Filtered Markdown (for LanguageTool checks)
46
  print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
47
  markdown_text_from_filtered_pdf = extract_majority_font_text_directly(original_pdf_access_path)
48
- print("markdown font print kar raha hun", markdown_text_from_filtered_pdf)
49
  if not markdown_text_from_filtered_pdf and pdf_size > 0 :
50
  print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")
51
 
@@ -59,7 +66,7 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
59
  "figure_order_analysis": check_figure_order(cleaned_unfiltered_plain_text),
60
  "reference_order_analysis": check_reference_order(cleaned_unfiltered_plain_text),
61
  "plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_unfiltered_plain_text, re.IGNORECASE)),
62
- "readability_issues_detected": False,
63
  }
64
 
65
  print("Analyzer: Performing regex checks...")
@@ -73,13 +80,15 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
73
  lt_issues = lt_report.get("issues_list", [])
74
 
75
  detailed_issues_for_mapping = regex_issues + lt_issues
76
-
77
  # 4. Coordinate Mapping (against the original PDF)
78
  if detailed_issues_for_mapping:
79
  try:
 
80
  doc_for_mapping = fitz.open(original_pdf_access_path)
81
  if doc_for_mapping.page_count > 0:
82
  print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
 
83
  for page_idx in range(doc_for_mapping.page_count):
84
  page = doc_for_mapping[page_idx]
85
  current_page_num_1_based = page_idx + 1
@@ -104,13 +113,13 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
104
  print(f"Analyzer: Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
105
  except Exception as e_map:
106
  print(f"Analyzer: Error during PDF coordinate mapping: {e_map}\n{traceback.format_exc()}")
107
- finally:
108
- if doc_for_mapping: doc_for_mapping.close()
109
  else:
110
  print("Analyzer: No detailed issues from regex or language checks to map.")
111
 
112
  # 5. Format final list of issues
113
  final_formatted_issues_list = []
 
114
  for issue_data in detailed_issues_for_mapping:
115
  coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
116
  coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
@@ -118,29 +127,41 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
118
 
119
  final_formatted_issues_list.append({
120
  "message": issue_data.get('message', 'N/A'),
121
- "context": issue_data.get('context_text', 'N/A'),
122
  "suggestions": issue_data.get('replacements_suggestion', []),
123
  "category": issue_data.get('category_name', 'Unknown'),
124
  "rule_id": issue_data.get('ruleId', 'N/A'),
125
- "offset": issue_data.get('offset_in_text', -1),
126
- "length": issue_data.get('error_length', 0),
127
  "coordinates": coords_for_json if len(coords_for_json) == 4 else [],
128
  "page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
129
  "source_check_type": issue_data.get('source_check_type', 'N/A')
130
  })
131
-
132
  results = {
133
  "issues": final_formatted_issues_list,
134
  "document_checks": document_check_results
135
  }
136
- return results, None
137
  except Exception as e:
138
  print(f"Overall analysis error in analyze_pdf: {e}\n{traceback.format_exc()}")
139
  return {"error": f"Overall analysis error: {str(e)}"}, None
140
  finally:
141
- if temp_file_for_stream_path and os.path.exists(temp_file_for_stream_path):
142
- try:
143
- os.remove(temp_file_for_stream_path)
144
- print(f"Analyzer: Cleaned up main temporary PDF file: {temp_file_for_stream_path}")
145
- except Exception as e_clean:
146
- print(f"Analyzer: Error cleaning up main temporary PDF file {temp_file_for_stream_path}: {e_clean}")
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # main_analyzer.py
2
  import fitz # PyMuPDF
3
  import os
4
+ import tempfile # Not strictly needed by analyze_pdf for input if app.py handles it
5
  import re
6
  import traceback
7
  from typing import Tuple, Dict, Any, List
 
14
  )
15
  from content_analysis import (
16
  check_metadata, check_disclosures, check_figures_and_tables,
17
+ check_references_summary, check_structure,
18
  check_figure_order, check_reference_order
19
  )
20
  from language_checker import perform_language_checks
 
22
 
23
  def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
24
  original_pdf_access_path = None
25
+ # temp_file_for_stream_path is for a scenario where analyze_pdf itself
26
+ # might convert a stream input to a temp file. In the Gradio flow,
27
+ # app.py provides a path, so this remains None.
28
+ temp_file_for_stream_path = None
29
  doc_for_mapping = None
30
 
31
  try:
32
+ if isinstance(filepath_or_stream, str):
33
  original_pdf_access_path = filepath_or_stream
34
  print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
35
+ # NOTE: If filepath_or_stream is NOT a string (e.g., a byte stream was passed directly
36
+ # to analyze_pdf without app.py's temp file step), then original_pdf_access_path
37
+ # would remain None here, and the check below would fail.
38
+ # The fix in app.py ensures original_pdf_access_path gets the temp file path.
39
 
40
  if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
41
  return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
42
 
43
  print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
44
  raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
45
+
46
  pdf_size = os.path.getsize(original_pdf_access_path)
47
  if not raw_unfiltered_plain_text and pdf_size > 0 :
48
  print("Analyzer: Warning: Raw unfiltered plain text extraction yielded empty result. PDF might be image-based or have extraction issues.")
49
+
50
  cleaned_unfiltered_plain_text = re.sub(r'\s+', ' ', raw_unfiltered_plain_text.replace('\n', ' ')).strip()
51
+
52
  # 2. Font-Filtered Markdown (for LanguageTool checks)
53
  print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
54
  markdown_text_from_filtered_pdf = extract_majority_font_text_directly(original_pdf_access_path)
55
+ print("markdown font print kar raha hun", markdown_text_from_filtered_pdf) # User's debug print
56
  if not markdown_text_from_filtered_pdf and pdf_size > 0 :
57
  print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")
58
 
 
66
  "figure_order_analysis": check_figure_order(cleaned_unfiltered_plain_text),
67
  "reference_order_analysis": check_reference_order(cleaned_unfiltered_plain_text),
68
  "plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_unfiltered_plain_text, re.IGNORECASE)),
69
+ "readability_issues_detected": False,
70
  }
71
 
72
  print("Analyzer: Performing regex checks...")
 
80
  lt_issues = lt_report.get("issues_list", [])
81
 
82
  detailed_issues_for_mapping = regex_issues + lt_issues
83
+
84
  # 4. Coordinate Mapping (against the original PDF)
85
  if detailed_issues_for_mapping:
86
  try:
87
+ # Use original_pdf_access_path which now holds the path to the (potentially temporary) PDF
88
  doc_for_mapping = fitz.open(original_pdf_access_path)
89
  if doc_for_mapping.page_count > 0:
90
  print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
91
+ # ... (rest of mapping logic as before) ...
92
  for page_idx in range(doc_for_mapping.page_count):
93
  page = doc_for_mapping[page_idx]
94
  current_page_num_1_based = page_idx + 1
 
113
  print(f"Analyzer: Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
114
  except Exception as e_map:
115
  print(f"Analyzer: Error during PDF coordinate mapping: {e_map}\n{traceback.format_exc()}")
116
+ # ensure doc_for_mapping is closed in the main finally block
 
117
  else:
118
  print("Analyzer: No detailed issues from regex or language checks to map.")
119
 
120
  # 5. Format final list of issues
121
  final_formatted_issues_list = []
122
+ # ... (rest of formatting logic as before) ...
123
  for issue_data in detailed_issues_for_mapping:
124
  coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
125
  coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
 
127
 
128
  final_formatted_issues_list.append({
129
  "message": issue_data.get('message', 'N/A'),
130
+ "context": issue_data.get('context_text', 'N/A'),
131
  "suggestions": issue_data.get('replacements_suggestion', []),
132
  "category": issue_data.get('category_name', 'Unknown'),
133
  "rule_id": issue_data.get('ruleId', 'N/A'),
134
+ "offset": issue_data.get('offset_in_text', -1),
135
+ "length": issue_data.get('error_length', 0),
136
  "coordinates": coords_for_json if len(coords_for_json) == 4 else [],
137
  "page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
138
  "source_check_type": issue_data.get('source_check_type', 'N/A')
139
  })
140
+
141
  results = {
142
  "issues": final_formatted_issues_list,
143
  "document_checks": document_check_results
144
  }
145
+ return results, None
146
  except Exception as e:
147
  print(f"Overall analysis error in analyze_pdf: {e}\n{traceback.format_exc()}")
148
  return {"error": f"Overall analysis error: {str(e)}"}, None
149
  finally:
150
+ # This finally block is for resources opened *within* analyze_pdf.
151
+ # The temp file created by app.py is managed by app.py.
152
+ # The temp_file_for_stream_path logic was for a temp file created by analyze_pdf
153
+ # itself if it received a stream; this isn't happening in the Gradio flow.
154
+ if doc_for_mapping: # Ensure the fitz document for mapping is closed
155
+ doc_for_mapping.close()
156
+ print(f"Analyzer: Closed fitz document used for mapping.")
157
+
158
+ # The original finally block for temp_file_for_stream_path:
159
+ # if temp_file_for_stream_path and os.path.exists(temp_file_for_stream_path):
160
+ # try:
161
+ # os.remove(temp_file_for_stream_path)
162
+ # print(f"Analyzer: Cleaned up main temporary PDF file: {temp_file_for_stream_path}")
163
+ # except Exception as e_clean:
164
+ # print(f"Analyzer: Error cleaning up main temporary PDF file {temp_file_for_stream_path}: {e_clean}")
165
+ # This part is removed because temp_file_for_stream_path is never assigned a value
166
+ # in the current structure of analyze_pdf. If analyze_pdf were to handle streams
167
+ # by creating its own temp file, then this cleanup would be relevant for that temp file.