samyak152002 commited on
Commit
070b77e
·
verified ·
1 Parent(s): fab5be2

Update main_analyzer.py

Browse files
Files changed (1) hide show
  1. main_analyzer.py +120 -92
main_analyzer.py CHANGED
@@ -4,127 +4,155 @@ import os
4
  import tempfile
5
  import re
6
  import traceback
7
- from typing import Tuple, Dict, Any, List, Optional
8
  from collections import defaultdict
9
 
10
- # Import functions from our refactored modules
11
- from pdf_processing import extract_pdf_text, try_map_issues_to_page_rects # convert_rect_to_dict is used by try_map_issues
12
- from text_utils import convert_markdown_to_plain_text
 
 
13
  from content_analysis import (
14
  check_metadata, check_disclosures, check_figures_and_tables,
15
- check_references_summary, check_structure, check_language_issues_and_regex,
16
  check_figure_order, check_reference_order
17
  )
18
-
 
 
 
19
 
20
  def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
 
 
21
  doc_for_mapping = None
22
- temp_fitz_file_path = None
23
 
24
  try:
25
- markdown_text = extract_pdf_text(filepath_or_stream)
26
- if not markdown_text:
27
- return {"error": "Failed to extract text (Markdown) from PDF."}, None
28
-
29
- plain_text_for_general_checks = convert_markdown_to_plain_text(markdown_text)
30
- cleaned_plain_text_for_regex = re.sub(r'\s+', ' ', plain_text_for_general_checks.replace('\n', ' ')).strip()
 
 
 
 
 
 
 
 
31
 
32
- language_and_regex_issue_report = check_language_issues_and_regex(markdown_text)
 
 
 
 
 
 
33
 
34
- if "error" in language_and_regex_issue_report:
35
- return {"error": f"Language/Regex check error: {language_and_regex_issue_report['error']}"}, None
36
 
37
- detailed_issues_for_mapping = language_and_regex_issue_report.get("issues_list", [])
 
 
 
 
38
 
39
- if detailed_issues_for_mapping:
40
- if isinstance(filepath_or_stream, str):
41
- pdf_path_for_fitz = filepath_or_stream
42
- elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
43
- filepath_or_stream.seek(0)
44
- temp_fitz_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
45
- temp_fitz_file_path = temp_fitz_file.name
46
- temp_fitz_file.write(filepath_or_stream.read())
47
- temp_fitz_file.close()
48
- pdf_path_for_fitz = temp_fitz_file_path
49
- else:
50
- return {"error": "Invalid PDF input for coordinate mapping."}, None
 
 
 
 
 
 
 
 
 
 
51
 
 
 
 
 
52
  try:
53
- doc_for_mapping = fitz.open(pdf_path_for_fitz)
54
  if doc_for_mapping.page_count > 0:
55
- print(f"\n--- Mapping {len(detailed_issues_for_mapping)} Issues (filtered) to PDF Coordinates ---")
56
- if detailed_issues_for_mapping:
57
- for page_idx in range(doc_for_mapping.page_count):
58
- page = doc_for_mapping[page_idx]
59
- current_page_num_1_based = page_idx + 1
60
-
61
- unmapped_issues_on_this_page_by_context = defaultdict(list)
62
- for issue_dict in detailed_issues_for_mapping:
63
- if not issue_dict['is_mapped_to_pdf']:
64
- unmapped_issues_on_this_page_by_context[issue_dict['context_text']].append(issue_dict)
65
 
66
- if not unmapped_issues_on_this_page_by_context:
67
- if all(iss['is_mapped_to_pdf'] for iss in detailed_issues_for_mapping): break
68
- continue
69
 
70
- for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
71
- if not ctx_str.strip(): continue
72
- try:
73
- pdf_rects = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
74
- if pdf_rects:
75
- try_map_issues_to_page_rects(issues_for_ctx, pdf_rects, current_page_num_1_based)
76
- except Exception as search_exc:
77
- print(f"Warning: Error searching for context '{ctx_str[:30]}' on page {current_page_num_1_based}: {search_exc}")
78
- total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
79
- print(f"Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
80
- else:
81
- print("No language/regex issues found within the defined content boundaries to map.")
82
  except Exception as e_map:
83
- print(f"Error during PDF coordinate mapping: {e_map}")
84
- traceback.print_exc()
85
  finally:
86
  if doc_for_mapping: doc_for_mapping.close()
87
- if temp_fitz_file_path and os.path.exists(temp_fitz_file_path):
88
- os.unlink(temp_fitz_file_path)
89
-
 
90
  final_formatted_issues_list = []
91
  for issue_data in detailed_issues_for_mapping:
92
- page_num_for_json = 0
93
- coords_for_json = []
94
- if issue_data['is_mapped_to_pdf'] and issue_data['pdf_coordinates_list']:
95
- coord_dict = issue_data['pdf_coordinates_list'][0]
96
- coords_for_json = [coord_dict['x0'], coord_dict['y0'], coord_dict['x1'], coord_dict['y1']]
97
- page_num_for_json = issue_data['mapped_page_number']
98
-
99
  final_formatted_issues_list.append({
100
- "message": issue_data['message'], "context": issue_data['context_text'],
101
- "suggestions": issue_data['replacements_suggestion'], "category": issue_data['category_name'],
102
- "rule_id": issue_data['ruleId'], "offset": issue_data['offset_in_text'],
103
- "length": issue_data['error_length'], "coordinates": coords_for_json,
104
- "page": page_num_for_json
 
 
 
 
 
105
  })
106
-
107
  results = {
108
  "issues": final_formatted_issues_list,
109
- "document_checks": {
110
- "metadata": check_metadata(cleaned_plain_text_for_regex),
111
- "disclosures": check_disclosures(cleaned_plain_text_for_regex),
112
- "figures_and_tables": check_figures_and_tables(cleaned_plain_text_for_regex),
113
- "references_summary": check_references_summary(cleaned_plain_text_for_regex),
114
- "structure": check_structure(cleaned_plain_text_for_regex),
115
- "figure_order_analysis": check_figure_order(cleaned_plain_text_for_regex),
116
- "reference_order_analysis": check_reference_order(cleaned_plain_text_for_regex),
117
- "plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_plain_text_for_regex, re.IGNORECASE)),
118
- "readability_issues_detected": False,
119
- }
120
  }
121
-
122
  return results, None
123
-
124
  except Exception as e:
125
- print(f"Overall analysis error in analyze_pdf: {e}")
126
- traceback.print_exc()
127
- if doc_for_mapping: doc_for_mapping.close()
128
- if temp_fitz_file_path and os.path.exists(temp_fitz_file_path):
129
- os.unlink(temp_fitz_file_path)
130
- return {"error": str(e)}, None
 
 
 
 
4
  import tempfile
5
  import re
6
  import traceback
7
+ from typing import Tuple, Dict, Any, List
8
  from collections import defaultdict
9
 
10
+ from pdf_processing import (
11
+ extract_font_filtered_markdown,
12
+ extract_plain_text_from_original_pdf,
13
+ try_map_issues_to_page_rects
14
+ )
15
  from content_analysis import (
16
  check_metadata, check_disclosures, check_figures_and_tables,
17
+ check_references_summary, check_structure,
18
  check_figure_order, check_reference_order
19
  )
20
+ from language_checker import perform_language_checks
21
+ from regex_checker import perform_regex_checks
22
+ # text_utils.convert_markdown_to_plain_text is used by language_checker
23
+ # config.py is imported in app.py
24
 
25
  def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
26
+ original_pdf_access_path = None
27
+ temp_file_for_stream_path = None
28
  doc_for_mapping = None
 
29
 
30
  try:
31
+ if isinstance(filepath_or_stream, str):
32
+ original_pdf_access_path = filepath_or_stream
33
+ elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
34
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
35
+ temp_file_for_stream_path = temp_file_obj.name
36
+ filepath_or_stream.seek(0)
37
+ temp_file_obj.write(filepath_or_stream.read())
38
+ original_pdf_access_path = temp_file_for_stream_path
39
+ print(f"Analyzer: Original PDF stream saved to temp file: {original_pdf_access_path}")
40
+ else:
41
+ return {"error": "Invalid PDF input type. Must be path or file-like object."}, None
42
+
43
+ if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
44
+ return {"error": f"PDF path '{original_pdf_access_path}' does not exist or is invalid."}, None
45
 
46
+ # 1. Unfiltered Plain Text (for general and regex checks)
47
+ print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
48
+ raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
49
+
50
+ pdf_size = os.path.getsize(original_pdf_access_path)
51
+ if not raw_unfiltered_plain_text and pdf_size > 0 :
52
+ print("Analyzer: Warning: Raw unfiltered plain text extraction yielded empty result. PDF might be image-based or have extraction issues.")
53
 
54
+ cleaned_unfiltered_plain_text = re.sub(r'\s+', ' ', raw_unfiltered_plain_text.replace('\n', ' ')).strip()
 
55
 
56
+ # 2. Font-Filtered Markdown (for LanguageTool checks)
57
+ print(f"Analyzer: Extracting font-filtered markdown from: {original_pdf_access_path}")
58
+ markdown_text_from_filtered_pdf = extract_font_filtered_markdown(original_pdf_access_path)
59
+ if not markdown_text_from_filtered_pdf and pdf_size > 0 :
60
+ print("Analyzer: Warning: Font-filtered Markdown extraction yielded empty result.")
61
 
62
+ # 3. Perform all checks
63
+ document_check_results = {
64
+ "metadata": check_metadata(cleaned_unfiltered_plain_text),
65
+ "disclosures": check_disclosures(cleaned_unfiltered_plain_text),
66
+ "figures_and_tables": check_figures_and_tables(cleaned_unfiltered_plain_text),
67
+ "references_summary": check_references_summary(cleaned_unfiltered_plain_text),
68
+ "structure": check_structure(cleaned_unfiltered_plain_text),
69
+ "figure_order_analysis": check_figure_order(cleaned_unfiltered_plain_text),
70
+ "reference_order_analysis": check_reference_order(cleaned_unfiltered_plain_text),
71
+ "plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_unfiltered_plain_text, re.IGNORECASE)),
72
+ "readability_issues_detected": False,
73
+ }
74
+
75
+ print("Analyzer: Performing regex checks...")
76
+ regex_report = perform_regex_checks(cleaned_unfiltered_plain_text)
77
+ if "error" in regex_report: print(f"Analyzer: Error in regex checks: {regex_report['error']}")
78
+ regex_issues = regex_report.get("issues_list", [])
79
+
80
+ print("Analyzer: Performing language checks...")
81
+ lt_report = perform_language_checks(markdown_text_from_filtered_pdf)
82
+ if "error" in lt_report: print(f"Analyzer: Error in LanguageTool checks: {lt_report['error']}")
83
+ lt_issues = lt_report.get("issues_list", [])
84
 
85
+ detailed_issues_for_mapping = regex_issues + lt_issues
86
+
87
+ # 4. Coordinate Mapping (against the original PDF)
88
+ if detailed_issues_for_mapping:
89
  try:
90
+ doc_for_mapping = fitz.open(original_pdf_access_path)
91
  if doc_for_mapping.page_count > 0:
92
+ print(f"Analyzer: Mapping {len(detailed_issues_for_mapping)} issues to PDF coordinates...")
93
+ for page_idx in range(doc_for_mapping.page_count):
94
+ page = doc_for_mapping[page_idx]
95
+ current_page_num_1_based = page_idx + 1
96
+ unmapped_issues_on_this_page_by_context = defaultdict(list)
97
+ for issue_dict in detailed_issues_for_mapping:
98
+ if not issue_dict['is_mapped_to_pdf']:
99
+ unmapped_issues_on_this_page_by_context[issue_dict['context_text']].append(issue_dict)
 
 
100
 
101
+ if not unmapped_issues_on_this_page_by_context:
102
+ if all(iss['is_mapped_to_pdf'] for iss in detailed_issues_for_mapping): break
103
+ continue
104
 
105
+ for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
106
+ if not ctx_str or not ctx_str.strip(): continue
107
+ try:
108
+ pdf_rects = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
109
+ if pdf_rects:
110
+ try_map_issues_to_page_rects(issues_for_ctx, pdf_rects, current_page_num_1_based)
111
+ except Exception as search_exc:
112
+ print(f"Analyzer: Warning: Error searching for context '{ctx_str[:30].replace(chr(10),' ')}' on page {current_page_num_1_based}: {search_exc}")
113
+ total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
114
+ print(f"Analyzer: Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
 
 
115
  except Exception as e_map:
116
+ print(f"Analyzer: Error during PDF coordinate mapping: {e_map}\n{traceback.format_exc()}")
 
117
  finally:
118
  if doc_for_mapping: doc_for_mapping.close()
119
+ else:
120
+ print("Analyzer: No detailed issues from regex or language checks to map.")
121
+
122
+ # 5. Format final list of issues
123
  final_formatted_issues_list = []
124
  for issue_data in detailed_issues_for_mapping:
125
+ coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
126
+ coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
127
+ # Filter out None coordinates that might arise from empty coords dict
128
+ coords_for_json = [c for c in coords_for_json if c is not None]
129
+
130
+
 
131
  final_formatted_issues_list.append({
132
+ "message": issue_data.get('message', 'N/A'),
133
+ "context": issue_data.get('context_text', 'N/A'),
134
+ "suggestions": issue_data.get('replacements_suggestion', []),
135
+ "category": issue_data.get('category_name', 'Unknown'),
136
+ "rule_id": issue_data.get('ruleId', 'N/A'),
137
+ "offset": issue_data.get('offset_in_text', -1),
138
+ "length": issue_data.get('error_length', 0),
139
+ "coordinates": coords_for_json if len(coords_for_json) == 4 else [], # Ensure 4 coords or empty
140
+ "page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
141
+ "source_check_type": issue_data.get('source_check_type', 'N/A')
142
  })
143
+
144
  results = {
145
  "issues": final_formatted_issues_list,
146
+ "document_checks": document_check_results
 
 
 
 
 
 
 
 
 
 
147
  }
 
148
  return results, None
 
149
  except Exception as e:
150
+ print(f"Overall analysis error in analyze_pdf: {e}\n{traceback.format_exc()}")
151
+ return {"error": f"Overall analysis error: {str(e)}"}, None
152
+ finally:
153
+ if temp_file_for_stream_path and os.path.exists(temp_file_for_stream_path):
154
+ try:
155
+ os.remove(temp_file_for_stream_path)
156
+ print(f"Analyzer: Cleaned up main temporary PDF file: {temp_file_for_stream_path}")
157
+ except Exception as e_clean:
158
+ print(f"Analyzer: Error cleaning up main temporary PDF file {temp_file_for_stream_path}: {e_clean}")