samyak152002 commited on
Commit
b690306
·
verified ·
1 Parent(s): d37148b

Update main_analyzer.py

Browse files
Files changed (1) hide show
  1. main_analyzer.py +17 -10
main_analyzer.py CHANGED
@@ -19,8 +19,6 @@ from content_analysis import (
19
  )
20
  from language_checker import perform_language_checks
21
  from regex_checker import perform_regex_checks
22
- # text_utils.convert_markdown_to_plain_text is used by language_checker
23
- # config.py is imported in app.py
24
 
25
  def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
26
  original_pdf_access_path = None
@@ -28,21 +26,32 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
28
  doc_for_mapping = None
29
 
30
  try:
31
- if isinstance(filepath_or_stream, str):
32
  original_pdf_access_path = filepath_or_stream
 
 
 
 
 
 
 
 
 
33
  elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
34
  with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
35
  temp_file_for_stream_path = temp_file_obj.name
36
- filepath_or_stream.seek(0)
 
37
  temp_file_obj.write(filepath_or_stream.read())
38
  original_pdf_access_path = temp_file_for_stream_path
39
- print(f"Analyzer: Original PDF stream saved to temp file: {original_pdf_access_path}")
40
  else:
41
- return {"error": "Invalid PDF input type. Must be path or file-like object."}, None
42
 
43
  if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
44
- return {"error": f"PDF path '{original_pdf_access_path}' does not exist or is invalid."}, None
45
 
 
46
  # 1. Unfiltered Plain Text (for general and regex checks)
47
  print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
48
  raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
@@ -124,10 +133,8 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
124
  for issue_data in detailed_issues_for_mapping:
125
  coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
126
  coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
127
- # Filter out None coordinates that might arise from empty coords dict
128
  coords_for_json = [c for c in coords_for_json if c is not None]
129
 
130
-
131
  final_formatted_issues_list.append({
132
  "message": issue_data.get('message', 'N/A'),
133
  "context": issue_data.get('context_text', 'N/A'),
@@ -136,7 +143,7 @@ def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
136
  "rule_id": issue_data.get('ruleId', 'N/A'),
137
  "offset": issue_data.get('offset_in_text', -1),
138
  "length": issue_data.get('error_length', 0),
139
- "coordinates": coords_for_json if len(coords_for_json) == 4 else [], # Ensure 4 coords or empty
140
  "page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
141
  "source_check_type": issue_data.get('source_check_type', 'N/A')
142
  })
 
19
  )
20
  from language_checker import perform_language_checks
21
  from regex_checker import perform_regex_checks
 
 
22
 
23
  def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
24
  original_pdf_access_path = None
 
26
  doc_for_mapping = None
27
 
28
  try:
29
+ if isinstance(filepath_or_stream, str):
30
  original_pdf_access_path = filepath_or_stream
31
+ print(f"Analyzer: Input is a string path: {original_pdf_access_path}")
32
+ # Check for objects like Gradio's NamedString or TemporaryFileWrapper's .name attribute
33
+ elif hasattr(filepath_or_stream, 'name') and isinstance(getattr(filepath_or_stream, 'name'), str) and \
34
+ os.path.exists(getattr(filepath_or_stream, 'name')): # Ensure the .name path is valid
35
+ original_pdf_access_path = filepath_or_stream.name
36
+ print(f"Analyzer: Input is an object with .name attribute, using path: {original_pdf_access_path}")
37
+ # If this object also has a .read method, it might be a TemporaryFileWrapper.
38
+ # The next elif would handle it if we prefer processing it as a stream,
39
+ # but using its .name path is usually fine and simpler.
40
  elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
41
  with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file_obj:
42
  temp_file_for_stream_path = temp_file_obj.name
43
+ if hasattr(filepath_or_stream, 'seek') and callable(filepath_or_stream.seek):
44
+ filepath_or_stream.seek(0)
45
  temp_file_obj.write(filepath_or_stream.read())
46
  original_pdf_access_path = temp_file_for_stream_path
47
+ print(f"Analyzer: Input stream saved to temp file: {original_pdf_access_path}")
48
  else:
49
+ return {"error": f"Invalid PDF input type: {type(filepath_or_stream)}. Must be path string, an object with a .name attribute as path, or file-like stream object."}, None
50
 
51
  if not original_pdf_access_path or not os.path.exists(original_pdf_access_path):
52
+ return {"error": f"PDF path '{original_pdf_access_path}' (derived from input) does not exist or is invalid."}, None
53
 
54
+ # --- The rest of the function remains the same as the previous complete listing ---
55
  # 1. Unfiltered Plain Text (for general and regex checks)
56
  print(f"Analyzer: Extracting plain text from original PDF: {original_pdf_access_path}")
57
  raw_unfiltered_plain_text = extract_plain_text_from_original_pdf(original_pdf_access_path)
 
133
  for issue_data in detailed_issues_for_mapping:
134
  coords = issue_data.get('pdf_coordinates_list', [{}])[0] if issue_data.get('is_mapped_to_pdf') else {}
135
  coords_for_json = [coords.get("x0"), coords.get("y0"), coords.get("x1"), coords.get("y1")] if coords else []
 
136
  coords_for_json = [c for c in coords_for_json if c is not None]
137
 
 
138
  final_formatted_issues_list.append({
139
  "message": issue_data.get('message', 'N/A'),
140
  "context": issue_data.get('context_text', 'N/A'),
 
143
  "rule_id": issue_data.get('ruleId', 'N/A'),
144
  "offset": issue_data.get('offset_in_text', -1),
145
  "length": issue_data.get('error_length', 0),
146
+ "coordinates": coords_for_json if len(coords_for_json) == 4 else [],
147
  "page": issue_data.get('mapped_page_number', 0) if issue_data.get('is_mapped_to_pdf') else 0,
148
  "source_check_type": issue_data.get('source_check_type', 'N/A')
149
  })