File size: 19,254 Bytes
b5246f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
#!/usr/bin/env python3
"""
Hybrid TOC + PDFPlumber Parser

Combines the best of both approaches:
1. TOC-guided navigation for reliable chapter/section mapping
2. PDFPlumber's precise content extraction with formatting awareness
3. Aggressive trash content filtering while preserving actual content

This hybrid approach provides:
- Reliable structure detection (TOC)
- High-quality content extraction (PDFPlumber)
- Optimal chunk sizing and quality
- Fast processing with precise results

Author: Arthur Passuello
Date: 2025-07-01
"""

import re
import pdfplumber
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass

from .toc_guided_parser import TOCGuidedParser, TOCEntry
from .pdfplumber_parser import PDFPlumberParser


class HybridParser:
    """
    Hybrid parser combining TOC navigation with PDFPlumber extraction.
    
    Architecture:
    1. Use TOC to identify chapter/section boundaries and pages
    2. Use PDFPlumber to extract clean content from those specific pages
    3. Apply aggressive content filtering to remove trash
    4. Create optimal chunks with preserved structure
    """
    
    def __init__(self, target_chunk_size: int = 1400, min_chunk_size: int = 800, 
                 max_chunk_size: int = 2000):
        """Initialize hybrid parser."""
        self.target_chunk_size = target_chunk_size
        self.min_chunk_size = min_chunk_size
        self.max_chunk_size = max_chunk_size
        
        # Initialize component parsers
        self.toc_parser = TOCGuidedParser(target_chunk_size, min_chunk_size, max_chunk_size)
        self.plumber_parser = PDFPlumberParser(target_chunk_size, min_chunk_size, max_chunk_size)
        
        # Content filtering patterns (aggressive trash removal)
        self.trash_patterns = [
            # License and legal text
            r'Creative Commons.*?License',
            r'International License.*?authors',
            r'released under.*?license',
            r'derivative of.*?License',
            r'Document Version \d+',
            
            # Table of contents artifacts
            r'\.{3,}',  # Multiple dots
            r'^\s*\d+\s*$',  # Standalone page numbers
            r'Contents\s*$',
            r'Preface\s*$',
            
            # PDF formatting artifacts
            r'Volume\s+[IVX]+:.*?V\d+',
            r'^\s*[ivx]+\s*$',  # Roman numerals alone
            r'^\s*[\d\w\s]{1,3}\s*$',  # Very short meaningless lines
            
            # Redundant headers and footers
            r'RISC-V.*?ISA.*?V\d+',
            r'Volume I:.*?Unprivileged',
            
            # Editor and publication info
            r'Editors?:.*?[A-Z][a-z]+',
            r'[A-Z][a-z]+\s+\d{1,2},\s+\d{4}',  # Dates
            r'@[a-z]+\.[a-z]+',  # Email addresses
            
            # Boilerplate text
            r'please contact editors to suggest corrections',
            r'alphabetical order.*?corrections',
            r'contributors to all versions',
        ]
        
        # Content quality patterns (preserve these)
        self.preserve_patterns = [
            r'RISC-V.*?instruction',
            r'register.*?file',
            r'memory.*?operation',
            r'processor.*?implementation',
            r'architecture.*?design',
        ]
        
        # TOC-specific patterns to exclude from searchable content
        self.toc_exclusion_patterns = [
            r'^\s*Contents\s*$',
            r'^\s*Table\s+of\s+Contents\s*$',
            r'^\s*\d+(?:\.\d+)*\s*$',  # Standalone section numbers
            r'^\s*\d+(?:\.\d+)*\s+[A-Z]',  # "1.1 INTRODUCTION" style
            r'\.{3,}',  # Multiple dots (TOC formatting)
            r'^\s*Chapter\s+\d+\s*$',  # Standalone "Chapter N"
            r'^\s*Section\s+\d+(?:\.\d+)*\s*$',  # Standalone "Section N.M"
            r'^\s*Appendix\s+[A-Z]\s*$',  # Standalone "Appendix A"
            r'^\s*[ivxlcdm]+\s*$',  # Roman numerals alone
            r'^\s*Preface\s*$',
            r'^\s*Introduction\s*$',
            r'^\s*Conclusion\s*$',
            r'^\s*Bibliography\s*$',
            r'^\s*Index\s*$',
        ]
    
    def parse_document(self, pdf_path: Path, pdf_data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Parse document using hybrid approach.
        
        Args:
            pdf_path: Path to PDF file
            pdf_data: PDF data from extract_text_with_metadata()
            
        Returns:
            List of high-quality chunks with preserved structure
        """
        print("πŸ”— Starting Hybrid TOC + PDFPlumber parsing...")
        
        # Step 1: Use TOC to identify structure
        print("πŸ“‹ Step 1: Extracting TOC structure...")
        toc_entries = self.toc_parser.parse_toc(pdf_data['pages'])
        print(f"   Found {len(toc_entries)} TOC entries")
        
        # Check if TOC is reliable (multiple entries or quality single entry)
        toc_is_reliable = (
            len(toc_entries) > 1 or  # Multiple entries = likely real TOC
            (len(toc_entries) == 1 and len(toc_entries[0].title) > 10)  # Quality single entry
        )
        
        if not toc_entries or not toc_is_reliable:
            if not toc_entries:
                print("   ⚠️ No TOC found, using full page coverage parsing")
            else:
                print(f"   ⚠️ TOC quality poor (title: '{toc_entries[0].title}'), using full page coverage")
            return self.plumber_parser.parse_document(pdf_path, pdf_data)
        
        # Step 2: Use PDFPlumber for precise extraction
        print("πŸ”¬ Step 2: PDFPlumber extraction of TOC sections...")
        chunks = []
        chunk_id = 0
        
        with pdfplumber.open(str(pdf_path)) as pdf:
            for i, toc_entry in enumerate(toc_entries):
                next_entry = toc_entries[i + 1] if i + 1 < len(toc_entries) else None
                
                # Extract content using PDFPlumber
                section_content = self._extract_section_with_plumber(
                    pdf, toc_entry, next_entry
                )
                
                if section_content:
                    # Apply aggressive content filtering
                    cleaned_content = self._filter_trash_content(section_content)
                    
                    if cleaned_content and len(cleaned_content) >= 200:  # Minimum meaningful content
                        # Create chunks from cleaned content
                        section_chunks = self._create_chunks_from_clean_content(
                            cleaned_content, chunk_id, toc_entry
                        )
                        chunks.extend(section_chunks)
                        chunk_id += len(section_chunks)
        
        print(f"   Created {len(chunks)} high-quality chunks")
        return chunks
    
    def _extract_section_with_plumber(self, pdf, toc_entry: TOCEntry, 
                                     next_entry: Optional[TOCEntry]) -> str:
        """
        Extract section content using PDFPlumber's precise extraction.
        
        Args:
            pdf: PDFPlumber PDF object
            toc_entry: Current TOC entry
            next_entry: Next TOC entry (for boundary detection)
            
        Returns:
            Clean extracted content for this section
        """
        start_page = max(0, toc_entry.page - 1)  # Convert to 0-indexed
        
        if next_entry:
            end_page = min(len(pdf.pages), next_entry.page - 1)
        else:
            end_page = len(pdf.pages)
        
        content_parts = []
        
        for page_idx in range(start_page, end_page):
            if page_idx < len(pdf.pages):
                page = pdf.pages[page_idx]
                
                # Extract text with PDFPlumber (preserves formatting)
                page_text = page.extract_text()
                
                if page_text:
                    # Clean page content while preserving structure
                    cleaned_text = self._clean_page_content_precise(page_text)
                    if cleaned_text.strip():
                        content_parts.append(cleaned_text)
        
        return ' '.join(content_parts)
    
    def _clean_page_content_precise(self, page_text: str) -> str:
        """
        Clean page content with precision, removing artifacts but preserving content.
        
        Args:
            page_text: Raw page text from PDFPlumber
            
        Returns:
            Cleaned text with artifacts removed
        """
        lines = page_text.split('\n')
        cleaned_lines = []
        
        for line in lines:
            line = line.strip()
            
            # Skip empty lines
            if not line:
                continue
            
            # Skip obvious artifacts but be conservative
            if (len(line) < 3 or  # Very short lines
                re.match(r'^\d+$', line) or  # Standalone numbers
                re.match(r'^[ivx]+$', line.lower()) or  # Roman numerals alone
                '.' * 5 in line):  # TOC dots
                continue
            
            # Preserve technical content even if it looks like an artifact
            has_technical_content = any(term in line.lower() for term in [
                'risc', 'register', 'instruction', 'memory', 'processor', 
                'architecture', 'implementation', 'specification'
            ])
            
            if has_technical_content or len(line) >= 10:
                cleaned_lines.append(line)
        
        return ' '.join(cleaned_lines)
    
    def _filter_trash_content(self, content: str) -> str:
        """
        Apply aggressive trash filtering while preserving actual content.
        
        Args:
            content: Raw content to filter
            
        Returns:
            Content with trash removed but technical content preserved
        """
        if not content.strip():
            return ""
        
        # First, identify and preserve important technical sentences
        sentences = re.split(r'[.!?]+\s*', content)
        preserved_sentences = []
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            
            # Check if sentence contains important technical content
            is_technical = any(term in sentence.lower() for term in [
                'risc-v', 'register', 'instruction', 'memory', 'processor',
                'architecture', 'implementation', 'specification', 'encoding',
                'bit', 'byte', 'address', 'data', 'control', 'operand'
            ])
            
            # Check if sentence is trash (including general trash and TOC content)
            is_trash = any(re.search(pattern, sentence, re.IGNORECASE) 
                          for pattern in self.trash_patterns)
            
            # Check if sentence is TOC content (should be excluded)
            is_toc_content = any(re.search(pattern, sentence, re.IGNORECASE) 
                               for pattern in self.toc_exclusion_patterns)
            
            # Preserve if technical and not trash/TOC, or if substantial and not clearly trash/TOC
            if ((is_technical and not is_trash and not is_toc_content) or 
                (len(sentence) > 50 and not is_trash and not is_toc_content)):
                preserved_sentences.append(sentence)
        
        # Reconstruct content from preserved sentences
        filtered_content = '. '.join(preserved_sentences)
        
        # Final cleanup
        filtered_content = re.sub(r'\s+', ' ', filtered_content)  # Normalize whitespace
        filtered_content = re.sub(r'\.+', '.', filtered_content)  # Remove multiple dots
        
        # Ensure proper sentence ending
        if filtered_content and not filtered_content.rstrip().endswith(('.', '!', '?', ':', ';')):
            filtered_content = filtered_content.rstrip() + '.'
        
        return filtered_content.strip()
    
    def _create_chunks_from_clean_content(self, content: str, start_chunk_id: int, 
                                         toc_entry: TOCEntry) -> List[Dict[str, Any]]:
        """
        Create optimally-sized chunks from clean content.
        
        Args:
            content: Clean, filtered content
            start_chunk_id: Starting chunk ID
            toc_entry: TOC entry metadata
            
        Returns:
            List of chunk dictionaries
        """
        if not content or len(content) < 100:
            return []
        
        chunks = []
        
        # If content fits in one chunk, create single chunk
        if self.min_chunk_size <= len(content) <= self.max_chunk_size:
            chunk = self._create_chunk(content, start_chunk_id, toc_entry)
            chunks.append(chunk)
        
        # If too large, split intelligently at sentence boundaries
        elif len(content) > self.max_chunk_size:
            sub_chunks = self._split_large_content_smart(content, start_chunk_id, toc_entry)
            chunks.extend(sub_chunks)
        
        # If too small but substantial, keep it
        elif len(content) >= 200:  # Lower threshold for cleaned content
            chunk = self._create_chunk(content, start_chunk_id, toc_entry)
            chunks.append(chunk)
        
        return chunks
    
    def _split_large_content_smart(self, content: str, start_chunk_id: int, 
                                  toc_entry: TOCEntry) -> List[Dict[str, Any]]:
        """
        Split large content intelligently at natural boundaries.
        
        Args:
            content: Content to split
            start_chunk_id: Starting chunk ID
            toc_entry: TOC entry metadata
            
        Returns:
            List of chunk dictionaries
        """
        chunks = []
        
        # Split at sentence boundaries
        sentences = re.split(r'([.!?:;]+\s*)', content)
        
        current_chunk = ""
        chunk_id = start_chunk_id
        
        for i in range(0, len(sentences), 2):
            sentence = sentences[i].strip()
            if not sentence:
                continue
            
            # Add punctuation if available
            punctuation = sentences[i + 1] if i + 1 < len(sentences) else '.'
            full_sentence = sentence + punctuation
            
            # Check if adding this sentence exceeds max size
            potential_chunk = current_chunk + (" " if current_chunk else "") + full_sentence
            
            if len(potential_chunk) <= self.max_chunk_size:
                current_chunk = potential_chunk
            else:
                # Save current chunk if it meets minimum size
                if current_chunk and len(current_chunk) >= self.min_chunk_size:
                    chunk = self._create_chunk(current_chunk, chunk_id, toc_entry)
                    chunks.append(chunk)
                    chunk_id += 1
                
                # Start new chunk
                current_chunk = full_sentence
        
        # Add final chunk if substantial
        if current_chunk and len(current_chunk) >= 200:
            chunk = self._create_chunk(current_chunk, chunk_id, toc_entry)
            chunks.append(chunk)
        
        return chunks
    
    def _create_chunk(self, content: str, chunk_id: int, toc_entry: TOCEntry) -> Dict[str, Any]:
        """Create a chunk dictionary with hybrid metadata."""
        return {
            "text": content,
            "chunk_id": chunk_id,
            "title": toc_entry.title,
            "parent_title": toc_entry.parent_title,
            "level": toc_entry.level,
            "page": toc_entry.page,
            "size": len(content),
            "metadata": {
                "parsing_method": "hybrid_toc_pdfplumber",
                "has_context": True,
                "content_type": "filtered_structured_content",
                "quality_score": self._calculate_quality_score(content),
                "trash_filtered": True
            }
        }
    
    def _calculate_quality_score(self, content: str) -> float:
        """Calculate quality score for filtered content."""
        if not content.strip():
            return 0.0
        
        words = content.split()
        score = 0.0
        
        # Length score (25%)
        if self.min_chunk_size <= len(content) <= self.max_chunk_size:
            score += 0.25
        elif len(content) >= 200:  # At least some content
            score += 0.15
        
        # Content richness (25%)
        substantial_words = sum(1 for word in words if len(word) > 3)
        richness_score = min(substantial_words / 30, 1.0)  # Lower threshold for filtered content
        score += richness_score * 0.25
        
        # Technical content (30%)
        technical_terms = ['risc', 'register', 'instruction', 'cpu', 'memory', 'processor', 'architecture']
        technical_count = sum(1 for word in words if any(term in word.lower() for term in technical_terms))
        technical_score = min(technical_count / 3, 1.0)  # Lower threshold
        score += technical_score * 0.30
        
        # Completeness (20%)
        completeness_score = 0.0
        if content[0].isupper() or content.startswith(('The ', 'A ', 'An ', 'RISC')):
            completeness_score += 0.5
        if content.rstrip().endswith(('.', '!', '?', ':', ';')):
            completeness_score += 0.5
        score += completeness_score * 0.20
        
        return min(score, 1.0)


def parse_pdf_with_hybrid_approach(pdf_path: Path, pdf_data: Dict[str, Any],
                                  target_chunk_size: int = 1400, min_chunk_size: int = 800,
                                  max_chunk_size: int = 2000) -> List[Dict[str, Any]]:
    """
    Parse PDF using hybrid TOC + PDFPlumber approach.
    
    This function combines:
    1. TOC-guided structure detection for reliable navigation
    2. PDFPlumber's precise content extraction
    3. Aggressive trash filtering while preserving technical content
    
    Args:
        pdf_path: Path to PDF file
        pdf_data: PDF data from extract_text_with_metadata()
        target_chunk_size: Preferred chunk size
        min_chunk_size: Minimum chunk size
        max_chunk_size: Maximum chunk size
        
    Returns:
        List of high-quality, filtered chunks ready for RAG indexing
        
    Example:
        >>> from shared_utils.document_processing.pdf_parser import extract_text_with_metadata
        >>> from shared_utils.document_processing.hybrid_parser import parse_pdf_with_hybrid_approach
        >>> 
        >>> pdf_data = extract_text_with_metadata("document.pdf")
        >>> chunks = parse_pdf_with_hybrid_approach(Path("document.pdf"), pdf_data)
        >>> print(f"Created {len(chunks)} hybrid-parsed chunks")
    """
    parser = HybridParser(target_chunk_size, min_chunk_size, max_chunk_size)
    return parser.parse_document(pdf_path, pdf_data)


# Example usage
if __name__ == "__main__":
    print("Hybrid TOC + PDFPlumber Parser")
    print("Combines TOC navigation with PDFPlumber precision and aggressive trash filtering")