Arthur Passuello
Added missing sources
b5246f1
#!/usr/bin/env python3
"""
TOC-Guided PDF Parser
Uses the Table of Contents to guide intelligent chunking that respects
document structure and hierarchy.
Author: Arthur Passuello
"""
import re
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
@dataclass
class TOCEntry:
"""Represents a table of contents entry."""
title: str
page: int
level: int # 0 for chapters, 1 for sections, 2 for subsections
parent: Optional[str] = None
parent_title: Optional[str] = None # Added for hybrid parser compatibility
class TOCGuidedParser:
"""Parser that uses TOC to create structure-aware chunks."""
def __init__(self, target_chunk_size: int = 1400, min_chunk_size: int = 800,
max_chunk_size: int = 2000):
"""Initialize TOC-guided parser."""
self.target_chunk_size = target_chunk_size
self.min_chunk_size = min_chunk_size
self.max_chunk_size = max_chunk_size
def parse_toc(self, pages: List[Dict]) -> List[TOCEntry]:
"""Parse table of contents from pages."""
toc_entries = []
# Find TOC pages (usually early in document)
toc_pages = []
for i, page in enumerate(pages[:20]): # Check first 20 pages
page_text = page.get('text', '').lower()
if 'contents' in page_text or 'table of contents' in page_text:
toc_pages.append((i, page))
if not toc_pages:
print("No TOC found, using fallback structure detection")
return self._detect_structure_without_toc(pages)
# Parse TOC entries
for page_idx, page in toc_pages:
text = page.get('text', '')
lines = text.split('\n')
i = 0
while i < len(lines):
line = lines[i].strip()
# Skip empty lines and TOC header
if not line or 'contents' in line.lower():
i += 1
continue
# Pattern 1: "1.1 Title .... 23"
match1 = re.match(r'^(\d+(?:\.\d+)*)\s+(.+?)\s*\.{2,}\s*(\d+)$', line)
if match1:
number, title, page_num = match1.groups()
level = len(number.split('.')) - 1
toc_entries.append(TOCEntry(
title=title.strip(),
page=int(page_num),
level=level
))
i += 1
continue
# Pattern 2: Multi-line format
# "1.1"
# "Title"
# ". . . . 23"
if re.match(r'^(\d+(?:\.\d+)*)$', line):
number = line
if i + 1 < len(lines):
title_line = lines[i + 1].strip()
if i + 2 < len(lines):
dots_line = lines[i + 2].strip()
page_match = re.search(r'(\d+)\s*$', dots_line)
if page_match and '.' in dots_line:
title = title_line
page_num = int(page_match.group(1))
level = len(number.split('.')) - 1
toc_entries.append(TOCEntry(
title=title,
page=page_num,
level=level
))
i += 3
continue
# Pattern 3: "Chapter 1: Title ... 23"
match3 = re.match(r'^(Chapter|Section|Part)\s+(\d+):?\s+(.+?)\s*\.{2,}\s*(\d+)$', line, re.IGNORECASE)
if match3:
prefix, number, title, page_num = match3.groups()
level = 0 if prefix.lower() == 'chapter' else 1
toc_entries.append(TOCEntry(
title=f"{prefix} {number}: {title}",
page=int(page_num),
level=level
))
i += 1
continue
i += 1
# Add parent relationships
for i, entry in enumerate(toc_entries):
if entry.level > 0:
# Find parent (previous entry with lower level)
for j in range(i - 1, -1, -1):
if toc_entries[j].level < entry.level:
entry.parent = toc_entries[j].title
entry.parent_title = toc_entries[j].title # Set both for compatibility
break
return toc_entries
def _detect_structure_without_toc(self, pages: List[Dict]) -> List[TOCEntry]:
"""Fallback: detect structure from content patterns across ALL pages."""
entries = []
# Expanded patterns for better structure detection
chapter_patterns = [
re.compile(r'^(Chapter|CHAPTER)\s+(\d+|[IVX]+)(?:\s*[:\-]\s*(.+))?', re.MULTILINE),
re.compile(r'^(\d+)\s+([A-Z][^.]*?)(?:\s*\.{2,}\s*\d+)?$', re.MULTILINE), # "1 Introduction"
re.compile(r'^([A-Z][A-Z\s]{10,})$', re.MULTILINE), # ALL CAPS titles
]
section_patterns = [
re.compile(r'^(\d+\.\d+)\s+(.+?)(?:\s*\.{2,}\s*\d+)?$', re.MULTILINE), # "1.1 Section"
re.compile(r'^(\d+\.\d+\.\d+)\s+(.+?)(?:\s*\.{2,}\s*\d+)?$', re.MULTILINE), # "1.1.1 Subsection"
]
# Process ALL pages, not just first 20
for i, page in enumerate(pages):
text = page.get('text', '')
if not text.strip():
continue
# Find chapters with various patterns
for pattern in chapter_patterns:
for match in pattern.finditer(text):
if len(match.groups()) >= 2:
if len(match.groups()) >= 3 and match.group(3):
title = match.group(3).strip()
else:
title = match.group(2).strip() if match.group(2) else f"Section {match.group(1)}"
# Skip very short or likely false positives
if len(title) >= 3 and not re.match(r'^\d+$', title):
entries.append(TOCEntry(
title=title,
page=i + 1,
level=0
))
# Find sections
for pattern in section_patterns:
for match in pattern.finditer(text):
section_num = match.group(1)
title = match.group(2).strip() if len(match.groups()) >= 2 else f"Section {section_num}"
# Determine level by number of dots
level = section_num.count('.')
# Skip very short titles or obvious artifacts
if len(title) >= 3 and not re.match(r'^\d+$', title):
entries.append(TOCEntry(
title=title,
page=i + 1,
level=level
))
# If still no entries found, create page-based entries for full coverage
if not entries:
print("No structure patterns found, creating page-based sections for full coverage")
# Create sections every 10 pages to ensure full document coverage
for i in range(0, len(pages), 10):
start_page = i + 1
end_page = min(i + 10, len(pages))
title = f"Pages {start_page}-{end_page}"
entries.append(TOCEntry(
title=title,
page=start_page,
level=0
))
return entries
def create_chunks_from_toc(self, pdf_data: Dict, toc_entries: List[TOCEntry]) -> List[Dict]:
"""Create chunks based on TOC structure."""
chunks = []
pages = pdf_data.get('pages', [])
for i, entry in enumerate(toc_entries):
# Determine page range for this entry
start_page = entry.page - 1 # Convert to 0-indexed
# Find end page (start of next entry at same or higher level)
end_page = len(pages)
for j in range(i + 1, len(toc_entries)):
if toc_entries[j].level <= entry.level:
end_page = toc_entries[j].page - 1
break
# Extract text for this section
section_text = []
for page_idx in range(max(0, start_page), min(end_page, len(pages))):
page_text = pages[page_idx].get('text', '')
if page_text.strip():
section_text.append(page_text)
if not section_text:
continue
full_text = '\n\n'.join(section_text)
# Create chunks from section text
if len(full_text) <= self.max_chunk_size:
# Single chunk for small sections
chunks.append({
'text': full_text.strip(),
'title': entry.title,
'parent_title': entry.parent_title or entry.parent or '',
'level': entry.level,
'page': entry.page,
'context': f"From {entry.title}",
'metadata': {
'parsing_method': 'toc_guided',
'section_title': entry.title,
'hierarchy_level': entry.level
}
})
else:
# Split large sections into chunks
section_chunks = self._split_text_into_chunks(full_text)
for j, chunk_text in enumerate(section_chunks):
chunks.append({
'text': chunk_text.strip(),
'title': f"{entry.title} (Part {j+1})",
'parent_title': entry.parent_title or entry.parent or '',
'level': entry.level,
'page': entry.page,
'context': f"Part {j+1} of {entry.title}",
'metadata': {
'parsing_method': 'toc_guided',
'section_title': entry.title,
'hierarchy_level': entry.level,
'part_number': j + 1,
'total_parts': len(section_chunks)
}
})
return chunks
def _split_text_into_chunks(self, text: str) -> List[str]:
"""Split text into chunks while preserving sentence boundaries."""
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = []
current_size = 0
for sentence in sentences:
sentence_size = len(sentence)
if current_size + sentence_size > self.target_chunk_size and current_chunk:
# Save current chunk
chunks.append(' '.join(current_chunk))
current_chunk = [sentence]
current_size = sentence_size
else:
current_chunk.append(sentence)
current_size += sentence_size + 1 # +1 for space
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def parse_pdf_with_toc_guidance(pdf_data: Dict, **kwargs) -> List[Dict]:
"""Main entry point for TOC-guided parsing."""
parser = TOCGuidedParser(**kwargs)
# Parse TOC
pages = pdf_data.get('pages', [])
toc_entries = parser.parse_toc(pages)
print(f"Found {len(toc_entries)} TOC entries")
if not toc_entries:
print("No TOC entries found, falling back to basic chunking")
from .chunker import chunk_technical_text
return chunk_technical_text(pdf_data.get('text', ''))
# Create chunks based on TOC
chunks = parser.create_chunks_from_toc(pdf_data, toc_entries)
print(f"Created {len(chunks)} chunks from TOC structure")
return chunks