Spaces:
Sleeping
Sleeping
import fitz | |
import os | |
import re | |
from typing import List, Dict, Any | |
from dataclasses import dataclass | |
class DocumentChunk: | |
content: str | |
metadata: Dict[str, Any] | |
page_number: int | |
source_file: str | |
class PDFProcessor: | |
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): | |
self.chunk_size = chunk_size | |
self.chunk_overlap = chunk_overlap | |
def extract_text_from_pdf(self, pdf_path: str) -> List[DocumentChunk]: | |
"""Extract text from PDF and return chunks with metadata.""" | |
chunks = [] | |
try: | |
doc = fitz.open(pdf_path) | |
filename = os.path.basename(pdf_path) | |
for page_num in range(len(doc)): | |
page = doc.load_page(page_num) | |
text = page.get_text() | |
if text.strip(): | |
cleaned_text = self._clean_text(text) | |
page_chunks = self._create_chunks(cleaned_text, page_num + 1, filename) | |
chunks.extend(page_chunks) | |
doc.close() | |
return chunks | |
except Exception as e: | |
raise Exception(f"Error processing PDF {pdf_path}: {str(e)}") | |
def _clean_text(self, text: str) -> str: | |
"""Clean and normalize text.""" | |
text = re.sub(r'\s+', ' ', text) | |
text = re.sub(r'[^\w\s.,!?;:()\[\]{}"-]', '', text) | |
text = re.sub(r'([.,!?;:]){2,}', r'\1', text) | |
return text.strip() | |
def _create_chunks(self, text: str, page_number: int, filename: str) -> List[DocumentChunk]: | |
"""Split text into overlapping chunks.""" | |
chunks = [] | |
sentences = re.split(r'(?<=[.!?])\s+', text) | |
current_chunk = "" | |
current_length = 0 | |
for sentence in sentences: | |
sentence_length = len(sentence) | |
if current_length + sentence_length > self.chunk_size and current_chunk: | |
chunks.append(DocumentChunk( | |
content=current_chunk.strip(), | |
metadata={ | |
'filename': filename, | |
'page_number': page_number, | |
'chunk_length': len(current_chunk) | |
}, | |
page_number=page_number, | |
source_file=filename | |
)) | |
overlap_text = self._get_overlap_text(current_chunk) | |
current_chunk = overlap_text + " " + sentence | |
current_length = len(current_chunk) | |
else: | |
current_chunk += " " + sentence if current_chunk else sentence | |
current_length = len(current_chunk) | |
if current_chunk.strip(): | |
chunks.append(DocumentChunk( | |
content=current_chunk.strip(), | |
metadata={ | |
'filename': filename, | |
'page_number': page_number, | |
'chunk_length': len(current_chunk) | |
}, | |
page_number=page_number, | |
source_file=filename | |
)) | |
return chunks | |
def _get_overlap_text(self, text: str) -> str: | |
"""Get overlap text from the end of current chunk.""" | |
if len(text) <= self.chunk_overlap: | |
return text | |
return text[-self.chunk_overlap:] | |