Spaces:
Running
Running
import gradio as gr | |
import PyPDF2 | |
import io | |
from transformers import pipeline, AutoTokenizer | |
import torch | |
import re | |
from typing import List, Tuple | |
import warnings | |
warnings.filterwarnings("ignore") | |
class PDFSummarizer: | |
def __init__(self): | |
# Use a much faster, lighter model for summarization | |
self.model_name = "sshleifer/distilbart-cnn-12-6" # Much faster than BART-large | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
print(f"Using device: {self.device}") | |
try: | |
# Initialize the summarization pipeline with optimizations | |
self.summarizer = pipeline( | |
"summarization", | |
model=self.model_name, | |
device=0 if self.device == "cuda" else -1, | |
framework="pt", | |
model_kwargs={"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32} | |
) | |
# Initialize tokenizer for length calculations | |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
print("Model loaded successfully") | |
except Exception as e: | |
print(f"Error loading model: {e}") | |
# Fallback to an even faster model | |
self.model_name = "facebook/bart-large-cnn" | |
self.summarizer = pipeline("summarization", model=self.model_name, device=-1) | |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
print("Fallback model loaded") | |
def extract_text_from_pdf(self, pdf_file) -> str: | |
"""Extract text content from PDF file""" | |
try: | |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file)) | |
text = "" | |
for page_num, page in enumerate(pdf_reader.pages): | |
page_text = page.extract_text() | |
if page_text.strip(): | |
text += f"\n--- Page {page_num + 1} ---\n" | |
text += page_text | |
return text.strip() | |
except Exception as e: | |
raise Exception(f"Error extracting text from PDF: {str(e)}") | |
def clean_text(self, text: str) -> str: | |
"""Clean and preprocess text""" | |
# Remove extra whitespaces and newlines | |
text = re.sub(r'\s+', ' ', text) | |
# Remove special characters but keep punctuation | |
text = re.sub(r'[^\w\s.,!?;:()\-"]', ' ', text) | |
# Remove page markers | |
text = re.sub(r'--- Page \d+ ---', '', text) | |
return text.strip() | |
def chunk_text(self, text: str, max_chunk_length: int = 512) -> List[str]: | |
"""Split text into smaller, more manageable chunks for faster processing""" | |
sentences = text.split('. ') | |
chunks = [] | |
current_chunk = "" | |
for sentence in sentences: | |
# Check if adding this sentence would exceed the limit | |
potential_chunk = current_chunk + sentence + ". " | |
# Use faster length estimation | |
if len(potential_chunk.split()) <= max_chunk_length: | |
current_chunk = potential_chunk | |
else: | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
current_chunk = sentence + ". " | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
# Limit number of chunks for speed | |
return chunks[:5] # Process max 5 chunks for speed | |
def summarize_chunk(self, chunk: str, max_length: int = 100, min_length: int = 30) -> str: | |
"""Summarize a single chunk of text with speed optimizations""" | |
try: | |
# Speed optimizations | |
summary = self.summarizer( | |
chunk, | |
max_length=max_length, | |
min_length=min_length, | |
do_sample=False, | |
truncation=True, | |
early_stopping=True, | |
num_beams=2 # Reduced from default 4 for speed | |
) | |
return summary[0]['summary_text'] | |
except Exception as e: | |
return f"Error summarizing chunk: {str(e)}" | |
def process_pdf(self, pdf_file, summary_type: str) -> Tuple[str, str, str]: | |
"""Main function to process PDF and generate summary""" | |
try: | |
# Extract text from PDF | |
raw_text = self.extract_text_from_pdf(pdf_file) | |
if not raw_text.strip(): | |
return "β Error: No text could be extracted from the PDF.", "", "" | |
# Clean the text | |
cleaned_text = self.clean_text(raw_text) | |
# Calculate text statistics | |
word_count = len(cleaned_text.split()) | |
char_count = len(cleaned_text) | |
if word_count < 50: | |
return "β Error: PDF contains too little text to summarize.", "", "" | |
# Chunk the text for processing | |
chunks = self.chunk_text(cleaned_text) | |
# Determine summary parameters based on type (optimized for speed) | |
if summary_type == "Brief (Quick)": | |
max_len, min_len = 60, 20 | |
elif summary_type == "Detailed": | |
max_len, min_len = 100, 40 | |
else: # Comprehensive | |
max_len, min_len = 150, 60 | |
# Summarize each chunk (with progress tracking) | |
chunk_summaries = [] | |
for i, chunk in enumerate(chunks): | |
print(f"Processing chunk {i+1}/{len(chunks)}") | |
summary = self.summarize_chunk(chunk, max_len, min_len) | |
chunk_summaries.append(summary) | |
# Combine summaries | |
combined_summary = " ".join(chunk_summaries) | |
# Skip final summarization for speed if we have few chunks | |
if len(chunks) <= 2: | |
final_summary = combined_summary | |
else: | |
# Quick final summary for multiple chunks | |
final_summary = self.summarize_chunk( | |
combined_summary, | |
max_length=min(200, max_len * 1.5), | |
min_length=min_len | |
) | |
# Create statistics | |
summary_stats = f""" | |
π **Document Statistics:** | |
- Original word count: {word_count:,} | |
- Original character count: {char_count:,} | |
- Pages processed: {len(chunks)} | |
- Summary word count: {len(final_summary.split()):,} | |
- Compression ratio: {word_count / len(final_summary.split()):.1f}:1 | |
""" | |
return final_summary, summary_stats, "β Summary generated successfully!" | |
except Exception as e: | |
return f"β Error processing PDF: {str(e)}", "", "" | |
# Initialize the summarizer | |
pdf_summarizer = PDFSummarizer() | |
def summarize_pdf_interface(pdf_file, summary_type): | |
"""Gradio interface function""" | |
if pdf_file is None: | |
return "β Please upload a PDF file.", "", "" | |
try: | |
# Read the uploaded file - pdf_file is already the file path | |
with open(pdf_file, 'rb') as f: | |
pdf_content = f.read() | |
# Process the PDF | |
summary, stats, status = pdf_summarizer.process_pdf(pdf_content, summary_type) | |
return summary, stats, status | |
except Exception as e: | |
return f"β Error: {str(e)}", "", "" | |
# Create Gradio interface | |
def create_interface(): | |
with gr.Blocks( | |
title="π AI PDF Summarizer", | |
theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container { | |
max-width: 1200px !important; | |
} | |
.summary-box { | |
border-left: 4px solid #2196F3; | |
padding: 16px; | |
background-color: #f8f9fa; | |
} | |
""" | |
) as interface: | |
gr.Markdown(""" | |
# β¨ Your AI-Powered PDF Assistant | |
### Transform lengthy documents into clear, concise summaries in seconds. | |
Upload any PDF file below and let our intelligent model do the work for you. Perfect for students, researchers, and professionals who need to quickly grasp the core ideas of any document. | |
--- | |
## Key Features | |
- β‘ **Lightning-Fast Summaries**: Powered by a state-of-the-art AI model. | |
- π§ **Intelligent Text Chunking**: Handles long documents with ease by breaking them down into manageable pieces. | |
- π **Detailed Document Stats**: Get instant insights on word count, compression ratio, and more. | |
- π― **Customizable Summary Length**: Choose the level of detail that works best for you. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
pdf_input = gr.File( | |
label="π Upload PDF File", | |
file_types=[".pdf"], | |
type="filepath" | |
) | |
summary_type = gr.Radio( | |
choices=["Brief (Quick)", "Detailed", "Comprehensive"], | |
value="Detailed", | |
label="π Summary Length", | |
info="Choose how detailed you want the summary to be" | |
) | |
summarize_btn = gr.Button( | |
"π Generate Summary", | |
variant="primary", | |
size="lg" | |
) | |
status_output = gr.Textbox( | |
label="π Status", | |
interactive=False, | |
max_lines=2 | |
) | |
with gr.Column(scale=2): | |
summary_output = gr.Textbox( | |
label="π Generated Summary", | |
lines=15, | |
max_lines=20, | |
interactive=False, | |
elem_classes=["summary-box"] | |
) | |
stats_output = gr.Markdown( | |
label="π Document Statistics", | |
value="Upload a PDF to see statistics" | |
) | |
# Examples section | |
gr.Markdown(""" | |
--- | |
## π‘ Tips for Optimal Results: | |
- **Quality Matters**: Ensure your PDF has selectable text (not just scanned images). | |
- **Length**: Works best with documents between 500-10,000 words. | |
- **Language**: Optimized for English content. | |
- **Format**: Clean, well-formatted PDFs produce the best summaries. | |
## π§ How It Works: | |
This application uses a fine-tuned **BART** model for summarization. The process involves: | |
1. Extracting text from your PDF. | |
2. Cleaning and preprocessing the text. | |
3. Intelligently chunking the document to handle long texts. | |
4. Generating a summary for each chunk and then combining them for a final, coherent result. | |
""") | |
# Connect the button to the function | |
summarize_btn.click( | |
fn=summarize_pdf_interface, | |
inputs=[pdf_input, summary_type], | |
outputs=[summary_output, stats_output, status_output] | |
) | |
# Auto-process when file is uploaded | |
pdf_input.change( | |
fn=summarize_pdf_interface, | |
inputs=[pdf_input, summary_type], | |
outputs=[summary_output, stats_output, status_output] | |
) | |
return interface | |
# Launch the application | |
if __name__ == "__main__": | |
interface = create_interface() | |
interface.launch() | |