somya-27-04-03's picture
Upload 2 files
17404d5 verified
import gradio as gr
import re
import nltk
import pdfplumber
import docx
import textstat
from io import BytesIO
from newspaper import Article
from collections import Counter
from transformers import pipeline
nltk.download('punkt')
# Load summarization models
summarizers = {
"T5 (t5-small)": pipeline("summarization", model="t5-small"),
"BART (bart-large-cnn)": pipeline("summarization", model="facebook/bart-large-cnn"),
"Pegasus (xsum)": pipeline("summarization", model="google/pegasus-xsum")
}
# Load QA models
qa_models = {
"DistilBERT QA": pipeline("question-answering", model="distilbert-base-uncased-distilled-squad"),
"BERT QA": pipeline("question-answering", model="deepset/bert-base-cased-squad2")
}
# Utility functions
def extract_text_from_file(file):
if file is None:
return ""
name = file.name
ext = name.split('.')[-1]
if ext == 'txt':
return file.read().decode()
elif ext == 'pdf':
with pdfplumber.open(file) as pdf:
return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
elif ext == 'docx':
doc = docx.Document(file)
return "\n".join([p.text for p in doc.paragraphs])
return ""
def fetch_url_text(url):
article = Article(url)
article.download()
article.parse()
return article.text
def get_keywords(text, n=5):
words = re.findall(r'\b\w{4,}\b', text.lower())
common = Counter(words).most_common(n)
return "; ".join(word for word, _ in common)
def summarize_text(text, model_name, min_len, max_len, format_type):
summary_chunks = []
for i in range(0, len(text), 1024):
chunk = text[i:i+1024]
result = summarizers[model_name](chunk, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
summary_chunks.append(result)
summary = " ".join(summary_chunks)
if format_type == "Bullet Points":
bullets = re.split(r'(?<=[.!?]) +', summary)
return "\n".join(f"• {point}" for point in bullets if point.strip())
return summary
def qa_answers(text, questions, model_name):
model = qa_models[model_name]
answers = []
for q in questions.split('\n'):
if q.strip():
ans = model(question=q, context=text)
answers.append(f"{q}: {ans['answer']} (score: {ans['score']:.2f})")
return "\n".join(answers)
def get_metrics(original, summary):
return {
'Input Word Count': len(original.split()),
'Summary Word Count': len(summary.split()),
'Compression Rate (%)': round(100 - (len(summary.split()) / len(original.split()) * 100), 2) if len(original.split()) else 0,
'Readability (Flesch)': textstat.flesch_reading_ease(summary) if summary else 0
}
# Gradio main function
def process_text(input_text, file, url, summarizer_model, qa_model, min_tokens, max_tokens, format_type, questions):
if file is not None:
text = extract_text_from_file(file)
elif url:
text = fetch_url_text(url)
else:
text = input_text
if not text:
return "No input provided.", "", "", "", ""
summary = summarize_text(text, summarizer_model, min_tokens, max_tokens, format_type)
keywords = get_keywords(text)
answers = qa_answers(text, questions, qa_model) if questions else "No questions provided."
metrics = get_metrics(text, summary)
metrics_str = f"""
Input Word Count: {metrics['Input Word Count']}
Summary Word Count: {metrics['Summary Word Count']}
Compression Rate: {metrics['Compression Rate (%)']}%
Readability Score (Flesch): {metrics['Readability (Flesch)']}
"""
return summary, keywords, answers, metrics_str, text
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# 📚 Advanced Text Summarizer & Q&A App\nUpload text/file/url, summarize, extract keywords, and ask questions.")
with gr.Row():
input_text = gr.Textbox(label="Paste Text Here", placeholder="Enter text...", lines=6)
file = gr.File(label="Upload File (.txt, .pdf, .docx)")
url = gr.Textbox(label="URL", placeholder="https://...")
with gr.Row():
summarizer_model = gr.Dropdown(choices=list(summarizers.keys()), value="BART (bart-large-cnn)", label="Summarizer Model")
qa_model = gr.Dropdown(choices=list(qa_models.keys()), value="DistilBERT QA", label="QA Model")
with gr.Row():
min_tokens = gr.Slider(5, 300, value=30, step=1, label="Min Tokens")
max_tokens = gr.Slider(50, 1024, value=120, step=1, label="Max Tokens")
format_type = gr.Radio(choices=['Paragraph', 'Bullet Points'], value='Paragraph', label="Output Format")
questions = gr.Textbox(label="Questions (one per line)", placeholder="Type questions...", lines=3)
process_btn = gr.Button("Process")
summary_out = gr.Textbox(label="Summarized Text", lines=6)
keywords_out = gr.Textbox(label="Top Keywords")
answers_out = gr.Textbox(label="QA Answers", lines=4)
metrics_out = gr.Textbox(label="Metrics")
original_out = gr.Textbox(label="Original Text", lines=6)
process_btn.click(
fn=process_text,
inputs=[input_text, file, url, summarizer_model, qa_model, min_tokens, max_tokens, format_type, questions],
outputs=[summary_out, keywords_out, answers_out, metrics_out, original_out]
)
if __name__ == "__main__":
demo.launch()