File size: 5,550 Bytes
17404d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import gradio as gr
import re
import nltk
import pdfplumber
import docx
import textstat
from io import BytesIO
from newspaper import Article
from collections import Counter
from transformers import pipeline
nltk.download('punkt')
# Load summarization models
summarizers = {
"T5 (t5-small)": pipeline("summarization", model="t5-small"),
"BART (bart-large-cnn)": pipeline("summarization", model="facebook/bart-large-cnn"),
"Pegasus (xsum)": pipeline("summarization", model="google/pegasus-xsum")
}
# Load QA models
qa_models = {
"DistilBERT QA": pipeline("question-answering", model="distilbert-base-uncased-distilled-squad"),
"BERT QA": pipeline("question-answering", model="deepset/bert-base-cased-squad2")
}
# Utility functions
def extract_text_from_file(file):
if file is None:
return ""
name = file.name
ext = name.split('.')[-1]
if ext == 'txt':
return file.read().decode()
elif ext == 'pdf':
with pdfplumber.open(file) as pdf:
return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
elif ext == 'docx':
doc = docx.Document(file)
return "\n".join([p.text for p in doc.paragraphs])
return ""
def fetch_url_text(url):
article = Article(url)
article.download()
article.parse()
return article.text
def get_keywords(text, n=5):
words = re.findall(r'\b\w{4,}\b', text.lower())
common = Counter(words).most_common(n)
return "; ".join(word for word, _ in common)
def summarize_text(text, model_name, min_len, max_len, format_type):
summary_chunks = []
for i in range(0, len(text), 1024):
chunk = text[i:i+1024]
result = summarizers[model_name](chunk, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
summary_chunks.append(result)
summary = " ".join(summary_chunks)
if format_type == "Bullet Points":
bullets = re.split(r'(?<=[.!?]) +', summary)
return "\n".join(f"• {point}" for point in bullets if point.strip())
return summary
def qa_answers(text, questions, model_name):
model = qa_models[model_name]
answers = []
for q in questions.split('\n'):
if q.strip():
ans = model(question=q, context=text)
answers.append(f"{q}: {ans['answer']} (score: {ans['score']:.2f})")
return "\n".join(answers)
def get_metrics(original, summary):
return {
'Input Word Count': len(original.split()),
'Summary Word Count': len(summary.split()),
'Compression Rate (%)': round(100 - (len(summary.split()) / len(original.split()) * 100), 2) if len(original.split()) else 0,
'Readability (Flesch)': textstat.flesch_reading_ease(summary) if summary else 0
}
# Gradio main function
def process_text(input_text, file, url, summarizer_model, qa_model, min_tokens, max_tokens, format_type, questions):
if file is not None:
text = extract_text_from_file(file)
elif url:
text = fetch_url_text(url)
else:
text = input_text
if not text:
return "No input provided.", "", "", "", ""
summary = summarize_text(text, summarizer_model, min_tokens, max_tokens, format_type)
keywords = get_keywords(text)
answers = qa_answers(text, questions, qa_model) if questions else "No questions provided."
metrics = get_metrics(text, summary)
metrics_str = f"""
Input Word Count: {metrics['Input Word Count']}
Summary Word Count: {metrics['Summary Word Count']}
Compression Rate: {metrics['Compression Rate (%)']}%
Readability Score (Flesch): {metrics['Readability (Flesch)']}
"""
return summary, keywords, answers, metrics_str, text
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# 📚 Advanced Text Summarizer & Q&A App\nUpload text/file/url, summarize, extract keywords, and ask questions.")
with gr.Row():
input_text = gr.Textbox(label="Paste Text Here", placeholder="Enter text...", lines=6)
file = gr.File(label="Upload File (.txt, .pdf, .docx)")
url = gr.Textbox(label="URL", placeholder="https://...")
with gr.Row():
summarizer_model = gr.Dropdown(choices=list(summarizers.keys()), value="BART (bart-large-cnn)", label="Summarizer Model")
qa_model = gr.Dropdown(choices=list(qa_models.keys()), value="DistilBERT QA", label="QA Model")
with gr.Row():
min_tokens = gr.Slider(5, 300, value=30, step=1, label="Min Tokens")
max_tokens = gr.Slider(50, 1024, value=120, step=1, label="Max Tokens")
format_type = gr.Radio(choices=['Paragraph', 'Bullet Points'], value='Paragraph', label="Output Format")
questions = gr.Textbox(label="Questions (one per line)", placeholder="Type questions...", lines=3)
process_btn = gr.Button("Process")
summary_out = gr.Textbox(label="Summarized Text", lines=6)
keywords_out = gr.Textbox(label="Top Keywords")
answers_out = gr.Textbox(label="QA Answers", lines=4)
metrics_out = gr.Textbox(label="Metrics")
original_out = gr.Textbox(label="Original Text", lines=6)
process_btn.click(
fn=process_text,
inputs=[input_text, file, url, summarizer_model, qa_model, min_tokens, max_tokens, format_type, questions],
outputs=[summary_out, keywords_out, answers_out, metrics_out, original_out]
)
if __name__ == "__main__":
demo.launch() |