File size: 5,550 Bytes
17404d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import gradio as gr
import re
import nltk
import pdfplumber
import docx
import textstat
from io import BytesIO
from newspaper import Article
from collections import Counter
from transformers import pipeline

nltk.download('punkt')

# Load summarization models
summarizers = {
    "T5 (t5-small)": pipeline("summarization", model="t5-small"),
    "BART (bart-large-cnn)": pipeline("summarization", model="facebook/bart-large-cnn"),
    "Pegasus (xsum)": pipeline("summarization", model="google/pegasus-xsum")
}

# Load QA models
qa_models = {
    "DistilBERT QA": pipeline("question-answering", model="distilbert-base-uncased-distilled-squad"),
    "BERT QA": pipeline("question-answering", model="deepset/bert-base-cased-squad2")
}

# Utility functions
def extract_text_from_file(file):
    if file is None:
        return ""
    name = file.name
    ext = name.split('.')[-1]
    if ext == 'txt':
        return file.read().decode()
    elif ext == 'pdf':
        with pdfplumber.open(file) as pdf:
            return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    elif ext == 'docx':
        doc = docx.Document(file)
        return "\n".join([p.text for p in doc.paragraphs])
    return ""

def fetch_url_text(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

def get_keywords(text, n=5):
    words = re.findall(r'\b\w{4,}\b', text.lower())
    common = Counter(words).most_common(n)
    return "; ".join(word for word, _ in common)

def summarize_text(text, model_name, min_len, max_len, format_type):
    summary_chunks = []
    for i in range(0, len(text), 1024):
        chunk = text[i:i+1024]
        result = summarizers[model_name](chunk, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
        summary_chunks.append(result)
    summary = " ".join(summary_chunks)
    if format_type == "Bullet Points":
        bullets = re.split(r'(?<=[.!?]) +', summary)
        return "\n".join(f"• {point}" for point in bullets if point.strip())
    return summary

def qa_answers(text, questions, model_name):
    model = qa_models[model_name]
    answers = []
    for q in questions.split('\n'):
        if q.strip():
            ans = model(question=q, context=text)
            answers.append(f"{q}: {ans['answer']} (score: {ans['score']:.2f})")
    return "\n".join(answers)

def get_metrics(original, summary):
    return {
        'Input Word Count': len(original.split()),
        'Summary Word Count': len(summary.split()),
        'Compression Rate (%)': round(100 - (len(summary.split()) / len(original.split()) * 100), 2) if len(original.split()) else 0,
        'Readability (Flesch)': textstat.flesch_reading_ease(summary) if summary else 0
    }

# Gradio main function
def process_text(input_text, file, url, summarizer_model, qa_model, min_tokens, max_tokens, format_type, questions):
    if file is not None:
        text = extract_text_from_file(file)
    elif url:
        text = fetch_url_text(url)
    else:
        text = input_text

    if not text:
        return "No input provided.", "", "", "", ""

    summary = summarize_text(text, summarizer_model, min_tokens, max_tokens, format_type)
    keywords = get_keywords(text)
    answers = qa_answers(text, questions, qa_model) if questions else "No questions provided."
    metrics = get_metrics(text, summary)

    metrics_str = f"""

    Input Word Count: {metrics['Input Word Count']}

    Summary Word Count: {metrics['Summary Word Count']}

    Compression Rate: {metrics['Compression Rate (%)']}%

    Readability Score (Flesch): {metrics['Readability (Flesch)']}

    """

    return summary, keywords, answers, metrics_str, text

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# 📚 Advanced Text Summarizer & Q&A App\nUpload text/file/url, summarize, extract keywords, and ask questions.")

    with gr.Row():
        input_text = gr.Textbox(label="Paste Text Here", placeholder="Enter text...", lines=6)
        file = gr.File(label="Upload File (.txt, .pdf, .docx)")
        url = gr.Textbox(label="URL", placeholder="https://...")

    with gr.Row():
        summarizer_model = gr.Dropdown(choices=list(summarizers.keys()), value="BART (bart-large-cnn)", label="Summarizer Model")
        qa_model = gr.Dropdown(choices=list(qa_models.keys()), value="DistilBERT QA", label="QA Model")

    with gr.Row():
        min_tokens = gr.Slider(5, 300, value=30, step=1, label="Min Tokens")
        max_tokens = gr.Slider(50, 1024, value=120, step=1, label="Max Tokens")

    format_type = gr.Radio(choices=['Paragraph', 'Bullet Points'], value='Paragraph', label="Output Format")
    questions = gr.Textbox(label="Questions (one per line)", placeholder="Type questions...", lines=3)

    process_btn = gr.Button("Process")

    summary_out = gr.Textbox(label="Summarized Text", lines=6)
    keywords_out = gr.Textbox(label="Top Keywords")
    answers_out = gr.Textbox(label="QA Answers", lines=4)
    metrics_out = gr.Textbox(label="Metrics")
    original_out = gr.Textbox(label="Original Text", lines=6)

    process_btn.click(
        fn=process_text,
        inputs=[input_text, file, url, summarizer_model, qa_model, min_tokens, max_tokens, format_type, questions],
        outputs=[summary_out, keywords_out, answers_out, metrics_out, original_out]
    )

if __name__ == "__main__":
    demo.launch()