|
import gradio as gr |
|
from gtts import gTTS |
|
import pdfplumber |
|
from sumy.parsers.plaintext import PlaintextParser |
|
from sumy.nlp.tokenizers import Tokenizer |
|
from sumy.summarizers.lsa import LsaSummarizer |
|
import nltk |
|
import os |
|
|
|
|
|
try: |
|
nltk.download('punkt') |
|
nltk.download('punkt_tab') |
|
except Exception as e: |
|
print(f"Error downloading NLTK data: {str(e)}") |
|
|
|
def extract_text_from_pdf(pdf_file): |
|
""" |
|
Extract text from a PDF file using pdfplumber. |
|
|
|
Args: |
|
pdf_file: Uploaded PDF file. |
|
|
|
Returns: |
|
str: Extracted text or error message. |
|
""" |
|
try: |
|
with pdfplumber.open(pdf_file) as pdf: |
|
text = "" |
|
for page in pdf.pages: |
|
page_text = page.extract_text() |
|
if page_text: |
|
text += page_text + " " |
|
return text.strip() if text else "No text could be extracted from the PDF." |
|
except Exception as e: |
|
return f"Error extracting text: {str(e)}" |
|
|
|
def summarize_text(text, sentences_count=12): |
|
""" |
|
Summarize text to approximately four paragraphs using sumy LSA summarizer. |
|
|
|
Args: |
|
text (str): Text to summarize. |
|
sentences_count (int): Number of sentences in summary (approx. 3 sentences per paragraph). |
|
|
|
Returns: |
|
str: Summarized text or error message. |
|
""" |
|
try: |
|
if len(text.split()) < 50: |
|
return "Text is too short to summarize." |
|
parser = PlaintextParser.from_string(text, Tokenizer("english")) |
|
summarizer = LsaSummarizer() |
|
summary = summarizer(parser.document, sentences_count) |
|
summary_text = "" |
|
for i, sentence in enumerate(summary): |
|
summary_text += str(sentence) + " " |
|
if (i + 1) % 3 == 0: |
|
summary_text += "\n\n" |
|
return summary_text.strip() if summary_text else "No summary generated." |
|
except Exception as e: |
|
return f"Error summarizing text: {str(e)}" |
|
|
|
def pdf_to_speech(pdf_file, lang="en"): |
|
""" |
|
Convert text from a PDF to summarized speech using gTTS. |
|
|
|
Args: |
|
pdf_file: Uploaded PDF file. |
|
lang (str): Language code (default is 'en' for English). |
|
|
|
Returns: |
|
tuple: (Path to audio file or None, summarized text or error message). |
|
""" |
|
try: |
|
|
|
text = extract_text_from_pdf(pdf_file) |
|
if "Error" in text: |
|
return None, text |
|
|
|
|
|
summarized_text = summarize_text(text, sentences_count=12) |
|
if "Error" in summarized_text or "too short" in summarized_text: |
|
return None, summarized_text |
|
|
|
|
|
tts = gTTS(text=summarized_text, lang=lang, slow=False) |
|
|
|
|
|
output_file = "output.mp3" |
|
tts.save(output_file) |
|
|
|
return output_file, summarized_text |
|
|
|
except Exception as e: |
|
return None, f"An error occurred: {str(e)}" |
|
|
|
|
|
demo = gr.Interface( |
|
fn=pdf_to_speech, |
|
inputs=[ |
|
gr.File(label="Upload a PDF file", file_types=[".pdf"]), |
|
gr.Dropdown(choices=["en", "es", "fr"], label="Select Language", value="en") |
|
], |
|
outputs=[ |
|
gr.Audio(label="Generated Speech"), |
|
gr.Textbox(label="Summarized Text") |
|
], |
|
title="PDF Summary to Speech", |
|
description="Upload an English PDF file, select a language, and generate speech from a summarized version (approx. 4 paragraphs). The summarized text is also displayed." |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |