import gradio as gr from gtts import gTTS import pdfplumber from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer import nltk import os # Download NLTK data for sumy try: nltk.download('punkt') nltk.download('punkt_tab') except Exception as e: print(f"Error downloading NLTK data: {str(e)}") def extract_text_from_pdf(pdf_file): """ Extract text from a PDF file using pdfplumber. Args: pdf_file: Uploaded PDF file. Returns: str: Extracted text or error message. """ try: with pdfplumber.open(pdf_file) as pdf: text = "" for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + " " return text.strip() if text else "No text could be extracted from the PDF." except Exception as e: return f"Error extracting text: {str(e)}" def summarize_text(text, sentences_count=12): """ Summarize text to approximately four paragraphs using sumy LSA summarizer. Args: text (str): Text to summarize. sentences_count (int): Number of sentences in summary (approx. 3 sentences per paragraph). Returns: str: Summarized text or error message. """ try: if len(text.split()) < 50: return "Text is too short to summarize." parser = PlaintextParser.from_string(text, Tokenizer("english")) summarizer = LsaSummarizer() summary = summarizer(parser.document, sentences_count) summary_text = "" for i, sentence in enumerate(summary): summary_text += str(sentence) + " " if (i + 1) % 3 == 0: summary_text += "\n\n" return summary_text.strip() if summary_text else "No summary generated." except Exception as e: return f"Error summarizing text: {str(e)}" def pdf_to_speech(pdf_file, lang="en"): """ Convert text from a PDF to summarized speech using gTTS. Args: pdf_file: Uploaded PDF file. lang (str): Language code (default is 'en' for English). Returns: tuple: (Path to audio file or None, summarized text or error message). """ try: # Extract text from PDF text = extract_text_from_pdf(pdf_file) if "Error" in text: return None, text # Summarize text summarized_text = summarize_text(text, sentences_count=12) if "Error" in summarized_text or "too short" in summarized_text: return None, summarized_text # Create gTTS object tts = gTTS(text=summarized_text, lang=lang, slow=False) # Save the audio file output_file = "output.mp3" tts.save(output_file) return output_file, summarized_text except Exception as e: return None, f"An error occurred: {str(e)}" # Define Gradio interface demo = gr.Interface( fn=pdf_to_speech, inputs=[ gr.File(label="Upload a PDF file", file_types=[".pdf"]), gr.Dropdown(choices=["en", "es", "fr"], label="Select Language", value="en") ], outputs=[ gr.Audio(label="Generated Speech"), gr.Textbox(label="Summarized Text") ], title="PDF Summary to Speech", description="Upload an English PDF file, select a language, and generate speech from a summarized version (approx. 4 paragraphs). The summarized text is also displayed." ) # Launch the app if __name__ == "__main__": demo.launch()