testingspace / app.py
mrsk1883's picture
Update app.py
e2641a0
raw
history blame
1.56 kB
import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
# Define model and tokenizer
model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def summarize_pdf_abstract(pdf_data):
"""
Reads a PDF file, extracts the abstract, and summarizes it in one sentence.
Args:
pdf_data: A byte string containing the PDF data.
Returns:
A dictionary containing the one-sentence summary of the abstract and the generated audio.
"""
reader = PdfReader(BytesIO(pdf_data))
abstract_text = ""
for page in reader.pages:
if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
abstract_text = page.extract_text()
break
inputs = tokenizer(abstract_text, return_tensors="pt")
outputs = model.generate(**inputs)
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
speech = gTTS(summary, lang="en")
speech_bytes = speech.get_wav_data()
return {"summary": summary, "audio": speech_bytes}
# Create Gradio interface
interface = gr.Interface(
fn=summarize_pdf_abstract,
inputs=[gr.File(label="Upload PDF", mimetypes=["application/pdf"])],
outputs=[gr.Text(label="One-sentence summary"), gr.Audio(label="Summary audio")],
)
# Launch the Hugging Face Space
interface.launch(title="PDF Abstract Summarizer")