mrsk1883 commited on
Commit
e2641a0
·
1 Parent(s): 425b6b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -34
app.py CHANGED
@@ -1,63 +1,49 @@
 
 
1
  from PyPDF2 import PdfReader
2
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
  from gtts import gTTS
4
- from IPython.display import Audio
5
 
6
- # Download the model and tokenizer
7
  model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
8
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
 
11
- def summarize_pdf_abstract(pdf_path):
12
  """
13
  Reads a PDF file, extracts the abstract, and summarizes it in one sentence.
14
 
15
  Args:
16
- pdf_path: Path to the PDF file.
17
 
18
  Returns:
19
- A string containing the one-sentence summary of the abstract.
20
  """
21
 
22
- # Read the PDF file
23
- reader = PdfReader(open(pdf_path, 'rb'))
24
-
25
- # Extract the abstract
26
  abstract_text = ""
 
27
  for page in reader.pages:
28
- # Search for keywords like "Abstract" or "Introduction"
29
  if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
30
- # Extract the text following the keyword
31
  abstract_text = page.extract_text()
32
  break
33
 
34
- # Encode the abstract text
35
  inputs = tokenizer(abstract_text, return_tensors="pt")
36
-
37
- # Generate the summary
38
  outputs = model.generate(**inputs)
39
-
40
- # Decode the summary
41
  summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
42
 
43
- return summary
44
-
45
- # Define the file path
46
- pdf_path = "/content/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"
47
-
48
- # Summarize the abstract
49
- summary = summarize_pdf_abstract(pdf_path)
50
-
51
- # Print the summary
52
- print("One-sentence summary of the abstract:")
53
- print(summary)
54
 
55
- # Choose your preferred language for the audio
56
- language = "en"
57
 
58
- # Generate audio file
59
- speech = gTTS(summary, lang=language)
60
- speech.save("summary.mp3")
 
 
 
61
 
62
- # Display the audio file
63
- Audio("summary.mp3")
 
1
+ import gradio as gr
2
+
3
  from PyPDF2 import PdfReader
4
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
5
  from gtts import gTTS
6
+ from io import BytesIO
7
 
8
+ # Define model and tokenizer
9
  model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
10
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
 
13
+ def summarize_pdf_abstract(pdf_data):
14
  """
15
  Reads a PDF file, extracts the abstract, and summarizes it in one sentence.
16
 
17
  Args:
18
+ pdf_data: A byte string containing the PDF data.
19
 
20
  Returns:
21
+ A dictionary containing the one-sentence summary of the abstract and the generated audio.
22
  """
23
 
24
+ reader = PdfReader(BytesIO(pdf_data))
 
 
 
25
  abstract_text = ""
26
+
27
  for page in reader.pages:
 
28
  if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
 
29
  abstract_text = page.extract_text()
30
  break
31
 
 
32
  inputs = tokenizer(abstract_text, return_tensors="pt")
 
 
33
  outputs = model.generate(**inputs)
 
 
34
  summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
35
 
36
+ speech = gTTS(summary, lang="en")
37
+ speech_bytes = speech.get_wav_data()
 
 
 
 
 
 
 
 
 
38
 
39
+ return {"summary": summary, "audio": speech_bytes}
 
40
 
41
+ # Create Gradio interface
42
+ interface = gr.Interface(
43
+ fn=summarize_pdf_abstract,
44
+ inputs=[gr.File(label="Upload PDF", mimetypes=["application/pdf"])],
45
+ outputs=[gr.Text(label="One-sentence summary"), gr.Audio(label="Summary audio")],
46
+ )
47
 
48
+ # Launch the Hugging Face Space
49
+ interface.launch(title="PDF Abstract Summarizer")