File size: 8,744 Bytes
af464a0
35287b3
af464a0
35287b3
af464a0
 
35287b3
 
af464a0
 
 
 
35287b3
 
 
 
 
 
 
af464a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf84551
 
efe5ee8
 
bf84551
 
 
97a3fab
bf84551
97a3fab
bf84551
97a3fab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af464a0
6d55c1a
 
af464a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abc0274
 
af464a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248

# https://huggingface.co/spaces/Mi-Ni/PDFtoAudio

# Unfortunately I wasn´t able to create a running space. I couldn´t adapt my code to create a running app in a huggingface space. After solving a lot of issues I ended up with a problem I wasn´t able to solve. Nevertheless, you´ll find my code below. Sorry that  


#here are the imports: other imports and modules see in requirements

import gradio as gr

import numpy as np





#here is the code


# Create a function to extract text

def text_extraction(element):
    # Extracting the text from the in-line text element
    line_text = element.get_text()

    # Find the formats of the text
    # Initialize the list with all the formats that appeared in the line of text
    line_formats = []
    for text_line in element:
        if isinstance(text_line, LTTextContainer):
            # Iterating through each character in the line of text
            for character in text_line:
                if isinstance(character, LTChar):
                    # Append the font name of the character
                    line_formats.append(character.fontname)
                    # Append the font size of the character
                    line_formats.append(character.size)
    # Find the unique font sizes and names in the line
    format_per_line = list(set(line_formats))

    # Return a tuple with the text in each line along with its format
    return (line_text, format_per_line)

def read_pdf(pdf_path):

    # Use pdf_path.name to get the file name from the gr.File object
  with open(pdf_path.name, 'rb') as pdfFileObj:
      pdfReaded = PyPDF2.PdfReader(pdfFileObj)
    
    
    
  # create a PDF file object
  #pdfFileObj = open(pdf_path, 'rb')
  # create a PDF reader object
  #pdfReaded = PyPDF2.PdfReader(pdfFileObj)
  # Create the dictionary to extract text from each image
  text_per_page = {}
  # We extract the pages from the PDF 
  for pagenum, page in enumerate(extract_pages(pdf_path)):
      print("Elaborating Page_" +str(pagenum))
      # Initialize the variables needed for the text extraction from the page
      pageObj = pdfReaded.pages[pagenum]
      page_text = []
      line_format = []
      text_from_images = []
      text_from_tables = []
      page_content = []
      # Initialize the number of the examined tables
      table_num = 0
      first_element= True
      table_extraction_flag= False
      # Open the pdf file
      pdf = pdfplumber.open(pdf_path)
      # Find the examined page
      page_tables = pdf.pages[pagenum]
      # Find the number of tables on the page
      tables = page_tables.find_tables()


      # Find all the elements
      page_elements = [(element.y1, element) for element in page._objs]
      # Sort all the elements as they appear in the page
      page_elements.sort(key=lambda a: a[0], reverse=True)

      # Find the elements that composed a page
      for i,component in enumerate(page_elements):
          # Extract the position of the top side of the element in the PDF
          pos= component[0]
          # Extract the element of the page layout
          element = component[1]

          # Check if the element is a text element
          if isinstance(element, LTTextContainer):
              # Check if the text appeared in a table
              if table_extraction_flag == False:
                  # Use the function to extract the text and format for each text element
                  (line_text, format_per_line) = text_extraction(element)
                  # Append the text of each line to the page text
                  page_text.append(line_text)
                  # Append the format for each line containing text
                  line_format.append(format_per_line)
                  page_content.append(line_text)
              else:
                  # Omit the text that appeared in a table
                  pass


      # Create the key of the dictionary
      dctkey = 'Page_'+str(pagenum)
      # Add the list of list as the value of the page key
      text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]

  # Closing the pdf file object
  pdfFileObj.close()

  return text_per_page

#pdf_path = 'Article 11 Hidden Technical Debt in Machine Learning Systems'
pdf_path = gr.File()

text_per_page = read_pdf(pdf_path)
text_per_page.keys()
page_0 = text_per_page['Page_0']
page_1 = text_per_page['Page_1']
page_2 = text_per_page['Page_2']
page_3 = text_per_page['Page_3']
page_4 = text_per_page['Page_4']
page_5 = text_per_page['Page_5']
page_6 = text_per_page['Page_6']
page_7 = text_per_page['Page_7']
page_8 = text_per_page['Page_8']
page_all = page_0 + page_1 +page_2 + page_3 +page_4 + page_5 +page_6 + page_7 + page_8

# Flatten the nested lists
flattened_page_all = list(chain.from_iterable(page_all))

# Convert the flattened list to a string
page_all_string = ''.join(map(str, flattened_page_all))

# Use regular expression to find the abstract text including the delimiters
match = re.search(r'Abstract\n(.*?)(?=\d+\nIntroduction)', page_all_string, re.DOTALL)

# Check if a match is found
if match:
    abstract_text = match.group(1)
    #print(abstract_text)
else:
    print("Abstract not found.")

# Initialize summarization pipeline
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")

# Get the summary
summary_result = summarizer(abstract_text, max_length=100, min_length=30, do_sample=False)

# Extract the summary text from the result
summary_text = summary_result[0]['summary_text']

# Replace the dot between two sentences with a space and "and"
merged_summary = summary_text.replace('. ', ' and ', 1)

# Find the index of "and" in the merged summary
and_index = merged_summary.find('and')

# Replace the first letter after "and" with its lowercase equivalent
if and_index != -1 and and_index + 4 < len(merged_summary):
    merged_summary = merged_summary[:and_index + 4] + merged_summary[and_index + 4].lower() + merged_summary[and_index + 5:]

# Print the merged summary
#print(merged_summary)
merged_summary_1 = "A"
synthesiser = pipeline("text-to-speech", "suno/bark")
speech = synthesiser(merged_summary_1, forward_params={"do_sample": True})
Audio(speech["audio"], rate=speech["sampling_rate"])

def PDF_abstract(audio):
  #pdf_path = gr.File()
  pdf_path = 'Article 11 Hidden Technical Debt in Machine Learning Systems'

  text_per_page = read_pdf(pdf_path)
  text_per_page.keys()
  page_0 = text_per_page['Page_0']
  page_1 = text_per_page['Page_1']
  page_2 = text_per_page['Page_2']
  page_3 = text_per_page['Page_3']
  page_4 = text_per_page['Page_4']
  page_5 = text_per_page['Page_5']
  page_6 = text_per_page['Page_6']
  page_7 = text_per_page['Page_7']
  page_8 = text_per_page['Page_8']
  page_all = page_0 + page_1 +page_2 + page_3 +page_4 + page_5 +page_6 + page_7 + page_8

  # Flatten the nested lists
  flattened_page_all = list(chain.from_iterable(page_all))

  # Convert the flattened list to a string
  page_all_string = ''.join(map(str, flattened_page_all))

  # Use regular expression to find the abstract text including the delimiters
  match = re.search(r'Abstract\n(.*?)(?=\d+\nIntroduction)', page_all_string, re.DOTALL)

  # Check if a match is found
  if match:
      abstract_text = match.group(1)
      #print(abstract_text)
  else:
      print("Abstract not found.")

  # Initialize summarization pipeline
  summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")

  # Get the summary
  summary_result = summarizer(abstract_text, max_length=100, min_length=30, do_sample=False)

  # Extract the summary text from the result
  summary_text = summary_result[0]['summary_text']

  # Replace the dot between two sentences with a space and "and"
  merged_summary = summary_text.replace('. ', ' and ', 1)

  # Find the index of "and" in the merged summary
  and_index = merged_summary.find('and')

  # Replace the first letter after "and" with its lowercase equivalent
  if and_index != -1 and and_index + 4 < len(merged_summary):
      merged_summary = merged_summary[:and_index + 4] + merged_summary[and_index + 4].lower() + merged_summary[and_index + 5:]

  # Print the merged summary
  #print(merged_summary)
  merged_summary_1 = "A"
  synthesiser = pipeline("text-to-speech", "suno/bark")
  speech = synthesiser(merged_summary_1, forward_params={"do_sample": True})
  #Audio(speech["audio"], rate=speech["sampling_rate"])

  # Convert audio bytes to playable format
  audio_bytes = BytesIO(speech["audio"])
  audio = Audio(audio_bytes, rate=speech["sampling_rate"])

  return PDF_abstract()     #({"sampling_rate": sr, "raw": y})["text"]

demo = gr.Interface(
    PDF_abstract,
    inputs="file",
    outputs="audio",
    live=True
)

demo.launch()