Spaces:
Runtime error
Runtime error
File size: 8,744 Bytes
af464a0 35287b3 af464a0 35287b3 af464a0 35287b3 af464a0 35287b3 af464a0 bf84551 efe5ee8 bf84551 97a3fab bf84551 97a3fab bf84551 97a3fab af464a0 6d55c1a af464a0 abc0274 af464a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
# https://huggingface.co/spaces/Mi-Ni/PDFtoAudio
# Unfortunately I wasn´t able to create a running space. I couldn´t adapt my code to create a running app in a huggingface space. After solving a lot of issues I ended up with a problem I wasn´t able to solve. Nevertheless, you´ll find my code below. Sorry that
#here are the imports: other imports and modules see in requirements
import gradio as gr
import numpy as np
#here is the code
# Create a function to extract text
def text_extraction(element):
# Extracting the text from the in-line text element
line_text = element.get_text()
# Find the formats of the text
# Initialize the list with all the formats that appeared in the line of text
line_formats = []
for text_line in element:
if isinstance(text_line, LTTextContainer):
# Iterating through each character in the line of text
for character in text_line:
if isinstance(character, LTChar):
# Append the font name of the character
line_formats.append(character.fontname)
# Append the font size of the character
line_formats.append(character.size)
# Find the unique font sizes and names in the line
format_per_line = list(set(line_formats))
# Return a tuple with the text in each line along with its format
return (line_text, format_per_line)
def read_pdf(pdf_path):
# Use pdf_path.name to get the file name from the gr.File object
with open(pdf_path.name, 'rb') as pdfFileObj:
pdfReaded = PyPDF2.PdfReader(pdfFileObj)
# create a PDF file object
#pdfFileObj = open(pdf_path, 'rb')
# create a PDF reader object
#pdfReaded = PyPDF2.PdfReader(pdfFileObj)
# Create the dictionary to extract text from each image
text_per_page = {}
# We extract the pages from the PDF
for pagenum, page in enumerate(extract_pages(pdf_path)):
print("Elaborating Page_" +str(pagenum))
# Initialize the variables needed for the text extraction from the page
pageObj = pdfReaded.pages[pagenum]
page_text = []
line_format = []
text_from_images = []
text_from_tables = []
page_content = []
# Initialize the number of the examined tables
table_num = 0
first_element= True
table_extraction_flag= False
# Open the pdf file
pdf = pdfplumber.open(pdf_path)
# Find the examined page
page_tables = pdf.pages[pagenum]
# Find the number of tables on the page
tables = page_tables.find_tables()
# Find all the elements
page_elements = [(element.y1, element) for element in page._objs]
# Sort all the elements as they appear in the page
page_elements.sort(key=lambda a: a[0], reverse=True)
# Find the elements that composed a page
for i,component in enumerate(page_elements):
# Extract the position of the top side of the element in the PDF
pos= component[0]
# Extract the element of the page layout
element = component[1]
# Check if the element is a text element
if isinstance(element, LTTextContainer):
# Check if the text appeared in a table
if table_extraction_flag == False:
# Use the function to extract the text and format for each text element
(line_text, format_per_line) = text_extraction(element)
# Append the text of each line to the page text
page_text.append(line_text)
# Append the format for each line containing text
line_format.append(format_per_line)
page_content.append(line_text)
else:
# Omit the text that appeared in a table
pass
# Create the key of the dictionary
dctkey = 'Page_'+str(pagenum)
# Add the list of list as the value of the page key
text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
# Closing the pdf file object
pdfFileObj.close()
return text_per_page
#pdf_path = 'Article 11 Hidden Technical Debt in Machine Learning Systems'
pdf_path = gr.File()
text_per_page = read_pdf(pdf_path)
text_per_page.keys()
page_0 = text_per_page['Page_0']
page_1 = text_per_page['Page_1']
page_2 = text_per_page['Page_2']
page_3 = text_per_page['Page_3']
page_4 = text_per_page['Page_4']
page_5 = text_per_page['Page_5']
page_6 = text_per_page['Page_6']
page_7 = text_per_page['Page_7']
page_8 = text_per_page['Page_8']
page_all = page_0 + page_1 +page_2 + page_3 +page_4 + page_5 +page_6 + page_7 + page_8
# Flatten the nested lists
flattened_page_all = list(chain.from_iterable(page_all))
# Convert the flattened list to a string
page_all_string = ''.join(map(str, flattened_page_all))
# Use regular expression to find the abstract text including the delimiters
match = re.search(r'Abstract\n(.*?)(?=\d+\nIntroduction)', page_all_string, re.DOTALL)
# Check if a match is found
if match:
abstract_text = match.group(1)
#print(abstract_text)
else:
print("Abstract not found.")
# Initialize summarization pipeline
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
# Get the summary
summary_result = summarizer(abstract_text, max_length=100, min_length=30, do_sample=False)
# Extract the summary text from the result
summary_text = summary_result[0]['summary_text']
# Replace the dot between two sentences with a space and "and"
merged_summary = summary_text.replace('. ', ' and ', 1)
# Find the index of "and" in the merged summary
and_index = merged_summary.find('and')
# Replace the first letter after "and" with its lowercase equivalent
if and_index != -1 and and_index + 4 < len(merged_summary):
merged_summary = merged_summary[:and_index + 4] + merged_summary[and_index + 4].lower() + merged_summary[and_index + 5:]
# Print the merged summary
#print(merged_summary)
merged_summary_1 = "A"
synthesiser = pipeline("text-to-speech", "suno/bark")
speech = synthesiser(merged_summary_1, forward_params={"do_sample": True})
Audio(speech["audio"], rate=speech["sampling_rate"])
def PDF_abstract(audio):
#pdf_path = gr.File()
pdf_path = 'Article 11 Hidden Technical Debt in Machine Learning Systems'
text_per_page = read_pdf(pdf_path)
text_per_page.keys()
page_0 = text_per_page['Page_0']
page_1 = text_per_page['Page_1']
page_2 = text_per_page['Page_2']
page_3 = text_per_page['Page_3']
page_4 = text_per_page['Page_4']
page_5 = text_per_page['Page_5']
page_6 = text_per_page['Page_6']
page_7 = text_per_page['Page_7']
page_8 = text_per_page['Page_8']
page_all = page_0 + page_1 +page_2 + page_3 +page_4 + page_5 +page_6 + page_7 + page_8
# Flatten the nested lists
flattened_page_all = list(chain.from_iterable(page_all))
# Convert the flattened list to a string
page_all_string = ''.join(map(str, flattened_page_all))
# Use regular expression to find the abstract text including the delimiters
match = re.search(r'Abstract\n(.*?)(?=\d+\nIntroduction)', page_all_string, re.DOTALL)
# Check if a match is found
if match:
abstract_text = match.group(1)
#print(abstract_text)
else:
print("Abstract not found.")
# Initialize summarization pipeline
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
# Get the summary
summary_result = summarizer(abstract_text, max_length=100, min_length=30, do_sample=False)
# Extract the summary text from the result
summary_text = summary_result[0]['summary_text']
# Replace the dot between two sentences with a space and "and"
merged_summary = summary_text.replace('. ', ' and ', 1)
# Find the index of "and" in the merged summary
and_index = merged_summary.find('and')
# Replace the first letter after "and" with its lowercase equivalent
if and_index != -1 and and_index + 4 < len(merged_summary):
merged_summary = merged_summary[:and_index + 4] + merged_summary[and_index + 4].lower() + merged_summary[and_index + 5:]
# Print the merged summary
#print(merged_summary)
merged_summary_1 = "A"
synthesiser = pipeline("text-to-speech", "suno/bark")
speech = synthesiser(merged_summary_1, forward_params={"do_sample": True})
#Audio(speech["audio"], rate=speech["sampling_rate"])
# Convert audio bytes to playable format
audio_bytes = BytesIO(speech["audio"])
audio = Audio(audio_bytes, rate=speech["sampling_rate"])
return PDF_abstract() #({"sampling_rate": sr, "raw": y})["text"]
demo = gr.Interface(
PDF_abstract,
inputs="file",
outputs="audio",
live=True
)
demo.launch() |