Spaces:
Runtime error
Runtime error
import os | |
from pdf2image import convert_from_path | |
import pytesseract | |
from PIL import Image | |
import numpy as np | |
import pandas as pd | |
import gradio as gr | |
from typing import Tuple | |
class TableExtracter: | |
def __init__(self): | |
self.image_output_dir = 'output_images' | |
def pdf_file_path(self, pdf_file: str) -> str: | |
""" | |
Converts a PDF file to an image and returns the path of the input PDF. | |
Args: | |
pdf_file (str): The path to the PDF file to be converted. | |
Returns: | |
str: The path of the input PDF file. | |
""" | |
# Call the method to convert PDF to image and get paths | |
input_pdf_path,output_image_path = self.pdf_to_image_convert(pdf_file) | |
# Return the path of the input PDF | |
return input_pdf_path | |
def pdf_to_image_convert(self, pdf_file: str) -> Tuple[str, str]: | |
""" | |
Converts a PDF file to images and returns paths of the input PDF and output images. | |
Args: | |
pdf_file (str): The path to the PDF file to be converted. | |
Returns: | |
Tuple[str, str]: A tuple containing paths of input PDF and output image. | |
""" | |
# Extract the file name from the path | |
pdf_file = pdf_file.name | |
# Create the output directory if it doesn't exist | |
if not os.path.exists(self.image_output_dir): | |
os.makedirs(self.image_output_dir) | |
# Convert the PDF to images | |
images = convert_from_path(pdf_file) | |
for idx, image in enumerate(images): | |
# Create an output path for the image | |
output_image_path = os.path.join(self.image_output_dir, f"page_{idx + 1}.png") | |
# Save the image as a PNG file | |
image.save(output_image_path, "PNG") | |
# Return paths of the input PDF and the first output image | |
return pdf_file,output_image_path | |
def crop_image(self, output_image_path: str) -> str: | |
""" | |
Crop an image and return the path of the cropped image. | |
Args: | |
output_image_path (str): The path to the image to be cropped. | |
Returns: | |
str: The path of the cropped image. | |
""" | |
# Open the image | |
image = Image.open(output_image_path) | |
width, height = image.size | |
# Define the cropping dimensions (width, height) | |
crop_dimensions = (width, height) | |
# Crop the image | |
cropped_image = image.crop((width-1900, 0, crop_dimensions[0], crop_dimensions[1])) | |
# Save the cropped image | |
crop_image_path = "cropped_image.png" | |
cropped_image.save(crop_image_path) | |
return crop_image_path | |
def extract_table_from_image(self, crop_image_path: str) -> pd.DataFrame: | |
""" | |
Extract a table from an image using OCR and return it as a Pandas DataFrame. | |
Args: | |
crop_image_path (str): The path to the cropped image containing the table. | |
Returns: | |
pd.DataFrame: The extracted table as a Pandas DataFrame. | |
""" | |
# Load the image | |
image = Image.open(crop_image_path) | |
# Perform OCR on the image | |
ocr_text = pytesseract.image_to_string(image) | |
# Process the OCR text to create a table structure | |
lines = ocr_text.split('\n') | |
rows = [line.strip().split() for line in lines if line.strip()] | |
# Find the maximum number of columns in the rows | |
num_columns = max(len(row) for row in rows) | |
# Fill empty cells with None | |
for row in rows: | |
row.extend([None] * (num_columns - len(row))) | |
# Convert the data into a Pandas DataFrame | |
table_data_frame = pd.DataFrame(rows) | |
return table_data_frame | |
def find_index_locations(self, table_data_frame: pd.DataFrame) -> tuple: | |
""" | |
Find the locations of the plan index and index row in the given table DataFrame. | |
Args: | |
table_data_frame (pd.DataFrame): The DataFrame containing the table data. | |
Returns: | |
tuple: A tuple containing the plan index location and index row index. | |
""" | |
# Find the location of the 'Planindexliste' row | |
plan_index_location = int(table_data_frame.loc[table_data_frame[0] == 'Planindexliste'].index[0]) | |
# Find the index of the row containing 'Index' or 'INDEX' | |
index_row = table_data_frame[(table_data_frame.iloc[:, 0] == 'Index') | (table_data_frame.iloc[:, 0] == 'INDEX')] | |
index_row_index = index_row.index[0] | |
return plan_index_location,index_row_index | |
def extract_last_index_value_above(self, plan_index_location: int, index_row_index: int, table_data_frame: pd.DataFrame) -> str: | |
""" | |
Extract the last index value above the specified index rows from the table DataFrame. | |
Args: | |
plan_index_location (int): The location of the 'Planindexliste' row. | |
index_row_index (int): The index of the row containing 'Index' or 'INDEX'. | |
table_data_frame (pd.DataFrame): The DataFrame containing the table data. | |
Returns: | |
str: The extracted index value. | |
""" | |
try: | |
# Get values before the index row | |
values_before_index = table_data_frame[0][max(index_row_index - 6, 0):index_row_index] | |
values_before_index_list = values_before_index.tolist() | |
alpha_indices = [] | |
# Find the highest ASCII value index (character) present in values_before_index_list | |
for ascii_value in range(97, 108): | |
if chr(ascii_value) in values_before_index_list: | |
alpha_indices.append(ascii_value) | |
max_alpha_index = max(alpha_indices) | |
# Find the row index corresponding to the last alphabetical index | |
last_alpha_index = int(table_data_frame.loc[table_data_frame[0] == chr(max_alpha_index)].index[0]) | |
alpha_row_data = table_data_frame.loc[last_alpha_index] | |
alpha_row_data_clean = alpha_row_data[alpha_row_data.notnull()] | |
final_alpha_index = "" | |
# Concatenate non-None elements in a row-wise format | |
for index, value in alpha_row_data_clean.items(): | |
final_alpha_index = final_alpha_index + " " + value | |
return final_alpha_index | |
except: | |
# Get values before the index row | |
values_before_index = table_data_frame[0][max(index_row_index - 6, 0):index_row_index] | |
values_before_index_list = values_before_index.tolist() | |
num_indices = [] | |
# Find the highest numerical index present in values_before_index_list | |
for num in range(10): | |
if str(num) in values_before_index_list: | |
num_indices.append(num) | |
max_num_index = max(num_indices) | |
for _ in range(4): | |
last_num_index = int(table_data_frame.loc[table_data_frame[0] == str(max_num_index)].index[_]) | |
# Create a pandas Series with the provided data | |
data = values_before_index | |
series = pd.Series(data) | |
# Extract the indices as a NumPy array | |
indices = np.array(series.index) | |
if last_num_index in indices: | |
num_row_data = table_data_frame.loc[last_num_index] | |
break | |
num_row_data_clear = num_row_data[num_row_data.notnull()] | |
final_num_index = "" | |
# Concatenate non-None elements in a row-wise format | |
for index, value in num_row_data_clear.items(): | |
final_num_index = final_num_index + " " + value | |
return final_num_index | |
def extract_last_index_value_below(self, plan_index_location: int, index_row_index: int, table_data_frame: pd.DataFrame) -> str: | |
""" | |
Extract the last index value below the specified index rows from the table DataFrame. | |
Args: | |
plan_index_location (int): The location of the 'Planindexliste' row. | |
index_row_index (int): The index of the row containing 'Index' or 'INDEX'. | |
table_data_frame (pd.DataFrame): The DataFrame containing the table data. | |
Returns: | |
str: The extracted index value. | |
""" | |
try: | |
# Get values after the index row | |
values_after_index = table_data_frame[0][index_row_index + 1: index_row_index + 7] | |
values_after_index_list = values_after_index.tolist() | |
found_alpha_ascii_values = [] | |
# Find ASCII values (characters) present after the index row | |
for ascii_value in range(97, 123): | |
if chr(ascii_value) in values_after_index_list: | |
found_alpha_ascii_values.append(ascii_value) | |
# Find the highest ASCII value index (character) present in found_alpha_ascii_values | |
max_alpha_index = max(found_alpha_ascii_values) | |
last_alpha_index = int(table_data_frame.loc[table_data_frame[0] == chr(max_alpha_index)].index[0]) | |
alpha_row_data = table_data_frame.loc[last_alpha_index] | |
alpha_row_data_clear = alpha_row_data[alpha_row_data.notnull()] | |
final_alpha_index = "" | |
# Concatenate non-None elements in a row-wise format | |
for index, value in alpha_row_data_clear.items(): | |
final_alpha_index = final_alpha_index + " " + value | |
return final_alpha_index | |
except: | |
# Get values after the index row | |
values_after_index = table_data_frame[0][index_row_index + 1: index_row_index + 7] | |
values_after_index_list = values_after_index.tolist() | |
found_num_indices = [] | |
# Find numerical values present after the index row | |
for num in range(10): | |
if num in values_after_index_list: | |
found_num_indices.append(num) | |
max_found_num_index = max(found_num_indices) | |
# Find the row index corresponding to the last numerical index | |
last_num_index = int(table_data_frame.loc[table_data_frame[0] == str(max_found_num_index)].index[0]) | |
num_row_data = table_data_frame.loc[last_num_index] | |
num_row_data_clear = num_row_data[num_row_data.notnull()] | |
final_num_index = "" | |
# Concatenate non-None elements in a row-wise format | |
for index, value in num_row_data_clear.items(): | |
final_num_index = final_num_index + " " + value | |
return final_num_index | |
def main(self,pdf_file): | |
try: | |
pdf_path,output_image_path = self.pdf_to_image_convert(pdf_file) | |
crop_image_path = self.crop_image(output_image_path) | |
table_data_frame = self.extract_table_from_image(crop_image_path) | |
plan_index_location,index_row_index = self.find_index_locations(table_data_frame) | |
if plan_index_location < index_row_index: | |
final_index = self.extract_last_index_value_below(plan_index_location,index_row_index,table_data_frame) | |
elif plan_index_location > index_row_index: | |
final_index = self.extract_last_index_value_above(plan_index_location,index_row_index,table_data_frame) | |
Answer = "<h2>Last Value</h2><br><br><center><b>"+final_index+"</b></center><br><br>" | |
return Answer | |
except: | |
return "Unable Get Value... Please Try Again" | |
def gradio_interface(self): | |
with gr.Blocks(css="style.css",theme=gr.themes.Soft()) as demo: | |
gr.HTML("""<center><h1>Table Extracter</h1></center>""") | |
with gr.Column(elem_id="col-container"): | |
with gr.Row(elem_id="row-flex"): | |
with gr.Column(scale=0.85, min_width=160): | |
upload_button = gr.UploadButton( | |
"Browse File",file_types=[".txt", ".pdf", ".doc", ".docx",".json",".csv"]) | |
file_output = gr.File(elem_classes="filenameshow") | |
with gr.Row(scale=0.85,elem_id="row-flex"): | |
with gr.Column(scale=0.85, min_width=0): | |
btn = gr.Button("Submit") | |
with gr.Row(scale=0.85,elem_id="row-flex"): | |
with gr.Column(scale=0.85, min_width=0): | |
answer = gr.HTML(html=True) | |
upload_button.upload(self.pdf_file_path, upload_button, [file_output]) | |
btn.click(self.main,upload_button,[answer]) | |
demo.queue().launch(debug=True) | |
if __name__ == "__main__": | |
pdf_to_table = TableExtracter() | |
pdf_to_table.gradio_interface() |