TableExtractor / app.py
naveenvenkatesh's picture
Update app.py
02862d4
import os
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import numpy as np
import pandas as pd
import gradio as gr
from typing import Tuple
class TableExtracter:
def __init__(self):
self.image_output_dir = 'output_images'
def pdf_file_path(self, pdf_file: str) -> str:
"""
Converts a PDF file to an image and returns the path of the input PDF.
Args:
pdf_file (str): The path to the PDF file to be converted.
Returns:
str: The path of the input PDF file.
"""
# Call the method to convert PDF to image and get paths
input_pdf_path,output_image_path = self.pdf_to_image_convert(pdf_file)
# Return the path of the input PDF
return input_pdf_path
def pdf_to_image_convert(self, pdf_file: str) -> Tuple[str, str]:
"""
Converts a PDF file to images and returns paths of the input PDF and output images.
Args:
pdf_file (str): The path to the PDF file to be converted.
Returns:
Tuple[str, str]: A tuple containing paths of input PDF and output image.
"""
# Extract the file name from the path
pdf_file = pdf_file.name
# Create the output directory if it doesn't exist
if not os.path.exists(self.image_output_dir):
os.makedirs(self.image_output_dir)
# Convert the PDF to images
images = convert_from_path(pdf_file)
for idx, image in enumerate(images):
# Create an output path for the image
output_image_path = os.path.join(self.image_output_dir, f"page_{idx + 1}.png")
# Save the image as a PNG file
image.save(output_image_path, "PNG")
# Return paths of the input PDF and the first output image
return pdf_file,output_image_path
def crop_image(self, output_image_path: str) -> str:
"""
Crop an image and return the path of the cropped image.
Args:
output_image_path (str): The path to the image to be cropped.
Returns:
str: The path of the cropped image.
"""
# Open the image
image = Image.open(output_image_path)
width, height = image.size
# Define the cropping dimensions (width, height)
crop_dimensions = (width, height)
# Crop the image
cropped_image = image.crop((width-1900, 0, crop_dimensions[0], crop_dimensions[1]))
# Save the cropped image
crop_image_path = "cropped_image.png"
cropped_image.save(crop_image_path)
return crop_image_path
def extract_table_from_image(self, crop_image_path: str) -> pd.DataFrame:
"""
Extract a table from an image using OCR and return it as a Pandas DataFrame.
Args:
crop_image_path (str): The path to the cropped image containing the table.
Returns:
pd.DataFrame: The extracted table as a Pandas DataFrame.
"""
# Load the image
image = Image.open(crop_image_path)
# Perform OCR on the image
ocr_text = pytesseract.image_to_string(image)
# Process the OCR text to create a table structure
lines = ocr_text.split('\n')
rows = [line.strip().split() for line in lines if line.strip()]
# Find the maximum number of columns in the rows
num_columns = max(len(row) for row in rows)
# Fill empty cells with None
for row in rows:
row.extend([None] * (num_columns - len(row)))
# Convert the data into a Pandas DataFrame
table_data_frame = pd.DataFrame(rows)
return table_data_frame
def find_index_locations(self, table_data_frame: pd.DataFrame) -> tuple:
"""
Find the locations of the plan index and index row in the given table DataFrame.
Args:
table_data_frame (pd.DataFrame): The DataFrame containing the table data.
Returns:
tuple: A tuple containing the plan index location and index row index.
"""
# Find the location of the 'Planindexliste' row
plan_index_location = int(table_data_frame.loc[table_data_frame[0] == 'Planindexliste'].index[0])
# Find the index of the row containing 'Index' or 'INDEX'
index_row = table_data_frame[(table_data_frame.iloc[:, 0] == 'Index') | (table_data_frame.iloc[:, 0] == 'INDEX')]
index_row_index = index_row.index[0]
return plan_index_location,index_row_index
def extract_last_index_value_above(self, plan_index_location: int, index_row_index: int, table_data_frame: pd.DataFrame) -> str:
"""
Extract the last index value above the specified index rows from the table DataFrame.
Args:
plan_index_location (int): The location of the 'Planindexliste' row.
index_row_index (int): The index of the row containing 'Index' or 'INDEX'.
table_data_frame (pd.DataFrame): The DataFrame containing the table data.
Returns:
str: The extracted index value.
"""
try:
# Get values before the index row
values_before_index = table_data_frame[0][max(index_row_index - 6, 0):index_row_index]
values_before_index_list = values_before_index.tolist()
alpha_indices = []
# Find the highest ASCII value index (character) present in values_before_index_list
for ascii_value in range(97, 108):
if chr(ascii_value) in values_before_index_list:
alpha_indices.append(ascii_value)
max_alpha_index = max(alpha_indices)
# Find the row index corresponding to the last alphabetical index
last_alpha_index = int(table_data_frame.loc[table_data_frame[0] == chr(max_alpha_index)].index[0])
alpha_row_data = table_data_frame.loc[last_alpha_index]
alpha_row_data_clean = alpha_row_data[alpha_row_data.notnull()]
final_alpha_index = ""
# Concatenate non-None elements in a row-wise format
for index, value in alpha_row_data_clean.items():
final_alpha_index = final_alpha_index + " " + value
return final_alpha_index
except:
# Get values before the index row
values_before_index = table_data_frame[0][max(index_row_index - 6, 0):index_row_index]
values_before_index_list = values_before_index.tolist()
num_indices = []
# Find the highest numerical index present in values_before_index_list
for num in range(10):
if str(num) in values_before_index_list:
num_indices.append(num)
max_num_index = max(num_indices)
for _ in range(4):
last_num_index = int(table_data_frame.loc[table_data_frame[0] == str(max_num_index)].index[_])
# Create a pandas Series with the provided data
data = values_before_index
series = pd.Series(data)
# Extract the indices as a NumPy array
indices = np.array(series.index)
if last_num_index in indices:
num_row_data = table_data_frame.loc[last_num_index]
break
num_row_data_clear = num_row_data[num_row_data.notnull()]
final_num_index = ""
# Concatenate non-None elements in a row-wise format
for index, value in num_row_data_clear.items():
final_num_index = final_num_index + " " + value
return final_num_index
def extract_last_index_value_below(self, plan_index_location: int, index_row_index: int, table_data_frame: pd.DataFrame) -> str:
"""
Extract the last index value below the specified index rows from the table DataFrame.
Args:
plan_index_location (int): The location of the 'Planindexliste' row.
index_row_index (int): The index of the row containing 'Index' or 'INDEX'.
table_data_frame (pd.DataFrame): The DataFrame containing the table data.
Returns:
str: The extracted index value.
"""
try:
# Get values after the index row
values_after_index = table_data_frame[0][index_row_index + 1: index_row_index + 7]
values_after_index_list = values_after_index.tolist()
found_alpha_ascii_values = []
# Find ASCII values (characters) present after the index row
for ascii_value in range(97, 123):
if chr(ascii_value) in values_after_index_list:
found_alpha_ascii_values.append(ascii_value)
# Find the highest ASCII value index (character) present in found_alpha_ascii_values
max_alpha_index = max(found_alpha_ascii_values)
last_alpha_index = int(table_data_frame.loc[table_data_frame[0] == chr(max_alpha_index)].index[0])
alpha_row_data = table_data_frame.loc[last_alpha_index]
alpha_row_data_clear = alpha_row_data[alpha_row_data.notnull()]
final_alpha_index = ""
# Concatenate non-None elements in a row-wise format
for index, value in alpha_row_data_clear.items():
final_alpha_index = final_alpha_index + " " + value
return final_alpha_index
except:
# Get values after the index row
values_after_index = table_data_frame[0][index_row_index + 1: index_row_index + 7]
values_after_index_list = values_after_index.tolist()
found_num_indices = []
# Find numerical values present after the index row
for num in range(10):
if num in values_after_index_list:
found_num_indices.append(num)
max_found_num_index = max(found_num_indices)
# Find the row index corresponding to the last numerical index
last_num_index = int(table_data_frame.loc[table_data_frame[0] == str(max_found_num_index)].index[0])
num_row_data = table_data_frame.loc[last_num_index]
num_row_data_clear = num_row_data[num_row_data.notnull()]
final_num_index = ""
# Concatenate non-None elements in a row-wise format
for index, value in num_row_data_clear.items():
final_num_index = final_num_index + " " + value
return final_num_index
def main(self,pdf_file):
try:
pdf_path,output_image_path = self.pdf_to_image_convert(pdf_file)
crop_image_path = self.crop_image(output_image_path)
table_data_frame = self.extract_table_from_image(crop_image_path)
plan_index_location,index_row_index = self.find_index_locations(table_data_frame)
if plan_index_location < index_row_index:
final_index = self.extract_last_index_value_below(plan_index_location,index_row_index,table_data_frame)
elif plan_index_location > index_row_index:
final_index = self.extract_last_index_value_above(plan_index_location,index_row_index,table_data_frame)
Answer = "<h2>Last Value</h2><br><br><center><b>"+final_index+"</b></center><br><br>"
return Answer
except:
return "Unable Get Value... Please Try Again"
def gradio_interface(self):
with gr.Blocks(css="style.css",theme=gr.themes.Soft()) as demo:
gr.HTML("""<center><h1>Table Extracter</h1></center>""")
with gr.Column(elem_id="col-container"):
with gr.Row(elem_id="row-flex"):
with gr.Column(scale=0.85, min_width=160):
upload_button = gr.UploadButton(
"Browse File",file_types=[".txt", ".pdf", ".doc", ".docx",".json",".csv"])
file_output = gr.File(elem_classes="filenameshow")
with gr.Row(scale=0.85,elem_id="row-flex"):
with gr.Column(scale=0.85, min_width=0):
btn = gr.Button("Submit")
with gr.Row(scale=0.85,elem_id="row-flex"):
with gr.Column(scale=0.85, min_width=0):
answer = gr.HTML(html=True)
upload_button.upload(self.pdf_file_path, upload_button, [file_output])
btn.click(self.main,upload_button,[answer])
demo.queue().launch(debug=True)
if __name__ == "__main__":
pdf_to_table = TableExtracter()
pdf_to_table.gradio_interface()