Spaces:

ADOPLE
/

TableExtractor

Runtime error

App Files Files Community

TableExtractor / app.py

naveenvenkatesh

Update app.py

02862d4 almost 2 years ago

raw

history blame contribute delete

12.7 kB

	import os
	from pdf2image import convert_from_path
	import pytesseract
	from PIL import Image
	import numpy as np
	import pandas as pd
	import gradio as gr
	from typing import Tuple


	class TableExtracter:

	def __init__(self):

	self.image_output_dir = 'output_images'

	def pdf_file_path(self, pdf_file: str) -> str:

	"""
	Converts a PDF file to an image and returns the path of the input PDF.

	Args:
	pdf_file (str): The path to the PDF file to be converted.

	Returns:
	str: The path of the input PDF file.
	"""

	# Call the method to convert PDF to image and get paths
	input_pdf_path,output_image_path = self.pdf_to_image_convert(pdf_file)

	# Return the path of the input PDF
	return input_pdf_path

	def pdf_to_image_convert(self, pdf_file: str) -> Tuple[str, str]:

	"""
	Converts a PDF file to images and returns paths of the input PDF and output images.

	Args:
	pdf_file (str): The path to the PDF file to be converted.

	Returns:
	Tuple[str, str]: A tuple containing paths of input PDF and output image.
	"""

	# Extract the file name from the path
	pdf_file = pdf_file.name

	# Create the output directory if it doesn't exist
	if not os.path.exists(self.image_output_dir):
	os.makedirs(self.image_output_dir)

	# Convert the PDF to images
	images = convert_from_path(pdf_file)

	for idx, image in enumerate(images):
	# Create an output path for the image
	output_image_path = os.path.join(self.image_output_dir, f"page_{idx + 1}.png")

	# Save the image as a PNG file
	image.save(output_image_path, "PNG")

	# Return paths of the input PDF and the first output image
	return pdf_file,output_image_path

	def crop_image(self, output_image_path: str) -> str:

	"""
	Crop an image and return the path of the cropped image.

	Args:
	output_image_path (str): The path to the image to be cropped.

	Returns:
	str: The path of the cropped image.
	"""

	# Open the image
	image = Image.open(output_image_path)

	width, height = image.size

	# Define the cropping dimensions (width, height)
	crop_dimensions = (width, height)

	# Crop the image
	cropped_image = image.crop((width-1900, 0, crop_dimensions[0], crop_dimensions[1]))

	# Save the cropped image
	crop_image_path = "cropped_image.png"
	cropped_image.save(crop_image_path)

	return crop_image_path

	def extract_table_from_image(self, crop_image_path: str) -> pd.DataFrame:

	"""
	Extract a table from an image using OCR and return it as a Pandas DataFrame.

	Args:
	crop_image_path (str): The path to the cropped image containing the table.

	Returns:
	pd.DataFrame: The extracted table as a Pandas DataFrame.
	"""

	# Load the image
	image = Image.open(crop_image_path)

	# Perform OCR on the image
	ocr_text = pytesseract.image_to_string(image)

	# Process the OCR text to create a table structure
	lines = ocr_text.split('\n')
	rows = [line.strip().split() for line in lines if line.strip()]

	# Find the maximum number of columns in the rows
	num_columns = max(len(row) for row in rows)

	# Fill empty cells with None
	for row in rows:
	row.extend([None] * (num_columns - len(row)))

	# Convert the data into a Pandas DataFrame
	table_data_frame = pd.DataFrame(rows)

	return table_data_frame

	def find_index_locations(self, table_data_frame: pd.DataFrame) -> tuple:

	"""
	Find the locations of the plan index and index row in the given table DataFrame.

	Args:
	table_data_frame (pd.DataFrame): The DataFrame containing the table data.

	Returns:
	tuple: A tuple containing the plan index location and index row index.
	"""

	# Find the location of the 'Planindexliste' row
	plan_index_location = int(table_data_frame.loc[table_data_frame[0] == 'Planindexliste'].index[0])

	# Find the index of the row containing 'Index' or 'INDEX'
	index_row = table_data_frame[(table_data_frame.iloc[:, 0] == 'Index') \| (table_data_frame.iloc[:, 0] == 'INDEX')]
	index_row_index = index_row.index[0]

	return plan_index_location,index_row_index

	def extract_last_index_value_above(self, plan_index_location: int, index_row_index: int, table_data_frame: pd.DataFrame) -> str:

	"""
	Extract the last index value above the specified index rows from the table DataFrame.

	Args:
	plan_index_location (int): The location of the 'Planindexliste' row.
	index_row_index (int): The index of the row containing 'Index' or 'INDEX'.
	table_data_frame (pd.DataFrame): The DataFrame containing the table data.

	Returns:
	str: The extracted index value.
	"""

	try:
	# Get values before the index row
	values_before_index = table_data_frame[0][max(index_row_index - 6, 0):index_row_index]
	values_before_index_list = values_before_index.tolist()
	alpha_indices = []

	# Find the highest ASCII value index (character) present in values_before_index_list
	for ascii_value in range(97, 108):
	if chr(ascii_value) in values_before_index_list:
	alpha_indices.append(ascii_value)
	max_alpha_index = max(alpha_indices)

	# Find the row index corresponding to the last alphabetical index
	last_alpha_index = int(table_data_frame.loc[table_data_frame[0] == chr(max_alpha_index)].index[0])
	alpha_row_data = table_data_frame.loc[last_alpha_index]
	alpha_row_data_clean = alpha_row_data[alpha_row_data.notnull()]
	final_alpha_index = ""

	# Concatenate non-None elements in a row-wise format
	for index, value in alpha_row_data_clean.items():
	final_alpha_index = final_alpha_index + " " + value

	return final_alpha_index

	except:
	# Get values before the index row
	values_before_index = table_data_frame[0][max(index_row_index - 6, 0):index_row_index]
	values_before_index_list = values_before_index.tolist()
	num_indices = []

	# Find the highest numerical index present in values_before_index_list
	for num in range(10):
	if str(num) in values_before_index_list:
	num_indices.append(num)
	max_num_index = max(num_indices)
	for _ in range(4):
	last_num_index = int(table_data_frame.loc[table_data_frame[0] == str(max_num_index)].index[_])

	# Create a pandas Series with the provided data
	data = values_before_index
	series = pd.Series(data)

	# Extract the indices as a NumPy array
	indices = np.array(series.index)
	if last_num_index in indices:
	num_row_data = table_data_frame.loc[last_num_index]
	break

	num_row_data_clear = num_row_data[num_row_data.notnull()]
	final_num_index = ""

	# Concatenate non-None elements in a row-wise format
	for index, value in num_row_data_clear.items():
	final_num_index = final_num_index + " " + value

	return final_num_index

	def extract_last_index_value_below(self, plan_index_location: int, index_row_index: int, table_data_frame: pd.DataFrame) -> str:

	"""
	Extract the last index value below the specified index rows from the table DataFrame.

	Args:
	plan_index_location (int): The location of the 'Planindexliste' row.
	index_row_index (int): The index of the row containing 'Index' or 'INDEX'.
	table_data_frame (pd.DataFrame): The DataFrame containing the table data.

	Returns:
	str: The extracted index value.
	"""

	try:
	# Get values after the index row
	values_after_index = table_data_frame[0][index_row_index + 1: index_row_index + 7]
	values_after_index_list = values_after_index.tolist()
	found_alpha_ascii_values = []

	# Find ASCII values (characters) present after the index row
	for ascii_value in range(97, 123):
	if chr(ascii_value) in values_after_index_list:
	found_alpha_ascii_values.append(ascii_value)

	# Find the highest ASCII value index (character) present in found_alpha_ascii_values
	max_alpha_index = max(found_alpha_ascii_values)
	last_alpha_index = int(table_data_frame.loc[table_data_frame[0] == chr(max_alpha_index)].index[0])
	alpha_row_data = table_data_frame.loc[last_alpha_index]
	alpha_row_data_clear = alpha_row_data[alpha_row_data.notnull()]
	final_alpha_index = ""

	# Concatenate non-None elements in a row-wise format
	for index, value in alpha_row_data_clear.items():
	final_alpha_index = final_alpha_index + " " + value

	return final_alpha_index

	except:
	# Get values after the index row
	values_after_index = table_data_frame[0][index_row_index + 1: index_row_index + 7]
	values_after_index_list = values_after_index.tolist()
	found_num_indices = []

	# Find numerical values present after the index row
	for num in range(10):
	if num in values_after_index_list:
	found_num_indices.append(num)
	max_found_num_index = max(found_num_indices)

	# Find the row index corresponding to the last numerical index
	last_num_index = int(table_data_frame.loc[table_data_frame[0] == str(max_found_num_index)].index[0])
	num_row_data = table_data_frame.loc[last_num_index]
	num_row_data_clear = num_row_data[num_row_data.notnull()]
	final_num_index = ""

	# Concatenate non-None elements in a row-wise format
	for index, value in num_row_data_clear.items():
	final_num_index = final_num_index + " " + value

	return final_num_index

	def main(self,pdf_file):

	try:
	pdf_path,output_image_path = self.pdf_to_image_convert(pdf_file)
	crop_image_path = self.crop_image(output_image_path)
	table_data_frame = self.extract_table_from_image(crop_image_path)
	plan_index_location,index_row_index = self.find_index_locations(table_data_frame)
	if plan_index_location < index_row_index:
	final_index = self.extract_last_index_value_below(plan_index_location,index_row_index,table_data_frame)
	elif plan_index_location > index_row_index:
	final_index = self.extract_last_index_value_above(plan_index_location,index_row_index,table_data_frame)
	Answer = "<h2>Last Value</h2><br><br><center><b>"+final_index+"</b></center><br><br>"

	return Answer
	except:
	return "Unable Get Value... Please Try Again"

	def gradio_interface(self):

	with gr.Blocks(css="style.css",theme=gr.themes.Soft()) as demo:
	gr.HTML("""<center><h1>Table Extracter</h1></center>""")
	with gr.Column(elem_id="col-container"):
	with gr.Row(elem_id="row-flex"):
	with gr.Column(scale=0.85, min_width=160):
	upload_button = gr.UploadButton(
	"Browse File",file_types=[".txt", ".pdf", ".doc", ".docx",".json",".csv"])
	file_output = gr.File(elem_classes="filenameshow")
	with gr.Row(scale=0.85,elem_id="row-flex"):
	with gr.Column(scale=0.85, min_width=0):
	btn = gr.Button("Submit")
	with gr.Row(scale=0.85,elem_id="row-flex"):
	with gr.Column(scale=0.85, min_width=0):
	answer = gr.HTML(html=True)

	upload_button.upload(self.pdf_file_path, upload_button, [file_output])
	btn.click(self.main,upload_button,[answer])

	demo.queue().launch(debug=True)

	if __name__ == "__main__":

	pdf_to_table = TableExtracter()
	pdf_to_table.gradio_interface()