import os from pdf2image import convert_from_path import pytesseract from PIL import Image import numpy as np import pandas as pd import gradio as gr from typing import Tuple class TableExtracter: def __init__(self): self.image_output_dir = 'output_images' def pdf_file_path(self, pdf_file: str) -> str: """ Converts a PDF file to an image and returns the path of the input PDF. Args: pdf_file (str): The path to the PDF file to be converted. Returns: str: The path of the input PDF file. """ # Call the method to convert PDF to image and get paths input_pdf_path,output_image_path = self.pdf_to_image_convert(pdf_file) # Return the path of the input PDF return input_pdf_path def pdf_to_image_convert(self, pdf_file: str) -> Tuple[str, str]: """ Converts a PDF file to images and returns paths of the input PDF and output images. Args: pdf_file (str): The path to the PDF file to be converted. Returns: Tuple[str, str]: A tuple containing paths of input PDF and output image. """ # Extract the file name from the path pdf_file = pdf_file.name # Create the output directory if it doesn't exist if not os.path.exists(self.image_output_dir): os.makedirs(self.image_output_dir) # Convert the PDF to images images = convert_from_path(pdf_file) for idx, image in enumerate(images): # Create an output path for the image output_image_path = os.path.join(self.image_output_dir, f"page_{idx + 1}.png") # Save the image as a PNG file image.save(output_image_path, "PNG") # Return paths of the input PDF and the first output image return pdf_file,output_image_path def crop_image(self, output_image_path: str) -> str: """ Crop an image and return the path of the cropped image. Args: output_image_path (str): The path to the image to be cropped. Returns: str: The path of the cropped image. """ # Open the image image = Image.open(output_image_path) width, height = image.size # Define the cropping dimensions (width, height) crop_dimensions = (width, height) # Crop the image cropped_image = image.crop((width-1900, 0, crop_dimensions[0], crop_dimensions[1])) # Save the cropped image crop_image_path = "cropped_image.png" cropped_image.save(crop_image_path) return crop_image_path def extract_table_from_image(self, crop_image_path: str) -> pd.DataFrame: """ Extract a table from an image using OCR and return it as a Pandas DataFrame. Args: crop_image_path (str): The path to the cropped image containing the table. Returns: pd.DataFrame: The extracted table as a Pandas DataFrame. """ # Load the image image = Image.open(crop_image_path) # Perform OCR on the image ocr_text = pytesseract.image_to_string(image) # Process the OCR text to create a table structure lines = ocr_text.split('\n') rows = [line.strip().split() for line in lines if line.strip()] # Find the maximum number of columns in the rows num_columns = max(len(row) for row in rows) # Fill empty cells with None for row in rows: row.extend([None] * (num_columns - len(row))) # Convert the data into a Pandas DataFrame table_data_frame = pd.DataFrame(rows) return table_data_frame def find_index_locations(self, table_data_frame: pd.DataFrame) -> tuple: """ Find the locations of the plan index and index row in the given table DataFrame. Args: table_data_frame (pd.DataFrame): The DataFrame containing the table data. Returns: tuple: A tuple containing the plan index location and index row index. """ # Find the location of the 'Planindexliste' row plan_index_location = int(table_data_frame.loc[table_data_frame[0] == 'Planindexliste'].index[0]) # Find the index of the row containing 'Index' or 'INDEX' index_row = table_data_frame[(table_data_frame.iloc[:, 0] == 'Index') | (table_data_frame.iloc[:, 0] == 'INDEX')] index_row_index = index_row.index[0] return plan_index_location,index_row_index def extract_last_index_value_above(self, plan_index_location: int, index_row_index: int, table_data_frame: pd.DataFrame) -> str: """ Extract the last index value above the specified index rows from the table DataFrame. Args: plan_index_location (int): The location of the 'Planindexliste' row. index_row_index (int): The index of the row containing 'Index' or 'INDEX'. table_data_frame (pd.DataFrame): The DataFrame containing the table data. Returns: str: The extracted index value. """ try: # Get values before the index row values_before_index = table_data_frame[0][max(index_row_index - 6, 0):index_row_index] values_before_index_list = values_before_index.tolist() alpha_indices = [] # Find the highest ASCII value index (character) present in values_before_index_list for ascii_value in range(97, 108): if chr(ascii_value) in values_before_index_list: alpha_indices.append(ascii_value) max_alpha_index = max(alpha_indices) # Find the row index corresponding to the last alphabetical index last_alpha_index = int(table_data_frame.loc[table_data_frame[0] == chr(max_alpha_index)].index[0]) alpha_row_data = table_data_frame.loc[last_alpha_index] alpha_row_data_clean = alpha_row_data[alpha_row_data.notnull()] final_alpha_index = "" # Concatenate non-None elements in a row-wise format for index, value in alpha_row_data_clean.items(): final_alpha_index = final_alpha_index + " " + value return final_alpha_index except: # Get values before the index row values_before_index = table_data_frame[0][max(index_row_index - 6, 0):index_row_index] values_before_index_list = values_before_index.tolist() num_indices = [] # Find the highest numerical index present in values_before_index_list for num in range(10): if str(num) in values_before_index_list: num_indices.append(num) max_num_index = max(num_indices) for _ in range(4): last_num_index = int(table_data_frame.loc[table_data_frame[0] == str(max_num_index)].index[_]) # Create a pandas Series with the provided data data = values_before_index series = pd.Series(data) # Extract the indices as a NumPy array indices = np.array(series.index) if last_num_index in indices: num_row_data = table_data_frame.loc[last_num_index] break num_row_data_clear = num_row_data[num_row_data.notnull()] final_num_index = "" # Concatenate non-None elements in a row-wise format for index, value in num_row_data_clear.items(): final_num_index = final_num_index + " " + value return final_num_index def extract_last_index_value_below(self, plan_index_location: int, index_row_index: int, table_data_frame: pd.DataFrame) -> str: """ Extract the last index value below the specified index rows from the table DataFrame. Args: plan_index_location (int): The location of the 'Planindexliste' row. index_row_index (int): The index of the row containing 'Index' or 'INDEX'. table_data_frame (pd.DataFrame): The DataFrame containing the table data. Returns: str: The extracted index value. """ try: # Get values after the index row values_after_index = table_data_frame[0][index_row_index + 1: index_row_index + 7] values_after_index_list = values_after_index.tolist() found_alpha_ascii_values = [] # Find ASCII values (characters) present after the index row for ascii_value in range(97, 123): if chr(ascii_value) in values_after_index_list: found_alpha_ascii_values.append(ascii_value) # Find the highest ASCII value index (character) present in found_alpha_ascii_values max_alpha_index = max(found_alpha_ascii_values) last_alpha_index = int(table_data_frame.loc[table_data_frame[0] == chr(max_alpha_index)].index[0]) alpha_row_data = table_data_frame.loc[last_alpha_index] alpha_row_data_clear = alpha_row_data[alpha_row_data.notnull()] final_alpha_index = "" # Concatenate non-None elements in a row-wise format for index, value in alpha_row_data_clear.items(): final_alpha_index = final_alpha_index + " " + value return final_alpha_index except: # Get values after the index row values_after_index = table_data_frame[0][index_row_index + 1: index_row_index + 7] values_after_index_list = values_after_index.tolist() found_num_indices = [] # Find numerical values present after the index row for num in range(10): if num in values_after_index_list: found_num_indices.append(num) max_found_num_index = max(found_num_indices) # Find the row index corresponding to the last numerical index last_num_index = int(table_data_frame.loc[table_data_frame[0] == str(max_found_num_index)].index[0]) num_row_data = table_data_frame.loc[last_num_index] num_row_data_clear = num_row_data[num_row_data.notnull()] final_num_index = "" # Concatenate non-None elements in a row-wise format for index, value in num_row_data_clear.items(): final_num_index = final_num_index + " " + value return final_num_index def main(self,pdf_file): try: pdf_path,output_image_path = self.pdf_to_image_convert(pdf_file) crop_image_path = self.crop_image(output_image_path) table_data_frame = self.extract_table_from_image(crop_image_path) plan_index_location,index_row_index = self.find_index_locations(table_data_frame) if plan_index_location < index_row_index: final_index = self.extract_last_index_value_below(plan_index_location,index_row_index,table_data_frame) elif plan_index_location > index_row_index: final_index = self.extract_last_index_value_above(plan_index_location,index_row_index,table_data_frame) Answer = "