Spaces:

ADOPLE
/

TableExtractor

Runtime error

File size: 12,719 Bytes

import os
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import numpy as np
import pandas as pd
import gradio as gr
from typing import Tuple


class TableExtracter:

    def __init__(self):

        self.image_output_dir = 'output_images'

    def pdf_file_path(self, pdf_file: str) -> str:

        """
        Converts a PDF file to an image and returns the path of the input PDF.

        Args:
            pdf_file (str): The path to the PDF file to be converted.

        Returns:
            str: The path of the input PDF file.
        """

        # Call the method to convert PDF to image and get paths
        input_pdf_path,output_image_path = self.pdf_to_image_convert(pdf_file)

        # Return the path of the input PDF
        return input_pdf_path

    def pdf_to_image_convert(self, pdf_file: str) -> Tuple[str, str]:

        """
        Converts a PDF file to images and returns paths of the input PDF and output images.

        Args:
            pdf_file (str): The path to the PDF file to be converted.

        Returns:
            Tuple[str, str]: A tuple containing paths of input PDF and output image.
        """
        
        # Extract the file name from the path
        pdf_file = pdf_file.name

        # Create the output directory if it doesn't exist
        if not os.path.exists(self.image_output_dir):
            os.makedirs(self.image_output_dir)
        
        # Convert the PDF to images
        images = convert_from_path(pdf_file)

        for idx, image in enumerate(images):
            # Create an output path for the image
            output_image_path = os.path.join(self.image_output_dir, f"page_{idx + 1}.png")

            # Save the image as a PNG file
            image.save(output_image_path, "PNG")
            
            # Return paths of the input PDF and the first output image
            return pdf_file,output_image_path

    def crop_image(self, output_image_path: str) -> str:

        """
        Crop an image and return the path of the cropped image.

        Args:
            output_image_path (str): The path to the image to be cropped.

        Returns:
            str: The path of the cropped image.
        """

        # Open the image
        image = Image.open(output_image_path)

        width, height = image.size

        # Define the cropping dimensions (width, height)
        crop_dimensions = (width, height)

        # Crop the image
        cropped_image = image.crop((width-1900, 0, crop_dimensions[0], crop_dimensions[1]))

        # Save the cropped image
        crop_image_path = "cropped_image.png"
        cropped_image.save(crop_image_path)

        return crop_image_path

    def extract_table_from_image(self, crop_image_path: str) -> pd.DataFrame:

        """
        Extract a table from an image using OCR and return it as a Pandas DataFrame.

        Args:
            crop_image_path (str): The path to the cropped image containing the table.

        Returns:
            pd.DataFrame: The extracted table as a Pandas DataFrame.
        """

        # Load the image
        image = Image.open(crop_image_path)

        # Perform OCR on the image
        ocr_text = pytesseract.image_to_string(image)

        # Process the OCR text to create a table structure
        lines = ocr_text.split('\n')
        rows = [line.strip().split() for line in lines if line.strip()]

        # Find the maximum number of columns in the rows
        num_columns = max(len(row) for row in rows)

        # Fill empty cells with None
        for row in rows:
            row.extend([None] * (num_columns - len(row)))

        # Convert the data into a Pandas DataFrame
        table_data_frame  = pd.DataFrame(rows)

        return table_data_frame 

    def find_index_locations(self, table_data_frame: pd.DataFrame) -> tuple:

        """
        Find the locations of the plan index and index row in the given table DataFrame.

        Args:
            table_data_frame (pd.DataFrame): The DataFrame containing the table data.

        Returns:
            tuple: A tuple containing the plan index location and index row index.
        """
        
        # Find the location of the 'Planindexliste' row
        plan_index_location = int(table_data_frame.loc[table_data_frame[0] == 'Planindexliste'].index[0])

        # Find the index of the row containing 'Index' or 'INDEX'
        index_row = table_data_frame[(table_data_frame.iloc[:, 0] == 'Index') | (table_data_frame.iloc[:, 0] == 'INDEX')]
        index_row_index  = index_row.index[0]

        return plan_index_location,index_row_index

    def extract_last_index_value_above(self, plan_index_location: int, index_row_index: int, table_data_frame: pd.DataFrame) -> str:

        """
        Extract the last index value above the specified index rows from the table DataFrame.

        Args:
            plan_index_location (int): The location of the 'Planindexliste' row.
            index_row_index (int): The index of the row containing 'Index' or 'INDEX'.
            table_data_frame (pd.DataFrame): The DataFrame containing the table data.

        Returns:
            str: The extracted index value.
        """

        try:
            # Get values before the index row
            values_before_index = table_data_frame[0][max(index_row_index - 6, 0):index_row_index]
            values_before_index_list = values_before_index.tolist()
            alpha_indices = []

            # Find the highest ASCII value index (character) present in values_before_index_list
            for ascii_value in range(97, 108):
                if chr(ascii_value) in values_before_index_list:
                    alpha_indices.append(ascii_value)
                    max_alpha_index = max(alpha_indices)
            
            # Find the row index corresponding to the last alphabetical index
            last_alpha_index = int(table_data_frame.loc[table_data_frame[0] == chr(max_alpha_index)].index[0])
            alpha_row_data = table_data_frame.loc[last_alpha_index]
            alpha_row_data_clean = alpha_row_data[alpha_row_data.notnull()]
            final_alpha_index = ""

            # Concatenate non-None elements in a row-wise format
            for index, value in alpha_row_data_clean.items():
                final_alpha_index = final_alpha_index + " " + value

            return final_alpha_index

        except:
            # Get values before the index row
            values_before_index = table_data_frame[0][max(index_row_index - 6, 0):index_row_index]
            values_before_index_list = values_before_index.tolist()
            num_indices = []

            # Find the highest numerical index present in values_before_index_list
            for num in range(10):
                if str(num) in values_before_index_list:
                    num_indices.append(num)
                    max_num_index = max(num_indices)
                    for _ in range(4):
                        last_num_index = int(table_data_frame.loc[table_data_frame[0] == str(max_num_index)].index[_])

                        # Create a pandas Series with the provided data
                        data = values_before_index
                        series = pd.Series(data)

                        # Extract the indices as a NumPy array
                        indices = np.array(series.index)
                        if last_num_index in indices:
                            num_row_data = table_data_frame.loc[last_num_index]
                            break

            num_row_data_clear = num_row_data[num_row_data.notnull()]
            final_num_index = ""

            # Concatenate non-None elements in a row-wise format
            for index, value in num_row_data_clear.items():
                final_num_index = final_num_index + " " + value

            return final_num_index

    def extract_last_index_value_below(self, plan_index_location: int, index_row_index: int, table_data_frame: pd.DataFrame) -> str:
        
        """
        Extract the last index value below the specified index rows from the table DataFrame.

        Args:
            plan_index_location (int): The location of the 'Planindexliste' row.
            index_row_index (int): The index of the row containing 'Index' or 'INDEX'.
            table_data_frame (pd.DataFrame): The DataFrame containing the table data.

        Returns:
            str: The extracted index value.
        """

        try:
          # Get values after the index row
          values_after_index = table_data_frame[0][index_row_index + 1: index_row_index + 7]
          values_after_index_list = values_after_index.tolist()
          found_alpha_ascii_values = []

          # Find ASCII values (characters) present after the index row
          for ascii_value in range(97, 123):
              if chr(ascii_value) in values_after_index_list:
                  found_alpha_ascii_values.append(ascii_value)

          # Find the highest ASCII value index (character) present in found_alpha_ascii_values
          max_alpha_index = max(found_alpha_ascii_values)
          last_alpha_index = int(table_data_frame.loc[table_data_frame[0] == chr(max_alpha_index)].index[0])
          alpha_row_data = table_data_frame.loc[last_alpha_index]
          alpha_row_data_clear = alpha_row_data[alpha_row_data.notnull()]
          final_alpha_index = ""

          # Concatenate non-None elements in a row-wise format
          for index, value in alpha_row_data_clear.items():
              final_alpha_index = final_alpha_index + " " + value

          return final_alpha_index

        except:
          # Get values after the index row
          values_after_index = table_data_frame[0][index_row_index + 1: index_row_index + 7]
          values_after_index_list = values_after_index.tolist()
          found_num_indices = []

          # Find numerical values present after the index row
          for num in range(10):
              if num in values_after_index_list:
                  found_num_indices.append(num)
                  max_found_num_index = max(found_num_indices)
          
          # Find the row index corresponding to the last numerical index
          last_num_index = int(table_data_frame.loc[table_data_frame[0] == str(max_found_num_index)].index[0])
          num_row_data = table_data_frame.loc[last_num_index]
          num_row_data_clear = num_row_data[num_row_data.notnull()]
          final_num_index = ""

          # Concatenate non-None elements in a row-wise format
          for index, value in num_row_data_clear.items():
              final_num_index = final_num_index + " " + value

          return final_num_index

    def main(self,pdf_file):

        try:
          pdf_path,output_image_path = self.pdf_to_image_convert(pdf_file)
          crop_image_path = self.crop_image(output_image_path)
          table_data_frame = self.extract_table_from_image(crop_image_path)
          plan_index_location,index_row_index = self.find_index_locations(table_data_frame)
          if plan_index_location < index_row_index:
            final_index = self.extract_last_index_value_below(plan_index_location,index_row_index,table_data_frame)
          elif plan_index_location > index_row_index:
            final_index = self.extract_last_index_value_above(plan_index_location,index_row_index,table_data_frame)
          Answer = "<h2>Last Value</h2><br><br><center><b>"+final_index+"</b></center><br><br>"

          return Answer
        except:
          return "Unable Get Value... Please Try Again"

    def gradio_interface(self):

      with gr.Blocks(css="style.css",theme=gr.themes.Soft()) as demo:
        gr.HTML("""<center><h1>Table Extracter</h1></center>""")
        with gr.Column(elem_id="col-container"):
            with gr.Row(elem_id="row-flex"):
                with gr.Column(scale=0.85, min_width=160):
                  upload_button = gr.UploadButton(
                    "Browse File",file_types=[".txt", ".pdf", ".doc", ".docx",".json",".csv"])
                  file_output = gr.File(elem_classes="filenameshow")
            with gr.Row(scale=0.85,elem_id="row-flex"):
              with gr.Column(scale=0.85, min_width=0):
                    btn = gr.Button("Submit")
            with gr.Row(scale=0.85,elem_id="row-flex"):
              with gr.Column(scale=0.85, min_width=0):
                answer = gr.HTML(html=True)

        upload_button.upload(self.pdf_file_path, upload_button, [file_output])
        btn.click(self.main,upload_button,[answer])

      demo.queue().launch(debug=True)

if __name__ == "__main__":

    pdf_to_table = TableExtracter()
    pdf_to_table.gradio_interface()