File size: 12,719 Bytes
16b57c4
6efad09
688e6c3
61b90fc
40a8116
 
 
6f3b975
 
ff93e7a
40a8116
6f3b975
6efad09
 
6f3b975
 
 
 
02862d4
6f3b975
 
 
 
 
 
 
02862d4
6f3b975
02862d4
 
40a8116
02862d4
 
6f3b975
 
 
 
 
 
 
 
 
 
 
 
 
 
6efad09
40a8116
6f3b975
 
 
 
 
40a8116
 
 
6f3b975
 
 
 
 
 
 
 
 
 
40a8116
02862d4
 
6efad09
02862d4
 
6f3b975
02862d4
 
 
6f3b975
02862d4
 
6efad09
02862d4
6efad09
02862d4
 
6efad09
02862d4
 
40a8116
02862d4
 
 
6f3b975
02862d4
6efad09
6f3b975
 
 
 
 
 
 
 
 
 
 
 
40a8116
6f3b975
40a8116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f3b975
 
 
 
 
 
02862d4
 
6f3b975
02862d4
 
6f3b975
02862d4
 
 
 
 
 
6f3b975
02862d4
 
 
40a8116
02862d4
40a8116
6f3b975
40a8116
6f3b975
 
 
 
 
 
 
 
 
 
 
40a8116
 
6f3b975
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6efad09
6f3b975
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6efad09
6f3b975
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6efad09
6f3b975
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6efad09
 
6f3b975
02862d4
 
 
 
 
 
 
 
 
 
 
 
 
 
40a8116
 
 
 
 
 
 
 
6efad09
 
40a8116
 
 
 
 
 
 
 
6efad09
 
40a8116
61b90fc
40a8116
 
61b90fc
02862d4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
import os
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import numpy as np
import pandas as pd
import gradio as gr
from typing import Tuple


class TableExtracter:

    def __init__(self):

        self.image_output_dir = 'output_images'

    def pdf_file_path(self, pdf_file: str) -> str:

        """
        Converts a PDF file to an image and returns the path of the input PDF.

        Args:
            pdf_file (str): The path to the PDF file to be converted.

        Returns:
            str: The path of the input PDF file.
        """

        # Call the method to convert PDF to image and get paths
        input_pdf_path,output_image_path = self.pdf_to_image_convert(pdf_file)

        # Return the path of the input PDF
        return input_pdf_path

    def pdf_to_image_convert(self, pdf_file: str) -> Tuple[str, str]:

        """
        Converts a PDF file to images and returns paths of the input PDF and output images.

        Args:
            pdf_file (str): The path to the PDF file to be converted.

        Returns:
            Tuple[str, str]: A tuple containing paths of input PDF and output image.
        """
        
        # Extract the file name from the path
        pdf_file = pdf_file.name

        # Create the output directory if it doesn't exist
        if not os.path.exists(self.image_output_dir):
            os.makedirs(self.image_output_dir)
        
        # Convert the PDF to images
        images = convert_from_path(pdf_file)

        for idx, image in enumerate(images):
            # Create an output path for the image
            output_image_path = os.path.join(self.image_output_dir, f"page_{idx + 1}.png")

            # Save the image as a PNG file
            image.save(output_image_path, "PNG")
            
            # Return paths of the input PDF and the first output image
            return pdf_file,output_image_path

    def crop_image(self, output_image_path: str) -> str:

        """
        Crop an image and return the path of the cropped image.

        Args:
            output_image_path (str): The path to the image to be cropped.

        Returns:
            str: The path of the cropped image.
        """

        # Open the image
        image = Image.open(output_image_path)

        width, height = image.size

        # Define the cropping dimensions (width, height)
        crop_dimensions = (width, height)

        # Crop the image
        cropped_image = image.crop((width-1900, 0, crop_dimensions[0], crop_dimensions[1]))

        # Save the cropped image
        crop_image_path = "cropped_image.png"
        cropped_image.save(crop_image_path)

        return crop_image_path

    def extract_table_from_image(self, crop_image_path: str) -> pd.DataFrame:

        """
        Extract a table from an image using OCR and return it as a Pandas DataFrame.

        Args:
            crop_image_path (str): The path to the cropped image containing the table.

        Returns:
            pd.DataFrame: The extracted table as a Pandas DataFrame.
        """

        # Load the image
        image = Image.open(crop_image_path)

        # Perform OCR on the image
        ocr_text = pytesseract.image_to_string(image)

        # Process the OCR text to create a table structure
        lines = ocr_text.split('\n')
        rows = [line.strip().split() for line in lines if line.strip()]

        # Find the maximum number of columns in the rows
        num_columns = max(len(row) for row in rows)

        # Fill empty cells with None
        for row in rows:
            row.extend([None] * (num_columns - len(row)))

        # Convert the data into a Pandas DataFrame
        table_data_frame  = pd.DataFrame(rows)

        return table_data_frame 

    def find_index_locations(self, table_data_frame: pd.DataFrame) -> tuple:

        """
        Find the locations of the plan index and index row in the given table DataFrame.

        Args:
            table_data_frame (pd.DataFrame): The DataFrame containing the table data.

        Returns:
            tuple: A tuple containing the plan index location and index row index.
        """
        
        # Find the location of the 'Planindexliste' row
        plan_index_location = int(table_data_frame.loc[table_data_frame[0] == 'Planindexliste'].index[0])

        # Find the index of the row containing 'Index' or 'INDEX'
        index_row = table_data_frame[(table_data_frame.iloc[:, 0] == 'Index') | (table_data_frame.iloc[:, 0] == 'INDEX')]
        index_row_index  = index_row.index[0]

        return plan_index_location,index_row_index

    def extract_last_index_value_above(self, plan_index_location: int, index_row_index: int, table_data_frame: pd.DataFrame) -> str:

        """
        Extract the last index value above the specified index rows from the table DataFrame.

        Args:
            plan_index_location (int): The location of the 'Planindexliste' row.
            index_row_index (int): The index of the row containing 'Index' or 'INDEX'.
            table_data_frame (pd.DataFrame): The DataFrame containing the table data.

        Returns:
            str: The extracted index value.
        """

        try:
            # Get values before the index row
            values_before_index = table_data_frame[0][max(index_row_index - 6, 0):index_row_index]
            values_before_index_list = values_before_index.tolist()
            alpha_indices = []

            # Find the highest ASCII value index (character) present in values_before_index_list
            for ascii_value in range(97, 108):
                if chr(ascii_value) in values_before_index_list:
                    alpha_indices.append(ascii_value)
                    max_alpha_index = max(alpha_indices)
            
            # Find the row index corresponding to the last alphabetical index
            last_alpha_index = int(table_data_frame.loc[table_data_frame[0] == chr(max_alpha_index)].index[0])
            alpha_row_data = table_data_frame.loc[last_alpha_index]
            alpha_row_data_clean = alpha_row_data[alpha_row_data.notnull()]
            final_alpha_index = ""

            # Concatenate non-None elements in a row-wise format
            for index, value in alpha_row_data_clean.items():
                final_alpha_index = final_alpha_index + " " + value

            return final_alpha_index

        except:
            # Get values before the index row
            values_before_index = table_data_frame[0][max(index_row_index - 6, 0):index_row_index]
            values_before_index_list = values_before_index.tolist()
            num_indices = []

            # Find the highest numerical index present in values_before_index_list
            for num in range(10):
                if str(num) in values_before_index_list:
                    num_indices.append(num)
                    max_num_index = max(num_indices)
                    for _ in range(4):
                        last_num_index = int(table_data_frame.loc[table_data_frame[0] == str(max_num_index)].index[_])

                        # Create a pandas Series with the provided data
                        data = values_before_index
                        series = pd.Series(data)

                        # Extract the indices as a NumPy array
                        indices = np.array(series.index)
                        if last_num_index in indices:
                            num_row_data = table_data_frame.loc[last_num_index]
                            break

            num_row_data_clear = num_row_data[num_row_data.notnull()]
            final_num_index = ""

            # Concatenate non-None elements in a row-wise format
            for index, value in num_row_data_clear.items():
                final_num_index = final_num_index + " " + value

            return final_num_index

    def extract_last_index_value_below(self, plan_index_location: int, index_row_index: int, table_data_frame: pd.DataFrame) -> str:
        
        """
        Extract the last index value below the specified index rows from the table DataFrame.

        Args:
            plan_index_location (int): The location of the 'Planindexliste' row.
            index_row_index (int): The index of the row containing 'Index' or 'INDEX'.
            table_data_frame (pd.DataFrame): The DataFrame containing the table data.

        Returns:
            str: The extracted index value.
        """

        try:
          # Get values after the index row
          values_after_index = table_data_frame[0][index_row_index + 1: index_row_index + 7]
          values_after_index_list = values_after_index.tolist()
          found_alpha_ascii_values = []

          # Find ASCII values (characters) present after the index row
          for ascii_value in range(97, 123):
              if chr(ascii_value) in values_after_index_list:
                  found_alpha_ascii_values.append(ascii_value)

          # Find the highest ASCII value index (character) present in found_alpha_ascii_values
          max_alpha_index = max(found_alpha_ascii_values)
          last_alpha_index = int(table_data_frame.loc[table_data_frame[0] == chr(max_alpha_index)].index[0])
          alpha_row_data = table_data_frame.loc[last_alpha_index]
          alpha_row_data_clear = alpha_row_data[alpha_row_data.notnull()]
          final_alpha_index = ""

          # Concatenate non-None elements in a row-wise format
          for index, value in alpha_row_data_clear.items():
              final_alpha_index = final_alpha_index + " " + value

          return final_alpha_index

        except:
          # Get values after the index row
          values_after_index = table_data_frame[0][index_row_index + 1: index_row_index + 7]
          values_after_index_list = values_after_index.tolist()
          found_num_indices = []

          # Find numerical values present after the index row
          for num in range(10):
              if num in values_after_index_list:
                  found_num_indices.append(num)
                  max_found_num_index = max(found_num_indices)
          
          # Find the row index corresponding to the last numerical index
          last_num_index = int(table_data_frame.loc[table_data_frame[0] == str(max_found_num_index)].index[0])
          num_row_data = table_data_frame.loc[last_num_index]
          num_row_data_clear = num_row_data[num_row_data.notnull()]
          final_num_index = ""

          # Concatenate non-None elements in a row-wise format
          for index, value in num_row_data_clear.items():
              final_num_index = final_num_index + " " + value

          return final_num_index

    def main(self,pdf_file):

        try:
          pdf_path,output_image_path = self.pdf_to_image_convert(pdf_file)
          crop_image_path = self.crop_image(output_image_path)
          table_data_frame = self.extract_table_from_image(crop_image_path)
          plan_index_location,index_row_index = self.find_index_locations(table_data_frame)
          if plan_index_location < index_row_index:
            final_index = self.extract_last_index_value_below(plan_index_location,index_row_index,table_data_frame)
          elif plan_index_location > index_row_index:
            final_index = self.extract_last_index_value_above(plan_index_location,index_row_index,table_data_frame)
          Answer = "<h2>Last Value</h2><br><br><center><b>"+final_index+"</b></center><br><br>"

          return Answer
        except:
          return "Unable Get Value... Please Try Again"

    def gradio_interface(self):

      with gr.Blocks(css="style.css",theme=gr.themes.Soft()) as demo:
        gr.HTML("""<center><h1>Table Extracter</h1></center>""")
        with gr.Column(elem_id="col-container"):
            with gr.Row(elem_id="row-flex"):
                with gr.Column(scale=0.85, min_width=160):
                  upload_button = gr.UploadButton(
                    "Browse File",file_types=[".txt", ".pdf", ".doc", ".docx",".json",".csv"])
                  file_output = gr.File(elem_classes="filenameshow")
            with gr.Row(scale=0.85,elem_id="row-flex"):
              with gr.Column(scale=0.85, min_width=0):
                    btn = gr.Button("Submit")
            with gr.Row(scale=0.85,elem_id="row-flex"):
              with gr.Column(scale=0.85, min_width=0):
                answer = gr.HTML(html=True)

        upload_button.upload(self.pdf_file_path, upload_button, [file_output])
        btn.click(self.main,upload_button,[answer])

      demo.queue().launch(debug=True)

if __name__ == "__main__":

    pdf_to_table = TableExtracter()
    pdf_to_table.gradio_interface()