File size: 7,458 Bytes
c857e53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fe1db5
c857e53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import glob

import gradio as gr
import gspread
import numpy as np
import pandas as pd
import tqdm
from google.cloud import documentai_v1 as documentai
from google.oauth2 import service_account

credentials = None


def process_document_form_parser(credentials_file, images_folder):
    """Processes a document using the Form Parser."""

    global credentials

    credentials = service_account.Credentials.from_service_account_file(
        credentials_file,
        scopes=[
            "https://www.googleapis.com/auth/cloud-platform",
            "https://www.googleapis.com/auth/spreadsheets",
            "https://www.googleapis.com/auth/drive",
        ],
    )

    project_id = "cru-ocr"
    location = "us"
    processor_id = "26630ebbc76345a1"

    image_paths = glob.glob(f"{images_folder}/*.pdf")[:3]

    all_fields = {
        "Name": np.empty(len(image_paths), dtype=object),
        "Phone": np.empty(len(image_paths), dtype=object),
        "Email": np.empty(len(image_paths), dtype=object),
        "Cadet": np.empty(len(image_paths), dtype=object),
        "Greek or Going Greek": np.empty(len(image_paths), dtype=object),
        "Transfer Student": np.empty(len(image_paths), dtype=object),
        "Military Veteran": np.empty(len(image_paths), dtype=object),
        "International Student": np.empty(len(image_paths), dtype=object),
        "Res Hall": np.empty(len(image_paths), dtype=object),
        "Room #": np.empty(len(image_paths), dtype=object),
        "Off Campus": np.empty(len(image_paths), dtype=object),
        "Fr": np.empty(len(image_paths), dtype=object),
        "So": np.empty(len(image_paths), dtype=object),
        "Jr": np.empty(len(image_paths), dtype=object),
        "Sr": np.empty(len(image_paths), dtype=object),
        "Grad Student": np.empty(len(image_paths), dtype=object),
        "Male": np.empty(len(image_paths), dtype=object),
        "Female": np.empty(len(image_paths), dtype=object),
        "Non Binary": np.empty(len(image_paths), dtype=object),
    }

    client = documentai.DocumentProcessorServiceClient(credentials=credentials)
    name = client.processor_path(project_id, location, processor_id)

    for file_idx, file_path in enumerate(tqdm.tqdm(image_paths)):
        with open(file_path, "rb") as image_file:
            image_content = image_file.read()

        raw_document = documentai.RawDocument(
            content=image_content, mime_type="application/pdf"
        )
        request = documentai.ProcessRequest(name=name, raw_document=raw_document)
        result = client.process_document(request=request)
        document = result.document

        for page in document.pages:
            for form_field in page.form_fields:
                field_name = (
                    form_field.field_name.text_anchor.content
                    if form_field.field_name
                    else "Unnamed Field"
                )
                field_value = (
                    form_field.field_value.text_anchor.content
                    if form_field.field_value
                    else "No Value"
                )

                field_name = field_name.strip().replace(":", "")
                field_value = field_value.strip().replace(":", "")

                if field_name == "Name" and "\n" in field_value:
                    field_value = " ".join(field_value.split("\n")[1:])

                # Check if the field is a checkbox and set its value accordingly
                if field_name in all_fields.keys():
                    if field_name == "Email":
                        # Replace "ut.edu" with "vt.edu" in email addresses
                        field_value = field_value.replace("ut.edu", "vt.edu")

                        # Make email addresses lowercase
                        field_value = field_value.lower()

                        # Remove spaces from email addresses
                        field_value = field_value.replace(" ", "")

                    if field_name == "Phone":
                        # Remove non-numeric characters from phone numbers
                        field_value = "".join(filter(str.isdigit, field_value))

                    # Parse checkboxes
                    if field_value == "β˜‘":
                        field_value = "Yes"
                    # elif field_value == "☐":
                    #     field_value = ""

                    all_fields[field_name][file_idx] = field_value
                else:
                    print(
                        f"Unused field name: {field_name}, field value: {field_value}"
                    )

    df = pd.DataFrame(all_fields, columns=all_fields.keys())
    df["Year"] = df.apply(condense_year, axis=1)
    df = df.drop(columns=["Fr", "So", "Jr", "Sr", "Grad Student"])
    df = df.replace({"☐": "", None: ""})

    return df


def condense_year(row):
    """
    Handles logic to condense year values in the DataFrame row.
    """
    years = ["Fr", "So", "Jr", "Sr", "Grad Student"]
    year_vals = [row[year] for year in years]

    # Box was checked
    if "Yes" in year_vals:
        return years[year_vals.index("Yes")]

    # Four boxes were unchecked (meaning one box was likely checked and not recognized)
    elif year_vals.count("☐") == 4:
        for val in year_vals:
            if not val or "☐" not in val:
                return years[year_vals.index(val)]
    return ""


def upload_to_google_sheets(df):
    """Uploads the edited DataFrame to a Google Sheet."""

    global credentials

    spreadsheet_name = "Cru Connect Cards"
    worksheet_name = "Sheet1"

    # Authenticate with Google Sheets
    gc = gspread.authorize(credentials)

    # Open the Google Sheet
    try:
        spreadsheet = gc.open(spreadsheet_name)
    except gspread.SpreadsheetNotFound:
        spreadsheet = gc.create(spreadsheet_name)

    # Select the worksheet
    try:
        worksheet = spreadsheet.worksheet(worksheet_name)
    except gspread.WorksheetNotFound:
        worksheet = spreadsheet.add_worksheet(
            title=worksheet_name, rows="100", cols="20"
        )

    # Clear the worksheet before uploading new data
    worksheet.clear()

    # Upload the DataFrame to the worksheet
    worksheet.update([df.columns.values.tolist()] + df.values.tolist())

    return f"Data uploaded successfully to {spreadsheet_name} - {worksheet_name}."


# Gradio Interface
def gradio_interface(credentials_file, images_folder):
    return process_document_form_parser(credentials_file.name, images_folder)


def upload_handler(df_data):
    df = pd.DataFrame(df_data)
    result = upload_to_google_sheets(df)
    return result


with gr.Blocks() as iface:
    gr.Markdown("# Document Processing and Upload to Google Sheets")

    credentials_file = gr.File(
        label="Credentials File"#, value="cru-ocr-ee936d111292.json"
    )
    images_folder_path = gr.Textbox(label="Path to Images Folder", value="images_v2")
    process_button = gr.Button("Process Documents")

    output_dataframe = gr.Dataframe(label="Output", interactive=True)
    upload_button = gr.Button("Upload to Google Sheets")
    upload_status = gr.Textbox(label="Upload Status")

    process_button.click(
        fn=gradio_interface,
        inputs=[credentials_file, images_folder_path],
        outputs=[output_dataframe],
    )

    upload_button.click(
        fn=upload_handler, inputs=[output_dataframe], outputs=[upload_status]
    )

iface.launch()