import glob import gradio as gr import gspread import numpy as np import pandas as pd import tqdm from google.cloud import documentai_v1 as documentai from google.oauth2 import service_account credentials = None def process_document_form_parser(credentials_file, images_folder): """Processes a document using the Form Parser.""" global credentials credentials = service_account.Credentials.from_service_account_file( credentials_file, scopes=[ "https://www.googleapis.com/auth/cloud-platform", "https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive", ], ) project_id = "cru-ocr" location = "us" processor_id = "26630ebbc76345a1" image_paths = glob.glob(f"{images_folder}/*.pdf")[:3] all_fields = { "Name": np.empty(len(image_paths), dtype=object), "Phone": np.empty(len(image_paths), dtype=object), "Email": np.empty(len(image_paths), dtype=object), "Cadet": np.empty(len(image_paths), dtype=object), "Greek or Going Greek": np.empty(len(image_paths), dtype=object), "Transfer Student": np.empty(len(image_paths), dtype=object), "Military Veteran": np.empty(len(image_paths), dtype=object), "International Student": np.empty(len(image_paths), dtype=object), "Res Hall": np.empty(len(image_paths), dtype=object), "Room #": np.empty(len(image_paths), dtype=object), "Off Campus": np.empty(len(image_paths), dtype=object), "Fr": np.empty(len(image_paths), dtype=object), "So": np.empty(len(image_paths), dtype=object), "Jr": np.empty(len(image_paths), dtype=object), "Sr": np.empty(len(image_paths), dtype=object), "Grad Student": np.empty(len(image_paths), dtype=object), "Male": np.empty(len(image_paths), dtype=object), "Female": np.empty(len(image_paths), dtype=object), "Non Binary": np.empty(len(image_paths), dtype=object), } client = documentai.DocumentProcessorServiceClient(credentials=credentials) name = client.processor_path(project_id, location, processor_id) for file_idx, file_path in enumerate(tqdm.tqdm(image_paths)): with open(file_path, "rb") as image_file: image_content = image_file.read() raw_document = documentai.RawDocument( content=image_content, mime_type="application/pdf" ) request = documentai.ProcessRequest(name=name, raw_document=raw_document) result = client.process_document(request=request) document = result.document for page in document.pages: for form_field in page.form_fields: field_name = ( form_field.field_name.text_anchor.content if form_field.field_name else "Unnamed Field" ) field_value = ( form_field.field_value.text_anchor.content if form_field.field_value else "No Value" ) field_name = field_name.strip().replace(":", "") field_value = field_value.strip().replace(":", "") if field_name == "Name" and "\n" in field_value: field_value = " ".join(field_value.split("\n")[1:]) # Check if the field is a checkbox and set its value accordingly if field_name in all_fields.keys(): if field_name == "Email": # Replace "ut.edu" with "vt.edu" in email addresses field_value = field_value.replace("ut.edu", "vt.edu") # Make email addresses lowercase field_value = field_value.lower() # Remove spaces from email addresses field_value = field_value.replace(" ", "") if field_name == "Phone": # Remove non-numeric characters from phone numbers field_value = "".join(filter(str.isdigit, field_value)) # Parse checkboxes if field_value == "☑": field_value = "Yes" # elif field_value == "☐": # field_value = "" all_fields[field_name][file_idx] = field_value else: print( f"Unused field name: {field_name}, field value: {field_value}" ) df = pd.DataFrame(all_fields, columns=all_fields.keys()) df["Year"] = df.apply(condense_year, axis=1) df = df.drop(columns=["Fr", "So", "Jr", "Sr", "Grad Student"]) df = df.replace({"☐": "", None: ""}) return df def condense_year(row): """ Handles logic to condense year values in the DataFrame row. """ years = ["Fr", "So", "Jr", "Sr", "Grad Student"] year_vals = [row[year] for year in years] # Box was checked if "Yes" in year_vals: return years[year_vals.index("Yes")] # Four boxes were unchecked (meaning one box was likely checked and not recognized) elif year_vals.count("☐") == 4: for val in year_vals: if not val or "☐" not in val: return years[year_vals.index(val)] return "" def upload_to_google_sheets(df): """Uploads the edited DataFrame to a Google Sheet.""" global credentials spreadsheet_name = "Cru Connect Cards" worksheet_name = "Sheet1" # Authenticate with Google Sheets gc = gspread.authorize(credentials) # Open the Google Sheet try: spreadsheet = gc.open(spreadsheet_name) except gspread.SpreadsheetNotFound: spreadsheet = gc.create(spreadsheet_name) # Select the worksheet try: worksheet = spreadsheet.worksheet(worksheet_name) except gspread.WorksheetNotFound: worksheet = spreadsheet.add_worksheet( title=worksheet_name, rows="100", cols="20" ) # Clear the worksheet before uploading new data worksheet.clear() # Upload the DataFrame to the worksheet worksheet.update([df.columns.values.tolist()] + df.values.tolist()) return f"Data uploaded successfully to {spreadsheet_name} - {worksheet_name}." # Gradio Interface def gradio_interface(credentials_file, images_folder): return process_document_form_parser(credentials_file.name, images_folder) def upload_handler(df_data): df = pd.DataFrame(df_data) result = upload_to_google_sheets(df) return result with gr.Blocks() as iface: gr.Markdown("# Document Processing and Upload to Google Sheets") credentials_file = gr.File( label="Credentials File"#, value="cru-ocr-ee936d111292.json" ) images_folder_path = gr.Textbox(label="Path to Images Folder", value="images_v2") process_button = gr.Button("Process Documents") output_dataframe = gr.Dataframe(label="Output", interactive=True) upload_button = gr.Button("Upload to Google Sheets") upload_status = gr.Textbox(label="Upload Status") process_button.click( fn=gradio_interface, inputs=[credentials_file, images_folder_path], outputs=[output_dataframe], ) upload_button.click( fn=upload_handler, inputs=[output_dataframe], outputs=[upload_status] ) iface.launch()