import gradio as gr import pandas as pd from io import BytesIO import os import json from datetime import datetime import firebase_admin from firebase_admin import credentials, firestore from dar_processor import preprocess_pdf_text from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles from models import ParsedDARReport, HarmonisedPara # Firebase setup FIREBASE_CREDENTIALS = os.environ.get("FIREBASE_CREDENTIALS") if FIREBASE_CREDENTIALS: # Load credentials from environment variable (preferred for security) cred = credentials.Certificate(json.loads(FIREBASE_CREDENTIALS)) else: # Fallback to reading from firebase.json file if not os.path.exists("firebase.json"): raise ValueError("firebase.json not found and FIREBASE_CREDENTIALS not set.") cred = credentials.Certificate("firebase.json") firebase_admin.initialize_app(cred) db = firestore.client() request_counts = db.collection('request_counts') def get_request_count(): """Retrieve the current request count for today.""" today = datetime.utcnow().strftime('%Y-%m-%d') doc_ref = request_counts.document(today) doc = doc_ref.get() count = doc.to_dict().get('count', 0) if doc.exists else 0 return count def check_request_limit(): """Check if the request limit for the day has been reached.""" today = datetime.utcnow().strftime('%Y-%m-%d') doc_ref = request_counts.document(today) doc = doc_ref.get() if not doc.exists: # Initialize counter for the new day doc_ref.set({'count': 0}) count = 0 else: count = doc.to_dict().get('count', 0) if count >= 400: return False, "Daily request limit of 400 reached. Try again tomorrow." # Increment the counter doc_ref.update({'count': firestore.Increment(1)}) return True, None def create_html_report(results_with_harmonised: list[dict]) -> str: """Generates an HTML string to display the results in a styled table.""" if not results_with_harmonised: return "

No audit paras found or processed.

" style = """ """ html = f"{style}" for item in results_with_harmonised: para_num = item.get('audit_para_number', 'N/A') original_heading = item.get('audit_para_heading', 'N/A') harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A') amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L" html += f"" html += "
Para No.Original Audit Para HeadingHarmonised Audit Para HeadingAmount Involved (in Lakhs)
{para_num}{original_heading}{harmonised_heading}{amount}
" return html def process_dar_pdf(pdf_file): """The main processing function, called after successful login.""" # Check request limit before processing can_process, error_msg = check_request_limit() if not can_process: return error_msg, None, None, f"Requests today: {get_request_count()}/400" gemini_api_key = os.environ.get("GEMINI_API_KEY") if not pdf_file: return "Please upload a PDF file.", None, None, f"Requests today: {get_request_count()}/400" if not gemini_api_key: return "Error: GEMINI_API_KEY secret not found in Space settings.", None, None, f"Requests today: {get_request_count()}/400" # Step 1: Process PDF to text full_text = preprocess_pdf_text(pdf_file.name) if full_text.startswith("Error"): return f"Failed to process PDF: {full_text}", None, None, f"Requests today: {get_request_count()}/400" # Step 2: Extract structured data parsed_report = get_structured_data_with_gemini(gemini_api_key, full_text) if parsed_report.parsing_errors or not parsed_report.audit_paras: error_msg = parsed_report.parsing_errors or "Could not find any audit paras." return error_msg, None, None, f"Requests today: {get_request_count()}/400" # Step 3: Get harmonised titles original_headings = [p.audit_para_heading for p in parsed_report.audit_paras if p.audit_para_heading] if not original_headings: return "Found paras but no headings to harmonise.", None, None, f"Requests today: {get_request_count()}/400" harmonised_results = get_harmonised_titles(gemini_api_key, full_text, original_headings) if not harmonised_results: return "Failed to generate harmonised titles.", None, None, f"Requests today: {get_request_count()}/400" # Step 4: Combine and prepare outputs harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results} final_data_list = [] for para in parsed_report.audit_paras: combined_info = (parsed_report.header.dict() if parsed_report.header else {}) | para.dict() combined_info['harmonised_audit_para_heading'] = harmonised_map.get(para.audit_para_heading, "N/A") final_data_list.append(combined_info) html_output = create_html_report(final_data_list) # Step 5: Create Excel file for download df = pd.DataFrame(final_data_list) excel_columns = [ 'gstin', 'trade_name', 'category', 'audit_group_number', 'audit_para_number', 'audit_para_heading', 'harmonised_audit_para_heading', 'revenue_involved_lakhs_rs', 'revenue_recovered_lakhs_rs', 'status_of_para', 'total_amount_detected_overall_rs', 'total_amount_recovered_overall_rs' ] df = df.reindex(columns=excel_columns).fillna('N/A') output_excel = BytesIO() df.to_excel(output_excel, index=False, sheet_name='DAR_Extraction') output_excel.seek(0) excel_file_name = "dar_extraction_report.xlsx" with open(excel_file_name, "wb") as f: f.write(output_excel.getbuffer()) return "Processing complete.", html_output, gr.File(value=excel_file_name), f"Requests today: {get_request_count()}/400" # --- Gradio Interface Definition --- with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo: # --- Login UI (visible initially) --- with gr.Column(visible=True) as login_ui: gr.Markdown("# Mumbai CGST Audit Officer Login") gr.Markdown("Please enter the credentials to access the tool.") with gr.Row(): username_input = gr.Textbox(label="Username", placeholder="Enter your username") password_input = gr.Textbox(label="Password", type="password", placeholder="Enter your password") login_button = gr.Button("Login", variant="primary") login_error_msg = gr.Markdown(visible=False) # --- Main App UI (hidden initially) --- with gr.Column(visible=False) as main_app_ui: gr.Markdown("# DAR Draft Audit Report Harmonisation Tool") gr.Markdown("## Initiative by Mumbai Audit 1 Commissionerate") gr.Markdown( "Upload a Departmental Audit Report (DAR) in PDF format. The tool will process it and generate harmonised titles for Audit paras in accordance with GST law." ) request_count_output = gr.Textbox(label="Requests Made Today", interactive=False, value="Requests today: 0/400") with gr.Row(): with gr.Column(scale=1): pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"]) submit_btn = gr.Button("Process Report", variant="primary") with gr.Column(scale=2): status_output = gr.Textbox(label="Processing Status", interactive=False) excel_output = gr.File(label="Download Excel Report") gr.Markdown("## Harmonised Audit Para Titles") html_output = gr.HTML() submit_btn.click( fn=process_dar_pdf, inputs=[pdf_input], outputs=[status_output, html_output, excel_output, request_count_output] ) # --- Login Functionality --- def login(username, password): """ Checks user credentials against secrets. For production, these are loaded from Hugging Face secrets. """ auth_username = os.environ.get("APP_USERNAME") auth_password = os.environ.get("APP_PASSWORD") is_valid_user = (username == auth_username and password == auth_password) if is_valid_user: # Login successful: hide login UI, show main app, display request count request_count = get_request_count() return { login_ui: gr.update(visible=False), main_app_ui: gr.update(visible=True), login_error_msg: gr.update(visible=False), request_count_output: gr.update(value=f"Requests today: {request_count}/400") } else: # Login failed: keep login UI visible, show error message return { login_ui: gr.update(visible=True), main_app_ui: gr.update(visible=False), login_error_msg: gr.update(value="

Invalid username or password.

", visible=True), request_count_output: gr.update(value="Requests today: 0/400") } login_button.click( login, inputs=[username_input, password_input], outputs=[login_ui, main_app_ui, login_error_msg, request_count_output] ) if __name__ == "__main__": demo.launch(debug=True)# import gradio as gr # import pandas as pd # from io import BytesIO # import os # import json # from datetime import datetime # import firebase_admin # from firebase_admin import credentials, firestore # from dar_processor import preprocess_pdf_text # from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles # from models import ParsedDARReport, HarmonisedPara # # Firebase setup # FIREBASE_CREDENTIALS = os.environ.get("FIREBASE_CREDENTIALS") # if FIREBASE_CREDENTIALS: # # Load credentials from environment variable (preferred for security) # cred = credentials.Certificate(json.loads(FIREBASE_CREDENTIALS)) # else: # # Fallback to reading from firebase.json file # if not os.path.exists("firebase.json"): # raise ValueError("firebase.json not found and FIREBASE_CREDENTIALS not set.") # cred = credentials.Certificate("firebase.json") # firebase_admin.initialize_app(cred) # db = firestore.client() # request_counts = db.collection('request_counts') # def get_request_count(): # """Retrieve the current request count for today.""" # today = datetime.utcnow().strftime('%Y-%m-%d') # doc_ref = request_counts.document(today) # doc = doc_ref.get() # count = doc.to_dict().get('count', 0) if doc.exists else 0 # return count # def check_request_limit(): # """Check if the request limit for the day has been reached.""" # today = datetime.utcnow().strftime('%Y-%m-%d') # doc_ref = request_counts.document(today) # doc = doc_ref.get() # if not doc.exists: # # Initialize counter for the new day # doc_ref.set({'count': 0}) # count = 0 # else: # count = doc.to_dict().get('count', 0) # if count >= 400: # return False, "Daily request limit of 400 reached. Try again tomorrow." # # Increment the counter # doc_ref.update({'count': firestore.Increment(1)}) # return True, None # def create_html_report(results_with_harmonised: list[dict]) -> str: # """Generates an HTML string to display the results in a styled table.""" # if not results_with_harmonised: # return "

No audit paras found or processed.

" # style = """ # # """ # html = f"{style}" # for item in results_with_harmonised: # para_num = item.get('audit_para_number', 'N/A') # original_heading = item.get('audit_para_heading', 'N/A') # harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A') # amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L" # html += f"" # html += "
Para No.Original Audit Para HeadingHarmonised Audit Para HeadingAmount Involved (in Lakhs)
{para_num}{original_heading}{harmonised_heading}{amount}
" # return html # def process_dar_pdf(pdf_file): # """The main processing function, called after successful login.""" # # Check request limit before processing # can_process, error_msg = check_request_limit() # if not can_process: # return error_msg, None, None, f"Requests today: {get_request_count()}/400" # gemini_api_key = os.environ.get("GEMINI_API_KEY") # if not pdf_file: # return "Please upload a PDF file.", None, None, f"Requests today: {get_request_count()}/400" # if not gemini_api_key: # return "Error: GEMINI_API_KEY secret not found in Space settings.", None, None, f"Requests today: {get_request_count()}/400" # # Step 1: Process PDF to text # full_text = preprocess_pdf_text(pdf_file.name) # if full_text.startswith("Error"): # return f"Failed to process PDF: {full_text}", None, None, f"Requests today: {get_request_count()}/400" # # Step 2: Extract structured data # parsed_report = get_structured_data_with_gemini(gemini_api_key, full_text) # if parsed_report.parsing_errors or not parsed_report.audit_paras: # error_msg = parsed_report.parsing_errors or "Could not find any audit paras." # return error_msg, None, None, f"Requests today: {get_request_count()}/400" # # Step 3: Get harmonised titles # original_headings = [p.audit_para_heading for p in parsed_report.audit_paras if p.audit_para_heading] # if not original_headings: # return "Found paras but no headings to harmonise.", None, None, f"Requests today: {get_request_count()}/400" # harmonised_results = get_harmonised_titles(gemini_api_key, full_text, original_headings) # if not harmonised_results: # return "Failed to generate harmonised titles.", None, None, f"Requests today: {get_request_count()}/400" # # Step 4: Combine and prepare outputs # harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results} # final_data_list = [] # for para in parsed_report.audit_paras: # combined_info = (parsed_report.header.dict() if parsed_report.header else {}) | para.dict() # combined_info['harmonised_audit_para_heading'] = harmonised_map.get(para.audit_para_heading, "N/A") # final_data_list.append(combined_info) # html_output = create_html_report(final_data_list) # # Step 5: Create Excel file for download # df = pd.DataFrame(final_data_list) # excel_columns = [ # 'gstin', 'trade_name', 'category', 'audit_group_number', 'audit_para_number', # 'audit_para_heading', 'harmonised_audit_para_heading', 'revenue_involved_lakhs_rs', # 'revenue_recovered_lakhs_rs', 'status_of_para', 'total_amount_detected_overall_rs', # 'total_amount_recovered_overall_rs' # ] # df = df.reindex(columns=excel_columns).fillna('N/A') # output_excel = BytesIO() # df.to_excel(output_excel, index=False, sheet_name='DAR_Extraction') # output_excel.seek(0) # excel_file_name = "dar_extraction_report.xlsx" # with open(excel_file_name, "wb") as f: # f.write(output_excel.getbuffer()) # return "Processing complete.", html_output, gr.File(value=excel_file_name), f"Requests today: {get_request_count()}/400" # # --- Gradio Interface Definition --- # with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo: # # --- Login UI (visible initially) --- # with gr.Column(visible=True) as login_ui: # gr.Markdown("# Audit Officer Login") # gr.Markdown("Please enter the credentials to access the tool.") # with gr.Row(): # username_input = gr.Textbox(label="Username", placeholder="Enter your username") # password_input = gr.Textbox(label="Password", type="password", placeholder="Enter your password") # login_button = gr.Button("Login", variant="primary") # login_error_msg = gr.Markdown(visible=False) # # --- Main App UI (hidden initially) --- # with gr.Column(visible=False) as main_app_ui: # gr.Markdown("# DAR Draft Audit Report Harmonisation Tool") # gr.Markdown("## Initiative by Mumbai Audit 1 Commissionerate") # gr.Markdown( # "Upload a Observation letter to Taxpayer or Departmental Audit Report (DAR) in PDF format. The tool will process it and generate harmonised titles for Audit paras in accordance with GST law." # ) # request_count_output = gr.Textbox(label="Requests Made Today", interactive=False, value="Requests today: 0/400") # with gr.Row(): # with gr.Column(scale=1): # pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"]) # submit_btn = gr.Button("Process Report", variant="primary") # with gr.Column(scale=2): # status_output = gr.Textbox(label="Processing Status", interactive=False) # excel_output = gr.File(label="Download Excel Report") # gr.Markdown("## Harmonised Audit Para Titles") # html_output = gr.HTML() # submit_btn.click( # fn=process_dar_pdf, # inputs=[pdf_input], # outputs=[status_output, html_output, excel_output, request_count_output] # ) # # --- Login Functionality --- # def login(username, password): # """ # Checks user credentials against secrets. # For production, these are loaded from Hugging Face secrets. # """ # auth_username = os.environ.get("APP_USERNAME") # auth_password = os.environ.get("APP_PASSWORD") # is_valid_user = (username == auth_username and password == auth_password) # if is_valid_user: # # Login successful: hide login UI, show main app, display request count # request_count = get_request_count() # return { # login_ui: gr.update(visible=False), # main_app_ui: gr.update(visible=True), # login_error_msg: gr.update(visible=False), # request_count_output: gr.update(value=f"Requests today: {request_count}/400") # } # else: # # Login failed: keep login UI visible, show error message # return { # login_ui: gr.update(visible=True), # main_app_ui: gr.update(visible=False), # login_error_msg: gr.update(value="

Invalid username or password.

", visible=True), # request_count_output: gr.update(value="Requests today: 0/400") # } # login_button.click( # login, # inputs=[username_input, password_input], # outputs=[login_ui, main_app_ui, login_error_msg, request_count_output] # ) # if __name__ == "__main__": # demo.launch(debug=True) # # import pandas as pd # # from io import BytesIO # # import os # # # These imports assume the other python files (dar_processor.py, etc.) are in the same directory. # # from dar_processor import preprocess_pdf_text # # from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles # # from models import ParsedDARReport, HarmonisedPara # # def create_html_report(results_with_harmonised: list[dict]) -> str: # # """Generates an HTML string to display the results in a styled table.""" # # if not results_with_harmonised: # # return "

No audit paras found or processed.

" # # style = """ # # # # """ # # html = f"{style}" # # for item in results_with_harmonised: # # para_num = item.get('audit_para_number', 'N/A') # # original_heading = item.get('audit_para_heading', 'N/A') # # harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A') # # amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L" # # html += f"" # # html += "
Para No.Original Audit Para HeadingHarmonised Audit Para HeadingAmount Involved (in Lakhs)
{para_num}{original_heading}{harmonised_heading}{amount}
" # # return html # # def process_dar_pdf(pdf_file): # # """The main processing function, called after successful login.""" # # gemini_api_key = os.environ.get("GEMINI_API_KEY") # # if not pdf_file: # # return "Please upload a PDF file.", None, None # # if not gemini_api_key: # # return "Error: GEMINI_API_KEY secret not found in Space settings.", None, None # # # Step 1: Process PDF to text # # full_text = preprocess_pdf_text(pdf_file.name) # # if full_text.startswith("Error"): # # return f"Failed to process PDF: {full_text}", None, None # # # Step 2: Extract structured data # # parsed_report = get_structured_data_with_gemini(gemini_api_key, full_text) # # if parsed_report.parsing_errors or not parsed_report.audit_paras: # # error_msg = parsed_report.parsing_errors or "Could not find any audit paras." # # return error_msg, None, None # # # Step 3: Get harmonised titles # # original_headings = [p.audit_para_heading for p in parsed_report.audit_paras if p.audit_para_heading] # # if not original_headings: # # return "Found paras but no headings to harmonise.", None, None # # harmonised_results = get_harmonised_titles(gemini_api_key, full_text, original_headings) # # if not harmonised_results: # # return "Failed to generate harmonised titles.", None, None # # # Step 4: Combine and prepare outputs # # harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results} # # final_data_list = [] # # for para in parsed_report.audit_paras: # # combined_info = (parsed_report.header.dict() if parsed_report.header else {}) | para.dict() # # combined_info['harmonised_audit_para_heading'] = harmonised_map.get(para.audit_para_heading, "N/A") # # final_data_list.append(combined_info) # # html_output = create_html_report(final_data_list) # # # Step 5: Create Excel file for download # # df = pd.DataFrame(final_data_list) # # excel_columns = [ # # 'gstin', 'trade_name', 'category', 'audit_group_number', 'audit_para_number', # # 'audit_para_heading', 'harmonised_audit_para_heading', 'revenue_involved_lakhs_rs', # # 'revenue_recovered_lakhs_rs', 'status_of_para', 'total_amount_detected_overall_rs', # # 'total_amount_recovered_overall_rs' # # ] # # df = df.reindex(columns=excel_columns).fillna('N/A') # # output_excel = BytesIO() # # df.to_excel(output_excel, index=False, sheet_name='DAR_Extraction') # # output_excel.seek(0) # # excel_file_name = "dar_extraction_report.xlsx" # # with open(excel_file_name, "wb") as f: # # f.write(output_excel.getbuffer()) # # return "Processing complete.", html_output, gr.File(value=excel_file_name) # # # --- Gradio Interface Definition --- # # with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo: # # # --- Login UI (visible initially) --- # # with gr.Column(visible=True) as login_ui: # # gr.Markdown("# Audit Officer Login") # # gr.Markdown("Please enter the credentials to access the tool.") # # with gr.Row(): # # username_input = gr.Textbox(label="Username", placeholder="Enter your username") # # password_input = gr.Textbox(label="Password", type="password", placeholder="Enter your password") # # login_button = gr.Button("Login", variant="primary") # # login_error_msg = gr.Markdown(visible=False) # # # --- Main App UI (hidden initially) --- # # with gr.Column(visible=False) as main_app_ui: # # gr.Markdown("# DAR Draft Audit Report Harmonisation Tool") # # gr.Markdown("## Initiative by Mumbai Audit 1 Commissionerate") # # gr.Markdown( # # "Upload a Observation letter to taxpayer or Departmental Audit Report (DAR) in PDF format. The tool will process it and generate harmonised titles for Audit paras in accordance with GST law." # # ) # # with gr.Row(): # # with gr.Column(scale=1): # # pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"]) # # submit_btn = gr.Button("Process Report", variant="primary") # # with gr.Column(scale=2): # # status_output = gr.Textbox(label="Processing Status", interactive=False) # # excel_output = gr.File(label="Download Excel Report") # # gr.Markdown("## Harmonised Audit Para Titles") # # html_output = gr.HTML() # # submit_btn.click( # # fn=process_dar_pdf, # # inputs=[pdf_input], # # outputs=[status_output, html_output, excel_output] # # ) # # # --- Login Functionality --- # # def login(username, password): # # """ # # Checks user credentials against secrets. # # For production, these are loaded from Hugging Face secrets. # # """ # # # Get credentials from Hugging Face secrets. # # # Fallback to default values for local testing if secrets are not set. # # auth_username = os.environ.get("APP_USERNAME") # # auth_password = os.environ.get("APP_PASSWORD") # # is_valid_user = (username == auth_username and password == auth_password) # # if is_valid_user: # # # Login successful: hide login UI, show main app # # return { # # login_ui: gr.update(visible=False), # # main_app_ui: gr.update(visible=True), # # login_error_msg: gr.update(visible=False) # # } # # else: # # # Login failed: keep login UI visible, show error message # # return { # # login_ui: gr.update(visible=True), # # main_app_ui: gr.update(visible=False), # # login_error_msg: gr.update(value="

Invalid username or password.

", visible=True) # # } # # login_button.click( # # login, # # inputs=[username_input, password_input], # # outputs=[login_ui, main_app_ui, login_error_msg] # # ) # # if __name__ == "__main__": # # demo.launch(debug=True)