DAR_standardiser / gemini_utils.py
rameshmoorthy's picture
Update gemini_utils.py
bccd8b2 verified
import json
import time
import google.generativeai as genai
from models import ParsedDARReport, HarmonisedPara
from typing import List
def get_structured_data_with_gemini(api_key: str, text_content: str, max_retries=2) -> ParsedDARReport:
"""
Extracts the main DAR information (header, para details) from the PDF text.
"""
if not api_key:
return ParsedDARReport(parsing_errors="Gemini API Key not configured.")
if text_content.startswith("Error"):
return ParsedDARReport(parsing_errors=text_content)
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-flash-latest')
prompt = f"""
You are an expert GST audit report analyst. Based on the following FULL text from a Departmental Audit Report (DAR),
extract the specified information and structure it as a JSON object.
The JSON object should follow this structure precisely:
{{
"header": {{
"audit_group_number": "integer or null",
"gstin": "string or null",
"trade_name": "string or null",
"category": "string ('Large', 'Medium', 'Small') or null",
"total_amount_detected_overall_rs": "float or null",
"total_amount_recovered_overall_rs": "float or null"
}},
"audit_paras": [
{{
"audit_para_number": "integer or null",
"audit_para_heading": "string or null",
"revenue_involved_lakhs_rs": "float or null",
"revenue_recovered_lakhs_rs": "float or null",
"status_of_para": "string or null (e.g., 'Agreed and Paid')"
}}
],
"parsing_errors": "string or null"
}}
Key Instructions:
1. Extract header info like `gstin`, `trade_name`, and overall totals.
2. Identify each distinct audit para and extract its number, heading, revenue involved (in Lakhs), and status.
3. Use null for any missing values. Ensure monetary values are floats.
DAR Text Content:
--- START OF DAR TEXT ---
{text_content}
--- END OF DAR TEXT ---
Provide ONLY the JSON object as your response.
"""
attempt = 0
last_exception = None
while attempt <= max_retries:
try:
response = model.generate_content(prompt)
# Clean Gemini's markdown response
cleaned_response_text = response.text.strip().removeprefix("```json").removesuffix("```").strip()
json_data = json.loads(cleaned_response_text)
return ParsedDARReport(**json_data)
except Exception as e:
last_exception = e
print(f"Attempt {attempt + 1} failed: {e}")
time.sleep(1)
attempt += 1
return ParsedDARReport(parsing_errors=f"Gemini call failed after {max_retries + 1} attempts. Last error: {last_exception}")
def get_harmonised_titles(api_key: str, full_dar_text: str, original_headings: List[str]) -> List[HarmonisedPara]:
"""
Takes a list of original audit para headings and returns a list of
harmonised headings, using the full DAR text for context.
"""
if not api_key:
print("Error: Gemini API key not provided for harmonisation.")
return []
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-flash-latest')
# Convert the list of headings into a formatted string for the prompt
headings_text = "\n".join([f"- {h}" for h in original_headings])
prompt = f"""
You are an expert in standardizing GST audit para headings. Your task is to rewrite a list of given audit para headings to conform to a strict two-part structure, using the full context of the audit report provided below.
**Full Audit Report Text (for context):**
--- START OF DAR TEXT ---
{full_dar_text}
--- END OF DAR TEXT ---
---
**Rules for Harmonisation:**
1. **STRUCTURE:** All audit para headings must be:
`[Part I: Type of Non-compliance] + [Connector] + [Part II: Reason/Cause/Legal Provision]`
- **Connectors:** Use 'due to', 'arising from', or 'on account of'.
2. **PART I - STANDARDIZED CATEGORIES (Must start with one of these):**
- Non-payment of GST/Interest/Late Fee
- Short payment of GST
- Ineligible/Excess ITC
- Irregular ITC availed
- Short-reversal of ITC
- General Penalty
- Penalty for passing of ineligible ITC
3. **PART II - REASON/CAUSE:** Based on the context from the full report, specify the exact reason, cause, or legal provision (e.g., Section 17(5), Rule 42) that caused the non-compliance.
4. If u find the audit para is matching with any of the following audit para , use the exact heading from below with minor additions as the case applicable.
- 1. NON PAYMENT OF GST /INTEREST/LATE FEE RELATED(as the case applicable)
1.1 Non-payment of late fee due to late filing of GSTR-1/GSTR-2B/GSTR-9/GSTR-9C return (Choose one return as applicable to Audit para )
1.2 Non-payment of interest on late payment of GST due to delayed filing of GSTR-3B return
1.3 Non-payment of interest on late payment of GST payable under Reverse Charge Mechanism (RCM)
1.4 Non-payment of interest on Input Tax Credit availed on invoices where payment to suppliers was made after 180 days from the date of invoice in contravention of Section 50 of the CGST Act, 2017.
1.5 Non-payment of GST on sale of fixed assets/sale of scrap
2. SHORT PAYMENT OF GST
2.1 Short payment of GST in GSTR-3B returns due to discrepancy with GST payable as per GSTR-1 returns filed (GSTR-3B vis-à-vis GSTR-1).
2.2 Short payment of GST in GSTR-3B returns due to discrepancy with GSTR-9 annual return.
2.3 Short payment of GST under Reverse Charge Mechanism on [specify category of supplies].
2.4 Short payment of GST under RCM in GSTR-3B due to discrepancy with RCM Input Tax Credit availed.
2.5 Short payment of GST arising due to difference between turnover declared in Profit & Loss account and GSTR-3B returns.
3. INPUT TAX CREDIT IRREGULARITIES
3.1 Ineligible excess Input Tax Credit availed in GSTR-3B return in comparison to ITC available in GSTR-2A (GSTR-3B vis-à-vis GSTR-2A).
3.2 Ineligible excess Input Tax Credit availed in GSTR-3B return in comparison to ITC available in ITC register/purchase register.
3.3 Irregular availment of ineligible Input Tax Credit in contravention of Section 17(5)/Section 16 of the CGST Act, 2017.
3.4 Irregular Input Tax Credit availed on invoices issued by suppliers whose registration was cancelled prior to the date of invoice issuance
4. NON-REVERSAL OF INPUT TAX CREDIT
4.1 Non-reversal of Input Tax Credit on invoices where payment to suppliers was not made within 180 days from the date of invoice
4.2 Non-reversal of Input Tax Credit under Rule 42/Rule 43 of the CGST Rules, 2017.
4.3 Non-reversal of Input Tax Credit on account of expenses written off
5. PENALTY PROVISIONS
5.1 General penalty for non-filing of ITC-04 returns
5.2 General penalty for non-amendment of Principal Place of Business/non-declaration of additional Place of Business/non-maintenance of records at Principal Place of Business
5.3 Penalty for passing of ineligible Input Tax Credit
---
**Your Task:**
Based on the full report context, for each of the original headings provided below, generate a new, harmonised heading that strictly follows the rules.
**Output Format:**
Return your response as a single JSON array of objects. Each object must contain the original heading and the new harmonised heading. Do NOT include any other text or explanations outside the JSON.
Example JSON structure:
```json
[
{{
"original_heading": "The original heading from the list",
"harmonised_heading": "Your new, perfectly formatted heading"
}}
]
```
**Original Headings to Harmonise:**
{headings_text}
Provide ONLY the JSON array as your response.
"""
try:
response = model.generate_content(prompt)
# Clean Gemini's markdown response
cleaned_response_text = response.text.strip().removeprefix("```json").removesuffix("```").strip()
if not cleaned_response_text:
print("Harmonisation call returned an empty response.")
return []
json_data = json.loads(cleaned_response_text)
# Validate with Pydantic
harmonised_list = [HarmonisedPara(**item) for item in json_data]
return harmonised_list
except json.JSONDecodeError as e:
print(f"Error decoding JSON from harmonisation response: {e}")
print(f"Raw Response: {response.text}")
return []
except Exception as e:
print(f"An error occurred during title harmonisation: {e}")
return []