import pandas as pd from flask import Flask, render_template, request import numpy as np import os import google.generativeai as genai import json import logging import re app = Flask(__name__) # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # --- Configure Google Gemini API --- try: gemini_api_key = "AIzaSyDXBFif6puAw8I7lAOlEv6p24SUWpwUF1k" # IMPORTANT: For testing only - uncomment and add your key below, but remove for production # gemini_api_key = "YOUR_ACTUAL_GEMINI_API_KEY" # Replace with your actual key if not gemini_api_key: raise ValueError("GOOGLE_API_KEY environment variable not set or hardcoded key is empty.") genai.configure(api_key=gemini_api_key) logging.info("Gemini API configured successfully.") except ValueError as e: logging.error( f"Error configuring Gemini API: {e}. Please set your GOOGLE_API_KEY environment variable or provide a valid key.") gemini_api_key = None except Exception as e: logging.error(f"Unexpected error during Gemini API configuration: {e}") gemini_api_key = None # --- Helper functions for aggregation --- def aggregate_unique_strings(series): """Aggregates unique non-null string values from a series, joined by '; '.""" non_null_unique = series.dropna().astype(str).unique() if non_null_unique.size == 0: return '' return '; '.join(non_null_unique) def aggregate_unique_numerical_values_or_strings(series): """ Aggregates unique non-null numerical values from a series. Returns a float if there's only one unique value, otherwise a joined string. Returns empty string if no valid numerical values. """ numeric_series = pd.to_numeric(series.dropna(), errors='coerce').dropna() if numeric_series.empty: return '' unique_values = numeric_series.unique() if len(unique_values) == 1: return float(unique_values[0]) else: return '; '.join(unique_values.astype(str)) # --- Improved JSON Cleaning Function --- def clean_llm_json_string(json_string): """ Attempts to clean common LLM-generated JSON formatting issues, with improved handling to avoid corrupting valid JSON. """ # 1. Strip whitespace and markdown fences json_string = json_string.strip() # Remove markdown code blocks if present if json_string.startswith('```'): lines = json_string.split('\n') # Find first line that doesn't start with ``` and isn't empty start_idx = 0 for i, line in enumerate(lines): if not line.strip().startswith('```') and line.strip(): start_idx = i break # Find last line that doesn't start with ``` and isn't empty end_idx = len(lines) - 1 for i in range(len(lines) - 1, -1, -1): if not lines[i].strip().startswith('```') and lines[i].strip(): end_idx = i break json_string = '\n'.join(lines[start_idx:end_idx + 1]) # 2. Find JSON boundaries more carefully first_brace = json_string.find('{') last_brace = json_string.rfind('}') if first_brace == -1 or last_brace == -1 or first_brace > last_brace: # Try to find array boundaries first_bracket = json_string.find('[') last_bracket = json_string.rfind(']') if first_bracket == -1 or last_bracket == -1 or first_bracket > last_bracket: logging.warning("Cannot find valid JSON boundaries") return "" json_string = json_string[first_bracket:last_bracket + 1] else: json_string = json_string[first_brace:last_brace + 1] # 3. Try to parse as-is first (most LLM responses are actually valid JSON) try: json.loads(json_string) logging.info("JSON is already valid, no cleaning needed") return json_string except json.JSONDecodeError as e: logging.info(f"JSON needs cleaning: {e}") # 4. Only apply minimal cleaning if parsing failed original_string = json_string # Remove comments (// and /* */) json_string = re.sub(r'//.*?(?=\n|$)', '', json_string) json_string = re.sub(r'/\*[\s\S]*?\*/', '', json_string) # Remove trailing commas (most common issue) json_string = re.sub(r',(\s*[}\]])', r'\1', json_string) # Fix unquoted keys (but be very careful not to break quoted strings) # Only match word boundaries that are followed by colon and not inside quotes def fix_unquoted_keys(match_obj): full_match = match_obj.group(0) # Check if we're inside a quoted string by counting quotes before this position before_match = json_string[:match_obj.start()] quote_count = before_match.count('"') - before_match.count('\\"') if quote_count % 2 == 0: # Even number of quotes = we're outside quoted strings key = match_obj.group(1) return f'"{key}":' return full_match json_string = re.sub(r'\b(\w+):', fix_unquoted_keys, json_string) # Replace single quotes with double quotes (but avoid apostrophes) # Only replace single quotes that appear to be string delimiters json_string = re.sub(r"(?