import gradio as gr import json import re import datetime import pandas as pd import pysolr import google.generativeai as genai from sshtunnel import SSHTunnelForwarder import matplotlib.pyplot as plt import seaborn as sns import io import os import logging import concurrent.futures from IPython.display import display, Markdown # --- Suppress Matplotlib Debug Logs --- logging.getLogger('matplotlib').setLevel(logging.WARNING) # --- SSH Tunnel Configuration --- # It's recommended to load secrets securely, e.g., from environment variables SSH_HOST = os.environ.get('SSH_HOST') SSH_PORT = 5322 SSH_USER = os.environ.get('SSH_USER') SSH_PASS = os.environ.get('SSH_PASS') # --- Solr Configuration --- REMOTE_SOLR_HOST = '69.167.186.48' REMOTE_SOLR_PORT = 8983 LOCAL_BIND_PORT = 8983 SOLR_CORE_NAME = 'news' SOLR_USER = os.environ.get('SOLR_USER') SOLR_PASS = os.environ.get('SOLR_PASS') # --- Google Gemini Configuration --- try: genai.configure(api_key=os.environ.get('GEMINI_API_KEY')) except Exception as e: print(f"āŒ Gemini API Key Error: {e}. Please ensure 'GEMINI_API_KEY' is set in your environment.") # --- Global Variables --- ssh_tunnel_server = None solr_client = None llm_model = None is_initialized = False try: # 1. Start the SSH Tunnel ssh_tunnel_server = SSHTunnelForwarder( (SSH_HOST, SSH_PORT), ssh_username=SSH_USER, ssh_password=SSH_PASS, remote_bind_address=(REMOTE_SOLR_HOST, REMOTE_SOLR_PORT), local_bind_address=('127.0.0.1', LOCAL_BIND_PORT) ) ssh_tunnel_server.start() print(f"šŸš€ SSH tunnel established: Local Port {ssh_tunnel_server.local_bind_port} -> Remote Solr.") # 2. Initialize the pysolr client solr_url = f'http://127.0.0.1:{ssh_tunnel_server.local_bind_port}/solr/{SOLR_CORE_NAME}' solr_client = pysolr.Solr(solr_url, auth=(SOLR_USER, SOLR_PASS), always_commit=True) solr_client.ping() print(f"āœ… Solr connection successful on core '{SOLR_CORE_NAME}'.") # 3. Initialize the LLM llm_model = genai.GenerativeModel('gemini-1.5-flash', generation_config=genai.types.GenerationConfig(temperature=0)) print(f"āœ… LLM Model '{llm_model.model_name}' initialized.") print("āœ… System Initialized Successfully.") is_initialized = True except Exception as e: print(f"\nāŒ An error occurred during setup: {e}") if ssh_tunnel_server and ssh_tunnel_server.is_active: ssh_tunnel_server.stop() field_metadata = [ { "field_name": "business_model", "type": "string (categorical)", "example_values": ["pharma/bio", "drug delivery", "pharma services"], "definition": "The primary business category of the company involved in the news. Use for filtering by high-level industry segments." }, { "field_name": "news_type", "type": "string (categorical)", "example_values": ["product news", "financial news", "regulatory news"], "definition": "The category of the news article itself (e.g., financial, regulatory, acquisition). Use for filtering by the type of event being reported." }, { "field_name": "event_type", "type": "string (categorical)", "example_values": ["phase 2", "phase 1", "pre clinical", "marketed"], "definition": "The clinical or developmental stage of a product or event discussed in the article. Essential for queries about clinical trial phases." }, { "field_name": "source", "type": "string (categorical)", "example_values": ["Press Release", "PR Newswire", "Business Wire"], "definition": "The original source of the news article, such as a newswire or official report." }, { "field_name": "company_name", "type": "string (exact match, for faceting)", "example_values": ["pfizer inc.", "astrazeneca plc", "roche"], "definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching." }, { "field_name": "company_name_s", "type": "string (multi-valued, for searching)", "example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"], "definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting." }, { "field_name": "territory_hq_s", "type": "string (multi-valued, hierarchical)", "example_values": ["united states of america", "europe", "europe western"], "definition": "The geographic location (country and continent) of a company's headquarters. It is hierarchical. Use for filtering by location." }, { "field_name": "therapeutic_category", "type": "string (specific)", "example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"], "definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries." }, { "field_name": "therapeutic_category_s", "type": "string (multi-valued, for searching)", "example_values": ["cancer", "oncology", "infections", "cns"], "definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter." }, { "field_name": "compound_name", "type": "string (exact match, for faceting)", "example_values": ["opdivo injection solution", "keytruda injection solution"], "definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds." }, { "field_name": "compound_name_s", "type": "string (multi-valued, for searching)", "example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"], "definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name." }, { "field_name": "molecule_name", "type": "string (exact match, for faceting)", "example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"], "definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules." }, { "field_name": "molecule_name_s", "type": "string (multi-valued, for searching)", "example_values": ["cbd", "s1-220", "a1002n5s"], "definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name." }, { "field_name": "highest_phase", "type": "string (categorical)", "example_values": ["marketed", "phase 2", "phase 1"], "definition": "The highest stage of development a drug has ever reached." }, { "field_name": "drug_delivery_branch_s", "type": "string (multi-valued, for searching)", "example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"], "definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms." }, { "field_name": "drug_delivery_branch", "type": "string (categorical, specific, for faceting)", "example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"], "definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies." }, { "field_name": "route_branch", "type": "string (categorical)", "example_values": ["injection", "oral", "topical", "inhalation"], "definition": "The primary route of drug administration. Good for faceting on exact routes." }, { "field_name": "molecule_api_group", "type": "string (categorical)", "example_values": ["small molecules", "biologics", "nucleic acids"], "definition": "High-level classification of the drug's molecular type." }, { "field_name": "content", "type": "text (full-text search)", "example_values": ["The largest study to date...", "balstilimab..."], "definition": "The full text content of the news article. Use for keyword searches on topics not covered by other specific fields." }, { "field_name": "date", "type": "date", "example_values": ["2020-10-22T00:00:00Z"], "definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries." }, { "field_name": "date_year", "type": "number (year)", "example_values": [2020, 2021, 2022], "definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')." }, { "field_name": "total_deal_value_in_million", "type": "number (metric)", "example_values": [50, 120.5, 176.157, 1000], "definition": "The total value of a financial deal, in millions of USD. This is the primary numeric field for financial aggregations (sum, avg, etc.). To use this, you must also filter for news that has a deal value, e.g., 'total_deal_value_in_million:[0 TO *]'." } ] # Helper function to format the metadata for the prompt def format_metadata_for_prompt(metadata): formatted_string = "" for field in metadata: formatted_string += f"- **{field['field_name']}**\n" formatted_string += f" - **Type**: {field['type']}\n" formatted_string += f" - **Definition**: {field['definition']}\n" formatted_string += f" - **Examples**: {', '.join(map(str, field['example_values']))}\n\n" return formatted_string formatted_field_info = format_metadata_for_prompt(field_metadata) def parse_suggestions_from_report(report_text): """Extracts numbered suggestions from the report's markdown text.""" suggestions_match = re.search(r"### (?:Deeper Dive: Suggested Follow-up Analyses|Suggestions for Further Exploration)\s*\n(.*?)$", report_text, re.DOTALL | re.IGNORECASE) if not suggestions_match: return [] suggestions_text = suggestions_match.group(1) suggestions = re.findall(r"^\s*\d+\.\s*(.*)", suggestions_text, re.MULTILINE) return [s.strip() for s in suggestions] def llm_generate_solr_query_with_history(natural_language_query, field_metadata, chat_history): """Generates a Solr query and facet JSON from a natural language query, considering the conversation history.""" # Format the chat history for the prompt formatted_history = "" for user_msg, bot_msg in chat_history: # We only need the user's queries for context, not the bot's detailed responses. if user_msg: # CORRECTED: Properly formatted f-string with a newline character formatted_history += f"- User: \"{user_msg}\"\n" prompt = f""" You are an expert Solr query engineer who converts natural language questions into precise Solr JSON Facet API query objects. Your primary goal is to create a valid JSON object with `query` and `json.facet` keys. --- ### CONVERSATIONAL CONTEXT & RULES 1. **Today's Date for Calculations**: 2025-07-16 2. **Allowed Facet Types**: The `type` key for any facet MUST be one of the following: `terms`, `query`, or `range`. **Do not use `date_histogram`**. For time-series analysis, use a `range` facet on a date field. 3. **Field Usage**: You MUST use the fields described in the 'Field Definitions' section. Pay close attention to the definitions to select the correct field. 4. **Facet vs. Query Field Distinction**: This is critical. * For searching in the main `query` parameter, ALWAYS use the multi-valued search fields (ending in `_s`, like `company_name_s`) to get comprehensive results. * For grouping in a `terms` facet, ALWAYS use the canonical, single-value field (e.g., `company_name`, `molecule_name`) to ensure unique and accurate grouping. 5. **No `count(*)`**: Do NOT use functions like `count(*)`. The default facet bucket count is sufficient for counting documents. 6. **Allowed Aggregations**: For statistical facets, only use these functions: `sum`, `avg`, `min`, `max`, `unique`. The primary metric field is `total_deal_value_in_million`. The aggregation MUST be a simple string like `"sum(total_deal_value_in_million)"` and not a nested JSON object. 7. **Term Facet Limits**: Every `terms` facet MUST include a `limit` key. Default to `limit: 10` unless the user specifies a different number of top results. 8. **Output Format**: Your final output must be a single, raw JSON object and nothing else. Do not add comments, explanations, or markdown formatting like ```json. --- ### FIELD DEFINITIONS (Your Source of Truth) `{formatted_field_info}` --- ### CHAT HISTORY `{formatted_history}` --- ### EXAMPLE OF A FOLLOW-UP QUERY **Initial User Query:** "What are the infections news in this year?" ```json {{ "query": "date_year:2025 AND therapeutic_category_s:infections", "json.facet": {{ "infections_news_by_type": {{ "type": "terms", "field": "news_type", "limit": 10 }} }} }} ``` **Follow-up User Query:** "Compare deal values for injection vs oral." **Correct JSON Output for the Follow-up:** ```json {{ "query": "therapeutic_category_s:infections AND date_year:2025 AND total_deal_value_in_million:[0 TO *]", "json.facet": {{ "injection_deals": {{ "type": "query", "q": "route_branch:injection", "facet": {{ "total_deal_value": "sum(total_deal_value_in_million)" }} }}, "oral_deals": {{ "type": "query", "q": "route_branch:oral", "facet": {{ "total_deal_value": "sum(total_deal_value_in_million)" }} }} }} }} ``` --- ### YOUR TASK Now, convert the following user query into a single, raw JSON object with 'query' and 'json.facet' keys, strictly following all rules and field definitions provided above and considering the chat history. **Current User Query:** `{natural_language_query}` """ try: response = llm_model.generate_content(prompt) # Using a more robust regex to clean the response cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip() return json.loads(cleaned_text) except Exception as e: raw_response_text = response.text if 'response' in locals() else 'N/A' print(f"Error in llm_generate_solr_query_with_history: {e}\nRaw Response:\n{raw_response_text}") return None def llm_generate_visualization_code(query_context, facet_data): """Generates Python code for visualization based on query and data.""" prompt = f""" You are a Python Data Visualization expert specializing in Matplotlib and Seaborn. Your task is to generate robust, error-free Python code to create a single, insightful visualization based on the user's query and the provided Solr facet data. **User's Analytical Goal:** "{query_context}" **Aggregated Data (from Solr Facets):** ```json {json.dumps(facet_data, indent=2)} ``` --- ### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES** You MUST follow these rules to avoid errors. **1. Identify the Data Structure FIRST:** Before writing any code, analyze the `facet_data` JSON to determine its structure. There are three common patterns. Choose the correct template below. * **Pattern A: Simple `terms` Facet.** The JSON has ONE main key (besides "count") which contains a list of "buckets". Each bucket has a "val" and a "count". Use this for standard bar charts. * **Pattern B: Multiple `query` Facets.** The JSON has MULTIPLE keys (besides "count"), and each key is an object containing metrics like "count" or "sum(...)". Use this for comparing a few distinct items (e.g., "oral vs injection"). * **Pattern C: Nested `terms` Facet.** The JSON has one main key with a list of "buckets", but inside EACH bucket, there are nested metric objects. This is used for grouped comparisons (e.g., "compare 2024 vs 2025 across categories"). This almost always requires `pandas`. **2. Use the Correct Parsing Template:** --- **TEMPLATE FOR PATTERN A (Simple Bar Chart from `terms` facet):** ```python import matplotlib.pyplot as plt import seaborn as sns import pandas as pd plt.style.use('seaborn-v0_8-whitegrid') fig, ax = plt.subplots(figsize=(12, 8)) # Dynamically find the main facet key (the one with 'buckets') facet_key = None for key, value in facet_data.items(): if isinstance(value, dict) and 'buckets' in value: facet_key = key break if facet_key: buckets = facet_data[facet_key].get('buckets', []) # Check if buckets contain data if buckets: df = pd.DataFrame(buckets) # Check for a nested metric or use 'count' if 'total_deal_value' in df.columns and pd.api.types.is_dict_like(df['total_deal_value'].iloc): # Example for nested sum metric df['value'] = df['total_deal_value'].apply(lambda x: x.get('sum', 0)) y_axis_label = 'Sum of Total Deal Value' else: df.rename(columns={{'count': 'value'}}, inplace=True) y_axis_label = 'Count' sns.barplot(data=df, x='val', y='value', ax=ax, palette='viridis') ax.set_xlabel('Category') ax.set_ylabel(y_axis_label) else: ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center') ax.set_title('Your Insightful Title Here') # Correct way to rotate labels to prevent errors plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") plt.tight_layout() ``` --- **TEMPLATE FOR PATTERN B (Comparison Bar Chart from `query` facets):** ```python import matplotlib.pyplot as plt import seaborn as sns import pandas as pd plt.style.use('seaborn-v0_8-whitegrid') fig, ax = plt.subplots(figsize=(10, 6)) labels = [] values = [] # Iterate through top-level keys, skipping the 'count' for key, data_dict in facet_data.items(): if key == 'count' or not isinstance(data_dict, dict): continue # Extract the label (e.g., 'oral_deals' -> 'Oral') label = key.replace('_deals', '').replace('_', ' ').title() # Find the metric value, which is NOT 'count' metric_value = 0 for sub_key, sub_value in data_dict.items(): if sub_key != 'count': metric_value = sub_value break # Found the metric labels.append(label) values.append(metric_value) if labels: sns.barplot(x=labels, y=values, ax=ax, palette='mako') ax.set_ylabel('Total Deal Value') # Or other metric name ax.set_xlabel('Category') else: ax.text(0.5, 0.5, 'No query facet data to plot.', ha='center') ax.set_title('Your Insightful Title Here') plt.tight_layout() ``` --- **TEMPLATE FOR PATTERN C (Grouped Bar Chart from nested `terms` facet):** ```python import matplotlib.pyplot as plt import seaborn as sns import pandas as pd plt.style.use('seaborn-v0_8-whitegrid') fig, ax = plt.subplots(figsize=(14, 8)) # Find the key that has the buckets facet_key = None for key, value in facet_data.items(): if isinstance(value, dict) and 'buckets' in value: facet_key = key break if facet_key and facet_data[facet_key].get('buckets'): # This list comprehension is robust for parsing nested metrics plot_data = [] for bucket in facet_data[facet_key]['buckets']: category = bucket['val'] # Find all nested metrics (e.g., total_deal_value_2025) for sub_key, sub_value in bucket.items(): if isinstance(sub_value, dict) and 'sum' in sub_value: # Extracts year from 'total_deal_value_2025' -> '2025' year = sub_key.split('_')[-1] value = sub_value['sum'] plot_data.append({{'Category': category, 'Year': year, 'Value': value}}) if plot_data: df = pd.DataFrame(plot_data) sns.barplot(data=df, x='Category', y='Value', hue='Year', ax=ax) ax.set_ylabel('Total Deal Value') ax.set_xlabel('Business Model') # Correct way to rotate labels to prevent errors plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") else: ax.text(0.5, 0.5, 'No nested data found to plot.', ha='center') else: ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center') ax.set_title('Your Insightful Title Here') plt.tight_layout() ``` --- **3. Final Code Generation:** - **DO NOT** include `plt.show()`. - **DO** set a dynamic and descriptive `ax.set_title()`, `ax.set_xlabel()`, and `ax.set_ylabel()`. - **DO NOT** wrap the code in ```python ... ```. Output only the raw Python code. - Adapt the chosen template to the specific keys and metrics in the provided `facet_data`. **Your Task:** Now, generate the Python code. """ try: # Increase the timeout for potentially complex generation generation_config = genai.types.GenerationConfig(temperature=0, max_output_tokens=2048) response = llm_model.generate_content(prompt, generation_config=generation_config) # Clean the response to remove markdown formatting code = re.sub(r'^```python\s*|\s*```$', '', response.text, flags=re.MULTILINE) return code except Exception as e: print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {response.text}") return None def execute_viz_code_and_get_path(viz_code, facet_data): """Executes visualization code and returns the path to the saved plot image.""" if not viz_code: return None try: if not os.path.exists('/tmp/plots'): os.makedirs('/tmp/plots') plot_path = f"/tmp/plots/plot_{datetime.datetime.now().timestamp()}.png" # The exec environment needs access to the required libraries and the data exec_globals = {'facet_data': facet_data, 'plt': plt, 'sns': sns, 'pd': pd} exec(viz_code, exec_globals) fig = exec_globals.get('fig') if fig: fig.savefig(plot_path, bbox_inches='tight') plt.close(fig) # Important to free up memory return plot_path return None except Exception as e: print(f"ERROR executing visualization code: {e}\n---Code---\n{viz_code}") return None def llm_generate_summary_and_suggestions_stream(query_context, facet_data): """ Yields a streaming analytical report and strategic, context-aware suggestions for further exploration. """ prompt = f""" You are a leading business intelligence analyst and strategist. Your audience is an executive or decision-maker who relies on you to not just present data, but to uncover its meaning and suggest smart next steps. Your task is to analyze the provided data, deliver a concise, insightful report, and then propose logical follow-up analyses that could uncover deeper trends or causes. **Today's Date for Context:** {datetime.datetime.now().strftime('%Y-%m-%d')} **Analysis Context:** * **User's Core Question:** "{query_context}" * **Structured Data (Your Evidence):** ```json {json.dumps(facet_data, indent=2)} ``` **--- INSTRUCTIONS ---** **PART 1: THE ANALYTICAL REPORT** Structure your report using Markdown. Your tone should be insightful, data-driven, and forward-looking. * `## Executive Summary`: A 1-2 sentence, top-line answer to the user's core question. Get straight to the point. * `### Key Findings & Insights`: Use bullet points. Don't just state the data; interpret it. * Highlight the most significant figures, patterns, or anomalies. * Where relevant, calculate key differences or growth rates (e.g., "X is 25% higher than Y"). * Pinpoint what the visualization or data reveals about the core business question. * **Data Note:** Briefly mention any important caveats if apparent from the data (e.g., a short time frame, a small sample size). * `### Context & Implications`: Briefly explain the "so what?" of these findings. What might this mean for our strategy, the market, or operations? **PART 2: DEEPER DIVE: SUGGESTED FOLLOW-UP ANALYSES** After the report, create a final section titled `### Deeper Dive: Suggested Follow-up Analyses`. * **Think like a strategist.** Based on the findings, what would you ask next to validate a trend, understand a change, or uncover a root cause? * **Propose 2-3 logical next questions.** These should be concise and framed as natural language questions that inspire further exploration. * **Focus on comparative and trend analysis.** For example: * If the user asked for "this year," suggest a comparison: *"How does this year's performance in [X] compare to last year?"* * If a category is a clear leader, suggest breaking it down: *"What are the top sub-categories driving the growth in [Leading Category]?"* * If there's a time-based trend, suggest exploring correlations: *"Is the decline in [Metric Z] correlated with changes in any other category during the same period?"* * Format them as a numbered list. * Ensure your suggestions are answerable using the available field definitions below. ### FIELD DEFINITIONS (Your Source of Truth) {formatted_field_info} **--- YOUR TASK ---** Generate the full report and the strategic suggestions based on the user's question and the data provided. """ try: response_stream = llm_model.generate_content(prompt, stream=True) for chunk in response_stream: yield chunk.text except Exception as e: print(f"Error in llm_generate_summary_and_suggestions_stream: {e}") yield "Sorry, I was unable to generate a summary for this data." # CORRECTED: Only one, correctly implemented version of this function remains. def process_analysis_flow(user_input, history, state): """ A generator that manages the conversation and yields tuples of UI updates for Gradio. This version treats any user input as a new query and considers conversation history. """ # Initialize state on the first run if state is None: state = {'query_count': 0, 'last_suggestions': []} # If history is None (from a reset), initialize it as an empty list if history is None: history = [] # Reset UI components for the new analysis, but keep chat history yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False)) query_context = user_input.strip() if not query_context: history.append((user_input, "Please enter a question to analyze.")) yield (history, state, None, None, None, None) return # 1. Acknowledge and start the process history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating Solr query...*")) yield (history, state, None, None, None, None) # 2. Generate Solr Query with history llm_solr_obj = llm_generate_solr_query_with_history(query_context, field_metadata, history) if not llm_solr_obj or 'query' not in llm_solr_obj or 'json.facet' not in llm_solr_obj: history.append((None, "I'm sorry, I couldn't generate a valid Solr query for that request. Please try rephrasing your question.")) yield (history, state, None, None, None, None) return solr_q, solr_facet = llm_solr_obj.get('query'), llm_solr_obj.get('json.facet') history.append((None, "āœ… Solr query generated!")) formatted_query = f"**Query:**\n```\n{solr_q}\n```\n\n**Facet JSON:**\n```json\n{json.dumps(solr_facet, indent=2)}\n```" yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None) # 3. Execute Query try: history.append((None, "*Executing query against the database...*")) yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None) search_params = {"rows": 0, "json.facet": json.dumps(solr_facet)} results = solr_client.search(q=solr_q, **search_params) facet_data = results.raw_response.get("facets", {}) formatted_data = f"**Facet Data:**\n```json\n{json.dumps(facet_data, indent=2)}\n```" if not facet_data or facet_data.get('count', 0) == 0: history.append((None, "No data was found for your query. Please try a different question.")) yield (history, state, None, None, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True)) return # 4. Generate Visualization history.append((None, "āœ… Data retrieved. Generating visualization...")) yield (history, state, None, None, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True)) with concurrent.futures.ThreadPoolExecutor() as executor: # Start visualization generation in the background viz_future = executor.submit(llm_generate_visualization_code, query_context, facet_data) # 5. Generate and Stream Report history.append((None, "āœ… Plot created. Streaming final report...")) output_report = gr.update(value="", visible=True) # Make it visible before streaming yield (history, state, None, output_report, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True)) report_text = "" # The history object is not modified during streaming, so we pass it once # The yield statement for streaming only updates the report text stream_history = history[:] # Make a copy for chunk in llm_generate_summary_and_suggestions_stream(query_context, facet_data): report_text += chunk yield (stream_history, state, None, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True)) # Update the main history with the final report text history.append((None, report_text)) # Get the visualization code from the future viz_code = viz_future.result() plot_path = execute_viz_code_and_get_path(viz_code, facet_data) output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False) if not plot_path: history.append((None, "*I was unable to generate a plot for this data.*\n")) yield (history, state, output_plot, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True)) # 6. Finalize and prompt for next action state['query_count'] += 1 state['last_suggestions'] = parse_suggestions_from_report(report_text) next_prompt = "Analysis complete. What would you like to explore next? You can ask a follow-up question, or ask something new." history.append((None, next_prompt)) yield (history, state, output_plot, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True)) except Exception as e: error_message = f"An unexpected error occurred during analysis: {e}" history.append((None, error_message)) print(f"Error during analysis execution: {e}") yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None) # --- Gradio UI --- with gr.Blocks(theme=gr.themes.Soft(), css="footer {display: none !important}") as demo: state = gr.State() with gr.Row(): with gr.Column(scale=4): gr.Markdown("# šŸ’Š PharmaCircle AI Data Analyst") with gr.Column(scale=1): clear_button = gr.Button("šŸ”„ Start New Analysis", variant="primary") gr.Markdown("Ask a question to begin your analysis. I will generate a Solr query, retrieve the data, create a visualization, and write a report. You can then ask follow-up questions freely.") with gr.Row(): with gr.Column(scale=1): chatbot = gr.Chatbot(label="Analysis Chat Log", height=700, show_copy_button=True) msg_textbox = gr.Textbox(placeholder="Ask a question, e.g., 'Show me the top 5 companies by total deal value in 2023'", label="Your Question", interactive=True) with gr.Column(scale=2): with gr.Accordion("Generated Solr Query", open=False): solr_query_display = gr.Markdown("Query will appear here...", visible=True) with gr.Accordion("Retrieved Solr Data", open=False): solr_data_display = gr.Markdown("Data will appear here...", visible=False) plot_display = gr.Image(label="Visualization", type="filepath", visible=False) report_display = gr.Markdown("Report will be streamed here...", visible=False) # --- Event Wiring --- def reset_all(): """Resets the entire UI for a new analysis session.""" return ( [], # chatbot (cleared) None, # state (reset) "", # msg_textbox (cleared) gr.update(value=None, visible=False), # plot_display gr.update(value=None, visible=False), # report_display gr.update(value=None, visible=False), # solr_query_display gr.update(value=None, visible=False) # solr_data_display ) # Main event handler for all user queries msg_textbox.submit( fn=process_analysis_flow, inputs=[msg_textbox, chatbot, state], outputs=[chatbot, state, plot_display, report_display, solr_query_display, solr_data_display], ).then( lambda: gr.update(value=""), None, [msg_textbox], queue=False, ) clear_button.click( fn=reset_all, inputs=None, outputs=[chatbot, state, msg_textbox, plot_display, report_display, solr_query_display, solr_data_display], queue=False ) if is_initialized: demo.queue().launch(debug=True, share=True) else: print("\nSkipping Gradio launch due to initialization errors.")