Spaces:
Sleeping
Sleeping
dolphinium
add concurrent processing for visualization generation and update report streaming
621afd7
import gradio as gr | |
import json | |
import re | |
import datetime | |
import pandas as pd | |
import pysolr | |
import google.generativeai as genai | |
from sshtunnel import SSHTunnelForwarder | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import io | |
import os | |
import logging | |
import concurrent.futures | |
from IPython.display import display, Markdown | |
# --- Suppress Matplotlib Debug Logs --- | |
logging.getLogger('matplotlib').setLevel(logging.WARNING) | |
# --- SSH Tunnel Configuration --- | |
# It's recommended to load secrets securely, e.g., from environment variables | |
SSH_HOST = os.environ.get('SSH_HOST') | |
SSH_PORT = 5322 | |
SSH_USER = os.environ.get('SSH_USER') | |
SSH_PASS = os.environ.get('SSH_PASS') | |
# --- Solr Configuration --- | |
REMOTE_SOLR_HOST = '69.167.186.48' | |
REMOTE_SOLR_PORT = 8983 | |
LOCAL_BIND_PORT = 8983 | |
SOLR_CORE_NAME = 'news' | |
SOLR_USER = os.environ.get('SOLR_USER') | |
SOLR_PASS = os.environ.get('SOLR_PASS') | |
# --- Google Gemini Configuration --- | |
try: | |
genai.configure(api_key=os.environ.get('GEMINI_API_KEY')) | |
except Exception as e: | |
print(f"β Gemini API Key Error: {e}. Please ensure 'GEMINI_API_KEY' is set in your environment.") | |
# --- Global Variables --- | |
ssh_tunnel_server = None | |
solr_client = None | |
llm_model = None | |
is_initialized = False | |
try: | |
# 1. Start the SSH Tunnel | |
ssh_tunnel_server = SSHTunnelForwarder( | |
(SSH_HOST, SSH_PORT), | |
ssh_username=SSH_USER, | |
ssh_password=SSH_PASS, | |
remote_bind_address=(REMOTE_SOLR_HOST, REMOTE_SOLR_PORT), | |
local_bind_address=('127.0.0.1', LOCAL_BIND_PORT) | |
) | |
ssh_tunnel_server.start() | |
print(f"π SSH tunnel established: Local Port {ssh_tunnel_server.local_bind_port} -> Remote Solr.") | |
# 2. Initialize the pysolr client | |
solr_url = f'http://127.0.0.1:{ssh_tunnel_server.local_bind_port}/solr/{SOLR_CORE_NAME}' | |
solr_client = pysolr.Solr(solr_url, auth=(SOLR_USER, SOLR_PASS), always_commit=True) | |
solr_client.ping() | |
print(f"β Solr connection successful on core '{SOLR_CORE_NAME}'.") | |
# 3. Initialize the LLM | |
llm_model = genai.GenerativeModel('gemini-1.5-flash', generation_config=genai.types.GenerationConfig(temperature=0)) | |
print(f"β LLM Model '{llm_model.model_name}' initialized.") | |
print("β System Initialized Successfully.") | |
is_initialized = True | |
except Exception as e: | |
print(f"\nβ An error occurred during setup: {e}") | |
if ssh_tunnel_server and ssh_tunnel_server.is_active: | |
ssh_tunnel_server.stop() | |
field_metadata = [ | |
{ | |
"field_name": "business_model", | |
"type": "string (categorical)", | |
"example_values": ["pharma/bio", "drug delivery", "pharma services"], | |
"definition": "The primary business category of the company involved in the news. Use for filtering by high-level industry segments." | |
}, | |
{ | |
"field_name": "news_type", | |
"type": "string (categorical)", | |
"example_values": ["product news", "financial news", "regulatory news"], | |
"definition": "The category of the news article itself (e.g., financial, regulatory, acquisition). Use for filtering by the type of event being reported." | |
}, | |
{ | |
"field_name": "event_type", | |
"type": "string (categorical)", | |
"example_values": ["phase 2", "phase 1", "pre clinical", "marketed"], | |
"definition": "The clinical or developmental stage of a product or event discussed in the article. Essential for queries about clinical trial phases." | |
}, | |
{ | |
"field_name": "source", | |
"type": "string (categorical)", | |
"example_values": ["Press Release", "PR Newswire", "Business Wire"], | |
"definition": "The original source of the news article, such as a newswire or official report." | |
}, | |
{ | |
"field_name": "company_name", | |
"type": "string (exact match, for faceting)", | |
"example_values": ["pfizer inc.", "astrazeneca plc", "roche"], | |
"definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching." | |
}, | |
{ | |
"field_name": "company_name_s", | |
"type": "string (multi-valued, for searching)", | |
"example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"], | |
"definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting." | |
}, | |
{ | |
"field_name": "territory_hq_s", | |
"type": "string (multi-valued, hierarchical)", | |
"example_values": ["united states of america", "europe", "europe western"], | |
"definition": "The geographic location (country and continent) of a company's headquarters. It is hierarchical. Use for filtering by location." | |
}, | |
{ | |
"field_name": "therapeutic_category", | |
"type": "string (specific)", | |
"example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"], | |
"definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries." | |
}, | |
{ | |
"field_name": "therapeutic_category_s", | |
"type": "string (multi-valued, for searching)", | |
"example_values": ["cancer", "oncology", "infections", "cns"], | |
"definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter." | |
}, | |
{ | |
"field_name": "compound_name", | |
"type": "string (exact match, for faceting)", | |
"example_values": ["opdivo injection solution", "keytruda injection solution"], | |
"definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds." | |
}, | |
{ | |
"field_name": "compound_name_s", | |
"type": "string (multi-valued, for searching)", | |
"example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"], | |
"definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name." | |
}, | |
{ | |
"field_name": "molecule_name", | |
"type": "string (exact match, for faceting)", | |
"example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"], | |
"definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules." | |
}, | |
{ | |
"field_name": "molecule_name_s", | |
"type": "string (multi-valued, for searching)", | |
"example_values": ["cbd", "s1-220", "a1002n5s"], | |
"definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name." | |
}, | |
{ | |
"field_name": "highest_phase", | |
"type": "string (categorical)", | |
"example_values": ["marketed", "phase 2", "phase 1"], | |
"definition": "The highest stage of development a drug has ever reached." | |
}, | |
{ | |
"field_name": "drug_delivery_branch_s", | |
"type": "string (multi-valued, for searching)", | |
"example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"], | |
"definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms." | |
}, | |
{ | |
"field_name": "drug_delivery_branch", | |
"type": "string (categorical, specific, for faceting)", | |
"example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"], | |
"definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies." | |
}, | |
{ | |
"field_name": "route_branch", | |
"type": "string (categorical)", | |
"example_values": ["injection", "oral", "topical", "inhalation"], | |
"definition": "The primary route of drug administration. Good for faceting on exact routes." | |
}, | |
{ | |
"field_name": "molecule_api_group", | |
"type": "string (categorical)", | |
"example_values": ["small molecules", "biologics", "nucleic acids"], | |
"definition": "High-level classification of the drug's molecular type." | |
}, | |
{ | |
"field_name": "content", | |
"type": "text (full-text search)", | |
"example_values": ["The largest study to date...", "balstilimab..."], | |
"definition": "The full text content of the news article. Use for keyword searches on topics not covered by other specific fields." | |
}, | |
{ | |
"field_name": "date", | |
"type": "date", | |
"example_values": ["2020-10-22T00:00:00Z"], | |
"definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries." | |
}, | |
{ | |
"field_name": "date_year", | |
"type": "number (year)", | |
"example_values": [2020, 2021, 2022], | |
"definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')." | |
}, | |
{ | |
"field_name": "total_deal_value_in_million", | |
"type": "number (metric)", | |
"example_values": [50, 120.5, 176.157, 1000], | |
"definition": "The total value of a financial deal, in millions of USD. This is the primary numeric field for financial aggregations (sum, avg, etc.). To use this, you must also filter for news that has a deal value, e.g., 'total_deal_value_in_million:[0 TO *]'." | |
} | |
] | |
# Helper function to format the metadata for the prompt | |
def format_metadata_for_prompt(metadata): | |
formatted_string = "" | |
for field in metadata: | |
formatted_string += f"- **{field['field_name']}**\n" | |
formatted_string += f" - **Type**: {field['type']}\n" | |
formatted_string += f" - **Definition**: {field['definition']}\n" | |
formatted_string += f" - **Examples**: {', '.join(map(str, field['example_values']))}\n\n" | |
return formatted_string | |
formatted_field_info = format_metadata_for_prompt(field_metadata) | |
def parse_suggestions_from_report(report_text): | |
"""Extracts numbered suggestions from the report's markdown text.""" | |
suggestions_match = re.search(r"### (?:Deeper Dive: Suggested Follow-up Analyses|Suggestions for Further Exploration)\s*\n(.*?)$", report_text, re.DOTALL | re.IGNORECASE) | |
if not suggestions_match: return [] | |
suggestions_text = suggestions_match.group(1) | |
suggestions = re.findall(r"^\s*\d+\.\s*(.*)", suggestions_text, re.MULTILINE) | |
return [s.strip() for s in suggestions] | |
def llm_generate_solr_query_with_history(natural_language_query, field_metadata, chat_history): | |
"""Generates a Solr query and facet JSON from a natural language query, considering the conversation history.""" | |
# Format the chat history for the prompt | |
formatted_history = "" | |
for user_msg, bot_msg in chat_history: | |
# We only need the user's queries for context, not the bot's detailed responses. | |
if user_msg: | |
# CORRECTED: Properly formatted f-string with a newline character | |
formatted_history += f"- User: \"{user_msg}\"\n" | |
prompt = f""" | |
You are an expert Solr query engineer who converts natural language questions into precise Solr JSON Facet API query objects. Your primary goal is to create a valid JSON object with `query` and `json.facet` keys. | |
--- | |
### CONVERSATIONAL CONTEXT & RULES | |
1. **Today's Date for Calculations**: 2025-07-16 | |
2. **Allowed Facet Types**: The `type` key for any facet MUST be one of the following: `terms`, `query`, or `range`. **Do not use `date_histogram`**. For time-series analysis, use a `range` facet on a date field. | |
3. **Field Usage**: You MUST use the fields described in the 'Field Definitions' section. Pay close attention to the definitions to select the correct field. | |
4. **Facet vs. Query Field Distinction**: This is critical. | |
* For searching in the main `query` parameter, ALWAYS use the multi-valued search fields (ending in `_s`, like `company_name_s`) to get comprehensive results. | |
* For grouping in a `terms` facet, ALWAYS use the canonical, single-value field (e.g., `company_name`, `molecule_name`) to ensure unique and accurate grouping. | |
5. **No `count(*)`**: Do NOT use functions like `count(*)`. The default facet bucket count is sufficient for counting documents. | |
6. **Allowed Aggregations**: For statistical facets, only use these functions: `sum`, `avg`, `min`, `max`, `unique`. The primary metric field is `total_deal_value_in_million`. The aggregation MUST be a simple string like `"sum(total_deal_value_in_million)"` and not a nested JSON object. | |
7. **Term Facet Limits**: Every `terms` facet MUST include a `limit` key. Default to `limit: 10` unless the user specifies a different number of top results. | |
8. **Output Format**: Your final output must be a single, raw JSON object and nothing else. Do not add comments, explanations, or markdown formatting like ```json. | |
--- | |
### FIELD DEFINITIONS (Your Source of Truth) | |
`{formatted_field_info}` | |
--- | |
### CHAT HISTORY | |
`{formatted_history}` | |
--- | |
### EXAMPLE OF A FOLLOW-UP QUERY | |
**Initial User Query:** "What are the infections news in this year?" | |
```json | |
{{ | |
"query": "date_year:2025 AND therapeutic_category_s:infections", | |
"json.facet": {{ | |
"infections_news_by_type": {{ | |
"type": "terms", | |
"field": "news_type", | |
"limit": 10 | |
}} | |
}} | |
}} | |
``` | |
**Follow-up User Query:** "Compare deal values for injection vs oral." | |
**Correct JSON Output for the Follow-up:** | |
```json | |
{{ | |
"query": "therapeutic_category_s:infections AND date_year:2025 AND total_deal_value_in_million:[0 TO *]", | |
"json.facet": {{ | |
"injection_deals": {{ | |
"type": "query", | |
"q": "route_branch:injection", | |
"facet": {{ | |
"total_deal_value": "sum(total_deal_value_in_million)" | |
}} | |
}}, | |
"oral_deals": {{ | |
"type": "query", | |
"q": "route_branch:oral", | |
"facet": {{ | |
"total_deal_value": "sum(total_deal_value_in_million)" | |
}} | |
}} | |
}} | |
}} | |
``` | |
--- | |
### YOUR TASK | |
Now, convert the following user query into a single, raw JSON object with 'query' and 'json.facet' keys, strictly following all rules and field definitions provided above and considering the chat history. | |
**Current User Query:** `{natural_language_query}` | |
""" | |
try: | |
response = llm_model.generate_content(prompt) | |
# Using a more robust regex to clean the response | |
cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip() | |
return json.loads(cleaned_text) | |
except Exception as e: | |
raw_response_text = response.text if 'response' in locals() else 'N/A' | |
print(f"Error in llm_generate_solr_query_with_history: {e}\nRaw Response:\n{raw_response_text}") | |
return None | |
def llm_generate_visualization_code(query_context, facet_data): | |
"""Generates Python code for visualization based on query and data.""" | |
prompt = f""" | |
You are a Python Data Visualization expert specializing in Matplotlib and Seaborn. | |
Your task is to generate robust, error-free Python code to create a single, insightful visualization based on the user's query and the provided Solr facet data. | |
**User's Analytical Goal:** | |
"{query_context}" | |
**Aggregated Data (from Solr Facets):** | |
```json | |
{json.dumps(facet_data, indent=2)} | |
``` | |
--- | |
### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES** | |
You MUST follow these rules to avoid errors. | |
**1. Identify the Data Structure FIRST:** | |
Before writing any code, analyze the `facet_data` JSON to determine its structure. There are three common patterns. Choose the correct template below. | |
* **Pattern A: Simple `terms` Facet.** The JSON has ONE main key (besides "count") which contains a list of "buckets". Each bucket has a "val" and a "count". Use this for standard bar charts. | |
* **Pattern B: Multiple `query` Facets.** The JSON has MULTIPLE keys (besides "count"), and each key is an object containing metrics like "count" or "sum(...)". Use this for comparing a few distinct items (e.g., "oral vs injection"). | |
* **Pattern C: Nested `terms` Facet.** The JSON has one main key with a list of "buckets", but inside EACH bucket, there are nested metric objects. This is used for grouped comparisons (e.g., "compare 2024 vs 2025 across categories"). This almost always requires `pandas`. | |
**2. Use the Correct Parsing Template:** | |
--- | |
**TEMPLATE FOR PATTERN A (Simple Bar Chart from `terms` facet):** | |
```python | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import pandas as pd | |
plt.style.use('seaborn-v0_8-whitegrid') | |
fig, ax = plt.subplots(figsize=(12, 8)) | |
# Dynamically find the main facet key (the one with 'buckets') | |
facet_key = None | |
for key, value in facet_data.items(): | |
if isinstance(value, dict) and 'buckets' in value: | |
facet_key = key | |
break | |
if facet_key: | |
buckets = facet_data[facet_key].get('buckets', []) | |
# Check if buckets contain data | |
if buckets: | |
df = pd.DataFrame(buckets) | |
# Check for a nested metric or use 'count' | |
if 'total_deal_value' in df.columns and pd.api.types.is_dict_like(df['total_deal_value'].iloc): | |
# Example for nested sum metric | |
df['value'] = df['total_deal_value'].apply(lambda x: x.get('sum', 0)) | |
y_axis_label = 'Sum of Total Deal Value' | |
else: | |
df.rename(columns={{'count': 'value'}}, inplace=True) | |
y_axis_label = 'Count' | |
sns.barplot(data=df, x='val', y='value', ax=ax, palette='viridis') | |
ax.set_xlabel('Category') | |
ax.set_ylabel(y_axis_label) | |
else: | |
ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center') | |
ax.set_title('Your Insightful Title Here') | |
# Correct way to rotate labels to prevent errors | |
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") | |
plt.tight_layout() | |
``` | |
--- | |
**TEMPLATE FOR PATTERN B (Comparison Bar Chart from `query` facets):** | |
```python | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import pandas as pd | |
plt.style.use('seaborn-v0_8-whitegrid') | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
labels = [] | |
values = [] | |
# Iterate through top-level keys, skipping the 'count' | |
for key, data_dict in facet_data.items(): | |
if key == 'count' or not isinstance(data_dict, dict): | |
continue | |
# Extract the label (e.g., 'oral_deals' -> 'Oral') | |
label = key.replace('_deals', '').replace('_', ' ').title() | |
# Find the metric value, which is NOT 'count' | |
metric_value = 0 | |
for sub_key, sub_value in data_dict.items(): | |
if sub_key != 'count': | |
metric_value = sub_value | |
break # Found the metric | |
labels.append(label) | |
values.append(metric_value) | |
if labels: | |
sns.barplot(x=labels, y=values, ax=ax, palette='mako') | |
ax.set_ylabel('Total Deal Value') # Or other metric name | |
ax.set_xlabel('Category') | |
else: | |
ax.text(0.5, 0.5, 'No query facet data to plot.', ha='center') | |
ax.set_title('Your Insightful Title Here') | |
plt.tight_layout() | |
``` | |
--- | |
**TEMPLATE FOR PATTERN C (Grouped Bar Chart from nested `terms` facet):** | |
```python | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import pandas as pd | |
plt.style.use('seaborn-v0_8-whitegrid') | |
fig, ax = plt.subplots(figsize=(14, 8)) | |
# Find the key that has the buckets | |
facet_key = None | |
for key, value in facet_data.items(): | |
if isinstance(value, dict) and 'buckets' in value: | |
facet_key = key | |
break | |
if facet_key and facet_data[facet_key].get('buckets'): | |
# This list comprehension is robust for parsing nested metrics | |
plot_data = [] | |
for bucket in facet_data[facet_key]['buckets']: | |
category = bucket['val'] | |
# Find all nested metrics (e.g., total_deal_value_2025) | |
for sub_key, sub_value in bucket.items(): | |
if isinstance(sub_value, dict) and 'sum' in sub_value: | |
# Extracts year from 'total_deal_value_2025' -> '2025' | |
year = sub_key.split('_')[-1] | |
value = sub_value['sum'] | |
plot_data.append({{'Category': category, 'Year': year, 'Value': value}}) | |
if plot_data: | |
df = pd.DataFrame(plot_data) | |
sns.barplot(data=df, x='Category', y='Value', hue='Year', ax=ax) | |
ax.set_ylabel('Total Deal Value') | |
ax.set_xlabel('Business Model') | |
# Correct way to rotate labels to prevent errors | |
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") | |
else: | |
ax.text(0.5, 0.5, 'No nested data found to plot.', ha='center') | |
else: | |
ax.text(0.5, 0.5, 'No data in buckets to plot.', ha='center') | |
ax.set_title('Your Insightful Title Here') | |
plt.tight_layout() | |
``` | |
--- | |
**3. Final Code Generation:** | |
- **DO NOT** include `plt.show()`. | |
- **DO** set a dynamic and descriptive `ax.set_title()`, `ax.set_xlabel()`, and `ax.set_ylabel()`. | |
- **DO NOT** wrap the code in ```python ... ```. Output only the raw Python code. | |
- Adapt the chosen template to the specific keys and metrics in the provided `facet_data`. | |
**Your Task:** | |
Now, generate the Python code. | |
""" | |
try: | |
# Increase the timeout for potentially complex generation | |
generation_config = genai.types.GenerationConfig(temperature=0, max_output_tokens=2048) | |
response = llm_model.generate_content(prompt, generation_config=generation_config) | |
# Clean the response to remove markdown formatting | |
code = re.sub(r'^```python\s*|\s*```$', '', response.text, flags=re.MULTILINE) | |
return code | |
except Exception as e: | |
print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {response.text}") | |
return None | |
def execute_viz_code_and_get_path(viz_code, facet_data): | |
"""Executes visualization code and returns the path to the saved plot image.""" | |
if not viz_code: return None | |
try: | |
if not os.path.exists('/tmp/plots'): os.makedirs('/tmp/plots') | |
plot_path = f"/tmp/plots/plot_{datetime.datetime.now().timestamp()}.png" | |
# The exec environment needs access to the required libraries and the data | |
exec_globals = {'facet_data': facet_data, 'plt': plt, 'sns': sns, 'pd': pd} | |
exec(viz_code, exec_globals) | |
fig = exec_globals.get('fig') | |
if fig: | |
fig.savefig(plot_path, bbox_inches='tight') | |
plt.close(fig) # Important to free up memory | |
return plot_path | |
return None | |
except Exception as e: | |
print(f"ERROR executing visualization code: {e}\n---Code---\n{viz_code}") | |
return None | |
def llm_generate_summary_and_suggestions_stream(query_context, facet_data): | |
""" | |
Yields a streaming analytical report and strategic, context-aware suggestions for further exploration. | |
""" | |
prompt = f""" | |
You are a leading business intelligence analyst and strategist. Your audience is an executive or decision-maker who relies on you to not just present data, but to uncover its meaning and suggest smart next steps. | |
Your task is to analyze the provided data, deliver a concise, insightful report, and then propose logical follow-up analyses that could uncover deeper trends or causes. | |
**Today's Date for Context:** {datetime.datetime.now().strftime('%Y-%m-%d')} | |
**Analysis Context:** | |
* **User's Core Question:** "{query_context}" | |
* **Structured Data (Your Evidence):** | |
```json | |
{json.dumps(facet_data, indent=2)} | |
``` | |
**--- INSTRUCTIONS ---** | |
**PART 1: THE ANALYTICAL REPORT** | |
Structure your report using Markdown. Your tone should be insightful, data-driven, and forward-looking. | |
* `## Executive Summary`: A 1-2 sentence, top-line answer to the user's core question. Get straight to the point. | |
* `### Key Findings & Insights`: Use bullet points. Don't just state the data; interpret it. | |
* Highlight the most significant figures, patterns, or anomalies. | |
* Where relevant, calculate key differences or growth rates (e.g., "X is 25% higher than Y"). | |
* Pinpoint what the visualization or data reveals about the core business question. | |
* **Data Note:** Briefly mention any important caveats if apparent from the data (e.g., a short time frame, a small sample size). | |
* `### Context & Implications`: Briefly explain the "so what?" of these findings. What might this mean for our strategy, the market, or operations? | |
**PART 2: DEEPER DIVE: SUGGESTED FOLLOW-UP ANALYSES** | |
After the report, create a final section titled `### Deeper Dive: Suggested Follow-up Analyses`. | |
* **Think like a strategist.** Based on the findings, what would you ask next to validate a trend, understand a change, or uncover a root cause? | |
* **Propose 2-3 logical next questions.** These should be concise and framed as natural language questions that inspire further exploration. | |
* **Focus on comparative and trend analysis.** For example: | |
* If the user asked for "this year," suggest a comparison: *"How does this year's performance in [X] compare to last year?"* | |
* If a category is a clear leader, suggest breaking it down: *"What are the top sub-categories driving the growth in [Leading Category]?"* | |
* If there's a time-based trend, suggest exploring correlations: *"Is the decline in [Metric Z] correlated with changes in any other category during the same period?"* | |
* Format them as a numbered list. | |
* Ensure your suggestions are answerable using the available field definitions below. | |
### FIELD DEFINITIONS (Your Source of Truth) | |
{formatted_field_info} | |
**--- YOUR TASK ---** | |
Generate the full report and the strategic suggestions based on the user's question and the data provided. | |
""" | |
try: | |
response_stream = llm_model.generate_content(prompt, stream=True) | |
for chunk in response_stream: | |
yield chunk.text | |
except Exception as e: | |
print(f"Error in llm_generate_summary_and_suggestions_stream: {e}") | |
yield "Sorry, I was unable to generate a summary for this data." | |
# CORRECTED: Only one, correctly implemented version of this function remains. | |
def process_analysis_flow(user_input, history, state): | |
""" | |
A generator that manages the conversation and yields tuples of UI updates for Gradio. | |
This version treats any user input as a new query and considers conversation history. | |
""" | |
# Initialize state on the first run | |
if state is None: | |
state = {'query_count': 0, 'last_suggestions': []} | |
# If history is None (from a reset), initialize it as an empty list | |
if history is None: | |
history = [] | |
# Reset UI components for the new analysis, but keep chat history | |
yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False)) | |
query_context = user_input.strip() | |
if not query_context: | |
history.append((user_input, "Please enter a question to analyze.")) | |
yield (history, state, None, None, None, None) | |
return | |
# 1. Acknowledge and start the process | |
history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating Solr query...*")) | |
yield (history, state, None, None, None, None) | |
# 2. Generate Solr Query with history | |
llm_solr_obj = llm_generate_solr_query_with_history(query_context, field_metadata, history) | |
if not llm_solr_obj or 'query' not in llm_solr_obj or 'json.facet' not in llm_solr_obj: | |
history.append((None, "I'm sorry, I couldn't generate a valid Solr query for that request. Please try rephrasing your question.")) | |
yield (history, state, None, None, None, None) | |
return | |
solr_q, solr_facet = llm_solr_obj.get('query'), llm_solr_obj.get('json.facet') | |
history.append((None, "β Solr query generated!")) | |
formatted_query = f"**Query:**\n```\n{solr_q}\n```\n\n**Facet JSON:**\n```json\n{json.dumps(solr_facet, indent=2)}\n```" | |
yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None) | |
# 3. Execute Query | |
try: | |
history.append((None, "*Executing query against the database...*")) | |
yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None) | |
search_params = {"rows": 0, "json.facet": json.dumps(solr_facet)} | |
results = solr_client.search(q=solr_q, **search_params) | |
facet_data = results.raw_response.get("facets", {}) | |
formatted_data = f"**Facet Data:**\n```json\n{json.dumps(facet_data, indent=2)}\n```" | |
if not facet_data or facet_data.get('count', 0) == 0: | |
history.append((None, "No data was found for your query. Please try a different question.")) | |
yield (history, state, None, None, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True)) | |
return | |
# 4. Generate Visualization | |
history.append((None, "β Data retrieved. Generating visualization...")) | |
yield (history, state, None, None, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True)) | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
# Start visualization generation in the background | |
viz_future = executor.submit(llm_generate_visualization_code, query_context, facet_data) | |
# 5. Generate and Stream Report | |
history.append((None, "β Plot created. Streaming final report...")) | |
output_report = gr.update(value="", visible=True) # Make it visible before streaming | |
yield (history, state, None, output_report, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True)) | |
report_text = "" | |
# The history object is not modified during streaming, so we pass it once | |
# The yield statement for streaming only updates the report text | |
stream_history = history[:] # Make a copy | |
for chunk in llm_generate_summary_and_suggestions_stream(query_context, facet_data): | |
report_text += chunk | |
yield (stream_history, state, None, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True)) | |
# Update the main history with the final report text | |
history.append((None, report_text)) | |
# Get the visualization code from the future | |
viz_code = viz_future.result() | |
plot_path = execute_viz_code_and_get_path(viz_code, facet_data) | |
output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False) | |
if not plot_path: | |
history.append((None, "*I was unable to generate a plot for this data.*\n")) | |
yield (history, state, output_plot, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True)) | |
# 6. Finalize and prompt for next action | |
state['query_count'] += 1 | |
state['last_suggestions'] = parse_suggestions_from_report(report_text) | |
next_prompt = "Analysis complete. What would you like to explore next? You can ask a follow-up question, or ask something new." | |
history.append((None, next_prompt)) | |
yield (history, state, output_plot, report_text, gr.update(value=formatted_query, visible=True), gr.update(value=formatted_data, visible=True)) | |
except Exception as e: | |
error_message = f"An unexpected error occurred during analysis: {e}" | |
history.append((None, error_message)) | |
print(f"Error during analysis execution: {e}") | |
yield (history, state, None, None, gr.update(value=formatted_query, visible=True), None) | |
# --- Gradio UI --- | |
with gr.Blocks(theme=gr.themes.Soft(), css="footer {display: none !important}") as demo: | |
state = gr.State() | |
with gr.Row(): | |
with gr.Column(scale=4): | |
gr.Markdown("# π PharmaCircle AI Data Analyst") | |
with gr.Column(scale=1): | |
clear_button = gr.Button("π Start New Analysis", variant="primary") | |
gr.Markdown("Ask a question to begin your analysis. I will generate a Solr query, retrieve the data, create a visualization, and write a report. You can then ask follow-up questions freely.") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
chatbot = gr.Chatbot(label="Analysis Chat Log", height=700, show_copy_button=True) | |
msg_textbox = gr.Textbox(placeholder="Ask a question, e.g., 'Show me the top 5 companies by total deal value in 2023'", label="Your Question", interactive=True) | |
with gr.Column(scale=2): | |
with gr.Accordion("Generated Solr Query", open=False): | |
solr_query_display = gr.Markdown("Query will appear here...", visible=True) | |
with gr.Accordion("Retrieved Solr Data", open=False): | |
solr_data_display = gr.Markdown("Data will appear here...", visible=False) | |
plot_display = gr.Image(label="Visualization", type="filepath", visible=False) | |
report_display = gr.Markdown("Report will be streamed here...", visible=False) | |
# --- Event Wiring --- | |
def reset_all(): | |
"""Resets the entire UI for a new analysis session.""" | |
return ( | |
[], # chatbot (cleared) | |
None, # state (reset) | |
"", # msg_textbox (cleared) | |
gr.update(value=None, visible=False), # plot_display | |
gr.update(value=None, visible=False), # report_display | |
gr.update(value=None, visible=False), # solr_query_display | |
gr.update(value=None, visible=False) # solr_data_display | |
) | |
# Main event handler for all user queries | |
msg_textbox.submit( | |
fn=process_analysis_flow, | |
inputs=[msg_textbox, chatbot, state], | |
outputs=[chatbot, state, plot_display, report_display, solr_query_display, solr_data_display], | |
).then( | |
lambda: gr.update(value=""), | |
None, | |
[msg_textbox], | |
queue=False, | |
) | |
clear_button.click( | |
fn=reset_all, | |
inputs=None, | |
outputs=[chatbot, state, msg_textbox, plot_display, report_display, solr_query_display, solr_data_display], | |
queue=False | |
) | |
if is_initialized: | |
demo.queue().launch(debug=True, share=True) | |
else: | |
print("\nSkipping Gradio launch due to initialization errors.") | |