Spaces:
Sleeping
Sleeping
token_count
#1
by
uralk
- opened
- data_processing.py +10 -31
- llm_prompts.py +9 -19
- requirements.txt +1 -2
- ui.py +22 -58
data_processing.py
CHANGED
@@ -22,7 +22,6 @@ import google.generativeai as genai
|
|
22 |
import urllib
|
23 |
import pysolr
|
24 |
import config # Import the config module to access remote host details
|
25 |
-
import tiktoken
|
26 |
|
27 |
from llm_prompts import (
|
28 |
get_analysis_plan_prompt,
|
@@ -52,11 +51,11 @@ def llm_generate_analysis_plan_with_history(llm_model, natural_language_query, c
|
|
52 |
|
53 |
if intent != 'search_list':
|
54 |
print(f"API returned intent '{intent}' which is not 'search_list'. Aborting analysis.")
|
55 |
-
return None, None, None, intent
|
56 |
|
57 |
except Exception as e:
|
58 |
print(f"Warning: Could not retrieve dynamic search fields. Proceeding without them. Error: {e}")
|
59 |
-
return None, [], None, 'api_error'
|
60 |
|
61 |
core_name = search_name if search_name else 'news'
|
62 |
|
@@ -76,21 +75,17 @@ def llm_generate_analysis_plan_with_history(llm_model, natural_language_query, c
|
|
76 |
|
77 |
try:
|
78 |
response = llm_model.generate_content(prompt)
|
79 |
-
encoding = tiktoken.encoding_for_model("gpt-4")
|
80 |
-
input_token_count = len(encoding.encode(prompt))
|
81 |
-
output_token_count = len(encoding.encode(response.text))
|
82 |
-
total_token_count = (input_token_count if input_token_count is not None else 0) + (output_token_count if output_token_count is not None else 0)
|
83 |
cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip()
|
84 |
plan = json.loads(cleaned_text)
|
85 |
-
return plan, mapped_search_fields, core_name, intent
|
86 |
except json.JSONDecodeError as e:
|
87 |
raw_response_text = response.text if 'response' in locals() else 'N/A'
|
88 |
print(f"Error decoding JSON from LLM response: {e}\nRaw Response:\n{raw_response_text}")
|
89 |
-
return None, mapped_search_fields, core_name, intent
|
90 |
except Exception as e:
|
91 |
raw_response_text = response.text if 'response' in locals() else 'N/A'
|
92 |
print(f"Error in llm_generate_analysis_plan_with_history: {e}\nRaw Response:\n{raw_response_text}")
|
93 |
-
return None, mapped_search_fields, core_name, intent
|
94 |
|
95 |
def execute_quantitative_query(solr_client, plan):
|
96 |
"""Executes the facet query to get aggregate data."""
|
@@ -155,24 +150,12 @@ def llm_synthesize_enriched_report_stream(llm_model, query, quantitative_data, q
|
|
155 |
"""
|
156 |
prompt = get_synthesis_report_prompt(query, quantitative_data, qualitative_data, plan)
|
157 |
try:
|
158 |
-
response_stream = llm_model.generate_content(prompt, stream=True)
|
159 |
-
response_text = ""
|
160 |
for chunk in response_stream:
|
161 |
-
yield
|
162 |
-
response_text += chunk.text
|
163 |
-
encoding = tiktoken.encoding_for_model("gpt-4")
|
164 |
-
input_token_count = len(encoding.encode(prompt))
|
165 |
-
output_token_count = len(encoding.encode(response_text))
|
166 |
-
total_token_count = (input_token_count if input_token_count is not None else 0) + (output_token_count if output_token_count is not None else 0)
|
167 |
-
tokens = {
|
168 |
-
"input": input_token_count,
|
169 |
-
"output": output_token_count,
|
170 |
-
"total": total_token_count,
|
171 |
-
}
|
172 |
-
yield {"text": None, "tokens": tokens}
|
173 |
except Exception as e:
|
174 |
print(f"Error in llm_synthesize_enriched_report_stream: {e}")
|
175 |
-
yield
|
176 |
|
177 |
def llm_generate_visualization_code(llm_model, query_context, facet_data):
|
178 |
"""Generates Python code for visualization based on query and data."""
|
@@ -180,16 +163,12 @@ def llm_generate_visualization_code(llm_model, query_context, facet_data):
|
|
180 |
try:
|
181 |
generation_config = genai.types.GenerationConfig(temperature=0)
|
182 |
response = llm_model.generate_content(prompt, generation_config=generation_config)
|
183 |
-
encoding = tiktoken.encoding_for_model("gpt-4")
|
184 |
-
input_token_count = len(encoding.encode(prompt))
|
185 |
-
output_token_count = len(encoding.encode(response.text))
|
186 |
-
total_token_count = (input_token_count if input_token_count is not None else 0) + (output_token_count if output_token_count is not None else 0)
|
187 |
code = re.sub(r'^```python\s*|```$', '', response.text, flags=re.MULTILINE)
|
188 |
-
return code
|
189 |
except Exception as e:
|
190 |
raw_response_text = response.text if 'response' in locals() else 'N/A'
|
191 |
print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {raw_response_text}")
|
192 |
-
return
|
193 |
|
194 |
def execute_viz_code_and_get_path(viz_code, facet_data):
|
195 |
"""Executes visualization code and returns the path to the saved plot image."""
|
|
|
22 |
import urllib
|
23 |
import pysolr
|
24 |
import config # Import the config module to access remote host details
|
|
|
25 |
|
26 |
from llm_prompts import (
|
27 |
get_analysis_plan_prompt,
|
|
|
51 |
|
52 |
if intent != 'search_list':
|
53 |
print(f"API returned intent '{intent}' which is not 'search_list'. Aborting analysis.")
|
54 |
+
return None, None, None, intent
|
55 |
|
56 |
except Exception as e:
|
57 |
print(f"Warning: Could not retrieve dynamic search fields. Proceeding without them. Error: {e}")
|
58 |
+
return None, [], None, 'api_error'
|
59 |
|
60 |
core_name = search_name if search_name else 'news'
|
61 |
|
|
|
75 |
|
76 |
try:
|
77 |
response = llm_model.generate_content(prompt)
|
|
|
|
|
|
|
|
|
78 |
cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip()
|
79 |
plan = json.loads(cleaned_text)
|
80 |
+
return plan, mapped_search_fields, core_name, intent
|
81 |
except json.JSONDecodeError as e:
|
82 |
raw_response_text = response.text if 'response' in locals() else 'N/A'
|
83 |
print(f"Error decoding JSON from LLM response: {e}\nRaw Response:\n{raw_response_text}")
|
84 |
+
return None, mapped_search_fields, core_name, intent
|
85 |
except Exception as e:
|
86 |
raw_response_text = response.text if 'response' in locals() else 'N/A'
|
87 |
print(f"Error in llm_generate_analysis_plan_with_history: {e}\nRaw Response:\n{raw_response_text}")
|
88 |
+
return None, mapped_search_fields, core_name, intent
|
89 |
|
90 |
def execute_quantitative_query(solr_client, plan):
|
91 |
"""Executes the facet query to get aggregate data."""
|
|
|
150 |
"""
|
151 |
prompt = get_synthesis_report_prompt(query, quantitative_data, qualitative_data, plan)
|
152 |
try:
|
153 |
+
response_stream = llm_model.generate_content(prompt, stream=True)
|
|
|
154 |
for chunk in response_stream:
|
155 |
+
yield chunk.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
except Exception as e:
|
157 |
print(f"Error in llm_synthesize_enriched_report_stream: {e}")
|
158 |
+
yield "Sorry, an error occurred while generating the report. Please check the logs for details."
|
159 |
|
160 |
def llm_generate_visualization_code(llm_model, query_context, facet_data):
|
161 |
"""Generates Python code for visualization based on query and data."""
|
|
|
163 |
try:
|
164 |
generation_config = genai.types.GenerationConfig(temperature=0)
|
165 |
response = llm_model.generate_content(prompt, generation_config=generation_config)
|
|
|
|
|
|
|
|
|
166 |
code = re.sub(r'^```python\s*|```$', '', response.text, flags=re.MULTILINE)
|
167 |
+
return code
|
168 |
except Exception as e:
|
169 |
raw_response_text = response.text if 'response' in locals() else 'N/A'
|
170 |
print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {raw_response_text}")
|
171 |
+
return None
|
172 |
|
173 |
def execute_viz_code_and_get_path(viz_code, facet_data):
|
174 |
"""Executes visualization code and returns the path to the saved plot image."""
|
llm_prompts.py
CHANGED
@@ -44,11 +44,9 @@ An external API has identified the following field-value pairs from the user que
|
|
44 |
"""
|
45 |
|
46 |
return f"""
|
47 |
-
You are
|
48 |
|
49 |
-
Your
|
50 |
-
|
51 |
-
Your most important job is to correctly infer the user's intent and choose an `analysis_dimension` and `analysis_measure` that provides a meaningful, non-obvious breakdown of the data that aligns with PharmaCircle's mission of tracking drug development and innovation.
|
52 |
|
53 |
---
|
54 |
### CONTEXT & RULES
|
@@ -73,9 +71,8 @@ never add an additional filter by yourself like `total_deal_value_in_million:[0
|
|
73 |
This is the most critical part of your task. A bad choice leads to a useless, boring analysis. You must first determine the user's persona and then select the analysis parameters accordingly.
|
74 |
|
75 |
**USER PERSONAS:**
|
76 |
-
|
77 |
-
* **The
|
78 |
-
* **The Scientific Analyst:** This user cares about the science. They track drug development, from discovery to market. They look for product pipelines, clinical trial phases, therapeutic breakthroughs, formulation details, and compound data. Their queries contain terms like "drug approvals," "phase 2," "therapeutic category," "compounds," "molecule," or "mechanism."
|
79 |
|
80 |
**1. Choosing the `analysis_measure` (The metric):**
|
81 |
|
@@ -88,20 +85,13 @@ Your users are PharmaCircle clients, primarily from the US (70%), Europe, and As
|
|
88 |
|
89 |
* **USER INTENT FIRST:** If the user explicitly asks to group by a field (e.g., "by company," "by country"), use that field.
|
90 |
|
91 |
-
* **INFERENCE HEURISTICS (If the user doesn't specify a dimension):** Think "What is the next logical question for this user persona
|
92 |
-
|
93 |
-
* **PharmaCircle Mission Priority:** Given PharmaCircle's focus on product pipelines and development timelines, **you should strongly prioritize `product_name`, `compound_name`, and date related fields as `analysis_dimension`s.** A time-based analysis (e.g., 'by year') or a product-focused analysis is often the most valuable insight for our users who are tracking progress, approvals, or activities over time.
|
94 |
-
|
95 |
* For a **Financial Analyst** asking about "top deals" or "recent financings," a good dimension is `company_name` (who is making deals?) or `news_type` (what kind of deals?). If the query is about "recent deals about infection," the dimension should be `company_name_invested`. Using `company_name` would pollute the data with both investor and invested companies.
|
96 |
-
|
97 |
-
* For a **Scientific Analyst** asking about
|
98 |
-
|
99 |
-
* For a **Scientific Analyst** asking about phase movements (e.g., "phase 2 to phase 3" or "phase 2 or phase 3"), a highly valuable dimension is `compound_name` or `product_name`. This reveals which specific products are progressing through the pipeline.
|
100 |
-
|
101 |
* If the query compares concepts like "cancer vs. infection," the dimension is `therapeutic_category`.
|
102 |
* If the query compares "oral vs. injection," the dimension is `route_branch`.
|
103 |
-
|
104 |
-
* Your goal is to find a dimension that reveals a meaningful pattern in the filtered data that is relevant to the user's likely persona and PharmaCircle's core value proposition.
|
105 |
---
|
106 |
### FIELD DEFINITIONS (Your Source of Truth for Core: {core_name})
|
107 |
|
@@ -225,7 +215,7 @@ Your users are PharmaCircle clients, primarily from the US (70%), Europe, and As
|
|
225 |
"limit": 2,
|
226 |
"sort": "total_deal_value desc",
|
227 |
"facet": {{
|
228 |
-
"
|
229 |
}}
|
230 |
}}
|
231 |
}}
|
|
|
44 |
"""
|
45 |
|
46 |
return f"""
|
47 |
+
You are an expert financial and scientific analyst specializing in the pharmaceutical industry. Your task is to convert a natural language question into a structured JSON "Analysis Plan". This plan will be used to run two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping).
|
48 |
|
49 |
+
Your most important job is to correctly infer the user's intent (are they a scientist or a financial analyst?) and choose an `analysis_dimension` and `analysis_measure` that provides a meaningful, non-obvious breakdown of the data for them.
|
|
|
|
|
50 |
|
51 |
---
|
52 |
### CONTEXT & RULES
|
|
|
71 |
This is the most critical part of your task. A bad choice leads to a useless, boring analysis. You must first determine the user's persona and then select the analysis parameters accordingly.
|
72 |
|
73 |
**USER PERSONAS:**
|
74 |
+
* **The Financial Analyst:** This user cares about the money. They look for investments, acquisitions, deal values, and company financials. Their queries contain terms like "deal," "value," "acquisition," "financing," "investment," or "revenue."
|
75 |
+
* **The Scientific Analyst:** This user cares about the science. They look for product pipelines, clinical trial phases, therapeutic breakthroughs, and compound details. Their queries contain terms like "drug approvals," "phase 2," "therapeutic category," "compounds," "molecule," or "mechanism."
|
|
|
76 |
|
77 |
**1. Choosing the `analysis_measure` (The metric):**
|
78 |
|
|
|
85 |
|
86 |
* **USER INTENT FIRST:** If the user explicitly asks to group by a field (e.g., "by company," "by country"), use that field.
|
87 |
|
88 |
+
* **INFERENCE HEURISTICS (If the user doesn't specify a dimension):** Think "What is the next logical question for this user persona?"
|
|
|
|
|
|
|
89 |
* For a **Financial Analyst** asking about "top deals" or "recent financings," a good dimension is `company_name` (who is making deals?) or `news_type` (what kind of deals?). If the query is about "recent deals about infection," the dimension should be `company_name_invested`. Using `company_name` would pollute the data with both investor and invested companies.
|
90 |
+
* For a **Scientific Analyst** asking about "drug approvals," a good dimension is `therapeutic_category` (what diseases are the approvals for?) or `company_name` (who is getting the approvals?).
|
91 |
+
* For a **Scientific Analyst** asking about phase movements (e.g., "phase 2 to phase 3" or "phase 2 or phase 3"), a highly valuable dimension is `compound_name`. This reveals which specific compounds are progressing through the pipeline.
|
|
|
|
|
|
|
92 |
* If the query compares concepts like "cancer vs. infection," the dimension is `therapeutic_category`.
|
93 |
* If the query compares "oral vs. injection," the dimension is `route_branch`.
|
94 |
+
* Your goal is to find a dimension that reveals a meaningful pattern in the filtered data that is relevant to the user's likely persona.
|
|
|
95 |
---
|
96 |
### FIELD DEFINITIONS (Your Source of Truth for Core: {core_name})
|
97 |
|
|
|
215 |
"limit": 2,
|
216 |
"sort": "total_deal_value desc",
|
217 |
"facet": {{
|
218 |
+
"total_deal_value": "sum(total_deal_value_in_million)"
|
219 |
}}
|
220 |
}}
|
221 |
}}
|
requirements.txt
CHANGED
@@ -5,5 +5,4 @@ google-generativeai
|
|
5 |
pandas
|
6 |
seaborn
|
7 |
matplotlib
|
8 |
-
IPython
|
9 |
-
tiktoken
|
|
|
5 |
pandas
|
6 |
seaborn
|
7 |
matplotlib
|
8 |
+
IPython
|
|
ui.py
CHANGED
@@ -70,8 +70,6 @@ def create_ui(llm_model, solr_client):
|
|
70 |
"Qualitative URL will appear here...", visible=False)
|
71 |
qualitative_data_display = gr.Markdown(
|
72 |
"Example data will appear here...", visible=False)
|
73 |
-
with gr.Accordion("Token Usage", open=False):
|
74 |
-
token_summary_box = gr.Markdown(visible=False)
|
75 |
plot_display = gr.Image(
|
76 |
label="Visualization", type="filepath", visible=False)
|
77 |
report_display = gr.Markdown(
|
@@ -81,28 +79,25 @@ def create_ui(llm_model, solr_client):
|
|
81 |
"""
|
82 |
Manages the conversation and yields UI updates.
|
83 |
"""
|
84 |
-
analysis_plan_input_token_count = analysis_plan_output_token_count = analysis_plan_total_token_count = None
|
85 |
-
enriched_report_input_token_count = enriched_report_output_token_count = enriched_report_total_token_count = None
|
86 |
-
visualization_input_token_count = visualization_output_token_count = visualization_total_token_count = None
|
87 |
if state is None:
|
88 |
state = {'query_count': 0, 'last_suggestions': []}
|
89 |
if history is None:
|
90 |
history = []
|
91 |
|
92 |
# Reset all displays at the beginning of a new flow
|
93 |
-
yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=
|
94 |
|
95 |
query_context = user_input.strip()
|
96 |
if not query_context:
|
97 |
history.append((user_input, "Please enter a question to analyze."))
|
98 |
-
yield (history, state, None, None, None, None, None, None, None, None
|
99 |
return
|
100 |
|
101 |
history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating analysis plan...*"))
|
102 |
-
yield (history, state, None, None, None, None, None, None, None, None
|
103 |
|
104 |
# Generate plan, get search field suggestions, and intent.
|
105 |
-
analysis_plan, mapped_search_fields, core_name, intent
|
106 |
|
107 |
# Update and display search field suggestions in its own accordion
|
108 |
if mapped_search_fields:
|
@@ -117,7 +112,7 @@ def create_ui(llm_model, solr_client):
|
|
117 |
else:
|
118 |
message = "I'm sorry, I couldn't generate a valid analysis plan. Please try rephrasing your question."
|
119 |
history.append((None, message))
|
120 |
-
yield (history, state, None, None, None, None, None, None, None,
|
121 |
return
|
122 |
|
123 |
history.append((None, f"✅ Analysis plan generated for core: **`{core_name}`**"))
|
@@ -128,10 +123,10 @@ def create_ui(llm_model, solr_client):
|
|
128 |
"""
|
129 |
history.append((None, plan_summary))
|
130 |
formatted_plan = f"**Full Analysis Plan (Core: `{core_name}`):**\n```json\n{json.dumps(analysis_plan, indent=2)}\n```"
|
131 |
-
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, None, None,
|
132 |
|
133 |
history.append((None, "*Executing queries for aggregates and examples...*"))
|
134 |
-
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, None, None,
|
135 |
|
136 |
# --- DYNAMIC CORE SWITCH (Thread-safe) ---
|
137 |
with solr_lock:
|
@@ -158,7 +153,7 @@ def create_ui(llm_model, solr_client):
|
|
158 |
|
159 |
if not aggregate_data or aggregate_data.get('count', 0) == 0:
|
160 |
history.append((None, f"No data was found for your query in the '{core_name}' core. Please try a different question."))
|
161 |
-
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, None, None,
|
162 |
return
|
163 |
|
164 |
# Display retrieved data
|
@@ -167,66 +162,36 @@ def create_ui(llm_model, solr_client):
|
|
167 |
formatted_agg_data = f"**Quantitative (Aggregate) Data:**\n```json\n{json.dumps(aggregate_data, indent=2)}\n```"
|
168 |
formatted_qual_data = f"**Qualitative (Example) Data:**\n```json\n{json.dumps(example_data, indent=2)}\n```"
|
169 |
qual_data_display_update = gr.update(value=formatted_qual_data, visible=True)
|
170 |
-
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update,
|
171 |
|
172 |
history.append((None, "✅ Data retrieved. Generating visualization and final report..."))
|
173 |
-
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update,
|
|
|
174 |
# Generate viz and report
|
175 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
176 |
viz_future = executor.submit(llm_generate_visualization_code, llm_model, query_context, aggregate_data)
|
177 |
-
|
178 |
report_text = ""
|
179 |
stream_history = history[:]
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
yield (stream_history, state, None, gr.update(value=report_text, visible=True), gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, None, suggestions_display_update)
|
185 |
-
elif item["tokens"] is not None:
|
186 |
-
enriched_report_input_token_count = item["tokens"]["input"]
|
187 |
-
enriched_report_output_token_count = item["tokens"]["output"]
|
188 |
-
enriched_report_total_token_count = item["tokens"]["total"]
|
189 |
history.append((None, report_text))
|
190 |
|
|
|
191 |
plot_path = execute_viz_code_and_get_path(viz_code, aggregate_data)
|
192 |
output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False)
|
193 |
if not plot_path:
|
194 |
history.append((None, "*I was unable to generate a plot for this data.*\n"))
|
195 |
-
|
196 |
-
|
197 |
-
analysis_plan_total_token_count,
|
198 |
-
enriched_report_total_token_count,
|
199 |
-
visualization_total_token_count
|
200 |
-
]))
|
201 |
-
|
202 |
-
total_input = sum(filter(None, [
|
203 |
-
analysis_plan_input_token_count,
|
204 |
-
enriched_report_input_token_count,
|
205 |
-
visualization_input_token_count
|
206 |
-
]))
|
207 |
-
total_output = sum(filter(None, [
|
208 |
-
analysis_plan_output_token_count,
|
209 |
-
enriched_report_output_token_count,
|
210 |
-
visualization_output_token_count
|
211 |
-
]))
|
212 |
-
expected_cost = round((total_input*0.3+total_output*2.5)/1000000, 3)
|
213 |
-
|
214 |
-
token_summary_box_update = gr.update(
|
215 |
-
value=f"""**Analysis Plan Tokens** → Prompt: `{analysis_plan_input_token_count or '-'}`, Output: `{analysis_plan_output_token_count or '-'}`, Total: `{analysis_plan_total_token_count or '-'}`
|
216 |
-
**Report Tokens** → Prompt: `{enriched_report_input_token_count or '-'}`, Output: `{enriched_report_output_token_count or '-'}`, Total: `{enriched_report_total_token_count or '-'}`
|
217 |
-
**Visualization Tokens** → Prompt: `{visualization_input_token_count or '-'}`, Output: `{visualization_output_token_count or '-'}`, Total: `{visualization_total_token_count or '-'}`
|
218 |
-
|
219 |
-
**Cumulative Tokens** → `{cumulative_tokens}`
|
220 |
-
**Expected Cost** → `{expected_cost}$`""",
|
221 |
-
visible=True
|
222 |
-
)
|
223 |
-
yield (history, state, output_plot, gr.update(value=report_text), gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, token_summary_box_update, suggestions_display_update)
|
224 |
|
225 |
state['query_count'] += 1
|
226 |
state['last_suggestions'] = parse_suggestions_from_report(report_text)
|
227 |
next_prompt = "Analysis complete. What would you like to explore next?"
|
228 |
history.append((None, next_prompt))
|
229 |
-
yield (history, state, output_plot, gr.update(value=report_text), gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update,
|
230 |
|
231 |
def reset_all():
|
232 |
"""Resets the entire UI for a new analysis session."""
|
@@ -241,7 +206,6 @@ def create_ui(llm_model, solr_client):
|
|
241 |
gr.update(value=None, visible=False),
|
242 |
gr.update(value=None, visible=False),
|
243 |
gr.update(value=None, visible=False),
|
244 |
-
gr.update(value=None, visible=False),
|
245 |
gr.update(value=None, visible=False)
|
246 |
)
|
247 |
|
@@ -249,7 +213,7 @@ def create_ui(llm_model, solr_client):
|
|
249 |
fn=process_analysis_flow,
|
250 |
inputs=[msg_textbox, chatbot, state],
|
251 |
outputs=[chatbot, state, plot_display, report_display, plan_display, quantitative_url_display,
|
252 |
-
quantitative_data_display, qualitative_url_display, qualitative_data_display,
|
253 |
).then(
|
254 |
lambda: gr.update(value=""),
|
255 |
None,
|
@@ -261,7 +225,7 @@ def create_ui(llm_model, solr_client):
|
|
261 |
fn=reset_all,
|
262 |
inputs=None,
|
263 |
outputs=[chatbot, state, msg_textbox, plot_display, report_display, plan_display, quantitative_url_display,
|
264 |
-
quantitative_data_display, qualitative_url_display, qualitative_data_display,
|
265 |
queue=False
|
266 |
)
|
267 |
|
|
|
70 |
"Qualitative URL will appear here...", visible=False)
|
71 |
qualitative_data_display = gr.Markdown(
|
72 |
"Example data will appear here...", visible=False)
|
|
|
|
|
73 |
plot_display = gr.Image(
|
74 |
label="Visualization", type="filepath", visible=False)
|
75 |
report_display = gr.Markdown(
|
|
|
79 |
"""
|
80 |
Manages the conversation and yields UI updates.
|
81 |
"""
|
|
|
|
|
|
|
82 |
if state is None:
|
83 |
state = {'query_count': 0, 'last_suggestions': []}
|
84 |
if history is None:
|
85 |
history = []
|
86 |
|
87 |
# Reset all displays at the beginning of a new flow
|
88 |
+
yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value="Suggestions from the external API will appear here...", visible=False))
|
89 |
|
90 |
query_context = user_input.strip()
|
91 |
if not query_context:
|
92 |
history.append((user_input, "Please enter a question to analyze."))
|
93 |
+
yield (history, state, None, None, None, None, None, None, None, None)
|
94 |
return
|
95 |
|
96 |
history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating analysis plan...*"))
|
97 |
+
yield (history, state, None, None, None, None, None, None, None, None)
|
98 |
|
99 |
# Generate plan, get search field suggestions, and intent.
|
100 |
+
analysis_plan, mapped_search_fields, core_name, intent = llm_generate_analysis_plan_with_history(llm_model, query_context, history)
|
101 |
|
102 |
# Update and display search field suggestions in its own accordion
|
103 |
if mapped_search_fields:
|
|
|
112 |
else:
|
113 |
message = "I'm sorry, I couldn't generate a valid analysis plan. Please try rephrasing your question."
|
114 |
history.append((None, message))
|
115 |
+
yield (history, state, None, None, None, None, None, None, None, suggestions_display_update)
|
116 |
return
|
117 |
|
118 |
history.append((None, f"✅ Analysis plan generated for core: **`{core_name}`**"))
|
|
|
123 |
"""
|
124 |
history.append((None, plan_summary))
|
125 |
formatted_plan = f"**Full Analysis Plan (Core: `{core_name}`):**\n```json\n{json.dumps(analysis_plan, indent=2)}\n```"
|
126 |
+
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, None, None, suggestions_display_update)
|
127 |
|
128 |
history.append((None, "*Executing queries for aggregates and examples...*"))
|
129 |
+
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, None, None, suggestions_display_update)
|
130 |
|
131 |
# --- DYNAMIC CORE SWITCH (Thread-safe) ---
|
132 |
with solr_lock:
|
|
|
153 |
|
154 |
if not aggregate_data or aggregate_data.get('count', 0) == 0:
|
155 |
history.append((None, f"No data was found for your query in the '{core_name}' core. Please try a different question."))
|
156 |
+
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, None, None, suggestions_display_update)
|
157 |
return
|
158 |
|
159 |
# Display retrieved data
|
|
|
162 |
formatted_agg_data = f"**Quantitative (Aggregate) Data:**\n```json\n{json.dumps(aggregate_data, indent=2)}\n```"
|
163 |
formatted_qual_data = f"**Qualitative (Example) Data:**\n```json\n{json.dumps(example_data, indent=2)}\n```"
|
164 |
qual_data_display_update = gr.update(value=formatted_qual_data, visible=True)
|
165 |
+
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, suggestions_display_update)
|
166 |
|
167 |
history.append((None, "✅ Data retrieved. Generating visualization and final report..."))
|
168 |
+
yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, suggestions_display_update)
|
169 |
+
|
170 |
# Generate viz and report
|
171 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
172 |
viz_future = executor.submit(llm_generate_visualization_code, llm_model, query_context, aggregate_data)
|
173 |
+
|
174 |
report_text = ""
|
175 |
stream_history = history[:]
|
176 |
+
for chunk in llm_synthesize_enriched_report_stream(llm_model, query_context, aggregate_data, example_data, analysis_plan):
|
177 |
+
report_text += chunk
|
178 |
+
yield (stream_history, state, None, gr.update(value=report_text, visible=True), gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, suggestions_display_update)
|
179 |
+
|
|
|
|
|
|
|
|
|
|
|
180 |
history.append((None, report_text))
|
181 |
|
182 |
+
viz_code = viz_future.result()
|
183 |
plot_path = execute_viz_code_and_get_path(viz_code, aggregate_data)
|
184 |
output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False)
|
185 |
if not plot_path:
|
186 |
history.append((None, "*I was unable to generate a plot for this data.*\n"))
|
187 |
+
|
188 |
+
yield (history, state, output_plot, gr.update(value=report_text), gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, suggestions_display_update)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
state['query_count'] += 1
|
191 |
state['last_suggestions'] = parse_suggestions_from_report(report_text)
|
192 |
next_prompt = "Analysis complete. What would you like to explore next?"
|
193 |
history.append((None, next_prompt))
|
194 |
+
yield (history, state, output_plot, gr.update(value=report_text), gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, suggestions_display_update)
|
195 |
|
196 |
def reset_all():
|
197 |
"""Resets the entire UI for a new analysis session."""
|
|
|
206 |
gr.update(value=None, visible=False),
|
207 |
gr.update(value=None, visible=False),
|
208 |
gr.update(value=None, visible=False),
|
|
|
209 |
gr.update(value=None, visible=False)
|
210 |
)
|
211 |
|
|
|
213 |
fn=process_analysis_flow,
|
214 |
inputs=[msg_textbox, chatbot, state],
|
215 |
outputs=[chatbot, state, plot_display, report_display, plan_display, quantitative_url_display,
|
216 |
+
quantitative_data_display, qualitative_url_display, qualitative_data_display, suggestions_display],
|
217 |
).then(
|
218 |
lambda: gr.update(value=""),
|
219 |
None,
|
|
|
225 |
fn=reset_all,
|
226 |
inputs=None,
|
227 |
outputs=[chatbot, state, msg_textbox, plot_display, report_display, plan_display, quantitative_url_display,
|
228 |
+
quantitative_data_display, qualitative_url_display, qualitative_data_display, suggestions_display],
|
229 |
queue=False
|
230 |
)
|
231 |
|