token_count

#1
by uralk - opened
Files changed (4) hide show
  1. data_processing.py +10 -31
  2. llm_prompts.py +9 -19
  3. requirements.txt +1 -2
  4. ui.py +22 -58
data_processing.py CHANGED
@@ -22,7 +22,6 @@ import google.generativeai as genai
22
  import urllib
23
  import pysolr
24
  import config # Import the config module to access remote host details
25
- import tiktoken
26
 
27
  from llm_prompts import (
28
  get_analysis_plan_prompt,
@@ -52,11 +51,11 @@ def llm_generate_analysis_plan_with_history(llm_model, natural_language_query, c
52
 
53
  if intent != 'search_list':
54
  print(f"API returned intent '{intent}' which is not 'search_list'. Aborting analysis.")
55
- return None, None, None, intent, None, None, None
56
 
57
  except Exception as e:
58
  print(f"Warning: Could not retrieve dynamic search fields. Proceeding without them. Error: {e}")
59
- return None, [], None, 'api_error', None, None, None
60
 
61
  core_name = search_name if search_name else 'news'
62
 
@@ -76,21 +75,17 @@ def llm_generate_analysis_plan_with_history(llm_model, natural_language_query, c
76
 
77
  try:
78
  response = llm_model.generate_content(prompt)
79
- encoding = tiktoken.encoding_for_model("gpt-4")
80
- input_token_count = len(encoding.encode(prompt))
81
- output_token_count = len(encoding.encode(response.text))
82
- total_token_count = (input_token_count if input_token_count is not None else 0) + (output_token_count if output_token_count is not None else 0)
83
  cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip()
84
  plan = json.loads(cleaned_text)
85
- return plan, mapped_search_fields, core_name, intent, input_token_count, output_token_count, total_token_count
86
  except json.JSONDecodeError as e:
87
  raw_response_text = response.text if 'response' in locals() else 'N/A'
88
  print(f"Error decoding JSON from LLM response: {e}\nRaw Response:\n{raw_response_text}")
89
- return None, mapped_search_fields, core_name, intent, None, None, None
90
  except Exception as e:
91
  raw_response_text = response.text if 'response' in locals() else 'N/A'
92
  print(f"Error in llm_generate_analysis_plan_with_history: {e}\nRaw Response:\n{raw_response_text}")
93
- return None, mapped_search_fields, core_name, intent, None, None, None
94
 
95
  def execute_quantitative_query(solr_client, plan):
96
  """Executes the facet query to get aggregate data."""
@@ -155,24 +150,12 @@ def llm_synthesize_enriched_report_stream(llm_model, query, quantitative_data, q
155
  """
156
  prompt = get_synthesis_report_prompt(query, quantitative_data, qualitative_data, plan)
157
  try:
158
- response_stream = llm_model.generate_content(prompt, stream=True)
159
- response_text = ""
160
  for chunk in response_stream:
161
- yield {"text": chunk.text, "tokens": None}
162
- response_text += chunk.text
163
- encoding = tiktoken.encoding_for_model("gpt-4")
164
- input_token_count = len(encoding.encode(prompt))
165
- output_token_count = len(encoding.encode(response_text))
166
- total_token_count = (input_token_count if input_token_count is not None else 0) + (output_token_count if output_token_count is not None else 0)
167
- tokens = {
168
- "input": input_token_count,
169
- "output": output_token_count,
170
- "total": total_token_count,
171
- }
172
- yield {"text": None, "tokens": tokens}
173
  except Exception as e:
174
  print(f"Error in llm_synthesize_enriched_report_stream: {e}")
175
- yield {"text": "Sorry, an error occurred while generating the report. Please check the logs for details.", "tokens": None}
176
 
177
  def llm_generate_visualization_code(llm_model, query_context, facet_data):
178
  """Generates Python code for visualization based on query and data."""
@@ -180,16 +163,12 @@ def llm_generate_visualization_code(llm_model, query_context, facet_data):
180
  try:
181
  generation_config = genai.types.GenerationConfig(temperature=0)
182
  response = llm_model.generate_content(prompt, generation_config=generation_config)
183
- encoding = tiktoken.encoding_for_model("gpt-4")
184
- input_token_count = len(encoding.encode(prompt))
185
- output_token_count = len(encoding.encode(response.text))
186
- total_token_count = (input_token_count if input_token_count is not None else 0) + (output_token_count if output_token_count is not None else 0)
187
  code = re.sub(r'^```python\s*|```$', '', response.text, flags=re.MULTILINE)
188
- return code, input_token_count, output_token_count, total_token_count
189
  except Exception as e:
190
  raw_response_text = response.text if 'response' in locals() else 'N/A'
191
  print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {raw_response_text}")
192
- return
193
 
194
  def execute_viz_code_and_get_path(viz_code, facet_data):
195
  """Executes visualization code and returns the path to the saved plot image."""
 
22
  import urllib
23
  import pysolr
24
  import config # Import the config module to access remote host details
 
25
 
26
  from llm_prompts import (
27
  get_analysis_plan_prompt,
 
51
 
52
  if intent != 'search_list':
53
  print(f"API returned intent '{intent}' which is not 'search_list'. Aborting analysis.")
54
+ return None, None, None, intent
55
 
56
  except Exception as e:
57
  print(f"Warning: Could not retrieve dynamic search fields. Proceeding without them. Error: {e}")
58
+ return None, [], None, 'api_error'
59
 
60
  core_name = search_name if search_name else 'news'
61
 
 
75
 
76
  try:
77
  response = llm_model.generate_content(prompt)
 
 
 
 
78
  cleaned_text = re.sub(r'```json\s*|\s*```', '', response.text, flags=re.MULTILINE | re.DOTALL).strip()
79
  plan = json.loads(cleaned_text)
80
+ return plan, mapped_search_fields, core_name, intent
81
  except json.JSONDecodeError as e:
82
  raw_response_text = response.text if 'response' in locals() else 'N/A'
83
  print(f"Error decoding JSON from LLM response: {e}\nRaw Response:\n{raw_response_text}")
84
+ return None, mapped_search_fields, core_name, intent
85
  except Exception as e:
86
  raw_response_text = response.text if 'response' in locals() else 'N/A'
87
  print(f"Error in llm_generate_analysis_plan_with_history: {e}\nRaw Response:\n{raw_response_text}")
88
+ return None, mapped_search_fields, core_name, intent
89
 
90
  def execute_quantitative_query(solr_client, plan):
91
  """Executes the facet query to get aggregate data."""
 
150
  """
151
  prompt = get_synthesis_report_prompt(query, quantitative_data, qualitative_data, plan)
152
  try:
153
+ response_stream = llm_model.generate_content(prompt, stream=True)
 
154
  for chunk in response_stream:
155
+ yield chunk.text
 
 
 
 
 
 
 
 
 
 
 
156
  except Exception as e:
157
  print(f"Error in llm_synthesize_enriched_report_stream: {e}")
158
+ yield "Sorry, an error occurred while generating the report. Please check the logs for details."
159
 
160
  def llm_generate_visualization_code(llm_model, query_context, facet_data):
161
  """Generates Python code for visualization based on query and data."""
 
163
  try:
164
  generation_config = genai.types.GenerationConfig(temperature=0)
165
  response = llm_model.generate_content(prompt, generation_config=generation_config)
 
 
 
 
166
  code = re.sub(r'^```python\s*|```$', '', response.text, flags=re.MULTILINE)
167
+ return code
168
  except Exception as e:
169
  raw_response_text = response.text if 'response' in locals() else 'N/A'
170
  print(f"Error in llm_generate_visualization_code: {e}\nRaw response: {raw_response_text}")
171
+ return None
172
 
173
  def execute_viz_code_and_get_path(viz_code, facet_data):
174
  """Executes visualization code and returns the path to the saved plot image."""
llm_prompts.py CHANGED
@@ -44,11 +44,9 @@ An external API has identified the following field-value pairs from the user que
44
  """
45
 
46
  return f"""
47
- You are the AI Data Analyst for PharmaCircle, a leading knowledge management company dedicated to curating vast amounts of pharmaceutical, biotechnology, and drug delivery industry data into due diligence-level intelligence. Your purpose is to make PharmaCircle's complex and powerful database easily accessible through natural language, providing insightful analysis that would typically require navigating complex search interfaces.
48
 
49
- Your primary task is to convert a user's natural language question into a structured JSON "Analysis Plan". This plan will drive two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping).
50
-
51
- Your most important job is to correctly infer the user's intent and choose an `analysis_dimension` and `analysis_measure` that provides a meaningful, non-obvious breakdown of the data that aligns with PharmaCircle's mission of tracking drug development and innovation.
52
 
53
  ---
54
  ### CONTEXT & RULES
@@ -73,9 +71,8 @@ never add an additional filter by yourself like `total_deal_value_in_million:[0
73
  This is the most critical part of your task. A bad choice leads to a useless, boring analysis. You must first determine the user's persona and then select the analysis parameters accordingly.
74
 
75
  **USER PERSONAS:**
76
- Your users are PharmaCircle clients, primarily from the US (70%), Europe, and Asia. They fall into two main categories:
77
- * **The Financial Analyst:** This user cares about the money. They look for investments, acquisitions, deal values, and company financials to identify partnering and investment opportunities. Their queries contain terms like "deal," "value," "acquisition," "financing," "investment," or "revenue."
78
- * **The Scientific Analyst:** This user cares about the science. They track drug development, from discovery to market. They look for product pipelines, clinical trial phases, therapeutic breakthroughs, formulation details, and compound data. Their queries contain terms like "drug approvals," "phase 2," "therapeutic category," "compounds," "molecule," or "mechanism."
79
 
80
  **1. Choosing the `analysis_measure` (The metric):**
81
 
@@ -88,20 +85,13 @@ Your users are PharmaCircle clients, primarily from the US (70%), Europe, and As
88
 
89
  * **USER INTENT FIRST:** If the user explicitly asks to group by a field (e.g., "by company," "by country"), use that field.
90
 
91
- * **INFERENCE HEURISTICS (If the user doesn't specify a dimension):** Think "What is the next logical question for this user persona, keeping PharmaCircle's mission in mind?"
92
-
93
- * **PharmaCircle Mission Priority:** Given PharmaCircle's focus on product pipelines and development timelines, **you should strongly prioritize `product_name`, `compound_name`, and date related fields as `analysis_dimension`s.** A time-based analysis (e.g., 'by year') or a product-focused analysis is often the most valuable insight for our users who are tracking progress, approvals, or activities over time.
94
-
95
  * For a **Financial Analyst** asking about "top deals" or "recent financings," a good dimension is `company_name` (who is making deals?) or `news_type` (what kind of deals?). If the query is about "recent deals about infection," the dimension should be `company_name_invested`. Using `company_name` would pollute the data with both investor and invested companies.
96
-
97
- * For a **Scientific Analyst** asking about "drug approvals," a good dimension is `therapeutic_category` (what diseases are the approvals for?) or `company_name` (who is getting the approvals?). See the Mission Priority rule above—if the query implies a timeline, `date_year` might be even better.
98
-
99
- * For a **Scientific Analyst** asking about phase movements (e.g., "phase 2 to phase 3" or "phase 2 or phase 3"), a highly valuable dimension is `compound_name` or `product_name`. This reveals which specific products are progressing through the pipeline.
100
-
101
  * If the query compares concepts like "cancer vs. infection," the dimension is `therapeutic_category`.
102
  * If the query compares "oral vs. injection," the dimension is `route_branch`.
103
-
104
- * Your goal is to find a dimension that reveals a meaningful pattern in the filtered data that is relevant to the user's likely persona and PharmaCircle's core value proposition.
105
  ---
106
  ### FIELD DEFINITIONS (Your Source of Truth for Core: {core_name})
107
 
@@ -225,7 +215,7 @@ Your users are PharmaCircle clients, primarily from the US (70%), Europe, and As
225
  "limit": 2,
226
  "sort": "total_deal_value desc",
227
  "facet": {{
228
- "total_value": "sum(total_deal_value_in_million)"
229
  }}
230
  }}
231
  }}
 
44
  """
45
 
46
  return f"""
47
+ You are an expert financial and scientific analyst specializing in the pharmaceutical industry. Your task is to convert a natural language question into a structured JSON "Analysis Plan". This plan will be used to run two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping).
48
 
49
+ Your most important job is to correctly infer the user's intent (are they a scientist or a financial analyst?) and choose an `analysis_dimension` and `analysis_measure` that provides a meaningful, non-obvious breakdown of the data for them.
 
 
50
 
51
  ---
52
  ### CONTEXT & RULES
 
71
  This is the most critical part of your task. A bad choice leads to a useless, boring analysis. You must first determine the user's persona and then select the analysis parameters accordingly.
72
 
73
  **USER PERSONAS:**
74
+ * **The Financial Analyst:** This user cares about the money. They look for investments, acquisitions, deal values, and company financials. Their queries contain terms like "deal," "value," "acquisition," "financing," "investment," or "revenue."
75
+ * **The Scientific Analyst:** This user cares about the science. They look for product pipelines, clinical trial phases, therapeutic breakthroughs, and compound details. Their queries contain terms like "drug approvals," "phase 2," "therapeutic category," "compounds," "molecule," or "mechanism."
 
76
 
77
  **1. Choosing the `analysis_measure` (The metric):**
78
 
 
85
 
86
  * **USER INTENT FIRST:** If the user explicitly asks to group by a field (e.g., "by company," "by country"), use that field.
87
 
88
+ * **INFERENCE HEURISTICS (If the user doesn't specify a dimension):** Think "What is the next logical question for this user persona?"
 
 
 
89
  * For a **Financial Analyst** asking about "top deals" or "recent financings," a good dimension is `company_name` (who is making deals?) or `news_type` (what kind of deals?). If the query is about "recent deals about infection," the dimension should be `company_name_invested`. Using `company_name` would pollute the data with both investor and invested companies.
90
+ * For a **Scientific Analyst** asking about "drug approvals," a good dimension is `therapeutic_category` (what diseases are the approvals for?) or `company_name` (who is getting the approvals?).
91
+ * For a **Scientific Analyst** asking about phase movements (e.g., "phase 2 to phase 3" or "phase 2 or phase 3"), a highly valuable dimension is `compound_name`. This reveals which specific compounds are progressing through the pipeline.
 
 
 
92
  * If the query compares concepts like "cancer vs. infection," the dimension is `therapeutic_category`.
93
  * If the query compares "oral vs. injection," the dimension is `route_branch`.
94
+ * Your goal is to find a dimension that reveals a meaningful pattern in the filtered data that is relevant to the user's likely persona.
 
95
  ---
96
  ### FIELD DEFINITIONS (Your Source of Truth for Core: {core_name})
97
 
 
215
  "limit": 2,
216
  "sort": "total_deal_value desc",
217
  "facet": {{
218
+ "total_deal_value": "sum(total_deal_value_in_million)"
219
  }}
220
  }}
221
  }}
requirements.txt CHANGED
@@ -5,5 +5,4 @@ google-generativeai
5
  pandas
6
  seaborn
7
  matplotlib
8
- IPython
9
- tiktoken
 
5
  pandas
6
  seaborn
7
  matplotlib
8
+ IPython
 
ui.py CHANGED
@@ -70,8 +70,6 @@ def create_ui(llm_model, solr_client):
70
  "Qualitative URL will appear here...", visible=False)
71
  qualitative_data_display = gr.Markdown(
72
  "Example data will appear here...", visible=False)
73
- with gr.Accordion("Token Usage", open=False):
74
- token_summary_box = gr.Markdown(visible=False)
75
  plot_display = gr.Image(
76
  label="Visualization", type="filepath", visible=False)
77
  report_display = gr.Markdown(
@@ -81,28 +79,25 @@ def create_ui(llm_model, solr_client):
81
  """
82
  Manages the conversation and yields UI updates.
83
  """
84
- analysis_plan_input_token_count = analysis_plan_output_token_count = analysis_plan_total_token_count = None
85
- enriched_report_input_token_count = enriched_report_output_token_count = enriched_report_total_token_count = None
86
- visualization_input_token_count = visualization_output_token_count = visualization_total_token_count = None
87
  if state is None:
88
  state = {'query_count': 0, 'last_suggestions': []}
89
  if history is None:
90
  history = []
91
 
92
  # Reset all displays at the beginning of a new flow
93
- yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value="Suggestions from the external API will appear here...", visible=False))
94
 
95
  query_context = user_input.strip()
96
  if not query_context:
97
  history.append((user_input, "Please enter a question to analyze."))
98
- yield (history, state, None, None, None, None, None, None, None, None, None)
99
  return
100
 
101
  history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating analysis plan...*"))
102
- yield (history, state, None, None, None, None, None, None, None, None, None)
103
 
104
  # Generate plan, get search field suggestions, and intent.
105
- analysis_plan, mapped_search_fields, core_name, intent, analysis_plan_input_token_count, analysis_plan_output_token_count, analysis_plan_total_token_count = llm_generate_analysis_plan_with_history(llm_model, query_context, history)
106
 
107
  # Update and display search field suggestions in its own accordion
108
  if mapped_search_fields:
@@ -117,7 +112,7 @@ def create_ui(llm_model, solr_client):
117
  else:
118
  message = "I'm sorry, I couldn't generate a valid analysis plan. Please try rephrasing your question."
119
  history.append((None, message))
120
- yield (history, state, None, None, None, None, None, None, None, None, suggestions_display_update)
121
  return
122
 
123
  history.append((None, f"✅ Analysis plan generated for core: **`{core_name}`**"))
@@ -128,10 +123,10 @@ def create_ui(llm_model, solr_client):
128
  """
129
  history.append((None, plan_summary))
130
  formatted_plan = f"**Full Analysis Plan (Core: `{core_name}`):**\n```json\n{json.dumps(analysis_plan, indent=2)}\n```"
131
- yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, None, None, None, suggestions_display_update)
132
 
133
  history.append((None, "*Executing queries for aggregates and examples...*"))
134
- yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, None, None, None, suggestions_display_update)
135
 
136
  # --- DYNAMIC CORE SWITCH (Thread-safe) ---
137
  with solr_lock:
@@ -158,7 +153,7 @@ def create_ui(llm_model, solr_client):
158
 
159
  if not aggregate_data or aggregate_data.get('count', 0) == 0:
160
  history.append((None, f"No data was found for your query in the '{core_name}' core. Please try a different question."))
161
- yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, None, None, None, suggestions_display_update)
162
  return
163
 
164
  # Display retrieved data
@@ -167,66 +162,36 @@ def create_ui(llm_model, solr_client):
167
  formatted_agg_data = f"**Quantitative (Aggregate) Data:**\n```json\n{json.dumps(aggregate_data, indent=2)}\n```"
168
  formatted_qual_data = f"**Qualitative (Example) Data:**\n```json\n{json.dumps(example_data, indent=2)}\n```"
169
  qual_data_display_update = gr.update(value=formatted_qual_data, visible=True)
170
- yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, None, suggestions_display_update)
171
 
172
  history.append((None, "✅ Data retrieved. Generating visualization and final report..."))
173
- yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, None, suggestions_display_update)
 
174
  # Generate viz and report
175
  with concurrent.futures.ThreadPoolExecutor() as executor:
176
  viz_future = executor.submit(llm_generate_visualization_code, llm_model, query_context, aggregate_data)
177
- viz_code, visualization_input_token_count, visualization_output_token_count, visualization_total_token_count = viz_future.result()
178
  report_text = ""
179
  stream_history = history[:]
180
- report_stream = llm_synthesize_enriched_report_stream(llm_model, query_context, aggregate_data, example_data, analysis_plan)
181
- for item in report_stream:
182
- if item["text"] is not None:
183
- report_text += item["text"]
184
- yield (stream_history, state, None, gr.update(value=report_text, visible=True), gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, None, suggestions_display_update)
185
- elif item["tokens"] is not None:
186
- enriched_report_input_token_count = item["tokens"]["input"]
187
- enriched_report_output_token_count = item["tokens"]["output"]
188
- enriched_report_total_token_count = item["tokens"]["total"]
189
  history.append((None, report_text))
190
 
 
191
  plot_path = execute_viz_code_and_get_path(viz_code, aggregate_data)
192
  output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False)
193
  if not plot_path:
194
  history.append((None, "*I was unable to generate a plot for this data.*\n"))
195
-
196
- cumulative_tokens = sum(filter(None, [
197
- analysis_plan_total_token_count,
198
- enriched_report_total_token_count,
199
- visualization_total_token_count
200
- ]))
201
-
202
- total_input = sum(filter(None, [
203
- analysis_plan_input_token_count,
204
- enriched_report_input_token_count,
205
- visualization_input_token_count
206
- ]))
207
- total_output = sum(filter(None, [
208
- analysis_plan_output_token_count,
209
- enriched_report_output_token_count,
210
- visualization_output_token_count
211
- ]))
212
- expected_cost = round((total_input*0.3+total_output*2.5)/1000000, 3)
213
-
214
- token_summary_box_update = gr.update(
215
- value=f"""**Analysis Plan Tokens** → Prompt: `{analysis_plan_input_token_count or '-'}`, Output: `{analysis_plan_output_token_count or '-'}`, Total: `{analysis_plan_total_token_count or '-'}`
216
- **Report Tokens** → Prompt: `{enriched_report_input_token_count or '-'}`, Output: `{enriched_report_output_token_count or '-'}`, Total: `{enriched_report_total_token_count or '-'}`
217
- **Visualization Tokens** → Prompt: `{visualization_input_token_count or '-'}`, Output: `{visualization_output_token_count or '-'}`, Total: `{visualization_total_token_count or '-'}`
218
-
219
- **Cumulative Tokens** → `{cumulative_tokens}`
220
- **Expected Cost** → `{expected_cost}$`""",
221
- visible=True
222
- )
223
- yield (history, state, output_plot, gr.update(value=report_text), gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, token_summary_box_update, suggestions_display_update)
224
 
225
  state['query_count'] += 1
226
  state['last_suggestions'] = parse_suggestions_from_report(report_text)
227
  next_prompt = "Analysis complete. What would you like to explore next?"
228
  history.append((None, next_prompt))
229
- yield (history, state, output_plot, gr.update(value=report_text), gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, token_summary_box_update, suggestions_display_update)
230
 
231
  def reset_all():
232
  """Resets the entire UI for a new analysis session."""
@@ -241,7 +206,6 @@ def create_ui(llm_model, solr_client):
241
  gr.update(value=None, visible=False),
242
  gr.update(value=None, visible=False),
243
  gr.update(value=None, visible=False),
244
- gr.update(value=None, visible=False),
245
  gr.update(value=None, visible=False)
246
  )
247
 
@@ -249,7 +213,7 @@ def create_ui(llm_model, solr_client):
249
  fn=process_analysis_flow,
250
  inputs=[msg_textbox, chatbot, state],
251
  outputs=[chatbot, state, plot_display, report_display, plan_display, quantitative_url_display,
252
- quantitative_data_display, qualitative_url_display, qualitative_data_display, token_summary_box, suggestions_display],
253
  ).then(
254
  lambda: gr.update(value=""),
255
  None,
@@ -261,7 +225,7 @@ def create_ui(llm_model, solr_client):
261
  fn=reset_all,
262
  inputs=None,
263
  outputs=[chatbot, state, msg_textbox, plot_display, report_display, plan_display, quantitative_url_display,
264
- quantitative_data_display, qualitative_url_display, qualitative_data_display, token_summary_box, suggestions_display],
265
  queue=False
266
  )
267
 
 
70
  "Qualitative URL will appear here...", visible=False)
71
  qualitative_data_display = gr.Markdown(
72
  "Example data will appear here...", visible=False)
 
 
73
  plot_display = gr.Image(
74
  label="Visualization", type="filepath", visible=False)
75
  report_display = gr.Markdown(
 
79
  """
80
  Manages the conversation and yields UI updates.
81
  """
 
 
 
82
  if state is None:
83
  state = {'query_count': 0, 'last_suggestions': []}
84
  if history is None:
85
  history = []
86
 
87
  # Reset all displays at the beginning of a new flow
88
+ yield (history, state, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value="Suggestions from the external API will appear here...", visible=False))
89
 
90
  query_context = user_input.strip()
91
  if not query_context:
92
  history.append((user_input, "Please enter a question to analyze."))
93
+ yield (history, state, None, None, None, None, None, None, None, None)
94
  return
95
 
96
  history.append((user_input, f"Analyzing: '{query_context}'\n\n*Generating analysis plan...*"))
97
+ yield (history, state, None, None, None, None, None, None, None, None)
98
 
99
  # Generate plan, get search field suggestions, and intent.
100
+ analysis_plan, mapped_search_fields, core_name, intent = llm_generate_analysis_plan_with_history(llm_model, query_context, history)
101
 
102
  # Update and display search field suggestions in its own accordion
103
  if mapped_search_fields:
 
112
  else:
113
  message = "I'm sorry, I couldn't generate a valid analysis plan. Please try rephrasing your question."
114
  history.append((None, message))
115
+ yield (history, state, None, None, None, None, None, None, None, suggestions_display_update)
116
  return
117
 
118
  history.append((None, f"✅ Analysis plan generated for core: **`{core_name}`**"))
 
123
  """
124
  history.append((None, plan_summary))
125
  formatted_plan = f"**Full Analysis Plan (Core: `{core_name}`):**\n```json\n{json.dumps(analysis_plan, indent=2)}\n```"
126
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, None, None, suggestions_display_update)
127
 
128
  history.append((None, "*Executing queries for aggregates and examples...*"))
129
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, None, None, suggestions_display_update)
130
 
131
  # --- DYNAMIC CORE SWITCH (Thread-safe) ---
132
  with solr_lock:
 
153
 
154
  if not aggregate_data or aggregate_data.get('count', 0) == 0:
155
  history.append((None, f"No data was found for your query in the '{core_name}' core. Please try a different question."))
156
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), None, None, None, None, suggestions_display_update)
157
  return
158
 
159
  # Display retrieved data
 
162
  formatted_agg_data = f"**Quantitative (Aggregate) Data:**\n```json\n{json.dumps(aggregate_data, indent=2)}\n```"
163
  formatted_qual_data = f"**Qualitative (Example) Data:**\n```json\n{json.dumps(example_data, indent=2)}\n```"
164
  qual_data_display_update = gr.update(value=formatted_qual_data, visible=True)
165
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, suggestions_display_update)
166
 
167
  history.append((None, "✅ Data retrieved. Generating visualization and final report..."))
168
+ yield (history, state, None, None, gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, suggestions_display_update)
169
+
170
  # Generate viz and report
171
  with concurrent.futures.ThreadPoolExecutor() as executor:
172
  viz_future = executor.submit(llm_generate_visualization_code, llm_model, query_context, aggregate_data)
173
+
174
  report_text = ""
175
  stream_history = history[:]
176
+ for chunk in llm_synthesize_enriched_report_stream(llm_model, query_context, aggregate_data, example_data, analysis_plan):
177
+ report_text += chunk
178
+ yield (stream_history, state, None, gr.update(value=report_text, visible=True), gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, suggestions_display_update)
179
+
 
 
 
 
 
180
  history.append((None, report_text))
181
 
182
+ viz_code = viz_future.result()
183
  plot_path = execute_viz_code_and_get_path(viz_code, aggregate_data)
184
  output_plot = gr.update(value=plot_path, visible=True) if plot_path else gr.update(visible=False)
185
  if not plot_path:
186
  history.append((None, "*I was unable to generate a plot for this data.*\n"))
187
+
188
+ yield (history, state, output_plot, gr.update(value=report_text), gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, suggestions_display_update)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  state['query_count'] += 1
191
  state['last_suggestions'] = parse_suggestions_from_report(report_text)
192
  next_prompt = "Analysis complete. What would you like to explore next?"
193
  history.append((None, next_prompt))
194
+ yield (history, state, output_plot, gr.update(value=report_text), gr.update(value=formatted_plan, visible=True), quantitative_url_update, gr.update(value=formatted_agg_data, visible=True), qualitative_url_update, qual_data_display_update, suggestions_display_update)
195
 
196
  def reset_all():
197
  """Resets the entire UI for a new analysis session."""
 
206
  gr.update(value=None, visible=False),
207
  gr.update(value=None, visible=False),
208
  gr.update(value=None, visible=False),
 
209
  gr.update(value=None, visible=False)
210
  )
211
 
 
213
  fn=process_analysis_flow,
214
  inputs=[msg_textbox, chatbot, state],
215
  outputs=[chatbot, state, plot_display, report_display, plan_display, quantitative_url_display,
216
+ quantitative_data_display, qualitative_url_display, qualitative_data_display, suggestions_display],
217
  ).then(
218
  lambda: gr.update(value=""),
219
  None,
 
225
  fn=reset_all,
226
  inputs=None,
227
  outputs=[chatbot, state, msg_textbox, plot_display, report_display, plan_display, quantitative_url_display,
228
+ quantitative_data_display, qualitative_url_display, qualitative_data_display, suggestions_display],
229
  queue=False
230
  )
231