|
""" |
|
Contains the prompt templates for interacting with the Gemini LLM. |
|
|
|
Separating prompts from the application logic makes them easier to manage, |
|
modify, and version. This module provides functions that return the formatted |
|
prompt strings required by the data processing module. |
|
""" |
|
|
|
|
|
import datetime |
|
import json |
|
from solr_metadata import format_metadata_for_prompt |
|
|
|
def get_analysis_plan_prompt(natural_language_query, chat_history, search_fields=None, core_name="news"): |
|
""" |
|
Generates the prompt for creating a Solr analysis plan from a user query. |
|
Args: |
|
natural_language_query (str): The user's query. |
|
chat_history (list): A list of previous user and bot messages. |
|
search_fields (list, optional): A list of dictionaries with 'field_name' and 'field_value'. |
|
core_name (str): The name of the Solr core to use for field metadata. |
|
""" |
|
|
|
formatted_field_info = format_metadata_for_prompt(core_name) |
|
formatted_history = "" |
|
for user_msg, bot_msg in chat_history: |
|
if user_msg: |
|
formatted_history += f"- User: \"{user_msg}\"\n" |
|
|
|
dynamic_fields_prompt_section = "" |
|
if search_fields: |
|
|
|
formatted_fields = "\n".join([f" - {field['field_name']}: {field['field_value']}" for field in search_fields]) |
|
dynamic_fields_prompt_section = f""" |
|
--- |
|
### MANDATORY DYNAMIC FILTERS |
|
|
|
An external API has identified the following field-value pairs from the user query. |
|
**You MUST use ALL of these fields and values to construct the `query_filter`.** |
|
- Construct the `query_filter` by combining these key-value pairs using the 'AND' operator. |
|
- Do NOT add any other fields or conditions to the `query_filter`. This section is the definitive source for it. |
|
|
|
**Mandatory Fields for Query Filter:** |
|
{formatted_fields} |
|
""" |
|
|
|
return f""" |
|
You are an expert data analyst and Solr query engineer. Your task is to convert a natural language question into a structured JSON "Analysis Plan". This plan will be used to run two separate, efficient queries: one for aggregate data (facets) and one for finding illustrative examples (grouping). |
|
|
|
Your most important job is to think like an analyst and choose a `analysis_dimension` and `analysis_measure` that provides a meaningful, non-obvious breakdown of the data. |
|
|
|
--- |
|
### CONTEXT & RULES |
|
|
|
1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")} |
|
2. **Query Filter Construction**: The `query_filter` MUST be built exclusively from the fields provided in the "MANDATORY DYNAMIC FILTERS" section, if present. |
|
3. **Field Usage**: You MUST use the fields described in the 'Field Definitions'. Pay close attention to the definitions to select the correct field, especially the `_s` fields for searching. Do not use fields ending with `_s` in `group.field` or facet `field` unless necessary for the analysis. |
|
4. **Crucial Sorting Rules**: |
|
* For `group.sort`: If `analysis_measure` involves a function on a field (e.g., `sum(total_deal_value_in_million)`), you MUST use the full function: `group.sort: 'sum(total_deal_value_in_million) desc'`. |
|
* If `analysis_measure` is 'count', you MUST OMIT the `group.sort` parameter entirely. |
|
* For sorting, NEVER use 'date_year' directly for `sort` in `terms` facets; use 'index asc' or 'index desc' instead. For other sorts, use 'date'. |
|
5. On **Qualitative Data** Group Operation: |
|
* We need to show user **standout examples** for each category chosen. |
|
For example: if user asks for "USA approved drugs last 5 years" We need to show user standout examples for each year. In this context: standout means the news with the biggest deals in million for each year for example. |
|
6. **Output Format**: Your final output must be a single, raw JSON object. Do not add comments or markdown formatting. The JSON MUST include a `reasoning` object explaining your choices. |
|
|
|
--- |
|
### HOW TO CHOOSE THE ANALYSIS DIMENSION AND MEASURE (ANALYTICAL STRATEGY) |
|
|
|
This is the most critical part of your task. A bad choice leads to a useless, boring analysis. |
|
|
|
**1. Choosing the `analysis_dimension` (The "Group By" field):** |
|
|
|
* **THE ANTI-REDUNDANCY RULE (MOST IMPORTANT):** If you use a field in the `query_filter` with a specific value (e.g., `news_type:"product approvals"`), you **MUST NOT** use that same field (`news_type`) as the `analysis_dimension`. The user already knows the news type; they want to know something *else* about it. Choosing a redundant dimension is a critical failure. |
|
|
|
* **USER INTENT FIRST:** If the user explicitly asks to group by a field (e.g., "by company", "by country"), use that field. |
|
|
|
* **INFERENCE HEURISTICS (If the user doesn't specify a dimension):** Think "What is the next logical question?" to find the most insightful breakdown. |
|
* If the query is about "drug approvals," a good dimension is `therapeutic_category` (what diseases are the approvals for?) or `company_name` (who is getting the approvals?). |
|
* If the query compares concepts like "cancer vs. infection," the dimension is `therapeutic_category`. |
|
* If the query compares "oral vs. injection," the dimension is `route_branch`. |
|
* For general "recent news" or "top deals," `news_type` or `company_name` are often good starting points. |
|
* Your goal is to find a dimension that reveals a meaningful pattern in the filtered data. |
|
|
|
**2. Choosing the `analysis_measure` (The metric):** |
|
|
|
* **EXPLICIT METRIC:** If the user asks for a value (e.g., "by total deal value", "highest revenue"), use the corresponding field and function (e.g., `sum(total_deal_value_in_million)`). |
|
* **IMPLICIT COUNT:** If the user asks a "what," "who," "how many," or "most common" question without specifying a value metric, the measure is `count`. |
|
|
|
--- |
|
### FIELD DEFINITIONS (Your Source of Truth for Core: {core_name}) |
|
|
|
{formatted_field_info} |
|
{dynamic_fields_prompt_section} |
|
|
|
--- |
|
### CHAT HISTORY |
|
{formatted_history} |
|
--- |
|
### EXAMPLES |
|
|
|
**User Query 1:** "What are the top 5 companies by total deal value in 2023?" |
|
**API Filter Input 1:** |
|
``` |
|
### MANDATORY DYNAMIC FILTERS |
|
**Mandatory Fields for Query Filter:** |
|
- date: '2023' |
|
``` |
|
**Correct JSON Output 1:** |
|
```json |
|
{{ |
|
"reasoning": {{ |
|
"dimension_choice": "User explicitly asked for 'top 5 companies', so 'company_name' is the correct dimension.", |
|
"measure_choice": "User explicitly asked for 'total deal value', so 'sum(total_deal_value_in_million)' is the correct measure.", |
|
"filter_choice": "The query filter was constructed from the mandatory fields provided by the API: date(date is converted to ISO 8601 format) and total_deal_value_in_million." |
|
}}, |
|
"analysis_dimension": "company_name", |
|
"analysis_measure": "sum(total_deal_value_in_million)", |
|
"sort_field_for_examples": "total_deal_value_in_million", |
|
"query_filter": "date:["2023-01-01T00:00:00Z" TO \"2023-12-31T23:59:59Z\"]", |
|
"quantitative_request": {{ |
|
"json.facet": {{ |
|
"companies_by_deal_value": {{ |
|
"type": "terms", |
|
"field": "company_name", |
|
"limit": 5, |
|
"sort": "total_value desc", |
|
"facet": {{ |
|
"total_value": "sum(total_deal_value_in_million)" |
|
}} |
|
}} |
|
}} |
|
}}, |
|
"qualitative_request": {{ |
|
"group": true, |
|
"group.field": "company_name", |
|
"group.limit": 2, |
|
"group.sort": "sum(total_deal_value_in_million) desc", |
|
"sort": "total_deal_value_in_million desc" |
|
}} |
|
}} |
|
``` |
|
|
|
**User Query 2:** "What are the most common news types for infections in 2025?" |
|
**API Filter Input 2:** |
|
```### MANDATORY DYNAMIC FILTERS |
|
**Mandatory Fields for Query Filter:** |
|
- therapeutic_category_s: infections |
|
- date: '2025' |
|
``` |
|
**Correct JSON Output 2:** |
|
```json |
|
{{ |
|
"reasoning": {{ |
|
"dimension_choice": "User asked for 'most common news types', so 'news_type' is the correct dimension. This is not redundant as the filter is on 'therapeutic_category'.", |
|
"measure_choice": "User asked for 'most common', which implies counting occurrences. Therefore, the measure is 'count'.", |
|
"filter_choice": "The query filter was constructed from the mandatory fields provided by the API: therapeutic_category_s and date(date is converted to ISO 8601 format)." |
|
}}, |
|
"analysis_dimension": "news_type", |
|
"analysis_measure": "count", |
|
"sort_field_for_examples": "date", |
|
"query_filter": "therapeutic_category_s:infections AND date:["2025-01-01T00:00:00Z" TO *]", |
|
"quantitative_request": {{ |
|
"json.facet": {{ |
|
"news_by_type": {{ |
|
"type": "terms", |
|
"field": "news_type", |
|
"limit": 10, |
|
"sort": "count desc" |
|
}} |
|
}} |
|
}}, |
|
"qualitative_request": {{ |
|
"group": true, |
|
"group.field": "news_type", |
|
"group.limit": 2, |
|
"group.sort": "sum(total_deal_value_in_million) desc", |
|
"sort": "total_deal_value_in_million desc" |
|
}} |
|
}} |
|
``` |
|
|
|
|
|
|
|
**User Query 3:** "Compare deal values for injection vs oral related to infection news." |
|
**API Filter Input 3:** |
|
``` |
|
### MANDATORY DYNAMIC FILTERS |
|
**Mandatory Fields for Query Filter:** |
|
- drug_delivery_branch_s: (injection OR oral) |
|
- therapeutic_category_s: infections |
|
``` |
|
**Correct JSON Output 3:** |
|
```json |
|
{{ |
|
"reasoning": {{ |
|
"dimension_choice": "The user wants to compare 'injection' vs 'oral', making 'route_branch' the appropriate analysis dimension.", |
|
"measure_choice": "The user explicitly asks to compare 'deal values', so 'sum(total_deal_value_in_million)' is the correct measure.", |
|
"filter_choice": "The query filter was constructed directly from the mandatory fields provided by the API: drug_delivery_branch_s and therapeutic_category_s." |
|
}}, |
|
"analysis_dimension": "route_branch", |
|
"analysis_measure": "sum(total_deal_value_in_million)", |
|
"sort_field_for_examples": "total_deal_value_in_million", |
|
"query_filter": "drug_delivery_branch_s:(injection OR oral) AND therapeutic_category_s:infections", |
|
"quantitative_request": {{ |
|
"json.facet": {{ |
|
"deal_values_by_route": {{ |
|
"type": "terms", |
|
"field": "route_branch", |
|
"limit": 2, |
|
"sort": "total_deal_value desc", |
|
"facet": {{ |
|
"total_deal_value": "sum(total_deal_value_in_million)" |
|
}} |
|
}} |
|
}} |
|
}}, |
|
"qualitative_request": {{ |
|
"group": true, |
|
"group.field": "route_branch", |
|
"group.limit": 2, |
|
"group.sort": "sum(total_deal_value_in_million) desc", |
|
"sort": "total_deal_value_in_million desc" |
|
}} |
|
}} |
|
``` |
|
--- |
|
### YOUR TASK |
|
|
|
Convert the following user query into a single, raw JSON "Analysis Plan" object. Strictly follow all rules, especially the rule for building the `query_filter` from the mandatory dynamic filters. Your JSON output MUST include the `reasoning` field. |
|
|
|
**Current User Query:** `{natural_language_query}` |
|
""" |
|
|
|
def get_synthesis_report_prompt(query, quantitative_data, qualitative_data, plan): |
|
""" |
|
Generates the prompt for synthesizing a final report from the query results. |
|
""" |
|
qualitative_prompt_str = "" |
|
dimension = plan.get('analysis_dimension', 'N/A') |
|
if qualitative_data and dimension in qualitative_data: |
|
for group in qualitative_data.get(dimension, {}).get('groups', []): |
|
group_value = group.get('groupValue', 'N/A') |
|
if group.get('doclist', {}).get('docs'): |
|
doc = group.get('doclist', {}).get('docs', [{}])[0] |
|
title = doc.get('abstract', ['No Title']) |
|
content_list = doc.get('content', []) |
|
content_snip = (' '.join(content_list[0].split()[:40]) + '...') if content_list else 'No content available.' |
|
metric_val_raw = doc.get(plan.get('sort_field_for_examples'), 'N/A') |
|
metric_val = metric_val_raw[0] if isinstance(metric_val_raw, list) else metric_val_raw |
|
|
|
qualitative_prompt_str += f"- **For category `{group_value}`:**\n" |
|
qualitative_prompt_str += f" - **Top Example Title:** {title}\n" |
|
qualitative_prompt_str += f" - **Metric Value:** {metric_val}\n" |
|
qualitative_prompt_str += f" - **Content Snippet:** {content_snip}\n\n" |
|
|
|
return f""" |
|
You are a top-tier business intelligence analyst. Your task is to write an insightful, data-driven report for an executive. You must synthesize quantitative data (the 'what') with qualitative examples (the 'why') to tell a complete story. |
|
|
|
--- |
|
### AVAILABLE INFORMATION |
|
|
|
**1. The User's Core Question:** |
|
\"{query}\" |
|
|
|
**2. Quantitative Data (The 'What'):** |
|
This data shows the high-level aggregates. |
|
```json |
|
{json.dumps(quantitative_data, indent=2)} |
|
``` |
|
|
|
**3. Qualitative Data (The 'Why'):** |
|
These are the single most significant documents driving the numbers for each category. |
|
{qualitative_prompt_str} |
|
|
|
--- |
|
### REPORTING INSTRUCTIONS |
|
|
|
Your report must be in clean, professional Markdown and follow this structure precisely. |
|
|
|
**Report Structure:** |
|
|
|
`## Executive Summary` |
|
- A 1-2 sentence, top-line answer to the user's question based on the quantitative data. |
|
|
|
`### Key Findings` |
|
- Use bullet points to highlight the main figures from the quantitative data. Interpret the numbers. |
|
|
|
`### Key Drivers & Illustrative Examples` |
|
- **This is the most important section.** Explain the "so what?" behind the numbers. |
|
- Use the qualitative examples to explain *why* a category is high or low. Reference the top example document for each main category. |
|
|
|
`### Deeper Dive: Suggested Follow-up Analyses` |
|
- Propose 2-3 logical next questions based on your analysis to uncover deeper trends. |
|
|
|
--- |
|
**Generate the full report now, paying close attention to all formatting and spacing rules.** |
|
""" |
|
|
|
def get_visualization_code_prompt(query_context, facet_data): |
|
""" |
|
Generates a flexible prompt for creating Python visualization code. |
|
""" |
|
return f""" |
|
You are a world-class Python data visualization expert specializing in Matplotlib and Seaborn. |
|
Your primary task is to generate a single, insightful, and robust Python script to visualize the provided data. The visualization should directly answer the user's analytical goal. |
|
|
|
**1. User's Analytical Goal:** |
|
"{query_context}" |
|
|
|
**2. Aggregated Data (from Solr Facets):** |
|
```json |
|
{json.dumps(facet_data, indent=2)} |
|
``` |
|
|
|
--- |
|
### **CRITICAL INSTRUCTIONS: CODE GENERATION RULES** |
|
You MUST follow these rules meticulously to ensure the code runs without errors in a server environment. |
|
|
|
**A. Analyze the Data & Choose the Right Chart:** |
|
- **Inspect the Data:** Before writing any code, carefully examine the structure of the `facet_data` JSON. Is it a simple list of categories and counts? Is it a nested structure comparing metrics across categories? Is it a time-series? |
|
- **Select the Best Chart Type:** Based on the data and the user's goal, choose the most effective chart. |
|
- **Bar Chart:** Ideal for comparing quantities across different categories (e.g., top companies by deal value). |
|
- **Grouped Bar Chart:** Use when comparing a metric across categories for a few groups (e.g., deal values for 2023 vs. 2024 by company). |
|
- **Line Chart:** Best for showing a trend over time (e.g., number of approvals per year). |
|
- **Pie Chart:** Use ONLY for showing parts of a whole, and only with a few (2-5) categories. Generally, bar charts are better. |
|
- **Tell a Story:** Your visualization should be more than just a plot; it should reveal the key insight from the data. |
|
- **Direct Answer** If user ask for like this: compare x with y there should be a comparison visualization between x and y nothing more. |
|
|
|
**B. Non-Negotiable Code Requirements:** |
|
1. **Imports:** You must import `matplotlib.pyplot as plt`, `seaborn as sns`, and `pandas as pd`. |
|
2. **Use Pandas:** ALWAYS parse the `facet_data` into a pandas DataFrame. This is more robust and flexible than iterating through dictionaries directly. |
|
3. **Figure and Axes:** Use `fig, ax = plt.subplots()` to create the figure and axes objects. This gives you better control. |
|
4. **Styling:** Apply a clean and professional style, for example: `plt.style.use('seaborn-v0_8-whitegrid')` and use a suitable Seaborn palette (e.g., `palette='viridis'`). |
|
5. **NO `plt.show()`:** Your code will be run on a server. **DO NOT** include `plt.show()`. |
|
6. **Save the Figure:** The execution environment expects a Matplotlib figure object named `fig`. Your code does not need to handle the saving path directly, but it **MUST** produce the final `fig` object correctly. The calling function will handle saving it. |
|
7. **Titles and Labels:** You MUST set a clear and descriptive title and labels for the x and y axes. The title should reflect the user's query. |
|
8. **Axis Label Readability:** If x-axis labels are long, you MUST rotate them to prevent overlap. Use this robust method: `plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")`. |
|
9. **Layout:** Use `plt.tight_layout()` at the end to ensure all elements fit nicely. |
|
10. **Error Handling:** Your code should be robust. If the `facet_data` contains no "buckets" or data to plot, the code should not crash. It should instead produce a plot with a message like "No data available to plot." |
|
|
|
--- |
|
### **High-Quality Example (Grouped Bar Chart)** |
|
|
|
This example shows how to parse a nested facet structure into a DataFrame and create an insightful grouped bar chart. Adapt its principles to your specific task. |
|
|
|
```python |
|
# --- Imports and Style --- |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import pandas as pd |
|
|
|
plt.style.use('seaborn-v0_8-whitegrid') |
|
fig, ax = plt.subplots(figsize=(14, 8)) |
|
|
|
# --- Data Parsing --- |
|
# Dynamically find the main facet key (the one with 'buckets') |
|
facet_key = None |
|
for key, value in facet_data.items(): |
|
if isinstance(value, dict) and 'buckets' in value: |
|
facet_key = key |
|
break |
|
|
|
plot_data = [] |
|
# Check if a valid key and buckets were found |
|
if facet_key and facet_data[facet_key].get('buckets'): |
|
# This robustly parses nested metrics (e.g., a sum for each year) |
|
for bucket in facet_data[facet_key]['buckets']: |
|
category = bucket.get('val', 'N/A') |
|
# Find all nested metrics inside the bucket |
|
for sub_key, sub_value in bucket.items(): |
|
if isinstance(sub_value, dict) and 'sum' in sub_value: |
|
# Extracts '2025' from a key like 'total_value_2025' |
|
group = sub_key.split('_')[-1] |
|
value = sub_value['sum'] |
|
plot_data.append({{'Category': category, 'Group': group, 'Value': value}}) |
|
|
|
# --- Plotting --- |
|
if plot_data: |
|
df = pd.DataFrame(plot_data) |
|
sns.barplot(data=df, x='Category', y='Value', hue='Group', ax=ax, palette='viridis') |
|
|
|
# --- Labels and Titles --- |
|
ax.set_title('Comparison of Total Value by Category and Group') |
|
ax.set_xlabel('Category') |
|
ax.set_ylabel('Total Value') |
|
|
|
# --- Formatting --- |
|
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") |
|
else: |
|
# --- Handle No Data --- |
|
ax.text(0.5, 0.5, 'No data available to plot.', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes) |
|
ax.set_title('Data Visualization') |
|
|
|
# --- Final Layout --- |
|
plt.tight_layout() |
|
``` |
|
|
|
--- |
|
### **Your Task:** |
|
|
|
Now, generate the raw Python code to create the best possible visualization for the user's goal based on the provided data. |
|
Do not wrap the code in ```python ... ```. |
|
""" |