|
import gradio as gr |
|
import pandas as pd |
|
import duckdb |
|
from datasets import load_dataset |
|
from huggingface_hub import login |
|
import openai |
|
import os |
|
from typing import Dict, List, Any |
|
|
|
class SALTAnalytics: |
|
def __init__(self): |
|
"""Initialize SALT Analytics""" |
|
self.con = duckdb.connect(':memory:') |
|
self.data_loaded = False |
|
self.schema_info = "" |
|
self.available_columns = [] |
|
|
|
def load_salt_dataset(self): |
|
"""Load SAP SALT dataset from Hugging Face into DuckDB""" |
|
if self.data_loaded: |
|
return "Dataset already loaded!" |
|
|
|
try: |
|
hf_token = os.getenv('HF_TOKEN') |
|
|
|
if hf_token: |
|
dataset = load_dataset( |
|
"SAP/SALT", |
|
"joined_table", |
|
split="train", |
|
token=hf_token, |
|
streaming=False |
|
) |
|
else: |
|
dataset = load_dataset( |
|
"SAP/SALT", |
|
"joined_table", |
|
split="train", |
|
use_auth_token=True, |
|
streaming=False |
|
) |
|
|
|
df = dataset.to_pandas() |
|
|
|
if len(df) > 100000: |
|
df = df.sample(n=50000, random_state=42) |
|
|
|
self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df") |
|
|
|
schema_result = self.con.execute("DESCRIBE salt_data").fetchall() |
|
self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result]) |
|
self.available_columns = [col[0] for col in schema_result] |
|
|
|
self.data_loaded = True |
|
|
|
return f"β
Successfully loaded {len(df)} records into DuckDB\n\nπ Available columns:\n" + "\n".join(f"β’ {col}" for col in self.available_columns[:20]) + ("\n... and more" if len(self.available_columns) > 20 else "") |
|
|
|
except Exception as e: |
|
error_msg = str(e) |
|
if "gated dataset" in error_msg or "authentication" in error_msg.lower(): |
|
return f"β Authentication Error: {error_msg}\n\nTo fix this:\n1. Go to https://huggingface.co/datasets/SAP/SALT\n2. Request access to the dataset\n3. Wait for approval\n4. Set HF_TOKEN in your Space secrets" |
|
else: |
|
return f"β Error loading dataset: {error_msg}" |
|
|
|
def get_predefined_insights(self): |
|
"""Generate predefined analytical insights - COMPLETELY FIXED""" |
|
if not self.data_loaded: |
|
return "Please load the dataset first" |
|
|
|
try: |
|
insights = {} |
|
|
|
|
|
insights['Dataset Overview'] = self.con.execute(""" |
|
SELECT |
|
COUNT(*) as total_records, |
|
COUNT(DISTINCT CREATIONDATE) as unique_dates, |
|
MIN(CREATIONDATE) as earliest_date, |
|
MAX(CREATIONDATE) as latest_date |
|
FROM salt_data |
|
""").fetchdf() |
|
|
|
|
|
if 'CUSTOMERPAYMENTTERMS' in self.available_columns: |
|
insights['Payment Terms Distribution'] = self.con.execute(""" |
|
SELECT CUSTOMERPAYMENTTERMS, |
|
COUNT(*) as frequency, |
|
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage |
|
FROM salt_data |
|
WHERE CUSTOMERPAYMENTTERMS IS NOT NULL AND CUSTOMERPAYMENTTERMS != '' |
|
GROUP BY CUSTOMERPAYMENTTERMS |
|
ORDER BY frequency DESC |
|
LIMIT 10 |
|
""").fetchdf() |
|
|
|
|
|
sales_office_col = None |
|
for col in self.available_columns: |
|
if 'SALES' in col.upper() and 'OFFICE' in col.upper(): |
|
sales_office_col = col |
|
break |
|
|
|
if sales_office_col: |
|
query = f""" |
|
SELECT {sales_office_col}, |
|
COUNT(*) as total_orders |
|
FROM salt_data |
|
WHERE {sales_office_col} IS NOT NULL AND {sales_office_col} != '' |
|
GROUP BY {sales_office_col} |
|
ORDER BY total_orders DESC |
|
LIMIT 10 |
|
""" |
|
insights['Sales Office Performance'] = self.con.execute(query).fetchdf() |
|
|
|
|
|
shipping_col = None |
|
for col in self.available_columns: |
|
if 'SHIPPING' in col.upper() and 'CONDITION' in col.upper(): |
|
shipping_col = col |
|
break |
|
|
|
if shipping_col: |
|
query = f""" |
|
SELECT {shipping_col}, |
|
COUNT(*) as order_count |
|
FROM salt_data |
|
WHERE {shipping_col} IS NOT NULL AND {shipping_col} != '' |
|
GROUP BY {shipping_col} |
|
ORDER BY order_count DESC |
|
LIMIT 10 |
|
""" |
|
insights['Shipping Conditions'] = self.con.execute(query).fetchdf() |
|
|
|
|
|
if 'SALESDOCUMENTITEMCATEGORY' in self.available_columns: |
|
insights['Sales Document Categories'] = self.con.execute(""" |
|
SELECT SALESDOCUMENTITEMCATEGORY, |
|
COUNT(*) as frequency, |
|
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage |
|
FROM salt_data |
|
WHERE SALESDOCUMENTITEMCATEGORY IS NOT NULL AND SALESDOCUMENTITEMCATEGORY != '' |
|
GROUP BY SALESDOCUMENTITEMCATEGORY |
|
ORDER BY frequency DESC |
|
LIMIT 10 |
|
""").fetchdf() |
|
|
|
|
|
insights['Available Columns Sample'] = pd.DataFrame({ |
|
'Column Name': self.available_columns[:20], |
|
'Index': range(len(self.available_columns[:20])) |
|
}) |
|
|
|
return insights |
|
|
|
except Exception as e: |
|
|
|
return f"β Error generating insights: {str(e)}\n\nπ Debug Info:\n" + \ |
|
f"Data loaded: {self.data_loaded}\n" + \ |
|
f"Available columns ({len(self.available_columns)}): {', '.join(self.available_columns[:15])}...\n" + \ |
|
f"Error type: {type(e).__name__}" |
|
|
|
def clean_sql_response(self, sql_query: str) -> str: |
|
"""Clean SQL response - avoiding string literal errors""" |
|
backticks = "`" + "`" + "`" |
|
sql_marker = backticks + "sql" |
|
|
|
if sql_query.startswith(sql_marker): |
|
sql_query = sql_query[6:] |
|
elif sql_query.startswith(backticks): |
|
sql_query = sql_query[3:] |
|
|
|
if sql_query.endswith(backticks): |
|
sql_query = sql_query[:-3] |
|
|
|
return sql_query.strip() |
|
|
|
def natural_language_query(self, question: str, api_key: str): |
|
"""Convert natural language to SQL and execute""" |
|
if not self.data_loaded: |
|
return "Please load the dataset first" |
|
|
|
if not api_key: |
|
return "Please provide OpenAI API key" |
|
|
|
try: |
|
client = openai.OpenAI(api_key=api_key) |
|
|
|
columns_list = ", ".join(self.available_columns[:30]) |
|
|
|
prompt = f""" |
|
You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with these available columns: |
|
|
|
{columns_list} |
|
|
|
The SALT dataset contains SAP ERP sales order data where each row represents a sales document item. |
|
|
|
IMPORTANT: Use only the column names I provided above. Do not assume column names that don't exist. |
|
|
|
Convert this question to a DuckDB SQL query: "{question}" |
|
|
|
Return ONLY the SQL query, no explanation. Limit results to 20 rows and use WHERE clauses to filter out NULL values. |
|
""" |
|
|
|
response = client.chat.completions.create( |
|
model="gpt-4", |
|
messages=[{"role": "user", "content": prompt}], |
|
temperature=0.1 |
|
) |
|
|
|
sql_query = response.choices[0].message.content.strip() |
|
sql_query = self.clean_sql_response(sql_query) |
|
|
|
result_df = self.con.execute(sql_query).fetchdf() |
|
|
|
explanation_prompt = f""" |
|
Question: {question} |
|
Results: {result_df.head(10).to_string()} |
|
|
|
Provide a clear business explanation of these SAP ERP results in 2-3 sentences, focusing on actionable insights for sales operations. |
|
""" |
|
|
|
explanation_response = client.chat.completions.create( |
|
model="gpt-4", |
|
messages=[{"role": "user", "content": explanation_prompt}], |
|
temperature=0.3 |
|
) |
|
|
|
explanation = explanation_response.choices[0].message.content |
|
|
|
code_block = "`" + "`" + "`" |
|
return f"**SQL Query:**\n{code_block}sql\n{sql_query}\n{code_block}\n\n**Results:**\n{result_df.to_string(index=False)}\n\n**Explanation:**\n{explanation}" |
|
|
|
except Exception as e: |
|
return f"Error: {str(e)}\n\nTry rephrasing your question. Available columns: {', '.join(self.available_columns[:10])}..." |
|
|
|
|
|
analytics = SALTAnalytics() |
|
|
|
def load_dataset_interface(): |
|
return analytics.load_salt_dataset() |
|
|
|
def show_insights_interface(): |
|
"""Fixed insights interface with better error handling""" |
|
insights = analytics.get_predefined_insights() |
|
|
|
if isinstance(insights, str): |
|
return insights |
|
|
|
output = "# π SAP SALT Dataset Insights\n\n" |
|
|
|
for title, df in insights.items(): |
|
output += f"## {title}\n\n" |
|
if isinstance(df, pd.DataFrame) and len(df) > 0: |
|
output += df.to_markdown(index=False) |
|
else: |
|
output += "*No data available for this analysis*" |
|
output += "\n\n---\n\n" |
|
|
|
return output |
|
|
|
def qa_interface(question: str, api_key: str): |
|
if not question.strip(): |
|
return "Please enter a question" |
|
return analytics.natural_language_query(question, api_key) |
|
|
|
sample_questions = [ |
|
"Which sales offices process the most orders?", |
|
"What are the most common payment terms?", |
|
"Show me the distribution of shipping conditions", |
|
"What is the date range of orders in the dataset?", |
|
"Which document categories are most frequent?" |
|
] |
|
|
|
with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo: |
|
|
|
gr.Markdown(""" |
|
# π SAP SALT Dataset Analytics Demo |
|
## Open Source Analytics + AI for SAP ERP |
|
|
|
This demo uses the **authentic SAP SALT dataset** - real ERP data from sales orders, items, customers, and addresses. |
|
""") |
|
|
|
with gr.Tab("π₯ Load Dataset"): |
|
gr.Markdown("### Load SAP SALT Dataset from Hugging Face") |
|
|
|
load_btn = gr.Button("Load SALT Dataset", variant="primary") |
|
load_output = gr.Textbox(label="Status", lines=8) |
|
|
|
load_btn.click(fn=load_dataset_interface, outputs=load_output) |
|
|
|
with gr.Tab("π Insights"): |
|
gr.Markdown("### Pre-built Analytics Insights") |
|
|
|
insights_btn = gr.Button("Generate Insights", variant="primary") |
|
insights_output = gr.Markdown() |
|
|
|
insights_btn.click(fn=show_insights_interface, outputs=insights_output) |
|
|
|
with gr.Tab("π€ AI Q&A"): |
|
gr.Markdown("### Ask Questions in Natural Language") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
api_key_input = gr.Textbox( |
|
label="OpenAI API Key", |
|
type="password", |
|
placeholder="Enter your OpenAI API key" |
|
) |
|
|
|
question_input = gr.Textbox( |
|
label="Your Question", |
|
placeholder="e.g., Which sales offices process the most orders?", |
|
lines=2 |
|
) |
|
|
|
sample_dropdown = gr.Dropdown( |
|
choices=sample_questions, |
|
label="Or choose a sample question", |
|
value=None |
|
) |
|
|
|
ask_btn = gr.Button("Get Answer", variant="primary") |
|
|
|
with gr.Column(scale=4): |
|
qa_output = gr.Markdown() |
|
|
|
sample_dropdown.change( |
|
fn=lambda x: x if x else "", |
|
inputs=sample_dropdown, |
|
outputs=question_input |
|
) |
|
|
|
ask_btn.click( |
|
fn=qa_interface, |
|
inputs=[question_input, api_key_input], |
|
outputs=qa_output |
|
) |
|
|
|
with gr.Tab("βΉοΈ About"): |
|
gr.Markdown(""" |
|
### About the SALT Dataset |
|
|
|
**SAP SALT** (Sales Autocompletion Linked Business Tables) contains: |
|
- **500,908 sales orders** from real SAP S/4HANA system |
|
- **2.3M sales order line items** |
|
- **139,611 unique customers** |
|
- **Data from 2018-2020** with full business context |
|
|
|
**Key Use Cases:** |
|
- Sales process automation (70-80% accuracy) |
|
- Customer behavior analysis |
|
- Shipping and logistics optimization |
|
- Payment terms prediction |
|
|
|
**Technology Stack:** |
|
- **DuckDB**: High-performance analytics |
|
- **OpenAI GPT-4**: Natural language to SQL |
|
- **Gradio**: Interactive interface |
|
- **Real ERP Data**: Authentic business scenarios |
|
|
|
This demonstrates how **open source tools** can unlock massive value from enterprise SAP systems at zero licensing cost. |
|
""") |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|