# app.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import os
import warnings
warnings.filterwarnings('ignore')
# ---------- Page & Styles ----------
st.set_page_config(page_title="SAP Sales KPI Dashboard", page_icon="📊", layout="wide")
st.markdown("""
""", unsafe_allow_html=True)
# ---------- Kaggle load ----------
@st.cache_data(ttl=3600)
def load_kaggle_sap_data():
try:
if 'KAGGLE_USERNAME' not in st.secrets or 'KAGGLE_KEY' not in st.secrets:
return "Kaggle credentials not found in Streamlit secrets."
os.environ['KAGGLE_USERNAME'] = st.secrets['KAGGLE_USERNAME']
os.environ['KAGGLE_KEY'] = st.secrets['KAGGLE_KEY']
import kaggle
dataset_name = "mustafakeser4/sap-dataset-bigquery-dataset"
download_path = "./kaggle_data"
if not os.path.exists(os.path.join(download_path, 'vbak.csv')):
with st.spinner("Downloading dataset from Kaggle..."):
kaggle.api.authenticate()
kaggle.api.dataset_download_files(dataset_name, path=download_path, unzip=True)
needed = {'vbak': 'vbak.csv', 'vbap': 'vbap.csv', 'kna1': 'kna1.csv', 'makt': 'makt.csv'}
tables = {}
for k, fn in needed.items():
fp = os.path.join(download_path, fn)
if not os.path.exists(fp):
return f"Expected file missing: {fn}"
tables[k] = pd.read_csv(fp, low_memory=False)
return tables
except Exception as e:
return f"Error during Kaggle data loading: {e}"
# ---------- Processing (robust revenue + safe merges) ----------
@st.cache_data
def create_sales_analytics(_tables):
try:
vbak = _tables['vbak'].copy()
vbap = _tables['vbap'].copy()
kna1 = _tables['kna1'].copy()
makt = _tables['makt'].copy()
# normalize column names
for df in [vbak, vbap, kna1, makt]:
df.columns = [c.upper().strip() for c in df.columns]
# keep SAP keys as strings (avoid leading-zero loss)
def as_str(df, cols):
for c in cols:
if c in df.columns:
df[c] = df[c].astype(str).str.strip()
as_str(vbak, ['VBELN','KUNNR','VKORG','VTWEG'])
as_str(vbap, ['VBELN','MATNR'])
as_str(kna1, ['KUNNR'])
as_str(makt, ['MATNR'])
# choose numeric helper
def pick_numeric(df, cols):
for c in cols:
if c in df.columns:
s = pd.to_numeric(df[c], errors='coerce')
if s.notna().sum() > 0 and s.abs().sum() > 0:
return s
return pd.Series(0.0, index=df.index)
# Build item-level REVENUE
# primary: NETWR at item level (VBAP)
netwr_item = pick_numeric(vbap, ['NETWR'])
# fallback: price * qty using common SAP columns
price = pick_numeric(vbap, ['NETPR', 'KBETR', 'NETPR_I'])
qty = pick_numeric(vbap, ['KWMENG', 'KTMNG', 'MENGE'])
fallback_rev = (price.fillna(0) * qty.fillna(0)).fillna(0)
vbap['REVENUE'] = np.where(netwr_item > 0, netwr_item, fallback_rev).astype(float)
# header fields (include currency if present)
keep_vbak = ['VBELN','KUNNR','VKORG','VTWEG','ERDAT'] + (['WAERK'] if 'WAERK' in vbak.columns else [])
vbak_small = vbak[keep_vbak].drop_duplicates('VBELN')
vbap_small = vbap[['VBELN','MATNR','REVENUE']]
kna1_small = kna1[['KUNNR','NAME1','LAND1']] if {'KUNNR','NAME1','LAND1'}.issubset(kna1.columns) else pd.DataFrame(columns=['KUNNR','NAME1','LAND1'])
# product text in English
makt_en = makt[makt['SPRAS'].eq('E')] if 'SPRAS' in makt.columns else makt
makt_small = makt_en[['MATNR','MAKTX']].drop_duplicates('MATNR') if {'MATNR','MAKTX'}.issubset(makt_en.columns) else pd.DataFrame(columns=['MATNR','MAKTX'])
# final sales table
sales = (vbap_small
.merge(vbak_small, on='VBELN', how='inner')
.merge(kna1_small, on='KUNNR', how='left')
.merge(makt_small, on='MATNR', how='left'))
if 'ERDAT' in sales.columns:
sales['ERDAT'] = pd.to_datetime(sales['ERDAT'], errors='coerce')
# ensure columns exist
for col in ['REVENUE','LAND1','VTWEG','NAME1','MAKTX','VBELN','KUNNR','VKORG']:
if col not in sales.columns:
sales[col] = np.nan if col != 'REVENUE' else 0.0
if 'WAERK' not in sales.columns:
sales['WAERK'] = 'N/A'
# drop obvious junk rows
sales = sales.replace([np.inf, -np.inf], np.nan).dropna(subset=['REVENUE'])
return sales
except Exception as e:
return f"Error processing sales data: {e}"
# ---------- App ----------
st.markdown('
🎯 SAP Sales KPI Dashboard
Real SAP ERP sample data (Kaggle)
', unsafe_allow_html=True)
tables = load_kaggle_sap_data()
if isinstance(tables, str):
st.error(tables)
st.stop()
sales_df = create_sales_analytics(tables)
if isinstance(sales_df, str):
st.error(sales_df)
st.stop()
# ---------- Filter Bar (no sidebar) ----------
with st.container():
st.markdown('', unsafe_allow_html=True)
c1, c2, c3, c4 = st.columns([1.2, 1.2, 3, 0.9])
# currency filter
currencies = [c for c in sales_df['WAERK'].dropna().unique().tolist() if c != 'N/A']
default_cur = sales_df['WAERK'].mode().iat[0] if len(sales_df) and sales_df['WAERK'].notna().any() else 'N/A'
with c1:
currency = st.selectbox("Currency", options=(['All'] + sorted(currencies)) if currencies else ['All'], index=0 if currencies else 0)
# Top N
with c2:
top_n_countries = st.slider("Top N Countries", 5, 20, 10)
# Region multiselect inside expander to keep tidy
with c3:
with st.expander("Region (Country) – click to choose", expanded=False):
all_countries = sorted(sales_df['LAND1'].dropna().unique().tolist())
# buttons to select/clear
b1, b2 = st.columns([1,1])
if 'selected_countries' not in st.session_state:
st.session_state.selected_countries = all_countries
with b1:
if st.button("Select All"):
st.session_state.selected_countries = all_countries
with b2:
if st.button("Clear"):
st.session_state.selected_countries = []
selected_region = st.multiselect("Countries", options=all_countries, default=st.session_state.selected_countries, key="countries_ms")
with c4:
if st.button("🔄 Clear Cache"):
st.cache_data.clear()
st.rerun()
st.markdown('
', unsafe_allow_html=True)
# apply filters
filtered_df = sales_df.copy()
if currency and currency != 'All':
filtered_df = filtered_df[filtered_df['WAERK'] == currency]
if 'countries_ms' in st.session_state:
filtered_df = filtered_df[filtered_df['LAND1'].isin(st.session_state.countries_ms)]
st.success(f"✅ Loaded and processed {len(filtered_df):,} sales line-items after filters.")
# ---------- KPIs ----------
st.subheader("Sales KPIs")
k1,k2,k3,k4 = st.columns(4)
with k1: st.markdown(f'${float(filtered_df["REVENUE"].sum()):,.0f}
Total Revenue
', unsafe_allow_html=True)
with k2: st.markdown(f'{int(filtered_df["KUNNR"].nunique())}
Active Customers
', unsafe_allow_html=True)
with k3:
aov = float(filtered_df.loc[filtered_df['REVENUE']>0,'REVENUE'].mean() or 0.0)
st.markdown(f'${aov:,.0f}
Avg Order Value (item)
', unsafe_allow_html=True)
with k4: st.markdown(f'{int(filtered_df["VBELN"].nunique())}
Sales Orders
', unsafe_allow_html=True)
# ---------- Tabs ----------
tab1, tab2, tab3, tab4 = st.tabs(["👥 Top Customers", "🌍 Regional Analysis", "📈 Distribution Channels", "🛍️ Top Products"])
with tab1:
st.subheader("Top 10 Customers by Revenue")
customer_summary = (filtered_df.dropna(subset=['NAME1'])
.groupby('NAME1', as_index=False)['REVENUE'].sum()
.nlargest(10, 'REVENUE'))
if customer_summary.empty:
st.info("No customer data to display.")
else:
fig = px.bar(customer_summary, x='REVENUE', y='NAME1', orientation='h',
labels={'REVENUE':'Revenue','NAME1':'Customer'}, color='REVENUE')
st.plotly_chart(fig.update_layout(yaxis={'categoryorder':'total ascending'}), use_container_width=True)
with tab2:
st.subheader("Revenue by Country")
regional_summary = (filtered_df.dropna(subset=['LAND1'])
.groupby('LAND1', as_index=False)['REVENUE'].sum()
.nlargest(top_n_countries, 'REVENUE'))
if regional_summary.empty:
st.info("No country data to display.")
else:
fig = px.pie(regional_summary, values='REVENUE', names='LAND1',
title=f"Top {top_n_countries} Countries by Revenue")
st.plotly_chart(fig, use_container_width=True)
with tab3:
st.subheader("Revenue by Distribution Channel")
channel_summary = (filtered_df.dropna(subset=['VTWEG'])
.groupby('VTWEG', as_index=False)['REVENUE'].sum())
channel_summary['VTWEG'] = channel_summary['VTWEG'].astype(str)
if channel_summary.empty:
st.info("No distribution channel data to display.")
else:
fig = px.bar(channel_summary, x='VTWEG', y='REVENUE',
title="Total Revenue by Distribution Channel",
labels={'REVENUE':'Total Revenue','VTWEG':'Distribution Channel'},
color='REVENUE')
st.plotly_chart(fig, use_container_width=True)
with tab4:
st.subheader("Top 10 Products by Revenue")
product_summary = (filtered_df.dropna(subset=['MAKTX'])
.groupby('MAKTX', as_index=False)['REVENUE'].sum()
.nlargest(10, 'REVENUE'))
if product_summary.empty:
st.info("No product data to display.")
else:
fig = px.bar(product_summary, x='REVENUE', y='MAKTX', orientation='h',
labels={'REVENUE':'Revenue','MAKTX':'Product'}, color='REVENUE')
st.plotly_chart(fig.update_layout(yaxis={'categoryorder':'total ascending'}), use_container_width=True)
st.markdown("---")
st.markdown("Built with Streamlit • Real SAP ERP sample data (Kaggle)
", unsafe_allow_html=True)