# app.py import streamlit as st import pandas as pd import numpy as np import plotly.express as px import os import warnings warnings.filterwarnings('ignore') # ---------- Page & Styles ---------- st.set_page_config(page_title="SAP Sales KPI Dashboard", page_icon="📊", layout="wide") st.markdown(""" """, unsafe_allow_html=True) # ---------- Kaggle load ---------- @st.cache_data(ttl=3600) def load_kaggle_sap_data(): try: if 'KAGGLE_USERNAME' not in st.secrets or 'KAGGLE_KEY' not in st.secrets: return "Kaggle credentials not found in Streamlit secrets." os.environ['KAGGLE_USERNAME'] = st.secrets['KAGGLE_USERNAME'] os.environ['KAGGLE_KEY'] = st.secrets['KAGGLE_KEY'] import kaggle dataset_name = "mustafakeser4/sap-dataset-bigquery-dataset" download_path = "./kaggle_data" if not os.path.exists(os.path.join(download_path, 'vbak.csv')): with st.spinner("Downloading dataset from Kaggle..."): kaggle.api.authenticate() kaggle.api.dataset_download_files(dataset_name, path=download_path, unzip=True) needed = {'vbak': 'vbak.csv', 'vbap': 'vbap.csv', 'kna1': 'kna1.csv', 'makt': 'makt.csv'} tables = {} for k, fn in needed.items(): fp = os.path.join(download_path, fn) if not os.path.exists(fp): return f"Expected file missing: {fn}" tables[k] = pd.read_csv(fp, low_memory=False) return tables except Exception as e: return f"Error during Kaggle data loading: {e}" # ---------- Processing (robust revenue + safe merges) ---------- @st.cache_data def create_sales_analytics(_tables): try: vbak = _tables['vbak'].copy() vbap = _tables['vbap'].copy() kna1 = _tables['kna1'].copy() makt = _tables['makt'].copy() # normalize column names for df in [vbak, vbap, kna1, makt]: df.columns = [c.upper().strip() for c in df.columns] # keep SAP keys as strings (avoid leading-zero loss) def as_str(df, cols): for c in cols: if c in df.columns: df[c] = df[c].astype(str).str.strip() as_str(vbak, ['VBELN','KUNNR','VKORG','VTWEG']) as_str(vbap, ['VBELN','MATNR']) as_str(kna1, ['KUNNR']) as_str(makt, ['MATNR']) # choose numeric helper def pick_numeric(df, cols): for c in cols: if c in df.columns: s = pd.to_numeric(df[c], errors='coerce') if s.notna().sum() > 0 and s.abs().sum() > 0: return s return pd.Series(0.0, index=df.index) # Build item-level REVENUE # primary: NETWR at item level (VBAP) netwr_item = pick_numeric(vbap, ['NETWR']) # fallback: price * qty using common SAP columns price = pick_numeric(vbap, ['NETPR', 'KBETR', 'NETPR_I']) qty = pick_numeric(vbap, ['KWMENG', 'KTMNG', 'MENGE']) fallback_rev = (price.fillna(0) * qty.fillna(0)).fillna(0) vbap['REVENUE'] = np.where(netwr_item > 0, netwr_item, fallback_rev).astype(float) # header fields (include currency if present) keep_vbak = ['VBELN','KUNNR','VKORG','VTWEG','ERDAT'] + (['WAERK'] if 'WAERK' in vbak.columns else []) vbak_small = vbak[keep_vbak].drop_duplicates('VBELN') vbap_small = vbap[['VBELN','MATNR','REVENUE']] kna1_small = kna1[['KUNNR','NAME1','LAND1']] if {'KUNNR','NAME1','LAND1'}.issubset(kna1.columns) else pd.DataFrame(columns=['KUNNR','NAME1','LAND1']) # product text in English makt_en = makt[makt['SPRAS'].eq('E')] if 'SPRAS' in makt.columns else makt makt_small = makt_en[['MATNR','MAKTX']].drop_duplicates('MATNR') if {'MATNR','MAKTX'}.issubset(makt_en.columns) else pd.DataFrame(columns=['MATNR','MAKTX']) # final sales table sales = (vbap_small .merge(vbak_small, on='VBELN', how='inner') .merge(kna1_small, on='KUNNR', how='left') .merge(makt_small, on='MATNR', how='left')) if 'ERDAT' in sales.columns: sales['ERDAT'] = pd.to_datetime(sales['ERDAT'], errors='coerce') # ensure columns exist for col in ['REVENUE','LAND1','VTWEG','NAME1','MAKTX','VBELN','KUNNR','VKORG']: if col not in sales.columns: sales[col] = np.nan if col != 'REVENUE' else 0.0 if 'WAERK' not in sales.columns: sales['WAERK'] = 'N/A' # drop obvious junk rows sales = sales.replace([np.inf, -np.inf], np.nan).dropna(subset=['REVENUE']) return sales except Exception as e: return f"Error processing sales data: {e}" # ---------- App ---------- st.markdown('

🎯 SAP Sales KPI Dashboard

Real SAP ERP sample data (Kaggle)

', unsafe_allow_html=True) tables = load_kaggle_sap_data() if isinstance(tables, str): st.error(tables) st.stop() sales_df = create_sales_analytics(tables) if isinstance(sales_df, str): st.error(sales_df) st.stop() # ---------- Filter Bar (no sidebar) ---------- with st.container(): st.markdown('

', unsafe_allow_html=True) c1, c2, c3, c4 = st.columns([1.2, 1.2, 3, 0.9]) # currency filter currencies = [c for c in sales_df['WAERK'].dropna().unique().tolist() if c != 'N/A'] default_cur = sales_df['WAERK'].mode().iat[0] if len(sales_df) and sales_df['WAERK'].notna().any() else 'N/A' with c1: currency = st.selectbox("Currency", options=(['All'] + sorted(currencies)) if currencies else ['All'], index=0 if currencies else 0) # Top N with c2: top_n_countries = st.slider("Top N Countries", 5, 20, 10) # Region multiselect inside expander to keep tidy with c3: with st.expander("Region (Country) – click to choose", expanded=False): all_countries = sorted(sales_df['LAND1'].dropna().unique().tolist()) # buttons to select/clear b1, b2 = st.columns([1,1]) if 'selected_countries' not in st.session_state: st.session_state.selected_countries = all_countries with b1: if st.button("Select All"): st.session_state.selected_countries = all_countries with b2: if st.button("Clear"): st.session_state.selected_countries = [] selected_region = st.multiselect("Countries", options=all_countries, default=st.session_state.selected_countries, key="countries_ms") with c4: if st.button("🔄 Clear Cache"): st.cache_data.clear() st.rerun() st.markdown('

', unsafe_allow_html=True) # apply filters filtered_df = sales_df.copy() if currency and currency != 'All': filtered_df = filtered_df[filtered_df['WAERK'] == currency] if 'countries_ms' in st.session_state: filtered_df = filtered_df[filtered_df['LAND1'].isin(st.session_state.countries_ms)] st.success(f"✅ Loaded and processed {len(filtered_df):,} sales line-items after filters.") # ---------- KPIs ---------- st.subheader("Sales KPIs") k1,k2,k3,k4 = st.columns(4) with k1: st.markdown(f'

${float(filtered_df["REVENUE"].sum()):,.0f}

Total Revenue

', unsafe_allow_html=True) with k2: st.markdown(f'

{int(filtered_df["KUNNR"].nunique())}

Active Customers

', unsafe_allow_html=True) with k3: aov = float(filtered_df.loc[filtered_df['REVENUE']>0,'REVENUE'].mean() or 0.0) st.markdown(f'

${aov:,.0f}

Avg Order Value (item)

', unsafe_allow_html=True) with k4: st.markdown(f'

{int(filtered_df["VBELN"].nunique())}

Sales Orders

', unsafe_allow_html=True) # ---------- Tabs ---------- tab1, tab2, tab3, tab4 = st.tabs(["👥 Top Customers", "🌍 Regional Analysis", "📈 Distribution Channels", "🛍️ Top Products"]) with tab1: st.subheader("Top 10 Customers by Revenue") customer_summary = (filtered_df.dropna(subset=['NAME1']) .groupby('NAME1', as_index=False)['REVENUE'].sum() .nlargest(10, 'REVENUE')) if customer_summary.empty: st.info("No customer data to display.") else: fig = px.bar(customer_summary, x='REVENUE', y='NAME1', orientation='h', labels={'REVENUE':'Revenue','NAME1':'Customer'}, color='REVENUE') st.plotly_chart(fig.update_layout(yaxis={'categoryorder':'total ascending'}), use_container_width=True) with tab2: st.subheader("Revenue by Country") regional_summary = (filtered_df.dropna(subset=['LAND1']) .groupby('LAND1', as_index=False)['REVENUE'].sum() .nlargest(top_n_countries, 'REVENUE')) if regional_summary.empty: st.info("No country data to display.") else: fig = px.pie(regional_summary, values='REVENUE', names='LAND1', title=f"Top {top_n_countries} Countries by Revenue") st.plotly_chart(fig, use_container_width=True) with tab3: st.subheader("Revenue by Distribution Channel") channel_summary = (filtered_df.dropna(subset=['VTWEG']) .groupby('VTWEG', as_index=False)['REVENUE'].sum()) channel_summary['VTWEG'] = channel_summary['VTWEG'].astype(str) if channel_summary.empty: st.info("No distribution channel data to display.") else: fig = px.bar(channel_summary, x='VTWEG', y='REVENUE', title="Total Revenue by Distribution Channel", labels={'REVENUE':'Total Revenue','VTWEG':'Distribution Channel'}, color='REVENUE') st.plotly_chart(fig, use_container_width=True) with tab4: st.subheader("Top 10 Products by Revenue") product_summary = (filtered_df.dropna(subset=['MAKTX']) .groupby('MAKTX', as_index=False)['REVENUE'].sum() .nlargest(10, 'REVENUE')) if product_summary.empty: st.info("No product data to display.") else: fig = px.bar(product_summary, x='REVENUE', y='MAKTX', orientation='h', labels={'REVENUE':'Revenue','MAKTX':'Product'}, color='REVENUE') st.plotly_chart(fig.update_layout(yaxis={'categoryorder':'total ascending'}), use_container_width=True) st.markdown("---") st.markdown("

Built with Streamlit • Real SAP ERP sample data (Kaggle)

", unsafe_allow_html=True)