File size: 11,589 Bytes
a5de76b
82197ce
73a7361
a5de76b
fcccb84
f7dbc16
82197ce
 
fcccb84
ec9c243
 
82197ce
 
ec9c243
 
 
 
 
 
 
 
 
82197ce
 
 
ec9c243
4303c4f
82197ce
6b2600f
06e3d05
73f080a
06e3d05
 
a5de76b
f7dbc16
f325816
f7dbc16
06e3d05
4303c4f
ec9c243
73f080a
 
06e3d05
ec9c243
f7dbc16
ec9c243
 
 
 
 
1611d3c
6b2600f
73f080a
6b2600f
ec9c243
4303c4f
 
6b2600f
4303c4f
 
 
 
06e3d05
ec9c243
4303c4f
a5de76b
 
ec9c243
 
a5de76b
 
 
 
ec9c243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5de76b
ec9c243
 
 
 
 
 
 
 
 
a5de76b
ec9c243
 
 
 
 
 
 
a5de76b
 
 
 
ec9c243
 
a5de76b
ec9c243
 
 
a5de76b
ec9c243
 
a5de76b
6b2600f
73f080a
82197ce
ec9c243
 
 
 
 
 
73f080a
 
ec9c243
73f080a
 
 
 
ec9c243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73f080a
 
 
 
ec9c243
 
 
 
 
a5de76b
ec9c243
 
 
73f080a
 
 
ec9c243
 
 
 
 
a5de76b
ec9c243
 
 
73f080a
 
 
ec9c243
 
73f080a
ec9c243
 
a5de76b
ec9c243
 
 
 
 
73f080a
 
 
ec9c243
 
 
 
 
a5de76b
ec9c243
 
 
73f080a
 
ec9c243
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# app.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import os
import warnings
warnings.filterwarnings('ignore')

# ---------- Page & Styles ----------
st.set_page_config(page_title="SAP Sales KPI Dashboard", page_icon="πŸ“Š", layout="wide")
st.markdown("""
<style>
  /* hide default sidebar entirely */
  [data-testid="stSidebar"] { display: none; }
  .main-header { font-size: 2.2rem; font-weight: 800; color: #1f4e79; text-align: left; margin-bottom: .25rem; }
  .subtle { color:#6b7280; margin-bottom:1.25rem; }
  .filter-card { background:#f8f9fa; padding: .9rem 1rem; border-radius:12px; border:1px solid #edf2f7; }
  .kpi-card { background: #ffffff; padding: 1.25rem; border-radius: 14px; border:1px solid #e5e7eb; box-shadow: 0 1px 2px rgba(0,0,0,.03); }
  .kpi-value { font-size: 2.1rem; font-weight: 800; color: #1f4e79; line-height:1; }
  .kpi-label { font-size: .95rem; color: #6b7280; }
  .block-container { padding-top: 1.2rem; }
</style>
""", unsafe_allow_html=True)

# ---------- Kaggle load ----------
@st.cache_data(ttl=3600)
def load_kaggle_sap_data():
    try:
        if 'KAGGLE_USERNAME' not in st.secrets or 'KAGGLE_KEY' not in st.secrets:
            return "Kaggle credentials not found in Streamlit secrets."
        os.environ['KAGGLE_USERNAME'] = st.secrets['KAGGLE_USERNAME']
        os.environ['KAGGLE_KEY'] = st.secrets['KAGGLE_KEY']

        import kaggle
        dataset_name = "mustafakeser4/sap-dataset-bigquery-dataset"
        download_path = "./kaggle_data"

        if not os.path.exists(os.path.join(download_path, 'vbak.csv')):
            with st.spinner("Downloading dataset from Kaggle..."):
                kaggle.api.authenticate()
                kaggle.api.dataset_download_files(dataset_name, path=download_path, unzip=True)

        needed = {'vbak': 'vbak.csv', 'vbap': 'vbap.csv', 'kna1': 'kna1.csv', 'makt': 'makt.csv'}
        tables = {}
        for k, fn in needed.items():
            fp = os.path.join(download_path, fn)
            if not os.path.exists(fp):
                return f"Expected file missing: {fn}"
            tables[k] = pd.read_csv(fp, low_memory=False)
        return tables
    except Exception as e:
        return f"Error during Kaggle data loading: {e}"

# ---------- Processing (robust revenue + safe merges) ----------
@st.cache_data
def create_sales_analytics(_tables):
    try:
        vbak = _tables['vbak'].copy()
        vbap = _tables['vbap'].copy()
        kna1 = _tables['kna1'].copy()
        makt = _tables['makt'].copy()

        # normalize column names
        for df in [vbak, vbap, kna1, makt]:
            df.columns = [c.upper().strip() for c in df.columns]

        # keep SAP keys as strings (avoid leading-zero loss)
        def as_str(df, cols):
            for c in cols:
                if c in df.columns:
                    df[c] = df[c].astype(str).str.strip()

        as_str(vbak, ['VBELN','KUNNR','VKORG','VTWEG'])
        as_str(vbap, ['VBELN','MATNR'])
        as_str(kna1, ['KUNNR'])
        as_str(makt, ['MATNR'])

        # choose numeric helper
        def pick_numeric(df, cols):
            for c in cols:
                if c in df.columns:
                    s = pd.to_numeric(df[c], errors='coerce')
                    if s.notna().sum() > 0 and s.abs().sum() > 0:
                        return s
            return pd.Series(0.0, index=df.index)

        # Build item-level REVENUE
        # primary: NETWR at item level (VBAP)
        netwr_item = pick_numeric(vbap, ['NETWR'])

        # fallback: price * qty using common SAP columns
        price = pick_numeric(vbap, ['NETPR', 'KBETR', 'NETPR_I'])
        qty   = pick_numeric(vbap, ['KWMENG', 'KTMNG', 'MENGE'])
        fallback_rev = (price.fillna(0) * qty.fillna(0)).fillna(0)

        vbap['REVENUE'] = np.where(netwr_item > 0, netwr_item, fallback_rev).astype(float)

        # header fields (include currency if present)
        keep_vbak = ['VBELN','KUNNR','VKORG','VTWEG','ERDAT'] + (['WAERK'] if 'WAERK' in vbak.columns else [])
        vbak_small = vbak[keep_vbak].drop_duplicates('VBELN')
        vbap_small = vbap[['VBELN','MATNR','REVENUE']]
        kna1_small = kna1[['KUNNR','NAME1','LAND1']] if {'KUNNR','NAME1','LAND1'}.issubset(kna1.columns) else pd.DataFrame(columns=['KUNNR','NAME1','LAND1'])

        # product text in English
        makt_en = makt[makt['SPRAS'].eq('E')] if 'SPRAS' in makt.columns else makt
        makt_small = makt_en[['MATNR','MAKTX']].drop_duplicates('MATNR') if {'MATNR','MAKTX'}.issubset(makt_en.columns) else pd.DataFrame(columns=['MATNR','MAKTX'])

        # final sales table
        sales = (vbap_small
                 .merge(vbak_small, on='VBELN', how='inner')
                 .merge(kna1_small, on='KUNNR', how='left')
                 .merge(makt_small, on='MATNR', how='left'))

        if 'ERDAT' in sales.columns:
            sales['ERDAT'] = pd.to_datetime(sales['ERDAT'], errors='coerce')

        # ensure columns exist
        for col in ['REVENUE','LAND1','VTWEG','NAME1','MAKTX','VBELN','KUNNR','VKORG']:
            if col not in sales.columns:
                sales[col] = np.nan if col != 'REVENUE' else 0.0
        if 'WAERK' not in sales.columns:
            sales['WAERK'] = 'N/A'

        # drop obvious junk rows
        sales = sales.replace([np.inf, -np.inf], np.nan).dropna(subset=['REVENUE'])
        return sales
    except Exception as e:
        return f"Error processing sales data: {e}"

# ---------- App ----------
st.markdown('<div class="main-header">🎯 SAP Sales KPI Dashboard</div><div class="subtle">Real SAP ERP sample data (Kaggle)</div>', unsafe_allow_html=True)

tables = load_kaggle_sap_data()
if isinstance(tables, str):
    st.error(tables)
    st.stop()

sales_df = create_sales_analytics(tables)
if isinstance(sales_df, str):
    st.error(sales_df)
    st.stop()

# ---------- Filter Bar (no sidebar) ----------
with st.container():
    st.markdown('<div class="filter-card">', unsafe_allow_html=True)
    c1, c2, c3, c4 = st.columns([1.2, 1.2, 3, 0.9])

    # currency filter
    currencies = [c for c in sales_df['WAERK'].dropna().unique().tolist() if c != 'N/A']
    default_cur = sales_df['WAERK'].mode().iat[0] if len(sales_df) and sales_df['WAERK'].notna().any() else 'N/A'
    with c1:
        currency = st.selectbox("Currency", options=(['All'] + sorted(currencies)) if currencies else ['All'], index=0 if currencies else 0)

    # Top N
    with c2:
        top_n_countries = st.slider("Top N Countries", 5, 20, 10)

    # Region multiselect inside expander to keep tidy
    with c3:
        with st.expander("Region (Country) – click to choose", expanded=False):
            all_countries = sorted(sales_df['LAND1'].dropna().unique().tolist())
            # buttons to select/clear
            b1, b2 = st.columns([1,1])
            if 'selected_countries' not in st.session_state:
                st.session_state.selected_countries = all_countries
            with b1:
                if st.button("Select All"):
                    st.session_state.selected_countries = all_countries
            with b2:
                if st.button("Clear"):
                    st.session_state.selected_countries = []
            selected_region = st.multiselect("Countries", options=all_countries, default=st.session_state.selected_countries, key="countries_ms")

    with c4:
        if st.button("πŸ”„ Clear Cache"):
            st.cache_data.clear()
            st.rerun()
    st.markdown('</div>', unsafe_allow_html=True)

# apply filters
filtered_df = sales_df.copy()
if currency and currency != 'All':
    filtered_df = filtered_df[filtered_df['WAERK'] == currency]
if 'countries_ms' in st.session_state:
    filtered_df = filtered_df[filtered_df['LAND1'].isin(st.session_state.countries_ms)]

st.success(f"βœ… Loaded and processed {len(filtered_df):,} sales line-items after filters.")

# ---------- KPIs ----------
st.subheader("Sales KPIs")
k1,k2,k3,k4 = st.columns(4)
with k1: st.markdown(f'<div class="kpi-card"><div class="kpi-value">${float(filtered_df["REVENUE"].sum()):,.0f}</div><div class="kpi-label">Total Revenue</div></div>', unsafe_allow_html=True)
with k2: st.markdown(f'<div class="kpi-card"><div class="kpi-value">{int(filtered_df["KUNNR"].nunique())}</div><div class="kpi-label">Active Customers</div></div>', unsafe_allow_html=True)
with k3:
    aov = float(filtered_df.loc[filtered_df['REVENUE']>0,'REVENUE'].mean() or 0.0)
    st.markdown(f'<div class="kpi-card"><div class="kpi-value">${aov:,.0f}</div><div class="kpi-label">Avg Order Value (item)</div></div>', unsafe_allow_html=True)
with k4: st.markdown(f'<div class="kpi-card"><div class="kpi-value">{int(filtered_df["VBELN"].nunique())}</div><div class="kpi-label">Sales Orders</div></div>', unsafe_allow_html=True)

# ---------- Tabs ----------
tab1, tab2, tab3, tab4 = st.tabs(["πŸ‘₯ Top Customers", "🌍 Regional Analysis", "πŸ“ˆ Distribution Channels", "πŸ›οΈ Top Products"])

with tab1:
    st.subheader("Top 10 Customers by Revenue")
    customer_summary = (filtered_df.dropna(subset=['NAME1'])
                        .groupby('NAME1', as_index=False)['REVENUE'].sum()
                        .nlargest(10, 'REVENUE'))
    if customer_summary.empty:
        st.info("No customer data to display.")
    else:
        fig = px.bar(customer_summary, x='REVENUE', y='NAME1', orientation='h',
                     labels={'REVENUE':'Revenue','NAME1':'Customer'}, color='REVENUE')
        st.plotly_chart(fig.update_layout(yaxis={'categoryorder':'total ascending'}), use_container_width=True)

with tab2:
    st.subheader("Revenue by Country")
    regional_summary = (filtered_df.dropna(subset=['LAND1'])
                        .groupby('LAND1', as_index=False)['REVENUE'].sum()
                        .nlargest(top_n_countries, 'REVENUE'))
    if regional_summary.empty:
        st.info("No country data to display.")
    else:
        fig = px.pie(regional_summary, values='REVENUE', names='LAND1',
                     title=f"Top {top_n_countries} Countries by Revenue")
        st.plotly_chart(fig, use_container_width=True)

with tab3:
    st.subheader("Revenue by Distribution Channel")
    channel_summary = (filtered_df.dropna(subset=['VTWEG'])
                       .groupby('VTWEG', as_index=False)['REVENUE'].sum())
    channel_summary['VTWEG'] = channel_summary['VTWEG'].astype(str)
    if channel_summary.empty:
        st.info("No distribution channel data to display.")
    else:
        fig = px.bar(channel_summary, x='VTWEG', y='REVENUE',
                     title="Total Revenue by Distribution Channel",
                     labels={'REVENUE':'Total Revenue','VTWEG':'Distribution Channel'},
                     color='REVENUE')
        st.plotly_chart(fig, use_container_width=True)

with tab4:
    st.subheader("Top 10 Products by Revenue")
    product_summary = (filtered_df.dropna(subset=['MAKTX'])
                       .groupby('MAKTX', as_index=False)['REVENUE'].sum()
                       .nlargest(10, 'REVENUE'))
    if product_summary.empty:
        st.info("No product data to display.")
    else:
        fig = px.bar(product_summary, x='REVENUE', y='MAKTX', orientation='h',
                     labels={'REVENUE':'Revenue','MAKTX':'Product'}, color='REVENUE')
        st.plotly_chart(fig.update_layout(yaxis={'categoryorder':'total ascending'}), use_container_width=True)

st.markdown("---")
st.markdown("<p style='text-align:center;'>Built with Streamlit β€’ Real SAP ERP sample data (Kaggle)</p>", unsafe_allow_html=True)