Spaces:

DavMelchi
/

db_query

Running

File size: 13,593 Bytes

9d2b604

from datetime import datetime

import pandas as pd
import plotly.express as px
import streamlit as st

from utils.convert_to_excel import convert_dfs, save_dataframe
from utils.utils_vars import get_physical_db


class TraficAnalysis:
    last_period_df: pd.DataFrame = None


############### PROCESSING ###############
def extract_code(name):
    name = name.replace(" ", "_") if isinstance(name, str) else None
    return int(name.split("_")[0]) if name and len(name) >= 10 else None


def preprocess_2g(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df["BCF name"].str.len() >= 10].copy()
    df["2g_data_trafic"] = df["TRAFFIC_PS DL"] + df["PS_UL_Load"]
    df.rename(columns={"2G_Carried Traffic": "2g_voice_trafic"}, inplace=True)
    df["code"] = df["BCF name"].apply(extract_code)
    df["date"] = pd.to_datetime(df["PERIOD_START_TIME"], format="%m.%d.%Y")
    df["ID"] = df["date"].astype(str) + "_" + df["code"].astype(str)
    df = df.groupby(["date", "ID", "code"], as_index=False)[
        ["2g_data_trafic", "2g_voice_trafic"]
    ].sum()
    return df


def preprocess_3g(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df["WBTS name"].str.len() >= 10].copy()
    df["code"] = df["WBTS name"].apply(extract_code)
    df["date"] = pd.to_datetime(df["PERIOD_START_TIME"], format="%m.%d.%Y")
    df["ID"] = df["date"].astype(str) + "_" + df["code"].astype(str)
    df.rename(
        columns={
            "Total CS traffic - Erl": "3g_voice_trafic",
            "Total_Data_Traffic": "3g_data_trafic",
        },
        inplace=True,
    )
    df = df.groupby(["date", "ID", "code"], as_index=False)[
        ["3g_voice_trafic", "3g_data_trafic"]
    ].sum()
    return df


def preprocess_lte(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df["LNBTS name"].str.len() >= 10].copy()
    df["lte_data_trafic"] = (
        df["4G/LTE DL Traffic Volume (GBytes)"]
        + df["4G/LTE UL Traffic Volume (GBytes)"]
    )
    df["code"] = df["LNBTS name"].apply(extract_code)
    df["date"] = pd.to_datetime(df["PERIOD_START_TIME"], format="%m.%d.%Y")
    df["ID"] = df["date"].astype(str) + "_" + df["code"].astype(str)
    df = df.groupby(["date", "ID", "code"], as_index=False)[["lte_data_trafic"]].sum()
    return df


############################## ANALYSIS ################
def merge_and_compare(df_2g, df_3g, df_lte, pre_range, post_range, last_period_range):

    # Load physical database
    physical_db = get_physical_db()
    physical_db["code"] = physical_db["Code_Sector"].str.split("_").str[0]
    physical_db["code"] = (
        pd.to_numeric(physical_db["code"], errors="coerce").fillna(0).astype(int)
    )
    physical_db = physical_db[["code", "Longitude", "Latitude", "City"]]
    physical_db = physical_db.drop_duplicates(subset="code")

    df = pd.merge(df_2g, df_3g, on=["date", "ID", "code"], how="outer")
    df = pd.merge(df, df_lte, on=["date", "ID", "code"], how="outer")
    # print(df)

    for col in [
        "2g_data_trafic",
        "2g_voice_trafic",
        "3g_voice_trafic",
        "3g_data_trafic",
        "lte_data_trafic",
    ]:
        if col not in df:
            df[col] = 0

    df.fillna(0, inplace=True)

    df["total_voice_trafic"] = df["2g_voice_trafic"] + df["3g_voice_trafic"]
    df["total_data_trafic"] = (
        df["2g_data_trafic"] + df["3g_data_trafic"] + df["lte_data_trafic"]
    )
    df = pd.merge(df, physical_db, on=["code"], how="left")

    # Assign period based on date range
    pre_start, pre_end = pd.to_datetime(pre_range[0]), pd.to_datetime(pre_range[1])
    post_start, post_end = pd.to_datetime(post_range[0]), pd.to_datetime(post_range[1])
    last_period_start, last_period_end = pd.to_datetime(
        last_period_range[0]
    ), pd.to_datetime(last_period_range[1])

    last_period = df[
        (df["date"] >= last_period_start) & (df["date"] <= last_period_end)
    ]

    def assign_period(date):
        if pre_start <= date <= pre_end:
            return "pre"
        elif post_start <= date <= post_end:
            return "post"
        else:
            return "other"

    df["period"] = df["date"].apply(assign_period)

    comparison = df[df["period"].isin(["pre", "post"])]

    pivot = (
        comparison.groupby(["code", "period"])[
            ["total_voice_trafic", "total_data_trafic"]
        ]
        .sum()
        .unstack()
    )
    pivot.columns = [f"{metric}_{period}" for metric, period in pivot.columns]
    pivot = pivot.reset_index()

    # Differences
    pivot["total_voice_trafic_diff"] = (
        pivot["total_voice_trafic_post"] - pivot["total_voice_trafic_pre"]
    )
    pivot["total_data_trafic_diff"] = (
        pivot["total_data_trafic_post"] - pivot["total_data_trafic_pre"]
    )

    for metric in ["total_voice_trafic", "total_data_trafic"]:
        pivot[f"{metric}_diff_pct"] = (
            (pivot.get(f"{metric}_post", 0) - pivot.get(f"{metric}_pre", 0))
            / pivot.get(f"{metric}_pre", 1)
        ) * 100
    return df, last_period, pivot.round(2)


############################## UI #########################
st.title("📊 Global Trafic Analysis - 2G / 3G / LTE")
doc_col, image_col = st.columns(2)

with doc_col:
    st.write(
        """
        The report analyzes 2G / 3G / LTE traffic :
        - 2G Traffic Report in CSV format (required columns : BCF name, PERIOD_START_TIME, TRAFFIC_PS DL, PS_UL_Load)
        - 3G Traffic Report in CSV format (required columns : WBTS name, PERIOD_START_TIME, Total CS traffic - Erl, Total_Data_Traffic)
        - LTE Traffic Report in CSV format (required columns : LNBTS name, PERIOD_START_TIME, 4G/LTE DL Traffic Volume (GBytes), 4G/LTE UL Traffic Volume (GBytes))
        """
    )

# with image_col:
#     st.image("./assets/trafic_analysis.png", width=250)


upload_2g_col, upload_3g_col, upload_lte_col = st.columns(3)
with upload_2g_col:
    two_g_file = st.file_uploader(
        "Upload 2G Traffic Report", type=["csv", "xls", "xlsx"]
    )
with upload_3g_col:
    three_g_file = st.file_uploader(
        "Upload 3G Traffic Report", type=["csv", "xls", "xlsx"]
    )
with upload_lte_col:
    lte_file = st.file_uploader(
        "Upload LTE Traffic Report", type=["csv", "xls", "xlsx"]
    )

pre_range_col, post_range_col = st.columns(2)
with pre_range_col:
    pre_range = st.date_input("Pre-period (from - to)", [])
with post_range_col:
    post_range = st.date_input("Post-period (from - to)", [])

last_period_range_col, number_of_top_trafic_sites_col = st.columns(2)
with last_period_range_col:
    last_period_range = st.date_input("Last period (from - to)", [])
with number_of_top_trafic_sites_col:
    number_of_top_trafic_sites = st.number_input(
        "Number of top traffic sites", value=25
    )

if len(pre_range) != 2 or len(post_range) != 2:
    st.warning("⚠️ Please select 2 dates for each period (pre and post).")
    st.stop()
if not all([two_g_file, three_g_file, lte_file]):
    st.info("Please upload all 3 reports and select the comparison periods.")
    st.stop()

if st.button("🔍 Run Analysis"):

    df_2g = pd.read_csv(two_g_file, delimiter=";")
    df_3g = pd.read_csv(three_g_file, delimiter=";")
    df_lte = pd.read_csv(lte_file, delimiter=";")

    df_2g_clean = preprocess_2g(df_2g)
    df_3g_clean = preprocess_3g(df_3g)
    df_lte_clean = preprocess_lte(df_lte)

    full_df, last_period, summary_df = merge_and_compare(
        df_2g_clean, df_3g_clean, df_lte_clean, pre_range, post_range, last_period_range
    )

    # 🔍 Display Summary
    st.success("✅ Analysis completed")
    st.subheader("📈 Summary Analysis Pre / Post")
    st.dataframe(summary_df)
    TraficAnalysis.last_period_df = last_period

#######################################################################################################""

#######################################################################################################
if TraficAnalysis.last_period_df is not None:

    df = TraficAnalysis.last_period_df
    # Get top trafics sites based on total data trafic during last period
    top_sites = (
        df.groupby(["code", "City"])["total_data_trafic"]
        .sum()
        .sort_values(ascending=False)
    )
    top_sites = top_sites.head(number_of_top_trafic_sites)

    st.subheader(f"Top {number_of_top_trafic_sites} sites by data traffic")
    chart_col, data_col = st.columns(2)
    with data_col:
        st.dataframe(top_sites.sort_values(ascending=True))
    # chart
    fig = px.bar(
        top_sites.reset_index(),
        y=top_sites.reset_index()[["City", "code"]].agg(
            lambda x: "_".join(map(str, x)), axis=1
        ),
        x="total_data_trafic",
        title=f"Top {number_of_top_trafic_sites} sites by data traffic",
        orientation="h",
        text="total_data_trafic",
        text_auto=True,
    )
    # fig.update_layout(height=600)
    with chart_col:
        st.plotly_chart(fig)

    # Top sites by voice trafic during last period
    top_sites_voice = (
        df.groupby(["code", "City"])["total_voice_trafic"]
        .sum()
        .sort_values(ascending=False)
    )
    top_sites_voice = top_sites_voice.head(number_of_top_trafic_sites)

    st.subheader(f"Top {number_of_top_trafic_sites} sites by voice traffic")
    chart_col, data_col = st.columns(2)
    with data_col:
        st.dataframe(top_sites_voice.sort_values(ascending=True))
    # chart
    fig = px.bar(
        top_sites_voice.reset_index(),
        y=top_sites_voice.reset_index()[["City", "code"]].agg(
            lambda x: "_".join(map(str, x)), axis=1
        ),
        x="total_voice_trafic",
        title=f"Top {number_of_top_trafic_sites} sites by voice traffic",
        orientation="h",
        text="total_voice_trafic",
        text_auto=True,
    )
    # fig.update_layout(height=600)
    with chart_col:
        st.plotly_chart(fig)

    #####################################################
    min_size = 5
    max_size = 40
    # Map of sum of data trafic during last period
    # Aggregate total data traffic
    df_data = (
        df.groupby(["code", "City", "Latitude", "Longitude"])["total_data_trafic"]
        .sum()
        .reset_index()
    )

    st.subheader("Map of data trafic during last period")

    # Define size range

    # Linear size scaling
    traffic_data_min = df_data["total_data_trafic"].min()
    traffic_data_max = df_data["total_data_trafic"].max()
    df_data["bubble_size"] = df_data["total_data_trafic"].apply(
        lambda x: min_size
        + (max_size - min_size)
        * (x - traffic_data_min)
        / (traffic_data_max - traffic_data_min)
    )

    # Custom blue color scale: start from visible blue
    custom_blue_red = [
        [0.0, "#4292c6"],  # light blue
        [0.2, "#2171b5"],
        [0.4, "#084594"],  # dark blue
        [0.6, "#cb181d"],  # Strong red
        [0.8, "#a50f15"],  # Darker red
        [1.0, "#67000d"],  # Very dark red
    ]

    fig = px.scatter_map(
        df_data,
        lat="Latitude",
        lon="Longitude",
        color="total_data_trafic",
        size="bubble_size",
        color_continuous_scale=custom_blue_red,
        size_max=max_size,
        zoom=10,
        height=600,
        title="Data traffic distribution",
        hover_data={"code": True, "total_data_trafic": True},
        hover_name="code",
        text=[str(x) for x in df_data["code"]],
    )

    fig.update_layout(
        mapbox_style="open-street-map",
        coloraxis_colorbar=dict(title="Total Data Traffic (MB)"),
        coloraxis=dict(cmin=traffic_data_min, cmax=traffic_data_max),
        font=dict(size=10, color="black"),
    )

    st.plotly_chart(fig)

    ########################################################################################
    # Map of sum of voice trafic during last period
    # Aggregate total voice traffic
    df_voice = (
        df.groupby(["code", "City", "Latitude", "Longitude"])["total_voice_trafic"]
        .sum()
        .reset_index()
    )
    st.subheader("Map of voice trafic during last period")

    # Linear size scaling
    traffic_voice_min = df_voice["total_voice_trafic"].min()
    traffic_voice_max = df_voice["total_voice_trafic"].max()
    df_voice["bubble_size"] = df_voice["total_voice_trafic"].apply(
        lambda x: min_size
        + (max_size - min_size)
        * (x - traffic_voice_min)
        / (traffic_voice_max - traffic_voice_min)
    )

    fig = px.scatter_map(
        df_voice,
        lat="Latitude",
        lon="Longitude",
        color="total_voice_trafic",
        size="bubble_size",
        color_continuous_scale=custom_blue_red,
        size_max=max_size,
        zoom=10,
        height=600,
        title="Voice traffic distribution",
        hover_data={"code": True, "total_voice_trafic": True},
        hover_name="code",
        text=[str(x) for x in df_voice["code"]],
    )

    fig.update_layout(
        mapbox_style="open-street-map",
        coloraxis_colorbar=dict(title="Total Voice Traffic (MB)"),
        coloraxis=dict(cmin=traffic_voice_min, cmax=traffic_voice_max),
        font=dict(size=10, color="black"),
    )

    st.plotly_chart(fig)

    final_dfs = convert_dfs(
        [full_df, summary_df], ["Global_Trafic_Analysis", "Pre_Post_analysis"]
    )
    # 📥 Bouton de téléchargement
    st.download_button(
        on_click="ignore",
        type="primary",
        label="Download the Analysis Report",
        data=final_dfs,
        file_name=f"Global_Trafic_Analysis_Report_{datetime.now()}.xlsx",
        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    )