db_query / apps /kpi_analysis /trafic_analysis.py
DavMelchi's picture
Adding global trafic analysis
9d2b604
from datetime import datetime
import pandas as pd
import plotly.express as px
import streamlit as st
from utils.convert_to_excel import convert_dfs, save_dataframe
from utils.utils_vars import get_physical_db
class TraficAnalysis:
last_period_df: pd.DataFrame = None
############### PROCESSING ###############
def extract_code(name):
name = name.replace(" ", "_") if isinstance(name, str) else None
return int(name.split("_")[0]) if name and len(name) >= 10 else None
def preprocess_2g(df: pd.DataFrame) -> pd.DataFrame:
df = df[df["BCF name"].str.len() >= 10].copy()
df["2g_data_trafic"] = df["TRAFFIC_PS DL"] + df["PS_UL_Load"]
df.rename(columns={"2G_Carried Traffic": "2g_voice_trafic"}, inplace=True)
df["code"] = df["BCF name"].apply(extract_code)
df["date"] = pd.to_datetime(df["PERIOD_START_TIME"], format="%m.%d.%Y")
df["ID"] = df["date"].astype(str) + "_" + df["code"].astype(str)
df = df.groupby(["date", "ID", "code"], as_index=False)[
["2g_data_trafic", "2g_voice_trafic"]
].sum()
return df
def preprocess_3g(df: pd.DataFrame) -> pd.DataFrame:
df = df[df["WBTS name"].str.len() >= 10].copy()
df["code"] = df["WBTS name"].apply(extract_code)
df["date"] = pd.to_datetime(df["PERIOD_START_TIME"], format="%m.%d.%Y")
df["ID"] = df["date"].astype(str) + "_" + df["code"].astype(str)
df.rename(
columns={
"Total CS traffic - Erl": "3g_voice_trafic",
"Total_Data_Traffic": "3g_data_trafic",
},
inplace=True,
)
df = df.groupby(["date", "ID", "code"], as_index=False)[
["3g_voice_trafic", "3g_data_trafic"]
].sum()
return df
def preprocess_lte(df: pd.DataFrame) -> pd.DataFrame:
df = df[df["LNBTS name"].str.len() >= 10].copy()
df["lte_data_trafic"] = (
df["4G/LTE DL Traffic Volume (GBytes)"]
+ df["4G/LTE UL Traffic Volume (GBytes)"]
)
df["code"] = df["LNBTS name"].apply(extract_code)
df["date"] = pd.to_datetime(df["PERIOD_START_TIME"], format="%m.%d.%Y")
df["ID"] = df["date"].astype(str) + "_" + df["code"].astype(str)
df = df.groupby(["date", "ID", "code"], as_index=False)[["lte_data_trafic"]].sum()
return df
############################## ANALYSIS ################
def merge_and_compare(df_2g, df_3g, df_lte, pre_range, post_range, last_period_range):
# Load physical database
physical_db = get_physical_db()
physical_db["code"] = physical_db["Code_Sector"].str.split("_").str[0]
physical_db["code"] = (
pd.to_numeric(physical_db["code"], errors="coerce").fillna(0).astype(int)
)
physical_db = physical_db[["code", "Longitude", "Latitude", "City"]]
physical_db = physical_db.drop_duplicates(subset="code")
df = pd.merge(df_2g, df_3g, on=["date", "ID", "code"], how="outer")
df = pd.merge(df, df_lte, on=["date", "ID", "code"], how="outer")
# print(df)
for col in [
"2g_data_trafic",
"2g_voice_trafic",
"3g_voice_trafic",
"3g_data_trafic",
"lte_data_trafic",
]:
if col not in df:
df[col] = 0
df.fillna(0, inplace=True)
df["total_voice_trafic"] = df["2g_voice_trafic"] + df["3g_voice_trafic"]
df["total_data_trafic"] = (
df["2g_data_trafic"] + df["3g_data_trafic"] + df["lte_data_trafic"]
)
df = pd.merge(df, physical_db, on=["code"], how="left")
# Assign period based on date range
pre_start, pre_end = pd.to_datetime(pre_range[0]), pd.to_datetime(pre_range[1])
post_start, post_end = pd.to_datetime(post_range[0]), pd.to_datetime(post_range[1])
last_period_start, last_period_end = pd.to_datetime(
last_period_range[0]
), pd.to_datetime(last_period_range[1])
last_period = df[
(df["date"] >= last_period_start) & (df["date"] <= last_period_end)
]
def assign_period(date):
if pre_start <= date <= pre_end:
return "pre"
elif post_start <= date <= post_end:
return "post"
else:
return "other"
df["period"] = df["date"].apply(assign_period)
comparison = df[df["period"].isin(["pre", "post"])]
pivot = (
comparison.groupby(["code", "period"])[
["total_voice_trafic", "total_data_trafic"]
]
.sum()
.unstack()
)
pivot.columns = [f"{metric}_{period}" for metric, period in pivot.columns]
pivot = pivot.reset_index()
# Differences
pivot["total_voice_trafic_diff"] = (
pivot["total_voice_trafic_post"] - pivot["total_voice_trafic_pre"]
)
pivot["total_data_trafic_diff"] = (
pivot["total_data_trafic_post"] - pivot["total_data_trafic_pre"]
)
for metric in ["total_voice_trafic", "total_data_trafic"]:
pivot[f"{metric}_diff_pct"] = (
(pivot.get(f"{metric}_post", 0) - pivot.get(f"{metric}_pre", 0))
/ pivot.get(f"{metric}_pre", 1)
) * 100
return df, last_period, pivot.round(2)
############################## UI #########################
st.title("📊 Global Trafic Analysis - 2G / 3G / LTE")
doc_col, image_col = st.columns(2)
with doc_col:
st.write(
"""
The report analyzes 2G / 3G / LTE traffic :
- 2G Traffic Report in CSV format (required columns : BCF name, PERIOD_START_TIME, TRAFFIC_PS DL, PS_UL_Load)
- 3G Traffic Report in CSV format (required columns : WBTS name, PERIOD_START_TIME, Total CS traffic - Erl, Total_Data_Traffic)
- LTE Traffic Report in CSV format (required columns : LNBTS name, PERIOD_START_TIME, 4G/LTE DL Traffic Volume (GBytes), 4G/LTE UL Traffic Volume (GBytes))
"""
)
# with image_col:
# st.image("./assets/trafic_analysis.png", width=250)
upload_2g_col, upload_3g_col, upload_lte_col = st.columns(3)
with upload_2g_col:
two_g_file = st.file_uploader(
"Upload 2G Traffic Report", type=["csv", "xls", "xlsx"]
)
with upload_3g_col:
three_g_file = st.file_uploader(
"Upload 3G Traffic Report", type=["csv", "xls", "xlsx"]
)
with upload_lte_col:
lte_file = st.file_uploader(
"Upload LTE Traffic Report", type=["csv", "xls", "xlsx"]
)
pre_range_col, post_range_col = st.columns(2)
with pre_range_col:
pre_range = st.date_input("Pre-period (from - to)", [])
with post_range_col:
post_range = st.date_input("Post-period (from - to)", [])
last_period_range_col, number_of_top_trafic_sites_col = st.columns(2)
with last_period_range_col:
last_period_range = st.date_input("Last period (from - to)", [])
with number_of_top_trafic_sites_col:
number_of_top_trafic_sites = st.number_input(
"Number of top traffic sites", value=25
)
if len(pre_range) != 2 or len(post_range) != 2:
st.warning("⚠️ Please select 2 dates for each period (pre and post).")
st.stop()
if not all([two_g_file, three_g_file, lte_file]):
st.info("Please upload all 3 reports and select the comparison periods.")
st.stop()
if st.button("🔍 Run Analysis"):
df_2g = pd.read_csv(two_g_file, delimiter=";")
df_3g = pd.read_csv(three_g_file, delimiter=";")
df_lte = pd.read_csv(lte_file, delimiter=";")
df_2g_clean = preprocess_2g(df_2g)
df_3g_clean = preprocess_3g(df_3g)
df_lte_clean = preprocess_lte(df_lte)
full_df, last_period, summary_df = merge_and_compare(
df_2g_clean, df_3g_clean, df_lte_clean, pre_range, post_range, last_period_range
)
# 🔍 Display Summary
st.success("✅ Analysis completed")
st.subheader("📈 Summary Analysis Pre / Post")
st.dataframe(summary_df)
TraficAnalysis.last_period_df = last_period
#######################################################################################################""
#######################################################################################################
if TraficAnalysis.last_period_df is not None:
df = TraficAnalysis.last_period_df
# Get top trafics sites based on total data trafic during last period
top_sites = (
df.groupby(["code", "City"])["total_data_trafic"]
.sum()
.sort_values(ascending=False)
)
top_sites = top_sites.head(number_of_top_trafic_sites)
st.subheader(f"Top {number_of_top_trafic_sites} sites by data traffic")
chart_col, data_col = st.columns(2)
with data_col:
st.dataframe(top_sites.sort_values(ascending=True))
# chart
fig = px.bar(
top_sites.reset_index(),
y=top_sites.reset_index()[["City", "code"]].agg(
lambda x: "_".join(map(str, x)), axis=1
),
x="total_data_trafic",
title=f"Top {number_of_top_trafic_sites} sites by data traffic",
orientation="h",
text="total_data_trafic",
text_auto=True,
)
# fig.update_layout(height=600)
with chart_col:
st.plotly_chart(fig)
# Top sites by voice trafic during last period
top_sites_voice = (
df.groupby(["code", "City"])["total_voice_trafic"]
.sum()
.sort_values(ascending=False)
)
top_sites_voice = top_sites_voice.head(number_of_top_trafic_sites)
st.subheader(f"Top {number_of_top_trafic_sites} sites by voice traffic")
chart_col, data_col = st.columns(2)
with data_col:
st.dataframe(top_sites_voice.sort_values(ascending=True))
# chart
fig = px.bar(
top_sites_voice.reset_index(),
y=top_sites_voice.reset_index()[["City", "code"]].agg(
lambda x: "_".join(map(str, x)), axis=1
),
x="total_voice_trafic",
title=f"Top {number_of_top_trafic_sites} sites by voice traffic",
orientation="h",
text="total_voice_trafic",
text_auto=True,
)
# fig.update_layout(height=600)
with chart_col:
st.plotly_chart(fig)
#####################################################
min_size = 5
max_size = 40
# Map of sum of data trafic during last period
# Aggregate total data traffic
df_data = (
df.groupby(["code", "City", "Latitude", "Longitude"])["total_data_trafic"]
.sum()
.reset_index()
)
st.subheader("Map of data trafic during last period")
# Define size range
# Linear size scaling
traffic_data_min = df_data["total_data_trafic"].min()
traffic_data_max = df_data["total_data_trafic"].max()
df_data["bubble_size"] = df_data["total_data_trafic"].apply(
lambda x: min_size
+ (max_size - min_size)
* (x - traffic_data_min)
/ (traffic_data_max - traffic_data_min)
)
# Custom blue color scale: start from visible blue
custom_blue_red = [
[0.0, "#4292c6"], # light blue
[0.2, "#2171b5"],
[0.4, "#084594"], # dark blue
[0.6, "#cb181d"], # Strong red
[0.8, "#a50f15"], # Darker red
[1.0, "#67000d"], # Very dark red
]
fig = px.scatter_map(
df_data,
lat="Latitude",
lon="Longitude",
color="total_data_trafic",
size="bubble_size",
color_continuous_scale=custom_blue_red,
size_max=max_size,
zoom=10,
height=600,
title="Data traffic distribution",
hover_data={"code": True, "total_data_trafic": True},
hover_name="code",
text=[str(x) for x in df_data["code"]],
)
fig.update_layout(
mapbox_style="open-street-map",
coloraxis_colorbar=dict(title="Total Data Traffic (MB)"),
coloraxis=dict(cmin=traffic_data_min, cmax=traffic_data_max),
font=dict(size=10, color="black"),
)
st.plotly_chart(fig)
########################################################################################
# Map of sum of voice trafic during last period
# Aggregate total voice traffic
df_voice = (
df.groupby(["code", "City", "Latitude", "Longitude"])["total_voice_trafic"]
.sum()
.reset_index()
)
st.subheader("Map of voice trafic during last period")
# Linear size scaling
traffic_voice_min = df_voice["total_voice_trafic"].min()
traffic_voice_max = df_voice["total_voice_trafic"].max()
df_voice["bubble_size"] = df_voice["total_voice_trafic"].apply(
lambda x: min_size
+ (max_size - min_size)
* (x - traffic_voice_min)
/ (traffic_voice_max - traffic_voice_min)
)
fig = px.scatter_map(
df_voice,
lat="Latitude",
lon="Longitude",
color="total_voice_trafic",
size="bubble_size",
color_continuous_scale=custom_blue_red,
size_max=max_size,
zoom=10,
height=600,
title="Voice traffic distribution",
hover_data={"code": True, "total_voice_trafic": True},
hover_name="code",
text=[str(x) for x in df_voice["code"]],
)
fig.update_layout(
mapbox_style="open-street-map",
coloraxis_colorbar=dict(title="Total Voice Traffic (MB)"),
coloraxis=dict(cmin=traffic_voice_min, cmax=traffic_voice_max),
font=dict(size=10, color="black"),
)
st.plotly_chart(fig)
final_dfs = convert_dfs(
[full_df, summary_df], ["Global_Trafic_Analysis", "Pre_Post_analysis"]
)
# 📥 Bouton de téléchargement
st.download_button(
on_click="ignore",
type="primary",
label="Download the Analysis Report",
data=final_dfs,
file_name=f"Global_Trafic_Analysis_Report_{datetime.now()}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
)