varalakshmi55 commited on
Commit
ccea267
·
verified ·
1 Parent(s): e2b5481

Delete pages/Dashboard.py

Browse files
Files changed (1) hide show
  1. pages/Dashboard.py +0 -177
pages/Dashboard.py DELETED
@@ -1,177 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import seaborn as sns
4
- import matplotlib.pyplot as plt
5
- from Utility.data_loader import load_train_series,load_train_events,load_sample_submission,load_test_series
6
- from sklearn.model_selection import train_test_split
7
- from sklearn.preprocessing import LabelEncoder, StandardScaler
8
- from xgboost import XGBClassifier # or XGBRegressor depending on your task
9
- import xgboost as xgb
10
- import numpy as np
11
-
12
- @st.cache_data
13
- def load_sampled_data():
14
- df3 = pd.read_parquet("train_series.parquet", columns=['series_id', 'step', 'anglez', 'enmo'])
15
- df4 = pd.read_parquet("test_series.parquet", columns=['series_id', 'step', 'anglez', 'enmo'])
16
- df2 = pd.read_csv("train_events.csv")
17
-
18
- # Sample safely based on available data
19
- df3_sample = df3.sample(n=min(5_000_000, len(df3)), random_state=42)
20
- df4_sample = df4.sample(n=min(1_000_000, len(df4)), random_state=42)
21
-
22
- return df3_sample, df4_sample, df2
23
-
24
- # Load
25
- df3, df4, df2 = load_sampled_data()
26
- df = pd.concat([df3, df4], axis=0, ignore_index=True)
27
- merged_df = pd.merge(df, df2, on=['series_id', 'step'], how='inner')
28
-
29
- # Rename timestamp columns if they exist
30
- if 'timestamp_x' in merged_df.columns:
31
- merged_df.rename(columns={'timestamp_x': 'sensor_timestamp'}, inplace=True)
32
- if 'timestamp_y' in merged_df.columns:
33
- merged_df.rename(columns={'timestamp_y': 'event_timestamp'}, inplace=True)
34
-
35
- # Box plots for each numerical feature
36
- fig, ax = plt.subplots(figsize=(2, 1))
37
- sns.boxplot(x=df2['step'], ax=ax)
38
- ax.set_title('Boxplot of Step')
39
-
40
- # Show the plot in Streamlit
41
- st.pyplot(fig)
42
-
43
- st.write("1. Data Visualization - Scatter Plot (feature vs feature or vs target)")
44
- # Assume merged_df is already defined or loaded
45
- df_sample = merged_df # or use df_sample = merged_df.sample(n=50000) to downsample
46
-
47
- st.subheader("Scatter Plot: anglez vs enmo")
48
-
49
- # Create the plot
50
- fig, ax = plt.subplots(figsize=(10, 6))
51
- sns.scatterplot(x='anglez', y='enmo', data=df_sample, ax=ax)
52
- ax.set_title("Scatter Plot: anglez vs enmo")
53
-
54
- # Display in Streamlit
55
- st.pyplot(fig)
56
-
57
- # df_sample = merged_df.sample(n=10000) # adjust sample size for performance
58
-
59
- # # Subheader
60
- # st.subheader("Pair Plot of Features")
61
-
62
- # # Create pairplot
63
- # fig = sns.pairplot(df_sample[['anglez', 'enmo', 'step']])
64
- # fig.fig.suptitle("Pair Plot of Features", y=1.02)
65
-
66
- # # Display in Streamlit
67
- # st.pyplot(fig)
68
- # Define columns to plot
69
- plot_columns = ['anglez', 'enmo', 'step']
70
-
71
- # Safety check: make sure required columns exist
72
- if all(col in merged_df.columns for col in plot_columns):
73
-
74
- # Check data size and sample accordingly
75
- max_rows = len(merged_df)
76
- sample_size = min(10000, max_rows) # Don't exceed available rows
77
-
78
- df_sample = merged_df.sample(n=sample_size)
79
-
80
- # Subheader
81
- st.subheader("Pair Plot of Features")
82
-
83
- # Create pairplot
84
- fig = sns.pairplot(df_sample[plot_columns])
85
- fig.fig.suptitle("Pair Plot of Features", y=1.02)
86
-
87
- # Display in Streamlit
88
- st.pyplot(fig)
89
-
90
- else:
91
- st.error("One or more required columns ('anglez', 'enmo', 'step') are missing in the dataset.")
92
-
93
-
94
-
95
-
96
- # Define features to plot
97
- plot_features = ['anglez', 'enmo']
98
-
99
- # Check if the required columns exist in the DataFrame
100
- if all(col in merged_df.columns for col in plot_features):
101
- total_rows = len(merged_df)
102
- sample_size = 10000
103
-
104
- # Handle small datasets
105
- if total_rows < sample_size:
106
- st.info(f"Only {total_rows} rows available — using full dataset.")
107
- df_sample = merged_df.copy()
108
- else:
109
- df_sample = merged_df.sample(n=sample_size)
110
-
111
- # Plot
112
- fig, axes = plt.subplots(1, 2, figsize=(14, 5))
113
-
114
- sns.histplot(df_sample['anglez'], kde=True, bins=50, ax=axes[0])
115
- axes[0].set_title("Distribution of anglez")
116
-
117
- sns.histplot(df_sample['enmo'], kde=True, bins=50, ax=axes[1])
118
- axes[1].set_title("Distribution of enmo")
119
-
120
- plt.tight_layout()
121
- st.pyplot(fig)
122
-
123
- else:
124
- st.error("Required columns not found in the dataset.")
125
-
126
-
127
-
128
-
129
- st.write("Multicollinearity Check - Correlation Matrix")
130
- features = ['anglez', 'enmo', 'step', 'night']
131
- df_subset = merged_df[features]
132
-
133
- # Streamlit title
134
- st.subheader("Multicollinearity Check - Correlation Matrix")
135
-
136
- # Calculate correlation matrix
137
- corr_matrix = df_subset.corr()
138
-
139
- # Plot heatmap
140
- fig, ax = plt.subplots(figsize=(6, 4))
141
- sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
142
- ax.set_title("Correlation Matrix")
143
-
144
- # Display in Streamlit
145
- st.pyplot(fig)
146
-
147
- # Encode
148
- le = LabelEncoder()
149
- merged_df['series_id'] = le.fit_transform(merged_df['series_id'])
150
- merged_df['event'] = le.fit_transform(merged_df['event'])
151
-
152
- # Drop columns with string or datetime values
153
- drop_cols = ['sensor_timestamp', 'event_timestamp', 'night', 'step', 'sleep_duration_hrs', 'series_id']
154
- df_cleaned = merged_df.drop(columns=[col for col in drop_cols if col in merged_df.columns])
155
-
156
- # Ensure only numeric features in X
157
- X = df_cleaned.drop('event', axis=1).select_dtypes(include=[np.number])
158
- y = merged_df['event']
159
-
160
- # Split and scale
161
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)
162
-
163
- st.write("Feature Importance")
164
- # Create model instance
165
- xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss') # example for classification
166
-
167
- # Fit the model
168
- xgb_model.fit(X_train, y_train)
169
-
170
- # Plot feature importance
171
- fig, ax = plt.subplots()
172
- xgb.plot_importance(xgb_model, ax=ax)
173
- ax.set_title("XGBoost Feature Importance")
174
-
175
- # Show in Streamlit
176
- st.subheader("XGBoost Feature Importance")
177
- st.pyplot(fig)