MigrationUtility / pages /2_Data _Play_Ground.py
rahgadda's picture
Initial Draft
81edf33
import os
import sys
import io
import re
import base64
import streamlit as st
import pandas as pd
import pandasql as psql
################################
######### Variables ############
################################
# -- Loading Variables
script_directory = os.path.dirname(os.path.abspath(sys.argv[0]))
file_details = pd.DataFrame(columns=['file_name', 'data'])
# -- Loading Session Data
if 'project_data' not in st.session_state:
st.session_state.project_data = pd.read_csv(script_directory+'/data/project.csv')
if 'global_dataframe' not in st.session_state:
st.session_state.global_dataframe=file_details
if 'load_sql' not in st.session_state:
st.session_state.load_sql=False
if 'run_sql' not in st.session_state:
st.session_state.run_sql=False
################################
####### GenericFunctions #######
################################
# -- Create Dynamic Columns
def generate_column_names(end):
if 1 > end:
raise ValueError("End value must be grater than 1")
column_names = [f"Col{i}" for i in range(1, end+2)]
return column_names
# -- Add missing separator
def add_missing_separators(file_data,separator,max_header_count):
# Create a list to hold the modified rows
modified_rows = []
for line in file_data:
# Count the occurrences of the separator
count = line.count(separator)
# Append the separator if the count is less than the max_header_count
if count < max_header_count:
separator_str=separator * (max_header_count - count)
line = line + separator_str
# Added modified line
modified_rows.append(line)
return modified_rows
# -- Create global dataframes
def create_global_df(sep=",", usecols=None, max_header_count=1):
file_details = pd.DataFrame(columns=['file_name','data'])
try:
if uploaded_files is not None:
for file in uploaded_files:
if usecols is not None:
file_data = io.StringIO(file.read().decode())
modified_rows = add_missing_separators(file_data, sep,max_header_count)
df = pd.DataFrame(each_row.split(sep) for each_row in modified_rows)
df.columns = usecols
else:
df = pd.read_csv(file, sep=sep)
pattern = r'([^/]+)\.csv$'
match = re.search(pattern, file.name)
file_name = match.group(1)
file_details.loc[len(file_details)] = {
'file_name':file_name,
'data':df
}
st.session_state.global_dataframe = file_details
except Exception as e:
st.error(f"Error processing csv: {str(e)}")
raise e
# -- Load global dataframes
def load_global_df():
if st.session_state.header:
print("Added Headers")
usecols = generate_column_names(st.session_state.header_count)
create_global_df(sep,usecols,st.session_state.header_count)
else:
print("No Headers Added")
create_global_df(sep)
# -- Run SQL Data
def run_sql_df():
for index, row in st.session_state.global_dataframe.iterrows():
globals()['%s' % row['file_name']] = row['data']
try:
sql_query = st.text_area(label="Sql Query", value="", key="sql_query", height=200)
if st.button("Run SQL Query"):
result_df = psql.sqldf(sql_query, globals())
st.write("Query Result")
st.dataframe(result_df)
csv_data = result_df.to_csv(index=False)
b64 = base64.b64encode(csv_data.encode()).decode()
st.markdown(f'<a href="data:file/csv;base64,{b64}" download="result.csv">Download Result CSV</a>', unsafe_allow_html=True)
except Exception as e:
st.error(f"Error executing SQL query: {str(e)}")
################################
####### Display of data ########
################################
# -- Streamlit Settings
st.set_page_config(layout='wide')
st.title("Data Play Ground")
# -- Delimiter
st.text("")
st.text("")
st.text("")
col1, col2, col3 = st.columns(3)
delimiter = col1.selectbox(
label="File Delimiter",
options=[",","|"],
key="delimiter"
)
# -- Upload Sample Files
st.text("")
st.text("")
col1, col2, col3, col4 = st.columns([1,0.3,0.7,1])
uploaded_files = col1.file_uploader(
"Choose a file",
type="csv",
key="uploaded_files",
accept_multiple_files=True
)
# -- Add header Indicator
header=col3.checkbox(
label='Add Header',
key="header"
)
# -- Dynamic Headers Count
if header:
header_count=col4.number_input(
label="No of Header",
value=2,
key="header_count",
min_value=1,
max_value=100,
step=1
)
# -- Load Data
st.text("")
col1, col2, col3 = st.columns([1,1,8])
sep = st.session_state.delimiter
if col1.button("Load Data"):
st.session_state.load_sql=True
st.session_state.run_sql=False
load_global_df()
# -- Run SQL Query
if col2.button("SQL"):
st.session_state.load_sql=False
st.session_state.run_sql=True
run_sql_df()
# -- Display SQL Query Data
if st.session_state.run_sql:
run_sql_df()
# -- Display Loaded Data
if (len(st.session_state.global_dataframe)>0 and st.session_state.load_sql):
# print("Count of stored files - "+str(len(st.session_state.global_dataframe)))
col1, col2, col3 = st.columns(3)
col1.selectbox(
label="Select Table Name",
key="table_name",
options=st.session_state.global_dataframe['file_name']
)
for index, row in st.session_state.global_dataframe.iterrows():
globals()['%s' % row['file_name']] = row['data']
st.dataframe(psql.sqldf("select * from "+st.session_state.table_name, globals()))