Spaces:
Sleeping
Sleeping
import os | |
import sys | |
import streamlit as st | |
from streamlit_extras.switch_page_button import switch_page | |
import pandas as pd | |
import numpy as np | |
import torch | |
import faiss | |
from sentence_transformers import SentenceTransformer | |
import csv | |
################################ | |
######### Variables ############ | |
################################ | |
# -- Loading Variables | |
script_directory = os.path.dirname(os.path.abspath(sys.argv[0])) | |
source_df = pd.DataFrame() | |
destination_df = pd.DataFrame() | |
model = SentenceTransformer('all-mpnet-base-v2') | |
# -- Loading Session Data | |
if 'project_data' not in st.session_state: | |
st.session_state.project_data = pd.read_csv(script_directory+'/data/project.csv') | |
################################ | |
####### GenericFunctions ####### | |
################################ | |
# -- Create Embedding - all-mpnet-base-v2 - https://www.sbert.net/docs/pretrained_models.html | |
def embed_text(text): | |
embedding = model.encode(text) | |
return embedding | |
def embed_list(list): | |
embeddings = [] | |
for text in list: | |
embeddings.append(embed_text(text)) | |
return embeddings | |
# -- Store embeddings in a FAISS Vector database | |
def store_embeddings(embeddings): | |
dimension = embeddings[0].shape[0] | |
index = faiss.IndexFlatIP(dimension) | |
index.add(np.array(embeddings)) | |
# faiss.write_index(index, "data/vector_db.index") | |
return index | |
# -- Perform semantic search using embeddings | |
def semantic_search(query_embedding, index, k=1): | |
D, I = index.search(np.array([query_embedding]), k) | |
return I[0][0] | |
################################ | |
####### Display of data ######## | |
################################ | |
# -- Streamlit Settings | |
st.set_page_config(layout='wide') | |
st.title("Mapping") | |
# -- Add Project Dropdown | |
st.text("") | |
st.text("") | |
st.text("") | |
col1, col2, col3 = st.columns(3) | |
option = col1.selectbox('Select Project',st.session_state.project_data['Project']) | |
col1, col2, col3 = st.columns(3) | |
# -- Destination File Name | |
st.text("") | |
st.text("") | |
col1, col2, col3 = st.columns(3) | |
cond = (st.session_state.project_data['Project'] == option) | |
result = st.session_state.project_data[cond].Destination.values[0] | |
with col1: | |
destination_file_format = st.file_uploader( | |
"Destination file name - "+str(result)+".csv", | |
type="csv", | |
key="destination_file_format", | |
accept_multiple_files=True | |
) | |
if destination_file_format is not None: | |
for file in destination_file_format: | |
destination_df = pd.read_csv(file) | |
# -- Source File Name | |
cond = (st.session_state.project_data['Project'] == option) | |
result = st.session_state.project_data[cond].Source.values[0] | |
with col3: | |
source_file_format = st.file_uploader( | |
"Source file name - "+str(result)+".csv", | |
type="csv", | |
key="source_file_format", | |
accept_multiple_files=True | |
) | |
if source_file_format is not None: | |
for file in source_file_format: | |
source_df = pd.read_csv(file) | |
# -- Suggest Button | |
st.text("") | |
st.text("") | |
col1, col2, col3 = st.columns([0.25,0.2,2.55]) | |
if col1.button("AI Suggest"): | |
st.session_state.mapping_df = pd.DataFrame(columns=["Sno","DestinationColumn","SourceColumn","Type","Expression"]) | |
if len(destination_df) == 0 or len(source_df) == 0: | |
st.error("Select Source and Destination Files") | |
else: | |
new_data = [] | |
# Source - KnowledgeBase | |
input_text = source_df["Columns"].tolist() | |
embeddings = embed_list(input_text) | |
index = store_embeddings(embeddings) | |
# Map to Source | |
for i in range(len(destination_df)): | |
search_text = destination_df.loc[i, "Columns"] | |
query_embeddings = embed_text(search_text) | |
result = input_text[semantic_search(query_embeddings, index)] | |
row = { | |
"Sno": i+1, | |
"DestinationColumn": destination_df.loc[i, "Columns"], | |
"SourceColumn": result, | |
"Type": None, | |
"Expression":None | |
} | |
new_data.append(row) | |
# Saving Mapping and displaying | |
if new_data or len(mapping_df) <0: | |
st.session_state.mapping_df = pd.concat( | |
[ st.session_state.mapping_df, pd.DataFrame(new_data)], | |
ignore_index=True | |
) | |
else: | |
st.error("Unable to map Source and Destination Files") | |
# -- Save Button | |
if col2.button("Save"): | |
if (len(destination_df) > 0 and len(source_df) > 0 and len(st.session_state.mapping_df)>0): | |
cond = (st.session_state.project_data['Project'] == option) | |
file_name = script_directory+'/data/'+str(st.session_state.project_data[cond].Id.values[0])+"_"+st.session_state.project_data[cond].Source.values[0]+"_"+st.session_state.project_data[cond].Destination.values[0]+'.csv' | |
st.session_state.mapping_df.to_csv(file_name, index=False, sep="|",quoting=csv.QUOTE_NONE) | |
else: | |
st.error("Transformation not created") | |
# -- Load Exisitng Mapping | |
if col3.button("Load Mapping"): | |
cond = (st.session_state.project_data['Project'] == option) | |
file_name = script_directory+'/data/'+str(st.session_state.project_data[cond].Id.values[0])+"_"+st.session_state.project_data[cond].Source.values[0]+"_"+st.session_state.project_data[cond].Destination.values[0]+'.csv' | |
st.session_state.mapping_df = pd.read_csv(file_name,sep="|",quoting=csv.QUOTE_NONE) | |
# -- Display Mapping Table | |
if (len(destination_df) > 0 and len(source_df) > 0 and len(st.session_state.mapping_df)>0): | |
st.text("") | |
st.header("Mapping Details") | |
st.text("") | |
st.text("") | |
st.session_state.mapping_df = st.data_editor( | |
st.session_state.mapping_df, | |
height=400, | |
width=1200, | |
hide_index=True, | |
column_config={ | |
"Sno": st.column_config.TextColumn( | |
"Sno" | |
), | |
"DestinationColumn": st.column_config.TextColumn( | |
"DestinationColumn" | |
), | |
"SourceColumn": st.column_config.SelectboxColumn( | |
"SourceColumn", | |
width="medium", | |
options= source_df["Columns"], | |
), | |
"Type": st.column_config.SelectboxColumn( | |
"Type", | |
width="medium", | |
options=[ | |
"Pandas", | |
"Constant" | |
] | |
), | |
"Expression": st.column_config.TextColumn( | |
"Expression" | |
) | |
}, | |
disabled=["Sno","DestinationColumn"] | |
) |