import requests
import streamlit as st
from bs4 import BeautifulSoup
import pandas as pd
from transformers import pipeline
import plotly.express as px
import time
import io
import os  
from comet_ml import Experiment 
import zipfile
import re
from streamlit_extras.stylable_container import stylable_container


st.set_page_config(layout="wide", page_title="Named Entity Recognition App")


COMET_API_KEY = os.environ.get("COMET_API_KEY")
COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")

comet_initialized = False
if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
    comet_initialized = True


st.subheader("9-Personal Data Named Entity Recognition Web App", divider="rainbow")
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")

expander = st.expander("**Important notes on the 9-Personal Data Named Entity Recognition Web App**")
expander.write('''
    
    **Named Entities:**
    This 9-Personal Data Named Entity Recognition Web App predicts nine (9) categories:
    
    1. **Account-related information**: Account name, account number, and transaction amounts 
    
    2. **Banking details**: BIC, IBAN, and Bitcoin or Ethereum addresses 
     
    3. **Personal information**: Full name, first name, middle name, last name, gender, and date of birth   
     
    4. **Contact information**: Email, phone number, and street address (including building number, city, county, state, and zip code) 
     
    5. **Job-related data**: Job title, job area, job descriptor, and job type
     
    6. **Financial data**: Credit card number, issuer, CVV, and currency information (code, name, and symbol)  
     
    7. **Digital identifiers**: IP addresses (IPv4 and IPv6), MAC addresses, and user agents 
     
    8. **Online presence**: URL, usernames, and passwords    
     
    9. **Other sensitive data**: SSN, vehicle VIN and VRM, phone IMEI, and nearby GPS coordinates 
    
    Results are presented in an easy-to-read table, visualized in an interactive tree map, pie chart, and bar chart, and are available for download along with a Glossary of tags.
    
    **How to Use:**
    Paste a URL, and then press Enter. If you type or paste text, just press Ctrl + Enter.
    
    **Usage Limits:**
     You can request results up to 10 times.
 
    **Customization:**
    To change the app's background color to white or black, click the three-dot menu on the right-hand side of your app, go to Settings and then Choose app theme, colors and fonts.
    
    **Technical issues:**
    If your connection times out, please refresh the page or reopen the app's URL.
    
    For any errors or inquiries, please contact us at info@nlpblogs.com
    
''')


with st.sidebar:
    container = st.container(border=True)
    container.write("**Named Entity Recognition (NER)** is the task of extracting and tagging entities in text data. Entities can be persons, organizations, locations, countries, products, events etc.")
    st.subheader("Related NLP Web Apps", divider="rainbow")
    st.link_button("8-Named Entity Recognition Web App", "https://nlpblogs.com/shop/named-entity-recognition-ner/8-named-entity-recognition-web-app/", type="primary")
    

if 'source_type_attempts' not in st.session_state:
    st.session_state['source_type_attempts'] = 0
max_attempts = 10 

def clear_url_input():
    
    st.session_state.url = ""

def clear_text_input():
    
    st.session_state.my_text_area = ""

url = st.text_input("Enter URL from the internet, and then press Enter:", key="url")
st.button("Clear URL", on_click=clear_url_input)

text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", key='my_text_area')
st.button("Clear Text", on_click=clear_text_input)


source_type = None
input_content = None
text_to_process = None 

if url:
    source_type = 'url'
    input_content = url
elif text:
    source_type = 'text'
    input_content = text

if source_type:
    
    st.subheader("Results", divider = "rainbow")

    
    if st.session_state['source_type_attempts'] >= max_attempts:
        st.error(f"You have requested results {max_attempts} times. You have reached your daily request limit.")
        st.stop()

    st.session_state['source_type_attempts'] += 1

    
    @st.cache_resource
    def load_ner_model():
        
        return pipeline("token-classification", model="h2oai/deberta_finetuned_pii", aggregation_strategy="first")

    model = load_ner_model()
    experiment = None 

    try:
        if source_type == 'url':
            if not url.startswith(("http://", "https://")):
                st.error("Please enter a valid URL starting with 'http://' or 'https://'.")
            else:
                with st.spinner(f"Fetching and parsing content from **{url}**...", show_time=True):
                    f = requests.get(url, timeout=10)
                    f.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
                    soup = BeautifulSoup(f.text, 'html.parser')
                    text_to_process = soup.get_text(separator=' ', strip=True)
                    st.divider()
                    st.write("**Input text content**")
                    st.write(text_to_process[:500] + "..." if len(text_to_process) > 500 else text_to_process)

                    
        elif source_type == 'text':
            text_to_process = text
            st.divider()
            st.write("**Input text content**")
            
            st.write(text_to_process[:500] + "..." if len(text_to_process) > 500 else text_to_process)

        if text_to_process and len(text_to_process.strip()) > 0: 
            with st.spinner("Analyzing text...", show_time=True):
                entities = model(text_to_process)
                data = []
                for entity in entities:
                    data.append({
                        'word': entity['word'],
                        'entity_group': entity['entity_group'],
                        'score': entity['score'],
                        'start': entity['start'], # Include start and end for download
                        'end': entity['end']
                    })
                df = pd.DataFrame(data)

                
                pattern = r'[^\w\s]'
                df['word'] = df['word'].replace(pattern, '', regex=True)
                
                df = df.replace('', 'Unknown')
                st.dataframe(df)

                
                if comet_initialized:
                    experiment = Experiment(
                        api_key=COMET_API_KEY,
                        workspace=COMET_WORKSPACE,
                        project_name=COMET_PROJECT_NAME,
                    )
                    experiment.log_parameter("input_source_type", source_type)
                    experiment.log_parameter("input_content_length", len(input_content))
                    experiment.log_table("predicted_entities", df)

                with st.expander("See Glossary of tags"):
                    st.write('''
                    '**word**': ['entity extracted from your text data']
                    
                    '**score**': ['accuracy score; how accurately a tag has been assigned to a given entity']
            
                    '**entity_group**': ['label (tag) assigned to a given extracted entity']
            
                    '**start**': ['index of the start of the corresponding entity']
            
                    '**end**': ['index of the end of the corresponding entity'] 
                    ''')
                
                
                if not df.empty:
                    
                    st.markdown("---")
                    st.subheader("Treemap", divider="rainbow")
                    fig = px.treemap(df, path=[px.Constant("all"), 'entity_group', 'word'],
                                     values='score', color='entity_group',
                                     )
                    fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
                    st.plotly_chart(fig, use_container_width=True)
                    if comet_initialized and experiment: 
                        experiment.log_figure(figure=fig, figure_name="entity_treemap")

                    
                    value_counts = df['entity_group'].value_counts().reset_index()
                    value_counts.columns = ['entity_group', 'count']

                    col1, col2 = st.columns(2)
                    with col1:
                        st.subheader("Pie Chart", divider="rainbow")
                        fig1 = px.pie(value_counts, values='count', names='entity_group',
                                      hover_data=['count'], labels={'count': 'count'},
                                      title='Percentage of Predicted Labels')
                        fig1.update_traces(textposition='inside', textinfo='percent+label')
                        st.plotly_chart(fig1, use_container_width=True)
                        if comet_initialized and experiment: # Check if experiment is initialized
                            experiment.log_figure(figure=fig1, figure_name="label_pie_chart")

                    with col2:
                        st.subheader("Bar Chart", divider="rainbow")
                        fig2 = px.bar(value_counts, x="count", y="entity_group", color="entity_group",
                                      text_auto=True, title='Occurrences of Predicted Labels')
                        st.plotly_chart(fig2, use_container_width=True)
                        if comet_initialized and experiment: # Check if experiment is initialized
                            experiment.log_figure(figure=fig2, figure_name="label_bar_chart")
                else:
                    st.warning("No entities were extracted from the provided text.")

                
                dfa = pd.DataFrame(
                    data={
                        'word': ['entity extracted from your text data'],
                        'score': ['accuracy score; how accurately a tag has been assigned to a given entity'],
                        'entity_group': ['label (tag) assigned to a given extracted entity'],
                        'start': ['index of the start of the corresponding entity'],
                        'end': ['index of the end of the corresponding entity'],
                    }
                )
                buf = io.BytesIO()
                with zipfile.ZipFile(buf, "w") as myzip:
                    if not df.empty:
                        myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
                    myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))

                with stylable_container(
                     key="download_button",
                     css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
                 ):
                    st.download_button(
                         label="Download zip file",
                         data=buf.getvalue(),
                         file_name="nlpblogs_ner_results.zip",
                         mime="application/zip",)
                    

                st.divider()
        else:
            st.warning("No meaningful text found to process. Please enter a URL or text.")

    
    except Exception as e:
        st.error(f"An unexpected error occurred: {e}")
    finally:
        if comet_initialized and experiment:
            experiment.end()

st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")