Spaces:

nlpblogs
/

9-Personal-Data-NER-TXT-URL-Web-App

Sleeping

App Files Files Community

Maria Tsilimos commited on 12 days ago

Commit

40cca74

unverified ·

1 Parent(s): 96fc515

Create app.py

Browse files

Files changed (1) hide show

app.py +279 -0

app.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import requests
+import streamlit as st
+from bs4 import BeautifulSoup
+import pandas as pd
+from transformers import pipeline
+import plotly.express as px
+import time
+import io
+import os
+from comet_ml import Experiment
+import zipfile
+import re
+from streamlit_extras.stylable_container import stylable_container
+st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
+COMET_API_KEY = os.environ.get("COMET_API_KEY")
+COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
+COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
+comet_initialized = False
+if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
+    comet_initialized = True
+st.subheader("9-Personal Data Named Entity Recognition Web App", divider="rainbow")
+st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
+expander = st.expander("**Important notes on the 9-Personal Data Named Entity Recognition Web App**")
+expander.write('''
+    **Named Entities:**
+    This 9-Personal Data Named Entity Recognition Web App predicts nine (9) categories:
+    1. **Account-related information**: Account name, account number, and transaction amounts
+    2. **Banking details**: BIC, IBAN, and Bitcoin or Ethereum addresses
+    3. **Personal information**: Full name, first name, middle name, last name, gender, and date of birth
+    4. **Contact information**: Email, phone number, and street address (including building number, city, county, state, and zip code)
+    5. **Job-related data**: Job title, job area, job descriptor, and job type
+    6. **Financial data**: Credit card number, issuer, CVV, and currency information (code, name, and symbol)
+    7. **Digital identifiers**: IP addresses (IPv4 and IPv6), MAC addresses, and user agents
+    8. **Online presence**: URL, usernames, and passwords
+    9. **Other sensitive data**: SSN, vehicle VIN and VRM, phone IMEI, and nearby GPS coordinates
+    Results are presented in an easy-to-read table, visualized in an interactive tree map, pie chart, and bar chart, and are available for download along with a Glossary of tags.
+    **How to Use:**
+    Upload your .pdf or .docx file. Then, click the 'Results' button to extract and tag entities in your text data.
+    **Usage Limits:**
+     You can request results up to 10 times.
+    **Customization:**
+    To change the app's background color to white or black, click the three-dot menu on the right-hand side of your app, go to Settings and then Choose app theme, colors and fonts.
+    **Technical issues:**
+    If your connection times out, please refresh the page or reopen the app's URL.
+    For any errors or inquiries, please contact us at info@nlpblogs.com
+''')
+with st.sidebar:
+    container = st.container(border=True)
+    container.write("**Named Entity Recognition (NER)** is the task of extracting and tagging entities in text data. Entities can be persons, organizations, locations, countries, products, events etc.")
+    st.subheader("Related NLP Web Apps", divider="rainbow")
+    st.link_button("8-Named Entity Recognition Web App", "https://nlpblogs.com/shop/named-entity-recognition-ner/8-named-entity-recognition-web-app/", type="primary")
+if 'source_type_attempts' not in st.session_state:
+    st.session_state['source_type_attempts'] = 0
+max_attempts = 10
+def clear_url_input():
+    st.session_state.url = ""
+def clear_text_input():
+    st.session_state.my_text_area = ""
+url = st.text_input("Enter URL from the internet, and then press Enter:", key="url")
+st.button("Clear URL", on_click=clear_url_input)
+text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", key='my_text_area')
+st.button("Clear Text", on_click=clear_text_input)
+source_type = None
+input_content = None
+text_to_process = None
+if url:
+    source_type = 'url'
+    input_content = url
+elif text:
+    source_type = 'text'
+    input_content = text
+if source_type:
+    st.subheader("Results", divider = "rainbow")
+    if st.session_state['source_type_attempts'] >= max_attempts:
+        st.error(f"You have requested results {max_attempts} times. You have reached your daily request limit.")
+        st.stop()
+    st.session_state['source_type_attempts'] += 1
+    @st.cache_resource
+    def load_ner_model():
+        return pipeline("token-classification", model="h2oai/deberta_finetuned_pii", aggregation_strategy="first")
+    model = load_ner_model()
+    experiment = None
+    try:
+        if source_type == 'url':
+            if not url.startswith(("http://", "https://")):
+                st.error("Please enter a valid URL starting with 'http://' or 'https://'.")
+            else:
+                with st.spinner(f"Fetching and parsing content from **{url}**...", show_time=True):
+                    f = requests.get(url, timeout=10)
+                    f.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
+                    soup = BeautifulSoup(f.text, 'html.parser')
+                    text_to_process = soup.get_text(separator=' ', strip=True)
+                    st.divider()
+                    st.write("**Input text content**")
+                    st.write(text_to_process[:500] + "..." if len(text_to_process) > 500 else text_to_process)
+        elif source_type == 'text':
+            text_to_process = text
+            st.divider()
+            st.write("**Input text content**")
+            st.write(text_to_process[:500] + "..." if len(text_to_process) > 500 else text_to_process)
+        if text_to_process and len(text_to_process.strip()) > 0:
+            with st.spinner("Analyzing text...", show_time=True):
+                entities = model(text_to_process)
+                data = []
+                for entity in entities:
+                    data.append({
+                        'word': entity['word'],
+                        'entity_group': entity['entity_group'],
+                        'score': entity['score'],
+                        'start': entity['start'], # Include start and end for download
+                        'end': entity['end']
+                    })
+                df = pd.DataFrame(data)
+                pattern = r'[^\w\s]'
+                df['word'] = df['word'].replace(pattern, '', regex=True)
+                df = df.replace('', 'Unknown')
+                st.dataframe(df)
+                if comet_initialized:
+                    experiment = Experiment(
+                        api_key=COMET_API_KEY,
+                        workspace=COMET_WORKSPACE,
+                        project_name=COMET_PROJECT_NAME,
+                    )
+                    experiment.log_parameter("input_source_type", source_type)
+                    experiment.log_parameter("input_content_length", len(input_content))
+                    experiment.log_table("predicted_entities", df)
+                with st.expander("See Glossary of tags"):
+                    st.write('''
+                    '**word**': ['entity extracted from your text data']
+                    '**score**': ['accuracy score; how accurately a tag has been assigned to a given entity']
+                    '**entity_group**': ['label (tag) assigned to a given extracted entity']
+                    '**start**': ['index of the start of the corresponding entity']
+                    '**end**': ['index of the end of the corresponding entity']
+                    ''')
+                if not df.empty:
+                    st.markdown("---")
+                    st.subheader("Treemap", divider="rainbow")
+                    fig = px.treemap(df, path=[px.Constant("all"), 'entity_group', 'word'],
+                                     values='score', color='entity_group',
+                                     )
+                    fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
+                    st.plotly_chart(fig, use_container_width=True)
+                    if comet_initialized and experiment:
+                        experiment.log_figure(figure=fig, figure_name="entity_treemap")
+                    value_counts = df['entity_group'].value_counts().reset_index()
+                    value_counts.columns = ['entity_group', 'count']
+                    col1, col2 = st.columns(2)
+                    with col1:
+                        st.subheader("Pie Chart", divider="rainbow")
+                        fig1 = px.pie(value_counts, values='count', names='entity_group',
+                                      hover_data=['count'], labels={'count': 'count'},
+                                      title='Percentage of Predicted Labels')
+                        fig1.update_traces(textposition='inside', textinfo='percent+label')
+                        st.plotly_chart(fig1, use_container_width=True)
+                        if comet_initialized and experiment: # Check if experiment is initialized
+                            experiment.log_figure(figure=fig1, figure_name="label_pie_chart")
+                    with col2:
+                        st.subheader("Bar Chart", divider="rainbow")
+                        fig2 = px.bar(value_counts, x="count", y="entity_group", color="entity_group",
+                                      text_auto=True, title='Occurrences of Predicted Labels')
+                        st.plotly_chart(fig2, use_container_width=True)
+                        if comet_initialized and experiment: # Check if experiment is initialized
+                            experiment.log_figure(figure=fig2, figure_name="label_bar_chart")
+                else:
+                    st.warning("No entities were extracted from the provided text.")
+                dfa = pd.DataFrame(
+                    data={
+                        'word': ['entity extracted from your text data'],
+                        'score': ['accuracy score; how accurately a tag has been assigned to a given entity'],
+                        'entity_group': ['label (tag) assigned to a given extracted entity'],
+                        'start': ['index of the start of the corresponding entity'],
+                        'end': ['index of the end of the corresponding entity'],
+                    }
+                )
+                buf = io.BytesIO()
+                with zipfile.ZipFile(buf, "w") as myzip:
+                    if not df.empty:
+                        myzip.writestr("Summary_of_results.csv", df.to_csv(index=False))
+                    myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False))
+                with stylable_container(
+                     key="download_button",
+                     css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
+                 ):
+                    st.download_button(
+                         label="Download zip file",
+                         data=buf.getvalue(),
+                         file_name="nlpblogs_ner_results.zip",
+                         mime="application/zip",)
+                if comet_initialized and experiment: # Ensure experiment exists before ending
+                    experiment.end()
+        else:
+            st.warning("No meaningful text found to process. Please enter a URL or text.")
+    except requests.exceptions.RequestException as e:
+        st.error(f"Error fetching the URL: {e}. Please check the URL and your internet connection.")
+st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")