import requests import streamlit as st from bs4 import BeautifulSoup import pandas as pd from transformers import pipeline import plotly.express as px import time import io import os from comet_ml import Experiment import zipfile import re from streamlit_extras.stylable_container import stylable_container st.set_page_config(layout="wide", page_title="Named Entity Recognition App") COMET_API_KEY = os.environ.get("COMET_API_KEY") COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE") COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME") comet_initialized = False if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME: comet_initialized = True st.subheader("9-Personal Data Named Entity Recognition Web App", divider="rainbow") st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary") expander = st.expander("**Important notes on the 9-Personal Data Named Entity Recognition Web App**") expander.write(''' **Named Entities:** This 9-Personal Data Named Entity Recognition Web App predicts nine (9) categories: 1. **Account-related information**: Account name, account number, and transaction amounts 2. **Banking details**: BIC, IBAN, and Bitcoin or Ethereum addresses 3. **Personal information**: Full name, first name, middle name, last name, gender, and date of birth 4. **Contact information**: Email, phone number, and street address (including building number, city, county, state, and zip code) 5. **Job-related data**: Job title, job area, job descriptor, and job type 6. **Financial data**: Credit card number, issuer, CVV, and currency information (code, name, and symbol) 7. **Digital identifiers**: IP addresses (IPv4 and IPv6), MAC addresses, and user agents 8. **Online presence**: URL, usernames, and passwords 9. **Other sensitive data**: SSN, vehicle VIN and VRM, phone IMEI, and nearby GPS coordinates Results are presented in an easy-to-read table, visualized in an interactive tree map, pie chart, and bar chart, and are available for download along with a Glossary of tags. **How to Use:** Paste a URL, and then press Enter. If you type or paste text, just press Ctrl + Enter. **Usage Limits:** You can request results up to 10 times. **Customization:** To change the app's background color to white or black, click the three-dot menu on the right-hand side of your app, go to Settings and then Choose app theme, colors and fonts. **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL. For any errors or inquiries, please contact us at info@nlpblogs.com ''') with st.sidebar: container = st.container(border=True) container.write("**Named Entity Recognition (NER)** is the task of extracting and tagging entities in text data. Entities can be persons, organizations, locations, countries, products, events etc.") st.subheader("Related NLP Web Apps", divider="rainbow") st.link_button("8-Named Entity Recognition Web App", "https://nlpblogs.com/shop/named-entity-recognition-ner/8-named-entity-recognition-web-app/", type="primary") if 'source_type_attempts' not in st.session_state: st.session_state['source_type_attempts'] = 0 max_attempts = 10 def clear_url_input(): st.session_state.url = "" def clear_text_input(): st.session_state.my_text_area = "" url = st.text_input("Enter URL from the internet, and then press Enter:", key="url") st.button("Clear URL", on_click=clear_url_input) text = st.text_area("Type or paste your text below, and then press Ctrl + Enter", key='my_text_area') st.button("Clear Text", on_click=clear_text_input) source_type = None input_content = None text_to_process = None if url: source_type = 'url' input_content = url elif text: source_type = 'text' input_content = text if source_type: st.subheader("Results", divider = "rainbow") if st.session_state['source_type_attempts'] >= max_attempts: st.error(f"You have requested results {max_attempts} times. You have reached your daily request limit.") st.stop() st.session_state['source_type_attempts'] += 1 @st.cache_resource def load_ner_model(): return pipeline("token-classification", model="h2oai/deberta_finetuned_pii", aggregation_strategy="first") model = load_ner_model() experiment = None try: if source_type == 'url': if not url.startswith(("http://", "https://")): st.error("Please enter a valid URL starting with 'http://' or 'https://'.") else: with st.spinner(f"Fetching and parsing content from **{url}**...", show_time=True): f = requests.get(url, timeout=10) f.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx) soup = BeautifulSoup(f.text, 'html.parser') text_to_process = soup.get_text(separator=' ', strip=True) st.divider() st.write("**Input text content**") st.write(text_to_process[:500] + "..." if len(text_to_process) > 500 else text_to_process) elif source_type == 'text': text_to_process = text st.divider() st.write("**Input text content**") st.write(text_to_process[:500] + "..." if len(text_to_process) > 500 else text_to_process) if text_to_process and len(text_to_process.strip()) > 0: with st.spinner("Analyzing text...", show_time=True): entities = model(text_to_process) data = [] for entity in entities: data.append({ 'word': entity['word'], 'entity_group': entity['entity_group'], 'score': entity['score'], 'start': entity['start'], # Include start and end for download 'end': entity['end'] }) df = pd.DataFrame(data) pattern = r'[^\w\s]' df['word'] = df['word'].replace(pattern, '', regex=True) df = df.replace('', 'Unknown') st.dataframe(df) if comet_initialized: experiment = Experiment( api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME, ) experiment.log_parameter("input_source_type", source_type) experiment.log_parameter("input_content_length", len(input_content)) experiment.log_table("predicted_entities", df) with st.expander("See Glossary of tags"): st.write(''' '**word**': ['entity extracted from your text data'] '**score**': ['accuracy score; how accurately a tag has been assigned to a given entity'] '**entity_group**': ['label (tag) assigned to a given extracted entity'] '**start**': ['index of the start of the corresponding entity'] '**end**': ['index of the end of the corresponding entity'] ''') if not df.empty: st.markdown("---") st.subheader("Treemap", divider="rainbow") fig = px.treemap(df, path=[px.Constant("all"), 'entity_group', 'word'], values='score', color='entity_group', ) fig.update_layout(margin=dict(t=50, l=25, r=25, b=25)) st.plotly_chart(fig, use_container_width=True) if comet_initialized and experiment: experiment.log_figure(figure=fig, figure_name="entity_treemap") value_counts = df['entity_group'].value_counts().reset_index() value_counts.columns = ['entity_group', 'count'] col1, col2 = st.columns(2) with col1: st.subheader("Pie Chart", divider="rainbow") fig1 = px.pie(value_counts, values='count', names='entity_group', hover_data=['count'], labels={'count': 'count'}, title='Percentage of Predicted Labels') fig1.update_traces(textposition='inside', textinfo='percent+label') st.plotly_chart(fig1, use_container_width=True) if comet_initialized and experiment: # Check if experiment is initialized experiment.log_figure(figure=fig1, figure_name="label_pie_chart") with col2: st.subheader("Bar Chart", divider="rainbow") fig2 = px.bar(value_counts, x="count", y="entity_group", color="entity_group", text_auto=True, title='Occurrences of Predicted Labels') st.plotly_chart(fig2, use_container_width=True) if comet_initialized and experiment: # Check if experiment is initialized experiment.log_figure(figure=fig2, figure_name="label_bar_chart") else: st.warning("No entities were extracted from the provided text.") dfa = pd.DataFrame( data={ 'word': ['entity extracted from your text data'], 'score': ['accuracy score; how accurately a tag has been assigned to a given entity'], 'entity_group': ['label (tag) assigned to a given extracted entity'], 'start': ['index of the start of the corresponding entity'], 'end': ['index of the end of the corresponding entity'], } ) buf = io.BytesIO() with zipfile.ZipFile(buf, "w") as myzip: if not df.empty: myzip.writestr("Summary_of_results.csv", df.to_csv(index=False)) myzip.writestr("Glossary_of_tags.csv", dfa.to_csv(index=False)) with stylable_container( key="download_button", css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""", ): st.download_button( label="Download zip file", data=buf.getvalue(), file_name="nlpblogs_ner_results.zip", mime="application/zip",) st.divider() else: st.warning("No meaningful text found to process. Please enter a URL or text.") except Exception as e: st.error(f"An unexpected error occurred: {e}") finally: if comet_initialized and experiment: experiment.end() st.write(f"Number of times you requested results: **{st.session_state['source_type_attempts']}/{max_attempts}**")