import streamlit as st from transformers import pipeline import pandas as pd import re from PyPDF2 import PdfReader # Cache the NER model for performance @st.cache_resource def load_ner_model(): return pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True) ner_model = load_ner_model() # Function to extract text from PDF def extract_text_from_pdf(file): pdf_reader = PdfReader(file) text = "" for page in pdf_reader.pages: text += page.extract_text() or "" return text # Main app def main(): st.title("Invoice Data Extraction (NER)") st.write("Extract entities like company names, dates, amounts, and other details from invoice text or uploaded files using Named Entity Recognition.") # Input options: Text or File Upload input_method = st.radio("Choose input method:", ("Text Input", "Upload File")) invoice_text = "" if input_method == "Text Input": invoice_text = st.text_area("Invoice Text", placeholder="Paste your invoice text here...", height=200) elif input_method == "Upload File": uploaded_file = st.file_uploader("Upload Invoice (PDF or TXT)", type=["pdf", "txt"]) if uploaded_file is not None: if uploaded_file.type == "application/pdf": invoice_text = extract_text_from_pdf(uploaded_file) else: # txt invoice_text = uploaded_file.read().decode("utf-8") st.text_area("Extracted Text", value=invoice_text, height=200, disabled=True) # Button and output if st.button("Extract"): if not invoice_text: st.warning("Please enter text or upload a file to extract!") return with st.spinner("Extracting entities..."): # Perform NER entities = ner_model(invoice_text) # Initialize entity dictionary with broader fields entity_dict = { "Organization": [], "Date": [], "Amount": [], "Supplier": [], "Item": [], "Due By": [], "Invoice Number": [] } # Process NER entities for entity in entities: if entity["entity_group"] == "ORG": entity_dict["Organization"].append(entity["word"]) elif entity["entity_group"] in ["DATE", "TIME"]: entity_dict["Date"].append(entity["word"]) # Enhanced heuristic rules for invoice-specific fields # Supplier (look for "Supplier:", "From:", or "From" prefix) supplier_match = re.search(r"(?:Supplier|From):?\s*([^\n]+)", invoice_text, re.IGNORECASE) if supplier_match and "Global Trading Ltd" not in entity_dict["Organization"]: # Avoid duplication entity_dict["Supplier"].append(supplier_match.group(1).strip()) # Date (enhanced regex for multiple formats) date_patterns = [ r"\b\d{1,2}[/-]\d{1,2}[/-]\d{4}\b", # e.g., 03/22/2025 r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b", # e.g., 12 December 2024 r"\b\d{1,2}-[A-Za-z]{3}-\d{4}\b" # e.g., 15-Jan-2025 ] for pattern in date_patterns: dates = re.findall(pattern, invoice_text) if dates: entity_dict["Date"].extend(dates) # Item (look for "Item:" prefix) item_matches = re.findall(r"Item:?\s*([^\n$€]+)(?:\s*[$€]\d+\.\d{2})?", invoice_text, re.IGNORECASE) if item_matches: entity_dict["Item"].extend([item.strip() for item in item_matches if item.strip()]) # Due By (look for "Due by:", "Payment due by:", or similar) due_by_match = re.search(r"(?:Due by|Payment due by):?\s*([^\n]+)", invoice_text, re.IGNORECASE) if due_by_match: entity_dict["Due By"].append(due_by_match.group(1).strip()) # Invoice Number (look for "Invoice #") invoice_match = re.search(r"Invoice #?(\w+)", invoice_text, re.IGNORECASE) if invoice_match: entity_dict["Invoice Number"].append(invoice_match.group(1).strip()) # Amount (improved regex for currency and numbers) amount_pattern = r'(?:\$\d+\.\d{2}|\€\d+\.\d{2})' amounts = re.findall(amount_pattern, invoice_text) if amounts: entity_dict["Amount"].extend(amounts) # Clean up empty categories entity_dict = {k: list(set(v)) if v else ["Not found"] for k, v in entity_dict.items()} # Remove duplicates # Display results as a table df = pd.DataFrame({ "Entity Type": list(entity_dict.keys()), "Extracted Value": [", ".join(v) for v in entity_dict.values()] }) st.success("Extracted Entities:") st.table(df) # Footer (shown only after extraction) st.markdown("""

Developed By: Krishna Prakash LinkedIn

""", unsafe_allow_html=True) if __name__ == "__main__": main()