Spaces:
Running
Running
File size: 5,794 Bytes
92f9bba f57f846 935c855 92f9bba 935c855 92f9bba 935c855 92f9bba 935c855 92f9bba 935c855 b744c43 935c855 b744c43 935c855 b744c43 935c855 b744c43 935c855 b744c43 92f9bba 935c855 92f9bba 935c855 92f9bba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import streamlit as st
from transformers import pipeline
import pandas as pd
import re
from PyPDF2 import PdfReader
# Cache the NER model for performance
@st.cache_resource
def load_ner_model():
return pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)
ner_model = load_ner_model()
# Function to extract text from PDF
def extract_text_from_pdf(file):
pdf_reader = PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() or ""
return text
# Main app
def main():
st.title("Invoice Data Extraction (NER)")
st.write("Extract entities like company names, dates, amounts, and other details from invoice text or uploaded files using Named Entity Recognition.")
# Input options: Text or File Upload
input_method = st.radio("Choose input method:", ("Text Input", "Upload File"))
invoice_text = ""
if input_method == "Text Input":
invoice_text = st.text_area("Invoice Text", placeholder="Paste your invoice text here...", height=200)
elif input_method == "Upload File":
uploaded_file = st.file_uploader("Upload Invoice (PDF or TXT)", type=["pdf", "txt"])
if uploaded_file is not None:
if uploaded_file.type == "application/pdf":
invoice_text = extract_text_from_pdf(uploaded_file)
else: # txt
invoice_text = uploaded_file.read().decode("utf-8")
st.text_area("Extracted Text", value=invoice_text, height=200, disabled=True)
# Button and output
if st.button("Extract"):
if not invoice_text:
st.warning("Please enter text or upload a file to extract!")
return
with st.spinner("Extracting entities..."):
# Perform NER
entities = ner_model(invoice_text)
# Initialize entity dictionary with broader fields
entity_dict = {
"Organization": [],
"Date": [],
"Amount": [],
"Supplier": [],
"Item": [],
"Due By": [],
"Invoice Number": []
}
# Process NER entities
for entity in entities:
if entity["entity_group"] == "ORG":
entity_dict["Organization"].append(entity["word"])
elif entity["entity_group"] in ["DATE", "TIME"]:
entity_dict["Date"].append(entity["word"])
# Enhanced heuristic rules for invoice-specific fields
# Supplier (look for "Supplier:", "From:", or "From" prefix)
supplier_match = re.search(r"(?:Supplier|From):?\s*([^\n]+)", invoice_text, re.IGNORECASE)
if supplier_match and "Global Trading Ltd" not in entity_dict["Organization"]: # Avoid duplication
entity_dict["Supplier"].append(supplier_match.group(1).strip())
# Date (enhanced regex for multiple formats)
date_patterns = [
r"\b\d{1,2}[/-]\d{1,2}[/-]\d{4}\b", # e.g., 03/22/2025
r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b", # e.g., 12 December 2024
r"\b\d{1,2}-[A-Za-z]{3}-\d{4}\b" # e.g., 15-Jan-2025
]
for pattern in date_patterns:
dates = re.findall(pattern, invoice_text)
if dates:
entity_dict["Date"].extend(dates)
# Item (look for "Item:" prefix)
item_matches = re.findall(r"Item:?\s*([^\n$€]+)(?:\s*[$€]\d+\.\d{2})?", invoice_text, re.IGNORECASE)
if item_matches:
entity_dict["Item"].extend([item.strip() for item in item_matches if item.strip()])
# Due By (look for "Due by:", "Payment due by:", or similar)
due_by_match = re.search(r"(?:Due by|Payment due by):?\s*([^\n]+)", invoice_text, re.IGNORECASE)
if due_by_match:
entity_dict["Due By"].append(due_by_match.group(1).strip())
# Invoice Number (look for "Invoice #")
invoice_match = re.search(r"Invoice #?(\w+)", invoice_text, re.IGNORECASE)
if invoice_match:
entity_dict["Invoice Number"].append(invoice_match.group(1).strip())
# Amount (improved regex for currency and numbers)
amount_pattern = r'(?:\$\d+\.\d{2}|\€\d+\.\d{2})'
amounts = re.findall(amount_pattern, invoice_text)
if amounts:
entity_dict["Amount"].extend(amounts)
# Clean up empty categories
entity_dict = {k: list(set(v)) if v else ["Not found"] for k, v in entity_dict.items()} # Remove duplicates
# Display results as a table
df = pd.DataFrame({
"Entity Type": list(entity_dict.keys()),
"Extracted Value": [", ".join(v) for v in entity_dict.values()]
})
st.success("Extracted Entities:")
st.table(df)
# Footer (shown only after extraction)
st.markdown("""
<p style="font-size: small; color: grey; text-align: center; margin-top: 20px; border-top: 1px solid #eee; padding-top: 10px;">
Developed By: Krishna Prakash
<a href="https://www.linkedin.com/in/krishnaprakash-profile/" target="_blank">
<img src="https://img.icons8.com/ios-filled/30/0077b5/linkedin.png" alt="LinkedIn" style="vertical-align: middle; margin: 0 5px;"/>
</a>
</p>
""", unsafe_allow_html=True)
if __name__ == "__main__":
main() |