File size: 5,794 Bytes
92f9bba
 
 
f57f846
935c855
92f9bba
 
 
 
 
 
 
 
935c855
 
 
 
 
 
 
 
92f9bba
 
 
935c855
 
 
 
92f9bba
935c855
 
 
 
 
 
 
 
 
 
 
92f9bba
 
 
935c855
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b744c43
 
 
 
935c855
 
b744c43
 
 
 
 
 
 
 
 
 
 
935c855
 
 
 
 
b744c43
 
935c855
 
 
 
 
 
 
 
 
b744c43
935c855
 
 
 
 
b744c43
92f9bba
935c855
 
 
 
 
 
 
92f9bba
935c855
 
 
 
 
 
 
 
 
92f9bba
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import streamlit as st
from transformers import pipeline
import pandas as pd
import re
from PyPDF2 import PdfReader

# Cache the NER model for performance
@st.cache_resource
def load_ner_model():
    return pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)

ner_model = load_ner_model()

# Function to extract text from PDF
def extract_text_from_pdf(file):
    pdf_reader = PdfReader(file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() or ""
    return text

# Main app
def main():
    st.title("Invoice Data Extraction (NER)")
    st.write("Extract entities like company names, dates, amounts, and other details from invoice text or uploaded files using Named Entity Recognition.")

    # Input options: Text or File Upload
    input_method = st.radio("Choose input method:", ("Text Input", "Upload File"))

    invoice_text = ""
    if input_method == "Text Input":
        invoice_text = st.text_area("Invoice Text", placeholder="Paste your invoice text here...", height=200)
    elif input_method == "Upload File":
        uploaded_file = st.file_uploader("Upload Invoice (PDF or TXT)", type=["pdf", "txt"])
        if uploaded_file is not None:
            if uploaded_file.type == "application/pdf":
                invoice_text = extract_text_from_pdf(uploaded_file)
            else:  # txt
                invoice_text = uploaded_file.read().decode("utf-8")
            st.text_area("Extracted Text", value=invoice_text, height=200, disabled=True)

    # Button and output
    if st.button("Extract"):
        if not invoice_text:
            st.warning("Please enter text or upload a file to extract!")
            return

        with st.spinner("Extracting entities..."):
            # Perform NER
            entities = ner_model(invoice_text)
            
            # Initialize entity dictionary with broader fields
            entity_dict = {
                "Organization": [],
                "Date": [],
                "Amount": [],
                "Supplier": [],
                "Item": [],
                "Due By": [],
                "Invoice Number": []
            }
            
            # Process NER entities
            for entity in entities:
                if entity["entity_group"] == "ORG":
                    entity_dict["Organization"].append(entity["word"])
                elif entity["entity_group"] in ["DATE", "TIME"]:
                    entity_dict["Date"].append(entity["word"])

            # Enhanced heuristic rules for invoice-specific fields
            # Supplier (look for "Supplier:", "From:", or "From" prefix)
            supplier_match = re.search(r"(?:Supplier|From):?\s*([^\n]+)", invoice_text, re.IGNORECASE)
            if supplier_match and "Global Trading Ltd" not in entity_dict["Organization"]:  # Avoid duplication
                entity_dict["Supplier"].append(supplier_match.group(1).strip())
            
            # Date (enhanced regex for multiple formats)
            date_patterns = [
                r"\b\d{1,2}[/-]\d{1,2}[/-]\d{4}\b",  # e.g., 03/22/2025
                r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b",  # e.g., 12 December 2024
                r"\b\d{1,2}-[A-Za-z]{3}-\d{4}\b"  # e.g., 15-Jan-2025
            ]
            for pattern in date_patterns:
                dates = re.findall(pattern, invoice_text)
                if dates:
                    entity_dict["Date"].extend(dates)
            
            # Item (look for "Item:" prefix)
            item_matches = re.findall(r"Item:?\s*([^\n$€]+)(?:\s*[$€]\d+\.\d{2})?", invoice_text, re.IGNORECASE)
            if item_matches:
                entity_dict["Item"].extend([item.strip() for item in item_matches if item.strip()])
            
            # Due By (look for "Due by:", "Payment due by:", or similar)
            due_by_match = re.search(r"(?:Due by|Payment due by):?\s*([^\n]+)", invoice_text, re.IGNORECASE)
            if due_by_match:
                entity_dict["Due By"].append(due_by_match.group(1).strip())
            
            # Invoice Number (look for "Invoice #")
            invoice_match = re.search(r"Invoice #?(\w+)", invoice_text, re.IGNORECASE)
            if invoice_match:
                entity_dict["Invoice Number"].append(invoice_match.group(1).strip())
            
            # Amount (improved regex for currency and numbers)
            amount_pattern = r'(?:\$\d+\.\d{2}|\€\d+\.\d{2})'
            amounts = re.findall(amount_pattern, invoice_text)
            if amounts:
                entity_dict["Amount"].extend(amounts)
            
            # Clean up empty categories
            entity_dict = {k: list(set(v)) if v else ["Not found"] for k, v in entity_dict.items()}  # Remove duplicates

            # Display results as a table
            df = pd.DataFrame({
                "Entity Type": list(entity_dict.keys()),
                "Extracted Value": [", ".join(v) for v in entity_dict.values()]
            })
            st.success("Extracted Entities:")
            st.table(df)

            # Footer (shown only after extraction)
            st.markdown("""
                <p style="font-size: small; color: grey; text-align: center; margin-top: 20px; border-top: 1px solid #eee; padding-top: 10px;">
                    Developed By: Krishna Prakash
                    <a href="https://www.linkedin.com/in/krishnaprakash-profile/" target="_blank">
                        <img src="https://img.icons8.com/ios-filled/30/0077b5/linkedin.png" alt="LinkedIn" style="vertical-align: middle; margin: 0 5px;"/>
                    </a>
                </p>
            """, unsafe_allow_html=True)

if __name__ == "__main__":
    main()