Spaces:
Sleeping
Sleeping
File size: 14,862 Bytes
db33631 e2c450b db33631 e2c450b 3767c30 db33631 ded79c9 db33631 ded79c9 db33631 ded79c9 db33631 ded79c9 db33631 ded79c9 db33631 ded79c9 db33631 ded79c9 e2c450b ded79c9 db33631 ded79c9 e2c450b ded79c9 e2c450b 12eb018 ded79c9 12eb018 ded79c9 e2c450b ded79c9 e2c450b ded79c9 3767c30 ded79c9 3767c30 ded79c9 3767c30 ded79c9 db33631 ded79c9 12eb018 ded79c9 12eb018 e2c450b 12eb018 db33631 12eb018 db33631 12eb018 ded79c9 db33631 12eb018 e2c450b 12eb018 db33631 e2c450b 12eb018 db33631 ded79c9 db33631 ded79c9 db33631 ded79c9 db33631 3767c30 e2c450b 3767c30 db33631 ded79c9 db33631 3767c30 e2c450b db33631 ded79c9 db33631 12eb018 db33631 12eb018 db33631 ded79c9 db33631 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 |
import os
import re
import json
import tempfile
import gradio as gr
from paddleocr import PaddleOCR
import fitz # PyMuPDF
from simple_salesforce import Salesforce
from dotenv import load_dotenv
import logging
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
import time
import base64
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from io import BytesIO
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Load environment variables from .env file
load_dotenv()
# Salesforce credentials from the .env file
SF_USERNAME = os.getenv('SF_USERNAME')
SF_PASSWORD = os.getenv('SF_PASSWORD')
SF_SECURITY_TOKEN = os.getenv('SF_SECURITY_TOKEN')
# Initialize PaddleOCR with use_angle_cls=True to handle text orientation
try:
ocr = PaddleOCR(use_angle_cls=True, lang='en') # Use English language model
logger.info("PaddleOCR initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize PaddleOCR: {str(e)}")
ocr = None
# List of required values
required_values = [
"Vendor Name",
"Tax Identification Number (TIN)",
"Address",
"Certification Details",
"Contract Terms",
"Payment Terms",
"Signature"
]
# Define valid flags picklist values (for Category_Match__c)
VALID_CATEGORIES = ['Compliant', 'Partially Compliant', 'Non-Compliant', 'Not Applicable']
# Define possible flags for Flags__c (multi-select picklist)
VALID_FLAGS = ['Compliant', 'Partially Compliant', 'Non-Compliant', 'Not Applicable']
# FastAPI app initialization
app = FastAPI()
# PDF generation helper
def generate_pdf_from_text(text, vendor_name):
try:
pdf_buffer = BytesIO()
c = canvas.Canvas(pdf_buffer, pagesize=letter)
width, height = letter
text_object = c.beginText(40, height - 40)
lines = text.split('\n')
for line in lines:
text_object.textLine(line)
c.drawText(text_object)
c.showPage()
c.save()
pdf_buffer.seek(0)
return pdf_buffer
except Exception as e:
logger.error(f"Error generating PDF: {e}")
return None
# Upload PDF to Salesforce ContentVersion and get public URL
def upload_pdf_to_salesforce(sf, pdf_buffer, vendor_name):
try:
encoded_pdf = base64.b64encode(pdf_buffer.getvalue()).decode('utf-8')
timestamp = int(time.time())
file_name = f"{vendor_name}_ExtractedText_{timestamp}.pdf"
content_version_data = {
"Title": file_name,
"PathOnClient": file_name,
"VersionData": encoded_pdf
}
content_version = sf.ContentVersion.create(content_version_data)
file_url = f"https://{sf.sf_instance}/sfc/servlet.shepherd/version/download/{content_version['id']}"
logger.info(f"PDF uploaded to Salesforce: {file_url}")
return file_url
except Exception as e:
logger.error(f"Error uploading PDF to Salesforce: {e}")
return None
def process_pdf(pdf_file_path):
try:
if not pdf_file_path or not os.path.exists(pdf_file_path):
logger.error("No valid file path provided or file does not exist")
return "No valid file provided", "Error", 0, "Error", "Error"
# Validate PDF file
try:
with open(pdf_file_path, 'rb') as f:
if not f.read(4).startswith(b'%PDF'):
logger.error("Uploaded file is not a valid PDF")
return "Invalid PDF file", "Error", 0, "Error", "Error"
except Exception as e:
logger.error(f"Error reading file: {e}")
return f"Error reading file: {e}", "Error", 0, "Error", "Error"
with tempfile.TemporaryDirectory() as path:
logger.info(f"Temporary directory created at {path}")
# Open PDF with fitz
try:
pdf_document = fitz.open(pdf_file_path)
num_pages = pdf_document.page_count
logger.info(f"PDF has {num_pages} pages.")
except Exception as e:
logger.error(f"Failed to open PDF with fitz: {e}")
return f"Failed to open PDF: {e}", "Error", 0, "Error", "Error"
extracted_text = ""
for page_num in range(num_pages):
page = pdf_document.load_page(page_num)
try:
# Improved pixmap generation with zoom and no alpha channel
zoom = 2
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
page_path = os.path.join(path, f"page_{page_num + 1}.png")
pix.save(page_path)
logger.info(f"Processing page {page_num + 1} saved at {page_path}.")
except Exception as e:
logger.error(f"Error creating pixmap for page {page_num + 1}: {e}")
try:
pix = page.get_pixmap(alpha=False)
page_path = os.path.join(path, f"page_{page_num + 1}_fallback.png")
pix.save(page_path)
logger.info(f"Fallback processing succeeded for page {page_num + 1} at {page_path}.")
except Exception as e2:
logger.error(f"Fallback failed for page {page_num + 1}: {e2}")
continue
# Use PaddleOCR without the cls parameter
if ocr is None:
logger.error("PaddleOCR not initialized")
return "OCR engine not initialized", "Error", 0, "Error", "Error"
try:
# Remove cls=True since it's causing the error
result = ocr.ocr(page_path)
if result and result[0]:
page_text = "\n".join([line[1][0] for line in result[0] if line[1][0]]) + "\n"
extracted_text += page_text
logger.info(f"Extracted text from page {page_num + 1} (first 200 chars): {page_text[:200]}...")
else:
logger.warning(f"No text extracted from page {page_num + 1}")
except Exception as e:
logger.error(f"Error performing OCR on page {page_num + 1}: {e}")
continue
logger.info(f"Full extracted text (first 200 chars): {extracted_text[:200]}...")
if not extracted_text.strip():
logger.error("No text extracted from PDF")
# Fallback: Try extracting text directly from PDF using PyMuPDF
try:
for page_num in range(num_pages):
page = pdf_document.load_page(page_num)
page_text = page.get_text("text")
if page_text:
extracted_text += page_text + "\n"
logger.info(f"Fallback text extracted from page {page_num + 1} using PyMuPDF.")
if not extracted_text.strip():
return "No text extracted from PDF even after fallback", "Error", 0, "Error", "Error"
except Exception as e:
logger.error(f"Error in fallback text extraction: {e}")
return "No text extracted from PDF even after fallback", "Error", 0, "Error", "Error"
vendor_name = extract_vendor_name(extracted_text)
logger.info(f"Extracted Vendor Name: {vendor_name}")
missing_values = analyze_document(extracted_text)
missing_count = len(missing_values)
logger.info(f"Missing values: {missing_values}")
if missing_count == 0:
category = 'Compliant'
score = 100
comments = 'Document contains all required values.'
flags = 'Compliant'
elif missing_count == 1:
category = 'Partially Compliant'
score = 85
comments = 'Document is missing one required value.'
flags = 'Partially Compliant'
elif missing_count > 1 and missing_count < 3:
category = 'Non-Compliant'
score = 60
comments = 'Document is missing two required values.'
flags = 'Non-Compliant'
else:
category = 'Not Applicable'
score = 40
comments = 'Document is missing three or more required values.'
flags = 'Not Applicable'
insert_result = insert_into_salesforce(vendor_name, extracted_text, category, score, comments, flags)
logger.info(f"Salesforce Insert Result: {insert_result}")
return extracted_text, category, score, comments, flags
except Exception as e:
logger.error(f"Error processing PDF: {e}")
return f"Error: {e}", "Error", 0, "Error", "Error"
def extract_vendor_name(text):
patterns = [
r"Vendor Name[:\s]*([^\n]+)",
r"Vendor[:\s]*([^\n]+)",
r"Company Name[:\s]*([^\n]+)",
r"Supplier[:\s]*([^\n]+)",
r"Name[:\s]*([^\n]+)"
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
vendor_name = match.group(1).strip()
logger.info(f"Vendor name extracted: {vendor_name}")
return vendor_name
logger.warning("No vendor name found in text, attempting to extract email")
email_match = re.search(r"[\w\.-]+@[\w\.-]+\.\w+", text)
if email_match:
email = email_match.group(0)
logger.info(f"Email extracted: {email}")
return email
logger.warning("No email found in text")
return "Unknown Vendor"
def analyze_document(document_text):
missing_values = []
for value in required_values:
if value.lower() not in document_text.lower().strip():
missing_values.append(value)
return missing_values
def insert_into_salesforce(vendor_name_or_email, extracted_text, category, score, comments, flags):
try:
sf = Salesforce(username=SF_USERNAME, password=SF_PASSWORD, security_token=SF_SECURITY_TOKEN)
logger.info(f"Salesforce authentication successful.")
vendor_id = None
vendor_name_clean = vendor_name_or_email.strip()
logger.info(f"Querying Salesforce for Vendor: {vendor_name_clean}")
is_email = bool(re.match(r"[\w\.-]+@[\w\.-]+\.\w+", vendor_name_clean))
if is_email:
query = f"SELECT Id, Name FROM Vendor__c WHERE Email__c = '{vendor_name_clean}' LIMIT 1"
vendor_record = sf.query(query)
if vendor_record['totalSize'] > 0:
vendor_id = vendor_record['records'][0]['Id']
vendor_name_clean = vendor_record['records'][0]['Name']
logger.info(f"Vendor found by email with ID: {vendor_id}, Name: {vendor_name_clean}")
else:
logger.warning(f"Vendor with email '{vendor_name_clean}' not found!")
else:
vendor_name_clean = vendor_name_clean.replace("'", "''")
query = f"SELECT Id FROM Vendor__c WHERE Name = '{vendor_name_clean}' LIMIT 1"
vendor_record = sf.query(query)
if vendor_record['totalSize'] > 0:
vendor_id = vendor_record['records'][0]['Id']
logger.info(f"Vendor found by name with ID: {vendor_id}")
else:
logger.warning(f"Vendor '{vendor_name_clean}' not found!")
pdf_buffer = generate_pdf_from_text(extracted_text, vendor_name_clean)
pdf_url = None
if pdf_buffer:
pdf_url = upload_pdf_to_salesforce(sf, pdf_buffer, vendor_name_clean)
else:
logger.error("Failed to generate PDF from extracted text.")
vendor_field_value = vendor_name_clean
logger.info(f"Setting Vendor_Name__c field to: {vendor_field_value}")
extracted_text_truncated = extracted_text[:32768] if len(extracted_text) > 32768 else extracted_text
flags_value = flags # Can be extended to "Value1;Value2" for multi-select
scorecard_data = {
'Vendor_Name__c': vendor_field_value,
'Extracted_Text_URL__c': pdf_url or "",
'Score__c': score,
'Category_Match__c': category,
'Comments__c': comments,
'Flags__c': flags_value,
'Uploaded_File__c': extracted_text_truncated
}
result = sf.Vendor_Scorecard__c.create(scorecard_data)
if result and 'id' in result:
logger.info(f"Record inserted successfully with ID: {result['id']}")
return result
else:
logger.error("Failed to insert Vendor_Scorecard__c record.")
return "Failed to insert record"
except Exception as e:
logger.error(f"Error inserting into Salesforce: {e}")
return f"Error: {e}"
@app.post("/process_pdf/")
async def process_pdf_api(file: UploadFile = File(...)):
try:
contents = await file.read()
if not contents:
logger.error("Uploaded file is empty")
return JSONResponse(content={"error": "Uploaded file is empty"}, status_code=400)
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(contents)
temp_file.flush()
temp_file.close()
extracted_text, category, score, comments, flags = process_pdf(temp_file.name)
os.unlink(temp_file.name)
return JSONResponse(content={
"extracted_text": extracted_text,
"category": category,
"score": score,
"comments": comments,
"flags": flags
})
except Exception as e:
logger.error(f"Error processing the file via API: {e}")
return JSONResponse(content={"error": str(e)}, status_code=500)
def gradio_interface(pdf_file):
if pdf_file is None:
return "No file uploaded", "Error", 0, "Error", "Error"
return process_pdf(pdf_file.name)
gr_interface = gr.Interface(
fn=gradio_interface,
inputs=gr.File(label="Upload PDF Document"),
outputs=[
gr.Textbox(label="Extracted Text"),
gr.Textbox(label="Category Match"),
gr.Number(label="Score"),
gr.Textbox(label="Comments"),
gr.Textbox(label="Flags")
],
live=True
)
if __name__ == "__main__":
import threading
def run_gradio():
gr_interface.launch()
threading.Thread(target=run_gradio).start()
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000) |