File size: 19,253 Bytes
7ced643
 
cb55723
ff1359f
e3a183a
cb55723
 
 
e3a183a
49f2a82
9c3de60
cb55723
 
 
 
9c3de60
f09938e
5635c5e
c309f5d
b54b20c
e3a183a
 
7ced643
e3a183a
 
 
 
 
 
 
0b83b3d
 
e3a183a
0b83b3d
e3a183a
0b83b3d
62f4d85
7ced643
e3a183a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394f654
 
011deb8
e3a183a
 
 
 
011deb8
e3a183a
 
 
 
 
 
 
0ff6a51
e3a183a
 
7ced643
e3a183a
 
 
7ced643
e3a183a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03f2467
e3a183a
b54b20c
11f7a30
e3a183a
11f7a30
e3a183a
b54b20c
11f7a30
e3a183a
d2c954e
9293af4
 
 
b54b20c
 
e3a183a
9293af4
 
 
d2c954e
e3a183a
384f5fe
e3a183a
 
 
 
 
 
384f5fe
e3a183a
 
384f5fe
 
 
cb55723
7ced643
 
8707304
2a522d7
e3a183a
8995446
e3a183a
 
 
7ced643
 
e3a183a
7ced643
b54b20c
 
e3a183a
cc4cae5
e3a183a
 
d2c954e
2a522d7
e3a183a
db6d664
7ced643
e3a183a
5f8fe09
cb55723
e3a183a
ff1359f
5635c5e
e3a183a
d487f80
e3a183a
 
 
 
 
 
 
 
 
d487f80
e3a183a
 
ff1359f
e3a183a
 
77f64eb
5bccda2
8e698cd
e3a183a
 
8e698cd
e3a183a
8e698cd
e3a183a
 
8e698cd
e3a183a
8e698cd
e3a183a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff1359f
5bccda2
 
e3a183a
 
5bccda2
 
e3a183a
 
 
5bccda2
e3a183a
49f2a82
e3a183a
 
 
7ced643
e3a183a
 
5635c5e
e3a183a
6cf034e
9235b45
e3a183a
 
 
 
6cf034e
 
e3a183a
 
6cf034e
 
e3a183a
6cf034e
 
e3a183a
 
6cf034e
e3a183a
 
7ced643
e3a183a
 
 
 
cc79bb4
 
e3a183a
 
2c466e2
cb55723
7ced643
e3a183a
3446718
2c466e2
 
 
8707304
23f873a
e3a183a
 
7ced643
e3a183a
ff1359f
7ced643
e3a183a
 
7ced643
e3a183a
 
7ced643
e3a183a
 
6cf034e
8cf12a5
2c466e2
e3a183a
9293af4
e3a183a
381e439
25e76ef
e3a183a
 
 
 
 
 
 
 
 
 
 
 
 
d354c8f
 
0c0c59a
e54c2ca
 
ae1ed4d
e3a183a
7ced643
e3a183a
 
7ced643
f943e52
e3a183a
6cf034e
2c466e2
e3a183a
384f5fe
 
e3a183a
384f5fe
e3a183a
3777dcf
e3a183a
 
 
 
 
cb55723
e3a183a
 
 
 
 
 
0ff6a51
e3a183a
9293af4
2c466e2
e3a183a
 
 
9293af4
e3a183a
7ced643
e3a183a
 
7ced643
0d2e0b4
e3a183a
 
 
 
1b0e496
e3a183a
 
 
 
 
 
 
1b0e496
e3a183a
 
 
 
1b0e496
e3a183a
 
 
 
 
 
 
 
 
 
 
 
1b0e496
e3a183a
 
 
 
1b0e496
e3a183a
 
 
 
 
 
 
1b0e496
e3a183a
 
 
 
 
 
 
 
014ac68
e3a183a
77f64eb
e3a183a
014ac68
7ced643
6cf034e
e3a183a
6e9c01b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
import os
import time
import logging
import re
from datetime import datetime, timedelta
from dotenv import load_dotenv
from cryptography.fernet import Fernet
from simple_salesforce import Salesforce
from transformers import pipeline
from PIL import Image
import pytesseract
import pandas as pd
from docx import Document
import PyPDF2
import gradio as gr
from pdf2image import convert_from_path
import tempfile
from pytz import timezone
import shutil
import unicodedata
import asyncio
import torch

# Global variables for caching
_sf = None
_summarizer = None
_fernet = None
_lock = asyncio.Lock()

# Setup logging
log_file = os.path.join(tempfile.gettempdir(), 'app.log')
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler(log_file)]
)
logger = logging.getLogger(__name__)

# Preload models and dependencies
def init_globals():
    global _summarizer, _fernet
    load_dotenv()
    required_env_vars = [
        'ENCRYPTION_KEY', 'SALESFORCE_USERNAME', 'SALESFORCE_PASSWORD',
        'SALESFORCE_SECURITY_TOKEN', 'SALESFORCE_DOMAIN'
    ]
    env = {var: os.getenv(var) for var in required_env_vars}
    if missing := [k for k in required_env_vars if not env[k]]:
        logger.error(f"Missing env vars: {', '.join(missing)}")
        return False
    try:
        _fernet = Fernet(env['ENCRYPTION_KEY'].encode())
    except Exception as e:
        logger.error(f"Invalid encryption key: {str(e)}")
        return False
    try:
        _summarizer = pipeline(
            "summarization",
            model="t5-small",
            tokenizer="t5-small",
            framework="pt",
            device=0 if torch.cuda.is_available() else -1
        )
        logger.info("Summarizer initialized successfully")
    except Exception as e:
        logger.error(f"Summarizer init failed: {str(e)}")
        return False
    return True

# Check critical dependencies
def check_dependencies():
    try:
        tesseract_path = shutil.which('tesseract')
        if not tesseract_path:
            logger.warning("Tesseract not found. OCR unavailable.")
            return ["Tesseract"], []
        pytesseract.pytesseract.tesseract_cmd = tesseract_path
        poppler_path = shutil.which('pdfinfo')
        if not poppler_path:
            logger.warning("Poppler not found.")
            return ["Poppler"], []
        return [], []
    except Exception as e:
        logger.error(f"Dependency check failed: {str(e)}")
        return ["Tesseract", "Poppler"], []

if not init_globals():
    raise RuntimeError("Failed to initialize global dependencies")

missing_deps, _ = check_dependencies()
if missing_deps:
    logger.warning(f"Missing dependencies: {', '.join(missing_deps)}")

# Salesforce connection (async)
async def init_salesforce(max_retries=2, initial_delay=1):
    global _sf
    async with _lock:
        if _sf is not None:
            return _sf
        for attempt in range(max_retries):
            try:
                _sf = await asyncio.get_event_loop().run_in_executor(
                    None,
                    lambda: Salesforce(
                        username=os.getenv('SALESFORCE_USERNAME'),
                        password=os.getenv('SALESFORCE_PASSWORD'),
                        security_token=os.getenv('SALESFORCE_SECURITY_TOKEN'),
                        domain=os.getenv('SALESFORCE_DOMAIN'),
                        version='58.0'
                    )
                )
                logger.info("Salesforce connection established")
                return _sf
            except Exception as e:
                logger.error(f"Salesforce connection attempt {attempt + 1} failed: {str(e)}")
                if attempt < max_retries - 1:
                    await asyncio.sleep(initial_delay * (2 ** attempt))
        raise ValueError("Salesforce connection failed after retries")

# Preprocess image for OCR (optimized)
def preprocess_image(image):
    try:
        return image.convert('L').resize((image.width, image.height), Image.BILINEAR)
    except Exception as e:
        logger.error(f"Image preprocess failed: {str(e)}")
        return image.convert('L')

# Clean text (optimized)
def clean_text(text):
    try:
        if not text:
            return ""
        text = unicodedata.normalize('NFKC', text)
        text = re.sub(r'\s+', ' ', text.strip())
        return text[:512]
    except Exception as e:
        logger.error(f"Text cleaning failed: {str(e)}")
        return ""

# Validate file
def validate_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext not in ['.pdf', '.docx', '.png', '.jpg', '.jpeg', '.csv', '.xls', '.xlsx']:
        return False, f"Unsupported file type: {ext}"
    if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
        return False, f"File not found or empty: {file_path}"
    return True, None

# Extract text (async)
async def extract_text_async(file_path):
    is_valid, error = validate_file(file_path)
    if not is_valid:
        return None, error
    ext = os.path.splitext(file_path)[1].lower()
    try:
        if ext == '.pdf':
            with open(file_path, 'rb') as f:
                pdf_reader = PyPDF2.PdfReader(f)
                text = "".join([p.extract_text() or "" for p in pdf_reader.pages[:1]])
                if not text or len(text.strip()) < 50:
                    images = convert_from_path(file_path, dpi=100, first_page=1, last_page=1, thread_count=2)
                    text = pytesseract.image_to_string(preprocess_image(images[0]), config='--psm 6')
            logger.info(f"Extracted text: {text[:100]}...")
        elif ext == '.docx':
            doc = Document(file_path)
            text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()][:25])
        elif ext in ['.png', '.jpg', '.jpeg']:
            img = Image.open(file_path)
            img = preprocess_image(img)
            text = pytesseract.image_to_string(img, config='--psm 6')
        elif ext in ['.csv', '.xls', '.xlsx']:
            df = pd.read_csv(file_path, encoding='utf-8') if ext == '.csv' else pd.read_excel(file_path)
            text = " ".join(df.astype(str).values.flatten())[:500]
        text = clean_text(text)
        if not text or len(text) < 50:
            return None, f"No valid text extracted from {file_path}"
        return text, None
    except Exception as e:
        logger.error(f"Text extraction failed: {str(e)} with file {file_path}")
        return None, f"Text extraction failed: {str(e)}"

# Parse dates (enhanced for better end date detection)
def parse_dates(text):
    ist = timezone('Asia/Kolkata')
    current_date = datetime.now(ist).strftime('%Y-%m-%d')
    try:
        date_patterns = [r'\b\d{4}-\d{2}-\d{2}\b']
        term_patterns = [r'(?:term|duration)\s*(?:of|for)\s*(\d+)\s*(?:year|years)']
        dates = re.findall(date_patterns[0], text, re.IGNORECASE)
        parsed_dates = [datetime.strptime(date, '%Y-%m-%d').strftime('%Y-%m-%d') for date in dates if '-' in date]
        term_match = re.search(term_patterns[0], text, re.IGNORECASE)
        start_date = parsed_dates[0] if parsed_dates else current_date
        end_date = (datetime.strptime(start_date, '%Y-%m-%d') + timedelta(days=(int(term_match.group(1)) * 365 if term_match else 1) * 365)).strftime('%Y-%m-%d') if parsed_dates else current_date
        logger.info(f"Parsed dates - Start: {start_date}, End: {end_date}")
        return start_date, end_date
    except Exception as e:
        logger.error(f"Date parsing failed: {str(e)} with text {text[:50]}...")
        return current_date, current_date

# Summarize contract (async)
async def summarize_contract_async(text, summarizer, file_name):
    aspects = ["parties", "payment terms", "obligations", "termination clauses"]
    try:
        if not text or len(text.strip()) < 50:
            ist = timezone('Asia/Kolkata')
            current_date = datetime.now(ist).strftime('%Y-%m-%d')
            return {
                "full_summary": "No summary due to insufficient text",
                "aspect_summaries": {asp: "Not extracted" for asp in aspects},
                "start_date": current_date,
                "end_date": current_date
            }, None
        text = clean_text(text)[:512]
        aspect_summaries = {}
        for asp in aspects:
            if asp == "parties":
                match = re.search(r'(?:parties|between)\s+([A-Za-z\s&]+?)(?:\sand|\,|\.)', text, re.IGNORECASE)
                aspect_summaries[asp] = match.group(1).strip()[:100] if match else "Not extracted"
            elif asp == "payment terms":
                match = re.search(r'(?:payment|terms)\s+([\d,.]+\s*(?:EUR|USD|INR)\s*(?:monthly|annually|quarterly))', text, re.IGNORECASE)
                aspect_summaries[asp] = match.group(1)[:100] if match else "Not extracted"
            elif asp == "obligations":
                match = re.search(r'(?:obligations|services|duties)\s+(.+?)(?:\by|\,|\.)', text, re.IGNORECASE)
                aspect_summaries[asp] = match.group(1).strip()[:100] if match else "Not extracted"
            elif asp == "termination clauses":
                match = re.search(r'(?:termination|notice)\s+(\d+\s*days\'?\s*notice)', text, re.IGNORECASE)
                aspect_summaries[asp] = match.group(1)[:100] if match else "Not extracted"
        
        # Custom summary template
        parties = aspect_summaries.get("parties", "Not extracted")
        obligations = aspect_summaries.get("obligations", "Not extracted")
        full_summary = f"Logistics agreement between {parties} for {obligations}..." if parties != "Not extracted" and obligations != "Not extracted" else text[:60] + "..."
        logger.info(f"Final summary: {full_summary}")
        
        start_date, end_date = parse_dates(text)
        return {
            "full_summary": full_summary,
            "aspect_summaries": aspect_summaries,
            "start_date": start_date,
            "end_date": end_date
        }, None
    except Exception as e:
        logger.error(f"Summarization failed: {str(e)} with text {text[:50]}...")
        ist = timezone('Asia/Kolkata')
        current_date = datetime.now(ist).strftime('%Y-%m-%d')
        return {
            "full_summary": text[:60] + "..." if len(text) > 60 else text,
            "aspect_summaries": {asp: "Not extracted" for asp in aspects},
            "start_date": current_date,
            "end_date": current_date
        }, f"Summarization error: {str(e)}"

# Create Contract Document (async)
async def create_contract_document(sf, file_name):
    ist = timezone('Asia/Kolkata')
    current_time = datetime.now(ist).strftime('%Y-%m-%dT%H:%M:%SZ')
    try:
        escaped_file_name = file_name.replace("'", "\\'")
        query = f"SELECT Id FROM Contract_Document__c WHERE Name = '{escaped_file_name}' LIMIT 1"
        result = await asyncio.get_event_loop().run_in_executor(None, sf.query, query)
        if result['totalSize'] > 0:
            return result['records'][0]['Id'], None
        record = {
            'Name': file_name,
            'Document_URL__c': '',
            'Upload_Date__c': current_time,
            'Status__c': 'Uploaded'
        }
        result = await asyncio.get_event_loop().run_in_executor(None, sf.Contract_Document__c.create, record)
        return result['id'], None
    except Exception as e:
        logger.error(f"Contract document creation failed: {str(e)}")
        return None, f"Contract document creation failed: {str(e)}"

# Store summary in Salesforce (async)
async def store_in_salesforce(sf, summary_data, file_name, contract_doc_id):
    try:
        if not contract_doc_id:
            return None, "Contract document ID is missing"
        query = f"SELECT Id FROM Contract_Summary__c WHERE Contract_Document__c = '{contract_doc_id}' LIMIT 1"
        result = await asyncio.get_event_loop().run_in_executor(None, sf.query, query)
        if result['totalSize'] > 0:
            return {'id': result['records'][0]['Id']}, None
        encrypted_summary = _fernet.encrypt(summary_data['full_summary'].encode()).decode()
        def truncate(text, length=100):
            return text[:length] if text else 'Not extracted'
        record = {
            'Name': file_name,
            'Contract_Document__c': contract_doc_id,
            'Parties__c': truncate(summary_data['aspect_summaries'].get('parties', 'Not extracted')),
            'Payment_Terms__c': truncate(summary_data['aspect_summaries'].get('payment terms', 'Not extracted')),
            'Obligations__c': truncate(summary_data['aspect_summaries'].get('obligations', 'Not extracted')),
            'Termination_Clause__c': truncate(summary_data['aspect_summaries'].get('termination clauses', 'Not extracted')),
            'Custom_Field_1__c': encrypted_summary,
            'Validation_Status__c': 'Pending',
            'Start_Date__c': summary_data['start_date'][:10],
            'End_Date__c': summary_data['end_date'][:10],
        }
        result = await asyncio.get_event_loop().run_in_executor(None, sf.Contract_Summary__c.create, record)
        return result, None
    except Exception as e:
        logger.error(f"Store summary failed: {str(e)}")
        return None, f"Store summary failed: {str(e)}"

# Generate CSV report (async)
async def generate_report(sf, output_file, contract_doc_id):
    try:
        if not contract_doc_id:
            return pd.DataFrame(columns=['Field', 'Value']), "Contract document ID is missing"
        query = (
            f"SELECT Id, Name, Parties__c, Payment_Terms__c, Obligations__c, Termination_Clause__c, Custom_Field_1__c, "
            f"Validation_Status__c, Start_Date__c, End_Date__c "
            f"FROM Contract_Summary__c WHERE Contract_Document__c = '{contract_doc_id}' LIMIT 1"
        )
        results = (await asyncio.get_event_loop().run_in_executor(None, sf.query, query))['records']
        rows = []
        for r in results:
            decrypted_summary = _fernet.decrypt(r.get('Custom_Field_1__c', '').encode()).decode() if r.get('Custom_Field_1__c') else 'Not extracted'
            fields = [
                ('Contract Name', r.get('Name', 'Not extracted')),
                ('Parties', r.get('Parties__c', 'Not extracted')[:100]),
                ('Payment Terms', r.get('Payment_Terms__c', 'Not extracted')[:100]),
                ('Obligations', r.get('Obligations__c', 'Not extracted')[:100]),
                ('Termination Clause', r.get('Termination_Clause__c', 'Not extracted')[:100]),
                ('Full Summary', decrypted_summary[:100]),
                ('Validation Status', r.get('Validation_Status__c', 'Not extracted')),
                ('Start Date', r.get('Start_Date__c', 'Not extracted')),
                ('End Date', r.get('End_Date__c', 'Not extracted')),
            ]
            rows.extend(fields)
        # Create DataFrame without the "Summary Report" header row
        df = pd.DataFrame(rows, columns=['Field', 'Value']) if rows else pd.DataFrame(columns=['Field', 'Value'])
        df.to_csv(output_file, index=False, encoding='utf-8')
        return df, output_file
    except Exception as e:
        logger.error(f"Report generation failed: {str(e)}")
        return pd.DataFrame(columns=['Field', 'Value']), f"Report generation failed: {str(e)}"

# Gradio interface function (async)
async def gradio_process_async(file, progress=gr.Progress()):
    try:
        if not file:
            return pd.DataFrame(columns=['Field', 'Value']), None
        file_path = file.name if hasattr(file, 'name') else file
        file_name = os.path.basename(file_path)
        progress(0.1, desc="Validating...")
        is_valid, error = validate_file(file_path)
        if not is_valid:
            return pd.DataFrame(columns=['Field', 'Value']), None
        progress(0.2, desc="Extracting text...")
        text, error = await extract_text_async(file_path)
        if error:
            return pd.DataFrame(columns['Field', 'Value']), None
        progress(0.4, desc="Initializing...")
        sf = await init_salesforce()
        progress(0.5, desc="Summarizing...")
        summary_data, err = await summarize_contract_async(text, _summarizer, file_name)
        if err:
            return pd.DataFrame(columns['Field', 'Value']), None
        progress(0.7, desc="Storing in Salesforce...")
        contract_doc_id, err = await create_contract_document(sf, file_name)
        if err or not contract_doc_id:
            return pd.DataFrame(columns['Field', 'Value']), None
        store_result, err = await store_in_salesforce(sf, summary_data, file_name, contract_doc_id)
        if err:
            return pd.DataFrame(columns['Field', 'Value']), None
        progress(0.9, desc="Generating report...")
        csv_path = os.path.join(tempfile.gettempdir(), f"contract_summary_{file_name}.csv")
        report_df, csv_path = await generate_report(sf, csv_path, contract_doc_id)
        if not csv_path:
            return pd.DataFrame(columns['Field', 'Value']), None
        progress(1.0, desc="Complete!")
        return report_df, csv_path
    except Exception as e:
        logger.error(f"Processing error for {file_name if 'file_name' in locals() else 'file'}: {str(e)} at {datetime.now(timezone('Asia/Kolkata')).strftime('%H:%M:%S %Y-%m-%d')}")
        return pd.DataFrame(columns['Field', 'Value']), None

# Gradio UI setup
with gr.Blocks(theme="soft", css="""
    .gr-button {
        background-color: #6A5ACD;
        color: #6A5ACD;
        font-weight: bold;
        font-size: 16px;
        border: none;
        padding: 5px 20px;
    }
    .gr-button:hover {
        background-color: #5A4ABF;
        color: #6A5ACD;
        font-weight: bold;
        font-size: 16px;
    }
    .gr-label {
        color: #6A5ACD;
        font-weight: bold;
        font-size: 16px;
        background-color: #F0F0FF;
        padding: 5px;
    }
    .gr-textbox { border: 1px solid #6A5ACD; background-color: white; }
    .gr-file { border: 1px solid #6A5ACD; background-color: white; }
    .gr-dataframe {
        border: 1px solid #6A5ACD;
        background-color: white;
    }
    .gr-dataframe td, .gr-dataframe th {
        color: #6A5ACD;
        font-weight: bold;
        font-size: 16px;
    }
    #summary-report-label {
        color: #6A5ACD;
        font-weight: bold;
        font-size: 16px;
        background-color: #F0F0FF;
        padding: 5px;
    }
    .gr-dataframe tr:first-child td {
        background-color: #F0F0FF;
        color: #6A5ACD;
        font-weight: bold;
        font-size: 16px;
        padding: 5px;
    }
""") as iface:
    file_input = gr.File(label="Upload Contract (PDF, DOCX, CSV)")
    submit_btn = gr.Button("Submit")
    report_output = gr.DataFrame(label="Summary Report", headers=['Field', 'Value'], interactive=False, elem_id="summary-report")
    csv_output = gr.File(label="Download CSV")
    submit_btn.click(
        fn=gradio_process_async,
        inputs=[file_input],
        outputs=[report_output, csv_output]
    )

if __name__ == "__main__":
    logger.info("Application startup at %s", datetime.now(timezone('Asia/Kolkata')).strftime('%H:%M:%S %Y-%m-%d'))
    iface.launch()