Commit
·
77e1eaf
1
Parent(s):
e31bcf4
printing extracted text11
Browse files
main.py
CHANGED
@@ -2,10 +2,18 @@ import os
|
|
2 |
import tempfile
|
3 |
import gc
|
4 |
import psutil
|
|
|
5 |
|
6 |
-
# Set
|
7 |
os.environ['TRANSFORMERS_CACHE'] = os.path.join(tempfile.gettempdir(), 'huggingface_cache')
|
8 |
-
os.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# Now import the rest of the dependencies
|
11 |
import sys
|
@@ -23,49 +31,53 @@ import json
|
|
23 |
import queue
|
24 |
import threading
|
25 |
from threading import Thread, Event
|
26 |
-
import time
|
27 |
-
import nltk
|
28 |
-
import gensim
|
29 |
-
from gensim.models import FastText
|
30 |
-
from sentence_transformers import SentenceTransformer
|
31 |
-
from transformers import pipeline
|
32 |
-
import torch
|
33 |
-
import torch.nn.functional as F
|
34 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
35 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
36 |
-
import re
|
37 |
-
import string
|
38 |
-
import unicodedata
|
39 |
import warnings
|
40 |
from flask_cors import CORS
|
41 |
from dotenv import load_dotenv
|
42 |
warnings.filterwarnings('ignore')
|
43 |
|
44 |
-
#
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
#
|
52 |
-
|
53 |
-
|
|
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
from HTR.strike import struck_images
|
59 |
-
from HTR.hcr import text
|
60 |
-
from HTR.spell_and_gramer_check import spell_grammer
|
61 |
|
62 |
-
|
63 |
-
|
|
|
64 |
|
65 |
-
#
|
66 |
-
|
67 |
|
68 |
-
# Create
|
69 |
BASE_DIR = '/tmp' # Use direct /tmp path for Hugging Face
|
70 |
log_dir = os.path.join(BASE_DIR, 'app_logs')
|
71 |
cache_dir = os.path.join(BASE_DIR, 'app_cache')
|
@@ -142,7 +154,7 @@ def get_or_load_model(model_name):
|
|
142 |
return None
|
143 |
except Exception as e:
|
144 |
log_print(f"Error loading {model_name} model: {e}", "ERROR")
|
145 |
-
|
146 |
return global_models.get(model_name)
|
147 |
|
148 |
def initialize_resources():
|
@@ -162,7 +174,8 @@ def initialize_resources():
|
|
162 |
log_print(f"Downloading NLTK data: {data}")
|
163 |
nltk.download(data, download_dir=nltk_data_dir, quiet=True)
|
164 |
except Exception as e:
|
165 |
-
log_print(f"Error downloading NLTK data {data}: {e}", "
|
|
|
166 |
|
167 |
# Initialize models
|
168 |
try:
|
|
|
2 |
import tempfile
|
3 |
import gc
|
4 |
import psutil
|
5 |
+
import time
|
6 |
|
7 |
+
# Set environment variables before any other imports
|
8 |
os.environ['TRANSFORMERS_CACHE'] = os.path.join(tempfile.gettempdir(), 'huggingface_cache')
|
9 |
+
os.environ['HF_HOME'] = os.path.join(tempfile.gettempdir(), 'huggingface')
|
10 |
+
os.environ['TORCH_HOME'] = os.path.join(tempfile.gettempdir(), 'torch')
|
11 |
+
os.environ['XDG_CACHE_HOME'] = os.path.join(tempfile.gettempdir(), 'cache')
|
12 |
+
|
13 |
+
# Create cache directories
|
14 |
+
for cache_dir in [os.environ['TRANSFORMERS_CACHE'], os.environ['HF_HOME'],
|
15 |
+
os.environ['TORCH_HOME'], os.environ['XDG_CACHE_HOME']]:
|
16 |
+
os.makedirs(cache_dir, exist_ok=True)
|
17 |
|
18 |
# Now import the rest of the dependencies
|
19 |
import sys
|
|
|
31 |
import queue
|
32 |
import threading
|
33 |
from threading import Thread, Event
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
import warnings
|
35 |
from flask_cors import CORS
|
36 |
from dotenv import load_dotenv
|
37 |
warnings.filterwarnings('ignore')
|
38 |
|
39 |
+
# Import ML libraries with timeout protection
|
40 |
+
def import_with_timeout(import_statement, timeout=30):
|
41 |
+
"""Import a module with a timeout to prevent hanging"""
|
42 |
+
result = {'success': False, 'module': None, 'error': None}
|
43 |
+
|
44 |
+
def _import():
|
45 |
+
try:
|
46 |
+
if isinstance(import_statement, str):
|
47 |
+
result['module'] = __import__(import_statement)
|
48 |
+
else:
|
49 |
+
exec(import_statement)
|
50 |
+
result['success'] = True
|
51 |
+
except Exception as e:
|
52 |
+
result['error'] = str(e)
|
53 |
+
|
54 |
+
thread = Thread(target=_import)
|
55 |
+
thread.daemon = True
|
56 |
+
thread.start()
|
57 |
+
thread.join(timeout=timeout)
|
58 |
+
|
59 |
+
if thread.is_alive():
|
60 |
+
return None, f"Import timed out after {timeout} seconds"
|
61 |
+
|
62 |
+
return result['module'], result['error']
|
63 |
|
64 |
+
# Import ML libraries safely
|
65 |
+
nltk, nltk_error = import_with_timeout('nltk')
|
66 |
+
if nltk_error:
|
67 |
+
log_print(f"Warning: NLTK import failed: {nltk_error}", "WARNING")
|
68 |
|
69 |
+
gensim, gensim_error = import_with_timeout('gensim')
|
70 |
+
if gensim_error:
|
71 |
+
log_print(f"Warning: Gensim import failed: {gensim_error}", "WARNING")
|
|
|
|
|
|
|
72 |
|
73 |
+
torch, torch_error = import_with_timeout('torch')
|
74 |
+
if torch_error:
|
75 |
+
log_print(f"Warning: PyTorch import failed: {torch_error}", "WARNING")
|
76 |
|
77 |
+
# Add the project root directory to Python path
|
78 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
79 |
|
80 |
+
# Create cache directory if it doesn't exist
|
81 |
BASE_DIR = '/tmp' # Use direct /tmp path for Hugging Face
|
82 |
log_dir = os.path.join(BASE_DIR, 'app_logs')
|
83 |
cache_dir = os.path.join(BASE_DIR, 'app_cache')
|
|
|
154 |
return None
|
155 |
except Exception as e:
|
156 |
log_print(f"Error loading {model_name} model: {e}", "ERROR")
|
157 |
+
return None
|
158 |
return global_models.get(model_name)
|
159 |
|
160 |
def initialize_resources():
|
|
|
174 |
log_print(f"Downloading NLTK data: {data}")
|
175 |
nltk.download(data, download_dir=nltk_data_dir, quiet=True)
|
176 |
except Exception as e:
|
177 |
+
log_print(f"Error downloading NLTK data {data}: {e}", "WARNING")
|
178 |
+
continue
|
179 |
|
180 |
# Initialize models
|
181 |
try:
|