Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,7 +7,7 @@ import re
|
|
| 7 |
import base64
|
| 8 |
import mimetypes
|
| 9 |
from datasets import Dataset
|
| 10 |
-
from huggingface_hub import HfApi,
|
| 11 |
import huggingface_hub
|
| 12 |
import os
|
| 13 |
from mistralai import Mistral
|
|
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
|
|
| 18 |
|
| 19 |
# --- Mistral OCR Setup ---
|
| 20 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
| 21 |
-
hf_token_global = None
|
| 22 |
client = None
|
| 23 |
|
| 24 |
if not api_key:
|
|
@@ -112,18 +112,33 @@ def perform_ocr_file(file_obj: Any) -> Tuple[str, str, Dict[str, str]]:
|
|
| 112 |
uploaded_file_id = None
|
| 113 |
|
| 114 |
if file_ext == '.pdf':
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
| 116 |
logger.info(f"Uploading PDF {file_name} to Mistral...")
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
uploaded_file_id = uploaded_pdf.id
|
|
|
|
|
|
|
| 119 |
signed_url_response = client.files.get_signed_url(file_id=uploaded_file_id)
|
| 120 |
ocr_response = client.ocr.process(
|
| 121 |
model="mistral-ocr-latest",
|
| 122 |
document={"type": "document_url", "document_url": signed_url_response.url},
|
| 123 |
include_image_base64=True
|
| 124 |
)
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
elif file_ext in ['.png', '.jpg', '.jpeg', '.webp', '.bmp']:
|
| 129 |
with open(file_path, "rb") as f:
|
|
|
|
| 7 |
import base64
|
| 8 |
import mimetypes
|
| 9 |
from datasets import Dataset
|
| 10 |
+
from huggingface_hub import HfApi, get_token
|
| 11 |
import huggingface_hub
|
| 12 |
import os
|
| 13 |
from mistralai import Mistral
|
|
|
|
| 18 |
|
| 19 |
# --- Mistral OCR Setup ---
|
| 20 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
| 21 |
+
hf_token_global = None
|
| 22 |
client = None
|
| 23 |
|
| 24 |
if not api_key:
|
|
|
|
| 112 |
uploaded_file_id = None
|
| 113 |
|
| 114 |
if file_ext == '.pdf':
|
| 115 |
+
try:
|
| 116 |
+
with open(file_path, "rb") as f:
|
| 117 |
+
file_content = f.read()
|
| 118 |
+
|
| 119 |
logger.info(f"Uploading PDF {file_name} to Mistral...")
|
| 120 |
+
files = {
|
| 121 |
+
"file": (file_name, file_content, "application/pdf")
|
| 122 |
+
}
|
| 123 |
+
uploaded_pdf = client.files.upload(
|
| 124 |
+
file=files["file"],
|
| 125 |
+
purpose="ocr"
|
| 126 |
+
)
|
| 127 |
uploaded_file_id = uploaded_pdf.id
|
| 128 |
+
logger.info(f"PDF uploaded successfully. File ID: {uploaded_file_id}")
|
| 129 |
+
|
| 130 |
signed_url_response = client.files.get_signed_url(file_id=uploaded_file_id)
|
| 131 |
ocr_response = client.ocr.process(
|
| 132 |
model="mistral-ocr-latest",
|
| 133 |
document={"type": "document_url", "document_url": signed_url_response.url},
|
| 134 |
include_image_base64=True
|
| 135 |
)
|
| 136 |
+
finally:
|
| 137 |
+
if uploaded_file_id:
|
| 138 |
+
try:
|
| 139 |
+
client.files.delete(file_id=uploaded_file_id)
|
| 140 |
+
except Exception as delete_err:
|
| 141 |
+
logger.warning(f"Failed to delete temporary file {uploaded_file_id}: {delete_err}")
|
| 142 |
|
| 143 |
elif file_ext in ['.png', '.jpg', '.jpeg', '.webp', '.bmp']:
|
| 144 |
with open(file_path, "rb") as f:
|