Spaces:
Running
Running
from PIL import Image | |
import io | |
import fitz | |
import re | |
import pytesseract | |
import google.generativeai as genai | |
from fastapi import FastAPI, UploadFile, File, Form, HTTPException | |
from fastapi.middleware.cors import CORSMiddleware | |
import platform | |
def extract_images_from_pdf_bytes(pdf_bytes: bytes) -> list: | |
doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
images = [] | |
for page in doc: | |
pix = page.get_pixmap() | |
buf = io.BytesIO() | |
buf.write(pix.tobytes("png")) | |
images.append(buf.getvalue()) | |
return images | |
def clean_ocr_text(text: str) -> str: | |
text = text.replace("\x0c", " ") # remove form feed | |
text = text.replace("\u00a0", " ") # replace NBSP with space | |
text = re.sub(r'(\d)\s*\.\s*(\d)', r'\1.\2', text) # fix split decimals | |
text = re.sub(r'\s+', ' ', text) # collapse multiple spaces/newlines | |
return text.strip() | |
def ocr_text_from_image(image_bytes: bytes) -> str: | |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB") | |
return pytesseract.image_to_string(image) | |
def load_pytesseract(): | |
if platform.system() == "Darwin": | |
#pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract' | |
pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract' | |
elif platform.system() == "Windows": | |
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' | |
def load_genai(genai_api_key: str): | |
try: | |
genai.configure(api_key=genai_api_key) | |
except Exception as e: | |
raise RuntimeError(f"Failed to configure Gemini API: {e}") | |
def setupFastAPI()-> FastAPI: | |
app = FastAPI() | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=[ | |
"http://localhost:8002" | |
"http://localhost:9000" | |
"http://localhost:5501" | |
], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
return app | |