File size: 1,968 Bytes
d3a44ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from PIL import Image
import io
import fitz 
import re
import pytesseract
import google.generativeai as genai
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
from fastapi.middleware.cors import CORSMiddleware
import platform

def extract_images_from_pdf_bytes(pdf_bytes: bytes) -> list:
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    images = []
    for page in doc:
        pix = page.get_pixmap()
        buf = io.BytesIO()
        buf.write(pix.tobytes("png"))
        images.append(buf.getvalue())
    return images

def clean_ocr_text(text: str) -> str:
    text = text.replace("\x0c", " ")       # remove form feed
    text = text.replace("\u00a0", " ")     # replace NBSP with space
    text = re.sub(r'(\d)\s*\.\s*(\d)', r'\1.\2', text)  # fix split decimals
    text = re.sub(r'\s+', ' ', text)       # collapse multiple spaces/newlines
    return text.strip()



def ocr_text_from_image(image_bytes: bytes) -> str:
    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    return pytesseract.image_to_string(image)



def load_pytesseract():
    if platform.system() == "Darwin": 
        #pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'  
        pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'  
    elif platform.system() == "Windows":
        pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


def load_genai(genai_api_key: str):
    try:
        genai.configure(api_key=genai_api_key)
    except Exception as e:
        raise RuntimeError(f"Failed to configure Gemini API: {e}")


def setupFastAPI()-> FastAPI:
    app = FastAPI()
    app.add_middleware(
        CORSMiddleware,
        allow_origins=[
            "http://localhost:8002"
            "http://localhost:9000"
            "http://localhost:5501"
        ],
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )
    return app