Spaces:
Sleeping
Sleeping
File size: 2,139 Bytes
6e8141b be6eecc 6e8141b be6eecc 6e8141b be6eecc 6e8141b be6eecc 6e8141b be6eecc 6e8141b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
import pytesseract
from PIL import Image
import os
app = FastAPI()
# Directories for uploads and outputs
UPLOAD_DIR = "uploads"
OUTPUT_DIR = "outputs"
os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Supported languages and their codes
SUPPORTED_LANGUAGES = {
"eng": "English",
"rus": "Russian",
"msa": "Malay",
}
def detect_language(text):
"""Detect the language from the extracted text."""
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0 # Ensures consistent results
try:
detected_lang = detect(text)
return detected_lang
except Exception:
return "unknown"
@app.post("/upload/")
async def upload_image(file: UploadFile = File(...)):
file_path = os.path.join(UPLOAD_DIR, file.filename)
with open(file_path, "wb") as f:
f.write(await file.read())
try:
# Perform OCR
image = Image.open(file_path)
# Specify languages to be used for OCR
text = pytesseract.image_to_string(image, lang='+'.join(SUPPORTED_LANGUAGES.keys()))
# Detect language of the extracted text
detected_lang = detect_language(text)
# Save OCR result as Markdown
markdown_path = os.path.join(OUTPUT_DIR, f"{os.path.splitext(file.filename)[0]}.md")
with open(markdown_path, "w", encoding="utf-8") as md_file:
md_file.write(f"# Detected Language: {detected_lang}\n\n{text}")
except Exception as e:
return {"error": str(e)}
return {"download_url": f"/download/{os.path.basename(markdown_path)}"}
@app.get("/download/{filename}")
async def download_file(filename: str):
file_path = os.path.join(OUTPUT_DIR, filename)
if os.path.exists(file_path):
return FileResponse(file_path, media_type='text/markdown', filename=filename)
return {"error": "File not found"}
# Serve static files (HTML and assets)
app.mount("/", StaticFiles(directory="static", html=True), name="static")
|