|
from fastapi import FastAPI, File, UploadFile |
|
from transformers import pipeline |
|
import PyPDF2 |
|
import docx |
|
import os |
|
import uvicorn |
|
from io import BytesIO |
|
|
|
app = FastAPI() |
|
|
|
|
|
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr") |
|
|
|
def extract_text_from_pdf(file: BytesIO) -> str: |
|
"""Extrait le texte d'un fichier PDF.""" |
|
reader = PyPDF2.PdfReader(file) |
|
text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()]) |
|
return text |
|
|
|
def extract_text_from_docx(file: BytesIO) -> str: |
|
"""Extrait le texte d'un fichier DOCX.""" |
|
doc = docx.Document(file) |
|
text = "\n".join([para.text for para in doc.paragraphs]) |
|
return text |
|
|
|
@app.post("/translate/") |
|
async def translate_file(file: UploadFile = File(...)): |
|
"""Endpoint pour traduire un fichier PDF ou DOCX.""" |
|
try: |
|
contents = await file.read() |
|
file_io = BytesIO(contents) |
|
file_extension = file.filename.split(".")[-1] |
|
|
|
if file_extension == "pdf": |
|
text = extract_text_from_pdf(file_io) |
|
elif file_extension == "docx": |
|
text = extract_text_from_docx(file_io) |
|
else: |
|
return {"error": "Format non supporté. Utilisez PDF ou DOCX."} |
|
|
|
|
|
translation = translator(text, max_length=1000) |
|
translated_text = " ".join([t["translation_text"] for t in translation]) |
|
|
|
return {"original_text": text[:500], "translated_text": translated_text[:500]} |
|
except Exception as e: |
|
return {"error": str(e)} |
|
|
|
if __name__ == "__main__": |
|
port = int(os.environ.get("PORT", 7860)) |
|
uvicorn.run(app, host="0.0.0.0", port=port) |
|
|
|
|