Spaces:

brestok
/

ocr-backend

Running

Istvan-Adem commited on Feb 21

Commit

7e8e988

1 Parent(s): 2ae9761

add original text with filtering

Files changed (4) hide show

ocr/api/message/openai_request.py CHANGED Viewed

@@ -3,7 +3,7 @@ from ocr.core.wrappers import openai_wrapper
 @openai_wrapper(model='gpt-4o-mini')
-async def generate_report(text: str):
     messages = [
         {
             "role": "system",
@@ -11,7 +11,22 @@ async def generate_report(text: str):
         },
         {
             "role": "user",
-            "content": f"Generate a report based on this data:\n\n```\n{text}\n```"
         }
     ]
     return messages

 @openai_wrapper(model='gpt-4o-mini')
+async def generate_report(content: list):
     messages = [
         {
             "role": "system",
         },
         {
             "role": "user",
+            "content": content
+        }
+    ]
+    return messages
+@openai_wrapper(model='gpt-4o-mini')
+async def extract_original_text(content: list):
+    messages = [
+        {
+            "role": "system",
+            "content": OCRPrompts.extract_original_text
+        },
+        {
+            "role": "user",
+            "content": content
         }
     ]
     return messages

ocr/api/message/prompts.py CHANGED Viewed

@@ -24,4 +24,21 @@ The report must be structured as follows, with each section containing only rele
 - **Do not invent or infer any information.** Only use data provided in the user request.
 - Ensure that the format is followed strictly, and the output is complete without any deviations.
-[/INST]"""

 - **Do not invent or infer any information.** Only use data provided in the user request.
 - Ensure that the format is followed strictly, and the output is complete without any deviations.
+[/INST]"""
+    extract_original_text = """## Task
+You must extract all text from the provided images and return it in the **text** field. However, you must **strictly** exclude any information related to the **patient’s name, contact details, or demographic data**.
+## Requirements
+- Extract **all readable text** from the images.
+- **Do not** include any **patient-identifiable information**, including:
+  - Names (first, last, middle, initials)
+  - Contact details (phone numbers, email addresses, addresses)
+  - Demographic information (age, date of birth, gender, ethnicity, etc.)
+- Preserve **the structure and order** of the text as much as possible.
+## Formatting Guidelines
+- Do not alter or interpret the content—your task is **only extraction**.
+- If a section contains both medical and personal data, extract only the medical data and redact the personal information."""

ocr/api/message/utils.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import io
 import re
-import pytesseract
-from PIL import Image
 from pdf2image import convert_from_bytes
@@ -17,15 +16,20 @@ def divide_images(contents: bytes) -> list[bytes]:
     return image_bytes_list
-def extract_text_from_images(images: list[bytes]) -> str:
-    extracted_texts = []
-    for image_bytes in images:
-        image = Image.open(io.BytesIO(image_bytes))
-        text = pytesseract.image_to_string(image)
-        extracted_texts.append(text)
-    return '\n'.join(extracted_texts)
 def clean_response(text: str) -> str:
     try:

+import base64
 import io
 import re
 from pdf2image import convert_from_bytes
     return image_bytes_list
+def prepare_request_content(images: list[bytes]) -> list:
+    content = [
+        {"type": "text", "text": "Generate a report on the attached document"},
+        *[
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}",
+                },
+            }
+            for image in images
+        ]
+    ]
+    return content
 def clean_response(text: str) -> str:
     try:

ocr/api/message/views.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from fastapi import File, UploadFile, HTTPException
 from ocr.api.message import ocr_router
-from ocr.api.message.openai_request import generate_report
 from ocr.api.message.schemas import OcrResponse
-from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images
 from ocr.core.wrappers import OcrResponseWrapper
@@ -19,8 +21,11 @@ async def get_all_chat_messages(
             images = [contents]
         else:
             raise HTTPException(status_code=400, detail='Unsupported file type.')
-        text_content = extract_text_from_images(images)
-        response = await generate_report(text_content)
-        return OcrResponseWrapper(data=OcrResponse(text=clean_response(response), originalText=text_content))
     finally:
         await file.close()

+import asyncio
 from fastapi import File, UploadFile, HTTPException
 from ocr.api.message import ocr_router
+from ocr.api.message.openai_request import generate_report, extract_original_text
 from ocr.api.message.schemas import OcrResponse
+from ocr.api.message.utils import divide_images, clean_response, prepare_request_content
 from ocr.core.wrappers import OcrResponseWrapper
             images = [contents]
         else:
             raise HTTPException(status_code=400, detail='Unsupported file type.')
+        content = prepare_request_content(images)
+        original_text, response = await asyncio.gather(
+            extract_original_text(content),
+            generate_report(content)
+        )
+        return OcrResponseWrapper(data=OcrResponse(text=clean_response(response), originalText=original_text))
     finally:
         await file.close()