Spaces:
Running
Running
Istvan-Adem
commited on
Commit
·
7e8e988
1
Parent(s):
2ae9761
add original text with filtering
Browse files- ocr/api/message/openai_request.py +17 -2
- ocr/api/message/prompts.py +18 -1
- ocr/api/message/utils.py +15 -11
- ocr/api/message/views.py +10 -5
ocr/api/message/openai_request.py
CHANGED
@@ -3,7 +3,7 @@ from ocr.core.wrappers import openai_wrapper
|
|
3 |
|
4 |
|
5 |
@openai_wrapper(model='gpt-4o-mini')
|
6 |
-
async def generate_report(
|
7 |
messages = [
|
8 |
{
|
9 |
"role": "system",
|
@@ -11,7 +11,22 @@ async def generate_report(text: str):
|
|
11 |
},
|
12 |
{
|
13 |
"role": "user",
|
14 |
-
"content":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
}
|
16 |
]
|
17 |
return messages
|
|
|
3 |
|
4 |
|
5 |
@openai_wrapper(model='gpt-4o-mini')
|
6 |
+
async def generate_report(content: list):
|
7 |
messages = [
|
8 |
{
|
9 |
"role": "system",
|
|
|
11 |
},
|
12 |
{
|
13 |
"role": "user",
|
14 |
+
"content": content
|
15 |
+
}
|
16 |
+
]
|
17 |
+
return messages
|
18 |
+
|
19 |
+
|
20 |
+
@openai_wrapper(model='gpt-4o-mini')
|
21 |
+
async def extract_original_text(content: list):
|
22 |
+
messages = [
|
23 |
+
{
|
24 |
+
"role": "system",
|
25 |
+
"content": OCRPrompts.extract_original_text
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"role": "user",
|
29 |
+
"content": content
|
30 |
}
|
31 |
]
|
32 |
return messages
|
ocr/api/message/prompts.py
CHANGED
@@ -24,4 +24,21 @@ The report must be structured as follows, with each section containing only rele
|
|
24 |
- **Do not invent or infer any information.** Only use data provided in the user request.
|
25 |
- Ensure that the format is followed strictly, and the output is complete without any deviations.
|
26 |
|
27 |
-
[/INST]"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
- **Do not invent or infer any information.** Only use data provided in the user request.
|
25 |
- Ensure that the format is followed strictly, and the output is complete without any deviations.
|
26 |
|
27 |
+
[/INST]"""
|
28 |
+
extract_original_text = """## Task
|
29 |
+
|
30 |
+
You must extract all text from the provided images and return it in the **text** field. However, you must **strictly** exclude any information related to the **patient’s name, contact details, or demographic data**.
|
31 |
+
|
32 |
+
## Requirements
|
33 |
+
|
34 |
+
- Extract **all readable text** from the images.
|
35 |
+
- **Do not** include any **patient-identifiable information**, including:
|
36 |
+
- Names (first, last, middle, initials)
|
37 |
+
- Contact details (phone numbers, email addresses, addresses)
|
38 |
+
- Demographic information (age, date of birth, gender, ethnicity, etc.)
|
39 |
+
- Preserve **the structure and order** of the text as much as possible.
|
40 |
+
|
41 |
+
## Formatting Guidelines
|
42 |
+
|
43 |
+
- Do not alter or interpret the content—your task is **only extraction**.
|
44 |
+
- If a section contains both medical and personal data, extract only the medical data and redact the personal information."""
|
ocr/api/message/utils.py
CHANGED
@@ -1,8 +1,7 @@
|
|
|
|
1 |
import io
|
2 |
import re
|
3 |
|
4 |
-
import pytesseract
|
5 |
-
from PIL import Image
|
6 |
from pdf2image import convert_from_bytes
|
7 |
|
8 |
|
@@ -17,15 +16,20 @@ def divide_images(contents: bytes) -> list[bytes]:
|
|
17 |
return image_bytes_list
|
18 |
|
19 |
|
20 |
-
def
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
def clean_response(text: str) -> str:
|
31 |
try:
|
|
|
1 |
+
import base64
|
2 |
import io
|
3 |
import re
|
4 |
|
|
|
|
|
5 |
from pdf2image import convert_from_bytes
|
6 |
|
7 |
|
|
|
16 |
return image_bytes_list
|
17 |
|
18 |
|
19 |
+
def prepare_request_content(images: list[bytes]) -> list:
|
20 |
+
content = [
|
21 |
+
{"type": "text", "text": "Generate a report on the attached document"},
|
22 |
+
*[
|
23 |
+
{
|
24 |
+
"type": "image_url",
|
25 |
+
"image_url": {
|
26 |
+
"url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}",
|
27 |
+
},
|
28 |
+
}
|
29 |
+
for image in images
|
30 |
+
]
|
31 |
+
]
|
32 |
+
return content
|
33 |
|
34 |
def clean_response(text: str) -> str:
|
35 |
try:
|
ocr/api/message/views.py
CHANGED
@@ -1,9 +1,11 @@
|
|
|
|
|
|
1 |
from fastapi import File, UploadFile, HTTPException
|
2 |
|
3 |
from ocr.api.message import ocr_router
|
4 |
-
from ocr.api.message.openai_request import generate_report
|
5 |
from ocr.api.message.schemas import OcrResponse
|
6 |
-
from ocr.api.message.utils import divide_images, clean_response,
|
7 |
from ocr.core.wrappers import OcrResponseWrapper
|
8 |
|
9 |
|
@@ -19,8 +21,11 @@ async def get_all_chat_messages(
|
|
19 |
images = [contents]
|
20 |
else:
|
21 |
raise HTTPException(status_code=400, detail='Unsupported file type.')
|
22 |
-
|
23 |
-
response = await
|
24 |
-
|
|
|
|
|
|
|
25 |
finally:
|
26 |
await file.close()
|
|
|
1 |
+
import asyncio
|
2 |
+
|
3 |
from fastapi import File, UploadFile, HTTPException
|
4 |
|
5 |
from ocr.api.message import ocr_router
|
6 |
+
from ocr.api.message.openai_request import generate_report, extract_original_text
|
7 |
from ocr.api.message.schemas import OcrResponse
|
8 |
+
from ocr.api.message.utils import divide_images, clean_response, prepare_request_content
|
9 |
from ocr.core.wrappers import OcrResponseWrapper
|
10 |
|
11 |
|
|
|
21 |
images = [contents]
|
22 |
else:
|
23 |
raise HTTPException(status_code=400, detail='Unsupported file type.')
|
24 |
+
content = prepare_request_content(images)
|
25 |
+
original_text, response = await asyncio.gather(
|
26 |
+
extract_original_text(content),
|
27 |
+
generate_report(content)
|
28 |
+
)
|
29 |
+
return OcrResponseWrapper(data=OcrResponse(text=clean_response(response), originalText=original_text))
|
30 |
finally:
|
31 |
await file.close()
|