Istvan-Adem commited on
Commit
7e8e988
·
1 Parent(s): 2ae9761

add original text with filtering

Browse files
ocr/api/message/openai_request.py CHANGED
@@ -3,7 +3,7 @@ from ocr.core.wrappers import openai_wrapper
3
 
4
 
5
  @openai_wrapper(model='gpt-4o-mini')
6
- async def generate_report(text: str):
7
  messages = [
8
  {
9
  "role": "system",
@@ -11,7 +11,22 @@ async def generate_report(text: str):
11
  },
12
  {
13
  "role": "user",
14
- "content": f"Generate a report based on this data:\n\n```\n{text}\n```"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  }
16
  ]
17
  return messages
 
3
 
4
 
5
  @openai_wrapper(model='gpt-4o-mini')
6
+ async def generate_report(content: list):
7
  messages = [
8
  {
9
  "role": "system",
 
11
  },
12
  {
13
  "role": "user",
14
+ "content": content
15
+ }
16
+ ]
17
+ return messages
18
+
19
+
20
+ @openai_wrapper(model='gpt-4o-mini')
21
+ async def extract_original_text(content: list):
22
+ messages = [
23
+ {
24
+ "role": "system",
25
+ "content": OCRPrompts.extract_original_text
26
+ },
27
+ {
28
+ "role": "user",
29
+ "content": content
30
  }
31
  ]
32
  return messages
ocr/api/message/prompts.py CHANGED
@@ -24,4 +24,21 @@ The report must be structured as follows, with each section containing only rele
24
  - **Do not invent or infer any information.** Only use data provided in the user request.
25
  - Ensure that the format is followed strictly, and the output is complete without any deviations.
26
 
27
- [/INST]"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  - **Do not invent or infer any information.** Only use data provided in the user request.
25
  - Ensure that the format is followed strictly, and the output is complete without any deviations.
26
 
27
+ [/INST]"""
28
+ extract_original_text = """## Task
29
+
30
+ You must extract all text from the provided images and return it in the **text** field. However, you must **strictly** exclude any information related to the **patient’s name, contact details, or demographic data**.
31
+
32
+ ## Requirements
33
+
34
+ - Extract **all readable text** from the images.
35
+ - **Do not** include any **patient-identifiable information**, including:
36
+ - Names (first, last, middle, initials)
37
+ - Contact details (phone numbers, email addresses, addresses)
38
+ - Demographic information (age, date of birth, gender, ethnicity, etc.)
39
+ - Preserve **the structure and order** of the text as much as possible.
40
+
41
+ ## Formatting Guidelines
42
+
43
+ - Do not alter or interpret the content—your task is **only extraction**.
44
+ - If a section contains both medical and personal data, extract only the medical data and redact the personal information."""
ocr/api/message/utils.py CHANGED
@@ -1,8 +1,7 @@
 
1
  import io
2
  import re
3
 
4
- import pytesseract
5
- from PIL import Image
6
  from pdf2image import convert_from_bytes
7
 
8
 
@@ -17,15 +16,20 @@ def divide_images(contents: bytes) -> list[bytes]:
17
  return image_bytes_list
18
 
19
 
20
- def extract_text_from_images(images: list[bytes]) -> str:
21
- extracted_texts = []
22
-
23
- for image_bytes in images:
24
- image = Image.open(io.BytesIO(image_bytes))
25
- text = pytesseract.image_to_string(image)
26
- extracted_texts.append(text)
27
-
28
- return '\n'.join(extracted_texts)
 
 
 
 
 
29
 
30
  def clean_response(text: str) -> str:
31
  try:
 
1
+ import base64
2
  import io
3
  import re
4
 
 
 
5
  from pdf2image import convert_from_bytes
6
 
7
 
 
16
  return image_bytes_list
17
 
18
 
19
+ def prepare_request_content(images: list[bytes]) -> list:
20
+ content = [
21
+ {"type": "text", "text": "Generate a report on the attached document"},
22
+ *[
23
+ {
24
+ "type": "image_url",
25
+ "image_url": {
26
+ "url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}",
27
+ },
28
+ }
29
+ for image in images
30
+ ]
31
+ ]
32
+ return content
33
 
34
  def clean_response(text: str) -> str:
35
  try:
ocr/api/message/views.py CHANGED
@@ -1,9 +1,11 @@
 
 
1
  from fastapi import File, UploadFile, HTTPException
2
 
3
  from ocr.api.message import ocr_router
4
- from ocr.api.message.openai_request import generate_report
5
  from ocr.api.message.schemas import OcrResponse
6
- from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images
7
  from ocr.core.wrappers import OcrResponseWrapper
8
 
9
 
@@ -19,8 +21,11 @@ async def get_all_chat_messages(
19
  images = [contents]
20
  else:
21
  raise HTTPException(status_code=400, detail='Unsupported file type.')
22
- text_content = extract_text_from_images(images)
23
- response = await generate_report(text_content)
24
- return OcrResponseWrapper(data=OcrResponse(text=clean_response(response), originalText=text_content))
 
 
 
25
  finally:
26
  await file.close()
 
1
+ import asyncio
2
+
3
  from fastapi import File, UploadFile, HTTPException
4
 
5
  from ocr.api.message import ocr_router
6
+ from ocr.api.message.openai_request import generate_report, extract_original_text
7
  from ocr.api.message.schemas import OcrResponse
8
+ from ocr.api.message.utils import divide_images, clean_response, prepare_request_content
9
  from ocr.core.wrappers import OcrResponseWrapper
10
 
11
 
 
21
  images = [contents]
22
  else:
23
  raise HTTPException(status_code=400, detail='Unsupported file type.')
24
+ content = prepare_request_content(images)
25
+ original_text, response = await asyncio.gather(
26
+ extract_original_text(content),
27
+ generate_report(content)
28
+ )
29
+ return OcrResponseWrapper(data=OcrResponse(text=clean_response(response), originalText=original_text))
30
  finally:
31
  await file.close()