Istvan-Adem commited on
Commit
df4c46f
·
1 Parent(s): e966906

add pytesseract

Browse files
ocr/api/message/prompts.py CHANGED
@@ -27,8 +27,8 @@ The report must be structured as follows, with each section containing only rele
27
  [/INST]"""
28
  extract_original_text = """## Task
29
 
30
- You must extract all text from the attached images and return it in the **text** field. You must not include the patient's name, contact details, or demographic data.
31
 
32
  ## Important notes
33
 
34
- - You must extract all text but exclude any information related to the name, contact details, and demographic data."""
 
27
  [/INST]"""
28
  extract_original_text = """## Task
29
 
30
+ You must return ALL provided text, but not include the patient's name, contact details, or demographic data.
31
 
32
  ## Important notes
33
 
34
+ - You must return all text but exclude any information related to the name, contact details, and demographic data."""
ocr/api/message/utils.py CHANGED
@@ -2,6 +2,8 @@ import base64
2
  import io
3
  import re
4
 
 
 
5
  from pdf2image import convert_from_bytes
6
 
7
 
@@ -16,6 +18,16 @@ def divide_images(contents: bytes) -> list[bytes]:
16
  return image_bytes_list
17
 
18
 
 
 
 
 
 
 
 
 
 
 
19
  def prepare_request_content(images: list[bytes]) -> list:
20
  content = [
21
  {"type": "text", "text": "Generate a report on the attached document"},
 
2
  import io
3
  import re
4
 
5
+ import pytesseract
6
+ from PIL import Image
7
  from pdf2image import convert_from_bytes
8
 
9
 
 
18
  return image_bytes_list
19
 
20
 
21
+ def extract_text_from_images(images: list[bytes]) -> str:
22
+ extracted_texts = []
23
+
24
+ for image_bytes in images:
25
+ image = Image.open(io.BytesIO(image_bytes))
26
+ text = pytesseract.image_to_string(image)
27
+ extracted_texts.append(text)
28
+
29
+ return '\n'.join(extracted_texts)
30
+
31
  def prepare_request_content(images: list[bytes]) -> list:
32
  content = [
33
  {"type": "text", "text": "Generate a report on the attached document"},
ocr/api/message/views.py CHANGED
@@ -5,7 +5,7 @@ from fastapi import File, UploadFile, HTTPException
5
  from ocr.api.message import ocr_router
6
  from ocr.api.message.openai_request import generate_report, extract_original_text
7
  from ocr.api.message.schemas import OcrResponse
8
- from ocr.api.message.utils import divide_images, clean_response, prepare_request_content
9
  from ocr.core.wrappers import OcrResponseWrapper
10
 
11
 
@@ -21,10 +21,10 @@ async def get_all_chat_messages(
21
  images = [contents]
22
  else:
23
  raise HTTPException(status_code=400, detail='Unsupported file type.')
24
- content = prepare_request_content(images)
25
  original_text, response = await asyncio.gather(
26
- extract_original_text(content),
27
- generate_report(content)
28
  )
29
  return OcrResponseWrapper(data=OcrResponse(text=clean_response(response), originalText=original_text))
30
  finally:
 
5
  from ocr.api.message import ocr_router
6
  from ocr.api.message.openai_request import generate_report, extract_original_text
7
  from ocr.api.message.schemas import OcrResponse
8
+ from ocr.api.message.utils import divide_images, clean_response, extract_text_from_images
9
  from ocr.core.wrappers import OcrResponseWrapper
10
 
11
 
 
21
  images = [contents]
22
  else:
23
  raise HTTPException(status_code=400, detail='Unsupported file type.')
24
+ text_content = extract_text_from_images(images)
25
  original_text, response = await asyncio.gather(
26
+ extract_original_text(text_content),
27
+ generate_report(text_content)
28
  )
29
  return OcrResponseWrapper(data=OcrResponse(text=clean_response(response), originalText=original_text))
30
  finally: