File size: 1,965 Bytes
7e8e988
0e48a80
b0d3425
 
df4c46f
 
150c3f8
0e48a80
22379c6
150c3f8
 
22379c6
0e48a80
 
 
 
 
 
 
 
 
 
 
df4c46f
 
 
 
 
 
 
 
 
 
7e8e988
 
 
 
 
 
 
 
 
 
 
 
 
 
0e48a80
 
2cace27
0e48a80
2cace27
 
0e48a80
150c3f8
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import base64
import io
import re

import pytesseract
from PIL import Image
from flair.data import Sentence
from pdf2image import convert_from_bytes

from ocr.core.config import settings


def divide_images(contents: bytes) -> list[bytes]:
    images = convert_from_bytes(contents, dpi=250)
    image_bytes_list = []
    for image in images:
        img_byte_array = io.BytesIO()
        image.save(img_byte_array, format='PNG')
        img_byte_array.seek(0)
        image_bytes_list.append(img_byte_array.read())
    return image_bytes_list


def extract_text_from_images(images: list[bytes]) -> str:
    extracted_texts = []

    for image_bytes in images:
        image = Image.open(io.BytesIO(image_bytes))
        text = pytesseract.image_to_string(image)
        extracted_texts.append(text)

    return '\n'.join(extracted_texts)

def prepare_request_content(images: list[bytes]) -> list:
    content = [
        {"type": "text", "text": "Generate a report on the attached document"},
        *[
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}",
                },
            }
            for image in images
        ]
    ]
    return content

def clean_response(text: str) -> str:
    try:
        text = re.search(r'```markdown\s*(.*?)\s*```', text, re.DOTALL).group(1)
    except Exception as e:
        pass
    return text


def clean_text(text: str) -> str:
    sentence = Sentence(text)
    settings.TAGGER.predict(sentence)
    per_entities = [entity for entity in sentence.get_spans('ner') if entity.tag == 'PER']
    per_entities = sorted(per_entities, key=lambda x: x.start_position, reverse=True)
    cleaned_text = text
    for entity in per_entities:
        start = entity.start_position
        end = entity.end_position
        cleaned_text = cleaned_text[:start] + cleaned_text[end:]
    return cleaned_text