Spaces:
Running
Running
import base64 | |
import io | |
import re | |
import pytesseract | |
from PIL import Image | |
from flair.data import Sentence | |
from pdf2image import convert_from_bytes | |
from ocr.core.config import settings | |
def divide_images(contents: bytes) -> list[bytes]: | |
images = convert_from_bytes(contents, dpi=250) | |
image_bytes_list = [] | |
for image in images: | |
img_byte_array = io.BytesIO() | |
image.save(img_byte_array, format='PNG') | |
img_byte_array.seek(0) | |
image_bytes_list.append(img_byte_array.read()) | |
return image_bytes_list | |
def extract_text_from_images(images: list[bytes]) -> str: | |
extracted_texts = [] | |
for image_bytes in images: | |
image = Image.open(io.BytesIO(image_bytes)) | |
text = pytesseract.image_to_string(image) | |
extracted_texts.append(text) | |
return '\n'.join(extracted_texts) | |
def prepare_request_content(images: list[bytes]) -> list: | |
content = [ | |
{"type": "text", "text": "Generate a report on the attached document"}, | |
*[ | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}", | |
}, | |
} | |
for image in images | |
] | |
] | |
return content | |
def clean_response(text: str) -> str: | |
try: | |
text = re.search(r'```markdown\s*(.*?)\s*```', text, re.DOTALL).group(1) | |
except Exception as e: | |
pass | |
return text | |
def clean_text(text: str) -> str: | |
sentence = Sentence(text) | |
settings.TAGGER.predict(sentence) | |
per_entities = [entity for entity in sentence.get_spans('ner') if entity.tag == 'PER'] | |
per_entities = sorted(per_entities, key=lambda x: x.start_position, reverse=True) | |
cleaned_text = text | |
for entity in per_entities: | |
start = entity.start_position | |
end = entity.end_position | |
cleaned_text = cleaned_text[:start] + cleaned_text[end:] | |
return cleaned_text | |