import base64 import io import re import pytesseract from PIL import Image from flair.data import Sentence from pdf2image import convert_from_bytes from ocr.core.config import settings def divide_images(contents: bytes) -> list[bytes]: images = convert_from_bytes(contents, dpi=250) image_bytes_list = [] for image in images: img_byte_array = io.BytesIO() image.save(img_byte_array, format='PNG') img_byte_array.seek(0) image_bytes_list.append(img_byte_array.read()) return image_bytes_list def extract_text_from_images(images: list[bytes]) -> str: extracted_texts = [] for image_bytes in images: image = Image.open(io.BytesIO(image_bytes)) text = pytesseract.image_to_string(image) extracted_texts.append(text) return '\n'.join(extracted_texts) def prepare_request_content(images: list[bytes]) -> list: content = [ {"type": "text", "text": "Generate a report on the attached document"}, *[ { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}", }, } for image in images ] ] return content def clean_response(text: str) -> str: try: text = re.search(r'```markdown\s*(.*?)\s*```', text, re.DOTALL).group(1) except Exception as e: pass return text def clean_text(text: str) -> str: sentence = Sentence(text) settings.TAGGER.predict(sentence) per_entities = [entity for entity in sentence.get_spans('ner') if entity.tag == 'PER'] per_entities = sorted(per_entities, key=lambda x: x.start_position, reverse=True) cleaned_text = text for entity in per_entities: start = entity.start_position end = entity.end_position cleaned_text = cleaned_text[:start] + cleaned_text[end:] return cleaned_text