test / app /chunks.py
Andrchest's picture
final try 1
e53c2d7
import uuid
class Chunk:
'''
id -> unique number in uuid format, can be tried https://www.uuidgenerator.net/
start_index -> the index of the first char from the beginning of the original document
TODO: implement access modifiers and set of getters and setters
'''
def __init__(self, id: uuid.UUID, filename: str, page_number: int, start_index: int, start_line: int, end_line: int, text: str):
self.id: uuid.UUID = id
self.filename: str = filename
self.page_number: int = page_number
self.start_index: int = start_index
self.start_line: int = start_line
self.end_line: int = end_line
self.text: str = text
def get_raw_text(self) -> str:
return self.text
def get_splitted_text(self) -> list[str]:
return self.text.split(" ")
def get_metadata(self) -> dict:
return {
"id": self.id,
"filename": self.filename,
"page_number": self.page_number,
"start_index": self.start_index,
"start_line": self.start_line,
"end_line": self.end_line,
}
# TODO: remove kostyly
def __str__(self):
return (f"Chunk from {self.filename.split('/')[-1]}, "
f"page - {self.page_number}, "
f"start - {self.start_line}, "
f"end - {self.end_line}, "
f"and text - {self.text[:100]}... ({len(self.text)})\n"
)