Spaces:
Sleeping
Sleeping
import uuid | |
class Chunk: | |
''' | |
id -> unique number in uuid format, can be tried https://www.uuidgenerator.net/ | |
start_index -> the index of the first char from the beginning of the original document | |
TODO: implement access modifiers and set of getters and setters | |
''' | |
def __init__(self, id: uuid.UUID, filename: str, page_number: int, start_index: int, start_line: int, end_line: int, text: str): | |
self.id: uuid.UUID = id | |
self.filename: str = filename | |
self.page_number: int = page_number | |
self.start_index: int = start_index | |
self.start_line: int = start_line | |
self.end_line: int = end_line | |
self.text: str = text | |
def get_raw_text(self) -> str: | |
return self.text | |
def get_splitted_text(self) -> list[str]: | |
return self.text.split(" ") | |
def get_metadata(self) -> dict: | |
return { | |
"id": self.id, | |
"filename": self.filename, | |
"page_number": self.page_number, | |
"start_index": self.start_index, | |
"start_line": self.start_line, | |
"end_line": self.end_line, | |
} | |
# TODO: remove kostyly | |
def __str__(self): | |
return (f"Chunk from {self.filename.split('/')[-1]}, " | |
f"page - {self.page_number}, " | |
f"start - {self.start_line}, " | |
f"end - {self.end_line}, " | |
f"and text - {self.text[:100]}... ({len(self.text)})\n" | |
) | |