File size: 1,536 Bytes
365de9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import uuid


class Chunk:
    """
    id -> unique number in uuid format, can be tried https://www.uuidgenerator.net/
    start_index -> the index of the first char from the beginning of the original document

    TODO: implement access modifiers and set of getters and setters
    """

    def __init__(
        self,
        id: uuid.UUID,
        filename: str,
        page_number: int,
        start_index: int,
        start_line: int,
        end_line: int,
        text: str,
    ):
        self.id: uuid.UUID = id
        self.filename: str = filename
        self.page_number: int = page_number
        self.start_index: int = start_index
        self.start_line: int = start_line
        self.end_line: int = end_line
        self.text: str = text

    def get_raw_text(self) -> str:
        return self.text

    def get_splitted_text(self) -> list[str]:
        return self.text.split(" ")

    def get_metadata(self) -> dict:
        return {
            "id": self.id,
            "filename": self.filename,
            "page_number": self.page_number,
            "start_index": self.start_index,
            "start_line": self.start_line,
            "end_line": self.end_line,
        }

    # TODO: remove kostyly
    def __str__(self):
        return (
            f"Chunk from {self.filename.split('/')[-1]}, "
            f"page - {self.page_number}, "
            f"start - {self.start_line}, "
            f"end - {self.end_line}, "
            f"and text - {self.text[:100]}... ({len(self.text)})\n"
        )