File size: 3,951 Bytes
9145e48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any
from datetime import datetime
from enum import Enum

class DocumentType(str, Enum):
    PDF = "pdf"
    TEXT = "txt"
    DOCX = "docx"
    IMAGE = "image"
    HTML = "html"

class ProcessingStatus(str, Enum):
    PENDING = "pending"
    PROCESSING = "processing"
    COMPLETED = "completed"
    FAILED = "failed"

class Document(BaseModel):
    id: str = Field(..., description="Unique document identifier")
    filename: str = Field(..., description="Original filename")
    content: str = Field(..., description="Extracted text content")
    doc_type: DocumentType = Field(..., description="Document type")
    file_size: int = Field(..., description="File size in bytes")
    created_at: datetime = Field(default_factory=datetime.utcnow)
    metadata: Dict[str, Any] = Field(default_factory=dict)
    tags: List[str] = Field(default_factory=list)
    summary: Optional[str] = None
    category: Optional[str] = None
    language: Optional[str] = None
    
    def to_dict(self) -> Dict[str, Any]:
        return {
            "id": self.id,
            "filename": self.filename,
            "content": self.content[:500] + "..." if len(self.content) > 500 else self.content,
            "doc_type": self.doc_type,
            "file_size": self.file_size,
            "created_at": self.created_at.isoformat(),
            "metadata": self.metadata,
            "tags": self.tags,
            "summary": self.summary,
            "category": self.category,
            "language": self.language
        }

class Chunk(BaseModel):
    id: str = Field(..., description="Unique chunk identifier")
    document_id: str = Field(..., description="Parent document ID")
    content: str = Field(..., description="Chunk text content")
    chunk_index: int = Field(..., description="Position in document")
    start_pos: int = Field(..., description="Start position in original document")
    end_pos: int = Field(..., description="End position in original document")
    embedding: Optional[List[float]] = None
    metadata: Dict[str, Any] = Field(default_factory=dict)

class SearchResult(BaseModel):
    chunk_id: str = Field(..., description="Matching chunk ID")
    document_id: str = Field(..., description="Source document ID")
    content: str = Field(..., description="Matching content")
    score: float = Field(..., description="Similarity score")
    metadata: Dict[str, Any] = Field(default_factory=dict)
    
    def to_dict(self) -> Dict[str, Any]:
        return {
            "chunk_id": self.chunk_id,
            "document_id": self.document_id,
            "content": self.content,
            "score": self.score,
            "metadata": self.metadata
        }

class ProcessingTask(BaseModel):
    task_id: str = Field(..., description="Unique task identifier")
    document_id: Optional[str] = None
    status: ProcessingStatus = ProcessingStatus.PENDING
    progress: float = Field(default=0.0, ge=0.0, le=100.0)
    message: Optional[str] = None
    error: Optional[str] = None
    created_at: datetime = Field(default_factory=datetime.utcnow)
    updated_at: datetime = Field(default_factory=datetime.utcnow)

class SummaryRequest(BaseModel):
    content: Optional[str] = None
    document_id: Optional[str] = None
    style: str = Field(default="concise", description="Summary style")
    max_length: Optional[int] = None

class TagGenerationRequest(BaseModel):
    content: Optional[str] = None
    document_id: Optional[str] = None
    max_tags: int = Field(default=5, ge=1, le=20)

class QuestionAnswerRequest(BaseModel):
    question: str = Field(..., description="Question to answer")
    context_filter: Optional[Dict[str, Any]] = None
    max_context_length: int = Field(default=2000)

class CategorizationRequest(BaseModel):
    content: Optional[str] = None
    document_id: Optional[str] = None
    categories: Optional[List[str]] = None