File size: 3,951 Bytes
9145e48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any
from datetime import datetime
from enum import Enum
class DocumentType(str, Enum):
PDF = "pdf"
TEXT = "txt"
DOCX = "docx"
IMAGE = "image"
HTML = "html"
class ProcessingStatus(str, Enum):
PENDING = "pending"
PROCESSING = "processing"
COMPLETED = "completed"
FAILED = "failed"
class Document(BaseModel):
id: str = Field(..., description="Unique document identifier")
filename: str = Field(..., description="Original filename")
content: str = Field(..., description="Extracted text content")
doc_type: DocumentType = Field(..., description="Document type")
file_size: int = Field(..., description="File size in bytes")
created_at: datetime = Field(default_factory=datetime.utcnow)
metadata: Dict[str, Any] = Field(default_factory=dict)
tags: List[str] = Field(default_factory=list)
summary: Optional[str] = None
category: Optional[str] = None
language: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return {
"id": self.id,
"filename": self.filename,
"content": self.content[:500] + "..." if len(self.content) > 500 else self.content,
"doc_type": self.doc_type,
"file_size": self.file_size,
"created_at": self.created_at.isoformat(),
"metadata": self.metadata,
"tags": self.tags,
"summary": self.summary,
"category": self.category,
"language": self.language
}
class Chunk(BaseModel):
id: str = Field(..., description="Unique chunk identifier")
document_id: str = Field(..., description="Parent document ID")
content: str = Field(..., description="Chunk text content")
chunk_index: int = Field(..., description="Position in document")
start_pos: int = Field(..., description="Start position in original document")
end_pos: int = Field(..., description="End position in original document")
embedding: Optional[List[float]] = None
metadata: Dict[str, Any] = Field(default_factory=dict)
class SearchResult(BaseModel):
chunk_id: str = Field(..., description="Matching chunk ID")
document_id: str = Field(..., description="Source document ID")
content: str = Field(..., description="Matching content")
score: float = Field(..., description="Similarity score")
metadata: Dict[str, Any] = Field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
return {
"chunk_id": self.chunk_id,
"document_id": self.document_id,
"content": self.content,
"score": self.score,
"metadata": self.metadata
}
class ProcessingTask(BaseModel):
task_id: str = Field(..., description="Unique task identifier")
document_id: Optional[str] = None
status: ProcessingStatus = ProcessingStatus.PENDING
progress: float = Field(default=0.0, ge=0.0, le=100.0)
message: Optional[str] = None
error: Optional[str] = None
created_at: datetime = Field(default_factory=datetime.utcnow)
updated_at: datetime = Field(default_factory=datetime.utcnow)
class SummaryRequest(BaseModel):
content: Optional[str] = None
document_id: Optional[str] = None
style: str = Field(default="concise", description="Summary style")
max_length: Optional[int] = None
class TagGenerationRequest(BaseModel):
content: Optional[str] = None
document_id: Optional[str] = None
max_tags: int = Field(default=5, ge=1, le=20)
class QuestionAnswerRequest(BaseModel):
question: str = Field(..., description="Question to answer")
context_filter: Optional[Dict[str, Any]] = None
max_context_length: int = Field(default=2000)
class CategorizationRequest(BaseModel):
content: Optional[str] = None
document_id: Optional[str] = None
categories: Optional[List[str]] = None |