from pydantic import BaseModel, Field from typing import List, Optional, Dict, Any from datetime import datetime from enum import Enum class DocumentType(str, Enum): PDF = "pdf" TEXT = "txt" DOCX = "docx" IMAGE = "image" HTML = "html" class ProcessingStatus(str, Enum): PENDING = "pending" PROCESSING = "processing" COMPLETED = "completed" FAILED = "failed" class Document(BaseModel): id: str = Field(..., description="Unique document identifier") filename: str = Field(..., description="Original filename") content: str = Field(..., description="Extracted text content") doc_type: DocumentType = Field(..., description="Document type") file_size: int = Field(..., description="File size in bytes") created_at: datetime = Field(default_factory=datetime.utcnow) metadata: Dict[str, Any] = Field(default_factory=dict) tags: List[str] = Field(default_factory=list) summary: Optional[str] = None category: Optional[str] = None language: Optional[str] = None def to_dict(self) -> Dict[str, Any]: return { "id": self.id, "filename": self.filename, "content": self.content[:500] + "..." if len(self.content) > 500 else self.content, "doc_type": self.doc_type, "file_size": self.file_size, "created_at": self.created_at.isoformat(), "metadata": self.metadata, "tags": self.tags, "summary": self.summary, "category": self.category, "language": self.language } class Chunk(BaseModel): id: str = Field(..., description="Unique chunk identifier") document_id: str = Field(..., description="Parent document ID") content: str = Field(..., description="Chunk text content") chunk_index: int = Field(..., description="Position in document") start_pos: int = Field(..., description="Start position in original document") end_pos: int = Field(..., description="End position in original document") embedding: Optional[List[float]] = None metadata: Dict[str, Any] = Field(default_factory=dict) class SearchResult(BaseModel): chunk_id: str = Field(..., description="Matching chunk ID") document_id: str = Field(..., description="Source document ID") content: str = Field(..., description="Matching content") score: float = Field(..., description="Similarity score") metadata: Dict[str, Any] = Field(default_factory=dict) def to_dict(self) -> Dict[str, Any]: return { "chunk_id": self.chunk_id, "document_id": self.document_id, "content": self.content, "score": self.score, "metadata": self.metadata } class ProcessingTask(BaseModel): task_id: str = Field(..., description="Unique task identifier") document_id: Optional[str] = None status: ProcessingStatus = ProcessingStatus.PENDING progress: float = Field(default=0.0, ge=0.0, le=100.0) message: Optional[str] = None error: Optional[str] = None created_at: datetime = Field(default_factory=datetime.utcnow) updated_at: datetime = Field(default_factory=datetime.utcnow) class SummaryRequest(BaseModel): content: Optional[str] = None document_id: Optional[str] = None style: str = Field(default="concise", description="Summary style") max_length: Optional[int] = None class TagGenerationRequest(BaseModel): content: Optional[str] = None document_id: Optional[str] = None max_tags: int = Field(default=5, ge=1, le=20) class QuestionAnswerRequest(BaseModel): question: str = Field(..., description="Question to answer") context_filter: Optional[Dict[str, Any]] = None max_context_length: int = Field(default=2000) class CategorizationRequest(BaseModel): content: Optional[str] = None document_id: Optional[str] = None categories: Optional[List[str]] = None