|
from pydantic import BaseModel, Field |
|
from typing import List, Optional, Dict, Any |
|
from datetime import datetime |
|
from enum import Enum |
|
|
|
class DocumentType(str, Enum): |
|
PDF = "pdf" |
|
TEXT = "txt" |
|
DOCX = "docx" |
|
IMAGE = "image" |
|
HTML = "html" |
|
|
|
class ProcessingStatus(str, Enum): |
|
PENDING = "pending" |
|
PROCESSING = "processing" |
|
COMPLETED = "completed" |
|
FAILED = "failed" |
|
|
|
class Document(BaseModel): |
|
id: str = Field(..., description="Unique document identifier") |
|
filename: str = Field(..., description="Original filename") |
|
content: str = Field(..., description="Extracted text content") |
|
doc_type: DocumentType = Field(..., description="Document type") |
|
file_size: int = Field(..., description="File size in bytes") |
|
created_at: datetime = Field(default_factory=datetime.utcnow) |
|
metadata: Dict[str, Any] = Field(default_factory=dict) |
|
tags: List[str] = Field(default_factory=list) |
|
summary: Optional[str] = None |
|
category: Optional[str] = None |
|
language: Optional[str] = None |
|
|
|
def to_dict(self) -> Dict[str, Any]: |
|
return { |
|
"id": self.id, |
|
"filename": self.filename, |
|
"content": self.content[:500] + "..." if len(self.content) > 500 else self.content, |
|
"doc_type": self.doc_type, |
|
"file_size": self.file_size, |
|
"created_at": self.created_at.isoformat(), |
|
"metadata": self.metadata, |
|
"tags": self.tags, |
|
"summary": self.summary, |
|
"category": self.category, |
|
"language": self.language |
|
} |
|
|
|
class Chunk(BaseModel): |
|
id: str = Field(..., description="Unique chunk identifier") |
|
document_id: str = Field(..., description="Parent document ID") |
|
content: str = Field(..., description="Chunk text content") |
|
chunk_index: int = Field(..., description="Position in document") |
|
start_pos: int = Field(..., description="Start position in original document") |
|
end_pos: int = Field(..., description="End position in original document") |
|
embedding: Optional[List[float]] = None |
|
metadata: Dict[str, Any] = Field(default_factory=dict) |
|
|
|
class SearchResult(BaseModel): |
|
chunk_id: str = Field(..., description="Matching chunk ID") |
|
document_id: str = Field(..., description="Source document ID") |
|
content: str = Field(..., description="Matching content") |
|
score: float = Field(..., description="Similarity score") |
|
metadata: Dict[str, Any] = Field(default_factory=dict) |
|
|
|
def to_dict(self) -> Dict[str, Any]: |
|
return { |
|
"chunk_id": self.chunk_id, |
|
"document_id": self.document_id, |
|
"content": self.content, |
|
"score": self.score, |
|
"metadata": self.metadata |
|
} |
|
|
|
class ProcessingTask(BaseModel): |
|
task_id: str = Field(..., description="Unique task identifier") |
|
document_id: Optional[str] = None |
|
status: ProcessingStatus = ProcessingStatus.PENDING |
|
progress: float = Field(default=0.0, ge=0.0, le=100.0) |
|
message: Optional[str] = None |
|
error: Optional[str] = None |
|
created_at: datetime = Field(default_factory=datetime.utcnow) |
|
updated_at: datetime = Field(default_factory=datetime.utcnow) |
|
|
|
class SummaryRequest(BaseModel): |
|
content: Optional[str] = None |
|
document_id: Optional[str] = None |
|
style: str = Field(default="concise", description="Summary style") |
|
max_length: Optional[int] = None |
|
|
|
class TagGenerationRequest(BaseModel): |
|
content: Optional[str] = None |
|
document_id: Optional[str] = None |
|
max_tags: int = Field(default=5, ge=1, le=20) |
|
|
|
class QuestionAnswerRequest(BaseModel): |
|
question: str = Field(..., description="Question to answer") |
|
context_filter: Optional[Dict[str, Any]] = None |
|
max_context_length: int = Field(default=2000) |
|
|
|
class CategorizationRequest(BaseModel): |
|
content: Optional[str] = None |
|
document_id: Optional[str] = None |
|
categories: Optional[List[str]] = None |