File size: 3,817 Bytes
fdc5d7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import logging
from typing import Dict, List, Optional, Any
from datetime import datetime
from pydantic import BaseModel, Field

from app.schemas.dataset_common import ImpactLevel, DatasetMetrics

# Log for this module
log = logging.getLogger(__name__)

# Supported strategies for dataset combination
SUPPORTED_STRATEGIES = ["merge", "intersect", "filter"]

class ImpactAssessment(BaseModel):
    dataset_id: str = Field(..., description="The ID of the dataset being assessed")
    impact_level: ImpactLevel = Field(..., description="The impact level: low, medium, or high")
    assessment_method: str = Field(
        "unknown", 
        description="Method used to determine impact level (e.g., size_based, downloads_and_likes_based)"
    )
    metrics: DatasetMetrics = Field(
        ...,
        description="Metrics used for impact assessment"
    )
    thresholds: Dict[str, Dict[str, str]] = Field(
        {},
        description="Thresholds used for determining impact levels (for reference)"
    )

class DatasetInfo(BaseModel):
    id: str
    impact_level: Optional[ImpactLevel] = None
    impact_assessment: Optional[Dict] = None
    # Add other fields as needed
    class Config:
        extra = "allow"  # Allow extra fields from the API

class DatasetBase(BaseModel):
    name: str
    description: Optional[str] = None
    tags: Optional[List[str]] = None

class DatasetCreate(DatasetBase):
    files: Optional[List[str]] = None

class DatasetUpdate(DatasetBase):
    name: Optional[str] = None  # Make fields optional for updates

class Dataset(DatasetBase):
    id: int  # or str depending on your ID format
    owner_id: str  # Assuming user IDs are strings
    created_at: Optional[str] = None
    updated_at: Optional[str] = None
    class Config:
        pass  # Removed orm_mode = True since ORM is not used

class DatasetCombineRequest(BaseModel):
    source_datasets: List[str] = Field(..., description="List of dataset IDs to combine")
    name: str = Field(..., description="Name for the combined dataset")
    description: Optional[str] = Field(None, description="Description for the combined dataset")
    combination_strategy: str = Field("merge", description="Strategy to use when combining datasets (e.g., 'merge', 'intersect', 'filter')")
    filter_criteria: Optional[Dict[str, Any]] = Field(None, description="Criteria for filtering when combining datasets")

class CombinedDataset(BaseModel):
    id: str = Field(..., description="ID of the combined dataset")
    name: str = Field(..., description="Name of the combined dataset")
    description: Optional[str] = Field(None, description="Description of the combined dataset")
    source_datasets: List[str] = Field(..., description="IDs of the source datasets")
    created_at: datetime = Field(..., description="Creation timestamp")
    created_by: str = Field(..., description="ID of the user who created this combined dataset")
    impact_level: Optional[ImpactLevel] = Field(None, description="Calculated impact level of the combined dataset")
    status: str = Field("processing", description="Status of the dataset combination process")
    combination_strategy: str = Field(..., description="Strategy used when combining datasets")
    metrics: Optional[DatasetMetrics] = Field(None, description="Metrics for the combined dataset")
    storage_bucket_id: Optional[str] = Field(None, description="ID of the storage bucket containing dataset files")
    storage_folder_path: Optional[str] = Field(None, description="Path to the dataset files within the bucket")
    class Config:
        extra = "allow"  # Allow extra fields for flexibility

__all__ = ["ImpactLevel", "ImpactAssessment", "DatasetInfo", "DatasetMetrics", 
           "Dataset", "DatasetCreate", "DatasetUpdate", "DatasetCombineRequest", "CombinedDataset"]