File size: 11,249 Bytes
3232d64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import json
import os
import re
from collections import defaultdict
from datetime import datetime, timedelta, timezone
import logging

import huggingface_hub
from huggingface_hub import ModelCard, HfApi, hf_hub_download
from huggingface_hub.hf_api import ModelInfo
from transformers import AutoConfig
from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)  # Only show INFO and above, hide DEBUG messages

def check_model_card(repo_id: str) -> tuple[bool, str]:
    """Checks if the model card and license exist and have been filled"""
    try:
        card = ModelCard.load(repo_id)
    except huggingface_hub.utils.EntryNotFoundError:
        return False, "Please add a model card to your model to explain how you trained/fine-tuned it."

    # Enforce license metadata
    if card.data.license is None:
        if not ("license_name" in card.data and "license_link" in card.data):
            return False, (
                "License not found. Please add a license to your model card using the `license` metadata or a"
                " `license_name`/`license_link` pair."
            )

    # Enforce card content
    if len(card.text) < 200:
        return False, "Please add a description to your model card, it is too short."

    return True, ""


def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
    """Makes sure the model is on the hub, and uses a valid configuration (in the latest transformers version)"""
    try:
        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
        if test_tokenizer:
            tokenizer_config = get_tokenizer_config(model_name) 
            if tokenizer_config is not None:
                tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
            else:
                tokenizer_class_candidate = config.tokenizer_class 


            tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
            if tokenizer_class is None:
                return (
                    False,
                    f"uses {tokenizer_class_candidate}, which is not in a transformers release, therefore not supported at the moment.",
                    None
                )
        return True, None, config

    except ValueError:
        return (
            False,
            "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
            None
        )

    except Exception as e:
        return False, "was not found on hub!", None


def get_model_size(model_info: ModelInfo, precision: str):
    """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
    try:
        model_size = round(model_info.safetensors["total"] / 1e9, 3)
    except (AttributeError, TypeError):
        return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py

    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
    model_size = size_factor * model_size
    return model_size

def get_model_arch(model_info: ModelInfo):
    """Gets the model architecture from the configuration"""
    return model_info.config.get("architectures", "Unknown")

def already_submitted_models(requested_models_dir: str) -> set[str]:
    depth = 1
    file_names = []
    users_to_submission_dates = defaultdict(list)

    for root, _, files in os.walk(requested_models_dir):
        current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
        if current_depth == depth:
            for file in files:
                if not file.endswith(".json"):
                    continue
                with open(os.path.join(root, file), "r") as f:
                    info = json.load(f)
                    file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")

                    # Select organisation
                    if info["model"].count("/") == 0 or "submitted_time" not in info:
                        continue
                    organisation, _ = info["model"].split("/")
                    users_to_submission_dates[organisation].append(info["submitted_time"])

    return set(file_names), users_to_submission_dates

def check_adapter_config_and_base_model(model_name: str, base_model: str, token: str = None) -> tuple[bool, str]:
    """
    Checks if the model exists on HuggingFace and is accessible.
    
    Args:
        model_name: Name of the model to check
        base_model: Expected base model name (not used anymore)
        token: HuggingFace API token (optional)
        
    Returns:
        Tuple[bool, str]: A tuple containing:
            - is_valid: Whether the model exists and is accessible
            - error_message: Error message if the model is invalid
    """
    try:
        # Check if model exists on HuggingFace
        # First try without token (for public models)
        try:
            # Try to access public model without token
            api_public = HfApi()
            model_info = api_public.model_info(repo_id=model_name)
            logger.debug(f"Successfully accessed model {model_name}")
            return True, None
        except Exception as e:
            logger.debug(f"Could not access model without token")
            # If that fails, try with token (for private models)
            if token:
                try:
                    api_with_token = HfApi(token=token)
                    model_info = api_with_token.model_info(repo_id=model_name)
                    logger.debug(f"Successfully accessed model {model_name} with authentication")
                    return True, None
                except Exception as e:
                    return False, f"Model {model_name} not found or not accessible: {str(e)}"
            else:
                return False, f"Model {model_name} not found or not accessible: {str(e)}"
    except Exception as e:
        return False, f"Error validating model: {str(e)}"

def has_adapter_config(model_name: str, token: str = None) -> tuple[bool, str]:
    """
    Checks if the model repository contains adapter configuration files.
    
    Args:
        model_name: Name of the model to check
        token: HuggingFace API token (optional)
        
    Returns:
        Tuple[bool, str]: A tuple containing:
            - has_adapter: Whether the model contains adapter configuration
            - message: Additional information or error message
    """
    try:
        # Initialize API with or without token
        api = HfApi(token=token) if token else HfApi()
        
        # Get the list of files in the repository
        repo_files = api.list_repo_files(repo_id=model_name)
        
        # Check for specific adapter configuration files
        adapter_files = [
            "adapter_config.json",
            "adapter_model.bin",
            "adapter_model.safetensors",
            "adapter.json",
            "adapter.safetensors",
            "adapter.bin"
        ]
        
        # Look for specific adapter files
        found_adapter_files = []
        for file in repo_files:
            file_lower = file.lower()
            if any(adapter_file.lower() in file_lower for adapter_file in adapter_files):
                found_adapter_files.append(file)
        
        # Check if we found adapter configuration
        has_adapter = len(found_adapter_files) > 0
        
        if has_adapter:
            adapter_files_str = ", ".join(found_adapter_files)
            return True, f"Found adapter configuration: {adapter_files_str}"
        else:
            return False, "No adapter configuration found"
        
    except Exception as e:
        return False, f"Error checking for adapter configuration: {str(e)}"

def has_safetensor_model(model_name: str, token: str = None) -> tuple[bool, str]:
    """
    Checks if the model repository contains safetensor model files.
    
    Args:
        model_name: Name of the model to check
        token: HuggingFace API token (optional)
        
    Returns:
        Tuple[bool, str]: A tuple containing:
            - has_safetensor: Whether the model contains safetensor model files
            - message: Additional information or error message
    """
    try:
        # Initialize API with or without token
        api = HfApi(token=token) if token else HfApi()
        
        # Get the list of files in the repository
        repo_files = api.list_repo_files(repo_id=model_name)
        
        # Look for safetensor model files (model_*.safetensors)
        safetensor_files = []
        model_pattern = "model_"
        safetensor_extension = ".safetensors"
        
        for file in repo_files:
            file_lower = file.lower()
            if model_pattern in file_lower and file_lower.endswith(safetensor_extension):
                safetensor_files.append(file)
        
        # Check if we found any safetensor model files
        has_safetensor = len(safetensor_files) > 0
        
        if has_safetensor:
            safetensor_files_str = ", ".join(safetensor_files)
            return True, f"Found safetensor model files: {safetensor_files_str}"
        else:
            # If no model_*.safetensors files, check for any .safetensors files
            any_safetensor_files = [file for file in repo_files if file.lower().endswith(safetensor_extension)]
            
            if any_safetensor_files:
                safetensor_files_str = ", ".join(any_safetensor_files)
                return True, f"Found safetensor files: {safetensor_files_str}"
            else:
                return False, "No safetensor model files found"
        
    except Exception as e:
        return False, f"Error checking for safetensor model files: {str(e)}"

def determine_model_type(model_name: str, token: str = None) -> tuple[str, str]:
    """
    Determines the type of model based on the files in the repository.
    
    Args:
        model_name: Name of the model to check
        token: HuggingFace API token (optional)
        
    Returns:
        Tuple[str, str]: A tuple containing:
            - model_type: Type of model (adapter, merged_model, unknown)
            - message: Additional information or details
    """
    try:
        # Check for adapter configuration
        has_adapter, adapter_message = has_adapter_config(model_name, token)
        
        # Check for safetensor model files
        has_safetensor, safetensor_message = has_safetensor_model(model_name, token)
        
        # Determine model type based on checks
        if has_adapter:
            return "adapter", adapter_message
        elif has_safetensor:
            return "merged_model", safetensor_message
        else:
            return "unknown", "Could not determine model type: no adapter config or safetensor model files found"
        
    except Exception as e:
        return "unknown", f"Error determining model type: {str(e)}"