Spaces:
Paused
Paused
Commit
Β·
d0726f5
1
Parent(s):
dfc02f9
now fetching entire repo instead of specific files
Browse files
app.py
CHANGED
@@ -11,14 +11,10 @@ HF_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
|
|
11 |
if not HF_TOKEN:
|
12 |
raise RuntimeError("Missing HUGGINGFACE_HUB_TOKEN in env")
|
13 |
|
14 |
-
# βββ 1) Download
|
15 |
local_cache = snapshot_download(
|
16 |
repo_id=REPO_ID,
|
17 |
token=HF_TOKEN,
|
18 |
-
allow_patterns=[
|
19 |
-
f"{SUBFOLDER}/*.json",
|
20 |
-
f"{SUBFOLDER}/*.safetensors",
|
21 |
-
],
|
22 |
)
|
23 |
print("[DEBUG] snapshot_download β local_cache:", local_cache)
|
24 |
import pathlib
|
@@ -28,20 +24,31 @@ print(
|
|
28 |
list(pathlib.Path(local_cache).glob(f"{SUBFOLDER}/*")),
|
29 |
)
|
30 |
|
31 |
-
# βββ 2)
|
32 |
-
MODEL_DIR =
|
|
|
33 |
print("[DEBUG] MODEL_DIR:", MODEL_DIR)
|
34 |
print("[DEBUG] MODEL_DIR files:", os.listdir(MODEL_DIR))
|
|
|
35 |
|
36 |
# βββ 3) Load tokenizer & model from disk βββ
|
37 |
tokenizer = AutoTokenizer.from_pretrained(
|
38 |
MODEL_DIR,
|
39 |
-
use_fast=
|
40 |
)
|
41 |
-
print("[DEBUG] Loaded tokenizer object:", tokenizer, "type:", type(tokenizer))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
model = AutoModelForCausalLM.from_pretrained(
|
44 |
MODEL_DIR,
|
|
|
45 |
device_map="auto",
|
46 |
torch_dtype=torch.float16,
|
47 |
)
|
|
|
11 |
if not HF_TOKEN:
|
12 |
raise RuntimeError("Missing HUGGINGFACE_HUB_TOKEN in env")
|
13 |
|
14 |
+
# βββ 1) Download the full repo βββ
|
15 |
local_cache = snapshot_download(
|
16 |
repo_id=REPO_ID,
|
17 |
token=HF_TOKEN,
|
|
|
|
|
|
|
|
|
18 |
)
|
19 |
print("[DEBUG] snapshot_download β local_cache:", local_cache)
|
20 |
import pathlib
|
|
|
24 |
list(pathlib.Path(local_cache).glob(f"{SUBFOLDER}/*")),
|
25 |
)
|
26 |
|
27 |
+
# βββ 2) Repo root contains tokenizer.json; model shards live in the checkpoint subfolder βββ
|
28 |
+
MODEL_DIR = local_cache
|
29 |
+
MODEL_SUBFOLDER = SUBFOLDER
|
30 |
print("[DEBUG] MODEL_DIR:", MODEL_DIR)
|
31 |
print("[DEBUG] MODEL_DIR files:", os.listdir(MODEL_DIR))
|
32 |
+
print("[DEBUG] Checkpoint files:", os.listdir(os.path.join(MODEL_DIR, MODEL_SUBFOLDER)))
|
33 |
|
34 |
# βββ 3) Load tokenizer & model from disk βββ
|
35 |
tokenizer = AutoTokenizer.from_pretrained(
|
36 |
MODEL_DIR,
|
37 |
+
use_fast=True,
|
38 |
)
|
39 |
+
print("[DEBUG] Loaded fast tokenizer object:", tokenizer, "type:", type(tokenizer))
|
40 |
+
# Confirm tokenizer files are present
|
41 |
+
import os
|
42 |
+
print("[DEBUG] Files in MODEL_DIR for tokenizer:", os.listdir(MODEL_DIR))
|
43 |
+
# Inspect tokenizer's initialization arguments
|
44 |
+
try:
|
45 |
+
print("[DEBUG] Tokenizer init_kwargs:", tokenizer.init_kwargs)
|
46 |
+
except AttributeError:
|
47 |
+
print("[DEBUG] No init_kwargs attribute on tokenizer.")
|
48 |
|
49 |
model = AutoModelForCausalLM.from_pretrained(
|
50 |
MODEL_DIR,
|
51 |
+
subfolder=MODEL_SUBFOLDER,
|
52 |
device_map="auto",
|
53 |
torch_dtype=torch.float16,
|
54 |
)
|