CodCodingCode commited on
Commit
d0726f5
Β·
1 Parent(s): dfc02f9

now fetching entire repo instead of specific files

Browse files
Files changed (1) hide show
  1. app.py +16 -9
app.py CHANGED
@@ -11,14 +11,10 @@ HF_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
11
  if not HF_TOKEN:
12
  raise RuntimeError("Missing HUGGINGFACE_HUB_TOKEN in env")
13
 
14
- # β€”β€”β€” 1) Download only the files in checkpoint-45000/ β€”β€”β€”
15
  local_cache = snapshot_download(
16
  repo_id=REPO_ID,
17
  token=HF_TOKEN,
18
- allow_patterns=[
19
- f"{SUBFOLDER}/*.json",
20
- f"{SUBFOLDER}/*.safetensors",
21
- ],
22
  )
23
  print("[DEBUG] snapshot_download β†’ local_cache:", local_cache)
24
  import pathlib
@@ -28,20 +24,31 @@ print(
28
  list(pathlib.Path(local_cache).glob(f"{SUBFOLDER}/*")),
29
  )
30
 
31
- # β€”β€”β€” 2) Point MODEL_DIR at that subfolder β€”β€”β€”
32
- MODEL_DIR = os.path.join(local_cache, SUBFOLDER)
 
33
  print("[DEBUG] MODEL_DIR:", MODEL_DIR)
34
  print("[DEBUG] MODEL_DIR files:", os.listdir(MODEL_DIR))
 
35
 
36
  # β€”β€”β€” 3) Load tokenizer & model from disk β€”β€”β€”
37
  tokenizer = AutoTokenizer.from_pretrained(
38
  MODEL_DIR,
39
- use_fast=False,
40
  )
41
- print("[DEBUG] Loaded tokenizer object:", tokenizer, "type:", type(tokenizer))
 
 
 
 
 
 
 
 
42
 
43
  model = AutoModelForCausalLM.from_pretrained(
44
  MODEL_DIR,
 
45
  device_map="auto",
46
  torch_dtype=torch.float16,
47
  )
 
11
  if not HF_TOKEN:
12
  raise RuntimeError("Missing HUGGINGFACE_HUB_TOKEN in env")
13
 
14
+ # β€”β€”β€” 1) Download the full repo β€”β€”β€”
15
  local_cache = snapshot_download(
16
  repo_id=REPO_ID,
17
  token=HF_TOKEN,
 
 
 
 
18
  )
19
  print("[DEBUG] snapshot_download β†’ local_cache:", local_cache)
20
  import pathlib
 
24
  list(pathlib.Path(local_cache).glob(f"{SUBFOLDER}/*")),
25
  )
26
 
27
+ # β€”β€”β€” 2) Repo root contains tokenizer.json; model shards live in the checkpoint subfolder β€”β€”β€”
28
+ MODEL_DIR = local_cache
29
+ MODEL_SUBFOLDER = SUBFOLDER
30
  print("[DEBUG] MODEL_DIR:", MODEL_DIR)
31
  print("[DEBUG] MODEL_DIR files:", os.listdir(MODEL_DIR))
32
+ print("[DEBUG] Checkpoint files:", os.listdir(os.path.join(MODEL_DIR, MODEL_SUBFOLDER)))
33
 
34
  # β€”β€”β€” 3) Load tokenizer & model from disk β€”β€”β€”
35
  tokenizer = AutoTokenizer.from_pretrained(
36
  MODEL_DIR,
37
+ use_fast=True,
38
  )
39
+ print("[DEBUG] Loaded fast tokenizer object:", tokenizer, "type:", type(tokenizer))
40
+ # Confirm tokenizer files are present
41
+ import os
42
+ print("[DEBUG] Files in MODEL_DIR for tokenizer:", os.listdir(MODEL_DIR))
43
+ # Inspect tokenizer's initialization arguments
44
+ try:
45
+ print("[DEBUG] Tokenizer init_kwargs:", tokenizer.init_kwargs)
46
+ except AttributeError:
47
+ print("[DEBUG] No init_kwargs attribute on tokenizer.")
48
 
49
  model = AutoModelForCausalLM.from_pretrained(
50
  MODEL_DIR,
51
+ subfolder=MODEL_SUBFOLDER,
52
  device_map="auto",
53
  torch_dtype=torch.float16,
54
  )