Spaces:

CodCodingCode
/

medical-test

Paused

CodCodingCode commited on 21 days ago

Commit

d0726f5

1 Parent(s): dfc02f9

now fetching entire repo instead of specific files

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,14 +11,10 @@ HF_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")
 if not HF_TOKEN:
     raise RuntimeError("Missing HUGGINGFACE_HUB_TOKEN in env")
-# ——— 1) Download only the files in checkpoint-45000/ ———
 local_cache = snapshot_download(
     repo_id=REPO_ID,
     token=HF_TOKEN,
-    allow_patterns=[
-        f"{SUBFOLDER}/*.json",
-        f"{SUBFOLDER}/*.safetensors",
-    ],
 )
 print("[DEBUG] snapshot_download → local_cache:", local_cache)
 import pathlib
@@ -28,20 +24,31 @@ print(
     list(pathlib.Path(local_cache).glob(f"{SUBFOLDER}/*")),
 )
-# ——— 2) Point MODEL_DIR at that subfolder ———
-MODEL_DIR = os.path.join(local_cache, SUBFOLDER)
 print("[DEBUG] MODEL_DIR:", MODEL_DIR)
 print("[DEBUG] MODEL_DIR files:", os.listdir(MODEL_DIR))
 # ——— 3) Load tokenizer & model from disk ———
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_DIR,
-    use_fast=False,
 )
-print("[DEBUG] Loaded tokenizer object:", tokenizer, "type:", type(tokenizer))
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_DIR,
     device_map="auto",
     torch_dtype=torch.float16,
 )

 if not HF_TOKEN:
     raise RuntimeError("Missing HUGGINGFACE_HUB_TOKEN in env")
+# ——— 1) Download the full repo ———
 local_cache = snapshot_download(
     repo_id=REPO_ID,
     token=HF_TOKEN,
 )
 print("[DEBUG] snapshot_download → local_cache:", local_cache)
 import pathlib
     list(pathlib.Path(local_cache).glob(f"{SUBFOLDER}/*")),
 )
+# ——— 2) Repo root contains tokenizer.json; model shards live in the checkpoint subfolder ———
+MODEL_DIR = local_cache
+MODEL_SUBFOLDER = SUBFOLDER
 print("[DEBUG] MODEL_DIR:", MODEL_DIR)
 print("[DEBUG] MODEL_DIR files:", os.listdir(MODEL_DIR))
+print("[DEBUG] Checkpoint files:", os.listdir(os.path.join(MODEL_DIR, MODEL_SUBFOLDER)))
 # ——— 3) Load tokenizer & model from disk ———
 tokenizer = AutoTokenizer.from_pretrained(
     MODEL_DIR,
+    use_fast=True,
 )
+print("[DEBUG] Loaded fast tokenizer object:", tokenizer, "type:", type(tokenizer))
+# Confirm tokenizer files are present
+import os
+print("[DEBUG] Files in MODEL_DIR for tokenizer:", os.listdir(MODEL_DIR))
+# Inspect tokenizer's initialization arguments
+try:
+    print("[DEBUG] Tokenizer init_kwargs:", tokenizer.init_kwargs)
+except AttributeError:
+    print("[DEBUG] No init_kwargs attribute on tokenizer.")
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_DIR,
+    subfolder=MODEL_SUBFOLDER,
     device_map="auto",
     torch_dtype=torch.float16,
 )