Spaces:
No application file
No application file
File size: 2,192 Bytes
668bf5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
def inspect_tokenized_dataset(dataset, num_samples=3):
print(f"\n๐ Inspecting first {num_samples} samples in tokenized dataset...")
for i in range(min(num_samples, len(dataset))):
sample = dataset[i]
print(f"\n=== Sample {i} ===")
print("input_ids:", sample.get("input_ids"))
print("attention_mask:", sample.get("attention_mask"))
print("labels:", sample.get("labels"))
# ํ์
ํ์ธ
if not isinstance(sample.get("labels"), list):
print(f"โ Sample {i}: labels๊ฐ list๊ฐ ์๋ โ {type(sample.get('labels'))}")
elif not all(isinstance(x, int) for x in sample["labels"]):
print(f"โ Sample {i}: labels ๋ด๋ถ์ int๊ฐ ์๋ ๊ฐ ์กด์ฌ")
else:
print(f"โ
Sample {i}: labels ๊ตฌ์กฐ ์ ์")
# ๊ธธ์ด ๋น๊ต
input_lens = [len(s["input_ids"]) for s in dataset[:num_samples]]
label_lens = [len(s["labels"]) for s in dataset[:num_samples]]
print("\n๐ input_ids ๊ธธ์ด:", input_lens)
print("๐ labels ๊ธธ์ด: ", label_lens)
def print_label_lengths(dataset):
lengths = [len(sample["labels"]) for sample in dataset]
print(f"[๋๋ฒ๊น
] labels ๊ธธ์ด - min: {min(lengths)}, max: {max(lengths)}, mean: {sum(lengths) / len(lengths):.2f}")
# ์์: 10๊ฐ ์ํ ์ค์ ๊ธธ์ด ์ง์ ํ์ธ
print("[๋๋ฒ๊น
] ์ํ๋ณ labels ๊ธธ์ด (์์ 10๊ฐ):", lengths[:10])
def print_field_lengths(dataset, n=10, stage=""):
"""
๋ฐ์ดํฐ์
์์ input_ids, attention_mask, labels ๊ธธ์ด ๋ถํฌ ๋ฐ ์ํ ํ์ (์์ n๊ฐ)
"""
print(f"\n[๋๋ฒ๊น
][{stage}] ๊ธธ์ด ํต๊ณ ================================")
for key in ["input_ids", "attention_mask", "labels"]:
try:
lengths = [len(x[key]) for x in dataset]
print(f"{key} โ min: {min(lengths)}, max: {max(lengths)}, mean: {sum(lengths)/len(lengths):.2f}")
print(f"{key} ์ํ (์์ {n}๊ฐ):", lengths[:n])
except Exception as e:
print(f"{key}: (์กด์ฌํ์ง ์๊ฑฐ๋ ์๋ฌ) {e}")
print("====================================================\n")
|