Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -214,14 +214,49 @@ def prepare_features_for_qa_inference(examples, tokenizer, pad_on_right, max_seq
|
|
214 |
processed_features.append(feature)
|
215 |
|
216 |
final_batch = {}
|
217 |
-
if processed_features:
|
218 |
-
for
|
219 |
-
|
220 |
-
else:
|
221 |
-
logger.warning(f"No features could be processed for example IDs: {examples.get('id', ['N/A'])}. Input q: {examples.get('question', ['N/A'])}, c: {examples.get('context', ['N/A'])}")
|
222 |
for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
|
223 |
final_batch[key_to_ensure] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
|
|
|
|
225 |
return final_batch
|
226 |
|
227 |
# postprocess_qa_predictions 函數也需要從 utils_qa.py 複製或導入
|
|
|
214 |
processed_features.append(feature)
|
215 |
|
216 |
final_batch = {}
|
217 |
+
if not processed_features:
|
218 |
+
logger.warning(f"No features generated for example IDs: {examples.get('id', ['N/A'])}. Returning empty structure.")
|
219 |
+
# 確保返回的結構與 .map 期望的一致,即字典的鍵是列名,值是空列表
|
|
|
|
|
220 |
for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
|
221 |
final_batch[key_to_ensure] = []
|
222 |
+
return final_batch
|
223 |
+
|
224 |
+
# 1. 首先,將 processed_features (list of dicts) 轉換為 final_batch (dict of lists)
|
225 |
+
for key in processed_features[0].keys(): # 假設所有特徵字典有相同的鍵
|
226 |
+
final_batch[key] = [feature[key] for feature in processed_features]
|
227 |
+
|
228 |
+
# 2. 然後,對 final_batch 中需要轉換為張量的字段進行健壯性檢查和修正
|
229 |
+
keys_to_fix_for_tensor_conversion = ["input_ids", "attention_mask", "token_type_ids"]
|
230 |
+
pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
|
231 |
+
cls_token_id = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else 101
|
232 |
+
sep_token_id = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else 102
|
233 |
+
|
234 |
+
for key in keys_to_fix_for_tensor_conversion:
|
235 |
+
if key in final_batch:
|
236 |
+
# final_batch[key] 是一個列表的列表,例如 [[ids_for_feature1], [ids_for_feature2], ...]
|
237 |
+
corrected_list_of_lists = []
|
238 |
+
for i, single_feature_list in enumerate(final_batch[key]):
|
239 |
+
if single_feature_list is None:
|
240 |
+
logger.warning(f"Feature list for '{key}' at index {i} is None. Replacing with default for max_seq_len {max_seq_len}.")
|
241 |
+
if key == "input_ids":
|
242 |
+
default_seq = [cls_token_id, sep_token_id] + [pad_token_id] * (max_seq_len - 2)
|
243 |
+
corrected_list_of_lists.append(default_seq[:max_seq_len])
|
244 |
+
elif key == "attention_mask":
|
245 |
+
default_mask = [1, 1] + [0] * (max_seq_len - 2)
|
246 |
+
corrected_list_of_lists.append(default_mask[:max_seq_len])
|
247 |
+
elif key == "token_type_ids":
|
248 |
+
corrected_list_of_lists.append([0] * max_seq_len)
|
249 |
+
elif not all(isinstance(x, int) for x in single_feature_list):
|
250 |
+
logger.warning(f"Feature list for '{key}' at index {i} contains non-integers: {str(single_feature_list)[:50]}... Fixing Nones.")
|
251 |
+
default_val = pad_token_id if key == "input_ids" else 0
|
252 |
+
fixed_list = [default_val if not isinstance(x, int) else x for x in single_feature_list]
|
253 |
+
corrected_list_of_lists.append(fixed_list)
|
254 |
+
else:
|
255 |
+
corrected_list_of_lists.append(single_feature_list) # List is already good
|
256 |
+
final_batch[key] = corrected_list_of_lists
|
257 |
|
258 |
+
# 在返回前,可以再加一層打印,確認修正後的 final_batch 結構
|
259 |
+
# logger.debug(f"Returning final_batch from prepare_features: { {k: str(v)[:200] + '...' for k,v in final_batch.items()} }")
|
260 |
return final_batch
|
261 |
|
262 |
# postprocess_qa_predictions 函數也需要從 utils_qa.py 複製或導入
|