TheWeeeed commited on
Commit
2bb107e
·
verified ·
1 Parent(s): 43478fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -5
app.py CHANGED
@@ -214,14 +214,49 @@ def prepare_features_for_qa_inference(examples, tokenizer, pad_on_right, max_seq
214
  processed_features.append(feature)
215
 
216
  final_batch = {}
217
- if processed_features:
218
- for key in processed_features[0].keys():
219
- final_batch[key] = [feature[key] for feature in processed_features]
220
- else:
221
- logger.warning(f"No features could be processed for example IDs: {examples.get('id', ['N/A'])}. Input q: {examples.get('question', ['N/A'])}, c: {examples.get('context', ['N/A'])}")
222
  for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
223
  final_batch[key_to_ensure] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
 
 
225
  return final_batch
226
 
227
  # postprocess_qa_predictions 函數也需要從 utils_qa.py 複製或導入
 
214
  processed_features.append(feature)
215
 
216
  final_batch = {}
217
+ if not processed_features:
218
+ logger.warning(f"No features generated for example IDs: {examples.get('id', ['N/A'])}. Returning empty structure.")
219
+ # 確保返回的結構與 .map 期望的一致,即字典的鍵是列名,值是空列表
 
 
220
  for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
221
  final_batch[key_to_ensure] = []
222
+ return final_batch
223
+
224
+ # 1. 首先,將 processed_features (list of dicts) 轉換為 final_batch (dict of lists)
225
+ for key in processed_features[0].keys(): # 假設所有特徵字典有相同的鍵
226
+ final_batch[key] = [feature[key] for feature in processed_features]
227
+
228
+ # 2. 然後,對 final_batch 中需要轉換為張量的字段進行健壯性檢查和修正
229
+ keys_to_fix_for_tensor_conversion = ["input_ids", "attention_mask", "token_type_ids"]
230
+ pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
231
+ cls_token_id = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else 101
232
+ sep_token_id = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else 102
233
+
234
+ for key in keys_to_fix_for_tensor_conversion:
235
+ if key in final_batch:
236
+ # final_batch[key] 是一個列表的列表,例如 [[ids_for_feature1], [ids_for_feature2], ...]
237
+ corrected_list_of_lists = []
238
+ for i, single_feature_list in enumerate(final_batch[key]):
239
+ if single_feature_list is None:
240
+ logger.warning(f"Feature list for '{key}' at index {i} is None. Replacing with default for max_seq_len {max_seq_len}.")
241
+ if key == "input_ids":
242
+ default_seq = [cls_token_id, sep_token_id] + [pad_token_id] * (max_seq_len - 2)
243
+ corrected_list_of_lists.append(default_seq[:max_seq_len])
244
+ elif key == "attention_mask":
245
+ default_mask = [1, 1] + [0] * (max_seq_len - 2)
246
+ corrected_list_of_lists.append(default_mask[:max_seq_len])
247
+ elif key == "token_type_ids":
248
+ corrected_list_of_lists.append([0] * max_seq_len)
249
+ elif not all(isinstance(x, int) for x in single_feature_list):
250
+ logger.warning(f"Feature list for '{key}' at index {i} contains non-integers: {str(single_feature_list)[:50]}... Fixing Nones.")
251
+ default_val = pad_token_id if key == "input_ids" else 0
252
+ fixed_list = [default_val if not isinstance(x, int) else x for x in single_feature_list]
253
+ corrected_list_of_lists.append(fixed_list)
254
+ else:
255
+ corrected_list_of_lists.append(single_feature_list) # List is already good
256
+ final_batch[key] = corrected_list_of_lists
257
 
258
+ # 在返回前,可以再加一層打印,確認修正後的 final_batch 結構
259
+ # logger.debug(f"Returning final_batch from prepare_features: { {k: str(v)[:200] + '...' for k,v in final_batch.items()} }")
260
  return final_batch
261
 
262
  # postprocess_qa_predictions 函數也需要從 utils_qa.py 複製或導入