TheWeeeed commited on
Commit
43478fe
·
verified ·
1 Parent(s): 525e038

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -77
app.py CHANGED
@@ -13,7 +13,13 @@ import numpy as np
13
  from datasets import Dataset
14
 
15
  from utils_qa import postprocess_qa_predictions
16
-
 
 
 
 
 
 
17
  # 假設 utils_qa.py 在同一目錄下 (或者您需要將其函數複製過來或確保可導入)
18
  # from utils_qa import postprocess_qa_predictions # 您可能需要完整路徑或將其放入 requirements.txt
19
 
@@ -100,118 +106,121 @@ def prepare_features_for_qa_inference_gradio(question_id, question_text, selecte
100
  # 您 inference_pipeline.py 中的 prepare_features_for_qa_inference 函數需要被複製到這裡
101
  # 或者確保它可以被導入
102
  def prepare_features_for_qa_inference(examples, tokenizer, pad_on_right, max_seq_len, doc_stride):
 
103
  examples["question"] = [q.lstrip() if isinstance(q, str) else "" for q in examples["question"]]
104
- questions = examples["question" if pad_on_right else "context"]
105
- contexts = examples["context" if pad_on_right else "question"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- # Ensure questions and contexts are lists of strings, handle None by converting to empty string
108
- questions = [q if isinstance(q, str) else "" for q in questions]
109
- contexts = [c if isinstance(c, str) else "" for c in contexts]
110
 
111
  tokenized_output = tokenizer(
112
- questions,
113
- contexts,
114
  truncation="only_second" if pad_on_right else "only_first",
115
  max_length=max_seq_len,
116
  stride=doc_stride,
117
  return_overflowing_tokens=True,
118
  return_offsets_mapping=True,
119
- padding="max_length", # This ensures all primary outputs are lists of numbers of fixed length
120
  )
121
 
122
- # The tokenizer with padding="max_length" should already produce lists of integers
123
- # for input_ids, attention_mask, token_type_ids.
124
- # The main risk of 'None' would be if the input strings were so problematic
125
- # that the tokenizer failed internally in a way not producing standard padded output.
126
- # However, standard tokenizers are quite robust with empty strings when padding is enabled.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- # Let's directly create the structure we need for the output Dataset.
129
- # `tokenized_output` is a BatchEncoding (dict-like).
130
- # If `return_overflowing_tokens=True` and N features are generated from one example,
131
- # then `tokenized_output['input_ids']` is a list of N lists.
132
 
133
  processed_features = []
134
- num_generated_features = len(tokenized_output["input_ids"]) # Number of features due to overflow
 
 
 
 
 
 
135
 
136
- # `sample_mapping` maps each generated feature back to its original example index in the input `examples`
137
- sample_mapping = tokenized_output.pop("overflow_to_sample_mapping", list(range(len(examples["id"]))))
 
138
 
139
 
140
  for i in range(num_generated_features):
141
  feature = {}
142
- original_example_index = sample_mapping[i] # Index of the original example this feature came from
 
 
 
143
 
144
- # These should always be lists of integers due to padding="max_length"
145
  feature["input_ids"] = tokenized_output["input_ids"][i]
146
  if "attention_mask" in tokenized_output:
147
  feature["attention_mask"] = tokenized_output["attention_mask"][i]
148
  if "token_type_ids" in tokenized_output:
149
  feature["token_type_ids"] = tokenized_output["token_type_ids"][i]
150
 
151
- # These might not be strictly needed by the model's forward pass but are used by postprocessing
152
- feature["example_id"] = examples["id"][original_example_index]
153
 
154
  current_offset_mapping = tokenized_output["offset_mapping"][i]
155
- sequence_ids = tokenized_output.sequence_ids(i) # Pass the index of the feature
156
  context_idx_in_pair = 1 if pad_on_right else 0
157
 
158
  feature["offset_mapping"] = [
159
- offset if sequence_ids[k] == context_idx_in_pair else None
160
  for k, offset in enumerate(current_offset_mapping)
161
  ]
162
  processed_features.append(feature)
163
 
164
- # The .map function expects a dictionary where keys are column names
165
- # and values are lists of features for those columns.
166
- # Since we are processing one original example at a time (batched=True on a Dataset of 1 row),
167
- # and this one example can produce multiple features, `processed_features` is a list of dicts.
168
- # We need to return a dictionary of lists.
169
- if not processed_features: # Should not happen if tokenizer works, but as a safeguard
170
- # Return structure with empty lists to match expected features by .map()
171
- # This case indicates an issue with tokenizing the input example.
172
- logger.error(f"No features generated for example ID {examples['id'][0]}. Input q: {examples['question'][0]}, c: {examples['context'][0]}")
173
- return {
174
- "input_ids": [], "token_type_ids": [], "attention_mask": [],
175
- "offset_mapping": [], "example_id": []
176
- }
177
-
178
- # Transpose the list of feature dictionaries into a dictionary of feature lists
179
- # This is what the .map(batched=True) function expects as a return value
180
  final_batch = {}
181
- for key in processed_features[0].keys():
182
- final_batch[key] = [feature[key] for feature in processed_features]
183
-
184
- for key_to_check in ["input_ids", "attention_mask", "token_type_ids"]:
185
- if key_to_check in final_batch:
186
- for i, lst in enumerate(final_batch[key_to_check]):
187
- if lst is None:
188
- raise ValueError(f"在 prepare_features_for_qa_inference 中,{key_to_check} 的第 {i} 個特徵列表為 None!")
189
- if any(x is None for x in lst):
190
- raise ValueError(f"在 prepare_features_for_qa_inference 中,{key_to_check} 的第 {i} 個特徵列表內部包含 None!內容: {lst[:20]}")
191
-
192
- for key_to_check in ["input_ids", "attention_mask", "token_type_ids"]:
193
- if key_to_check in final_batch:
194
- new_list_of_lists = []
195
- for single_feature_list in final_batch[key_to_check]:
196
- if single_feature_list is None: # 如果整個特徵的這個字段是 None
197
- # logger.error(f"Critical error: {key_to_check} list for a feature is None. Reconstructing a default.")
198
- # 根據 key_to_check 類型創建一個安全的默認值
199
- if key_to_check == "input_ids":
200
- safe_list = [tokenizer.cls_token_id or 101, tokenizer.sep_token_id or 102] + \
201
- [tokenizer.pad_token_id or 0] * (max_seq_len - 2)
202
- new_list_of_lists.append(safe_list[:max_seq_len])
203
- elif key_to_check == "attention_mask":
204
- safe_list = [1,1] + [0] * (max_seq_len-2)
205
- new_list_of_lists.append(safe_list[:max_seq_len])
206
- elif key_to_check == "token_type_ids":
207
- new_list_of_lists.append([0] * max_seq_len)
208
- elif not all(isinstance(x, int) for x in single_feature_list): # 如果列表內包含非整數
209
- # logger.error(f"Critical error: {key_to_check} list for a feature contains non-integers: {single_feature_list[:10]}. Fixing.")
210
- default_val = tokenizer.pad_token_id if key_to_check == "input_ids" else 0
211
- new_list_of_lists.append([default_val if not isinstance(x, int) else x for x in single_feature_list])
212
- else:
213
- new_list_of_lists.append(single_feature_list) # 原本就是好的
214
- final_batch[key_to_check] = new_list_of_lists
215
 
216
  return final_batch
217
 
 
13
  from datasets import Dataset
14
 
15
  from utils_qa import postprocess_qa_predictions
16
+ import logging
17
+ logger = logging.getLogger(__name__)
18
+ logging.basicConfig(
19
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
20
+ datefmt="%m/%d/%Y %H:%M:%S",
21
+ level=logging.INFO, # Or logging.DEBUG for more verbose output
22
+ )
23
  # 假設 utils_qa.py 在同一目錄下 (或者您需要將其函數複製過來或確保可導入)
24
  # from utils_qa import postprocess_qa_predictions # 您可能需要完整路徑或將其放入 requirements.txt
25
 
 
106
  # 您 inference_pipeline.py 中的 prepare_features_for_qa_inference 函數需要被複製到這裡
107
  # 或者確保它可以被導入
108
  def prepare_features_for_qa_inference(examples, tokenizer, pad_on_right, max_seq_len, doc_stride):
109
+ # Initial stripping and assignment
110
  examples["question"] = [q.lstrip() if isinstance(q, str) else "" for q in examples["question"]]
111
+ questions_to_tokenize = examples["question" if pad_on_right else "context"]
112
+ contexts_to_tokenize = examples["context" if pad_on_right else "question"]
113
+
114
+ questions_to_tokenize = [q if isinstance(q, str) else "" for q in questions_to_tokenize]
115
+ contexts_to_tokenize = [c if isinstance(c, str) else "" for c in contexts_to_tokenize]
116
+
117
+ # Handle cases where either question or context might be empty after processing
118
+ # Tokenizer might handle empty strings, but let's be explicit if one is vital
119
+ valid_inputs_for_tokenizer_q = []
120
+ valid_inputs_for_tokenizer_c = []
121
+ original_indices_for_valid_inputs = []
122
+
123
+ for i in range(len(questions_to_tokenize)):
124
+ q_str = questions_to_tokenize[i]
125
+ c_str = contexts_to_tokenize[i]
126
+ # Add a basic check: if context is empty, tokenization might be problematic for QA
127
+ if q_str.strip() and c_str.strip(): # Ensure both have content after stripping
128
+ valid_inputs_for_tokenizer_q.append(q_str)
129
+ valid_inputs_for_tokenizer_c.append(c_str)
130
+ original_indices_for_valid_inputs.append(i)
131
+ else:
132
+ logger.warning(f"Skipping tokenization for example index {i} due to empty question or context. Q: '{q_str}', C: '{c_str}'")
133
+
134
+ if not valid_inputs_for_tokenizer_q: # No valid (q,c) pairs to tokenize
135
+ logger.error(f"No valid question/context pairs to tokenize for examples with IDs: {examples.get('id', ['N/A'])}. Returning empty features.")
136
+ # Return a structure that .map expects (dictionary of empty lists for all expected keys)
137
+ return {key: [] for key in ["input_ids", "attention_mask", "token_type_ids", "example_id", "offset_mapping"]}
138
 
 
 
 
139
 
140
  tokenized_output = tokenizer(
141
+ valid_inputs_for_tokenizer_q,
142
+ valid_inputs_for_tokenizer_c,
143
  truncation="only_second" if pad_on_right else "only_first",
144
  max_length=max_seq_len,
145
  stride=doc_stride,
146
  return_overflowing_tokens=True,
147
  return_offsets_mapping=True,
148
+ padding="max_length",
149
  )
150
 
151
+ # Robustness check and fix for tokenizer outputs
152
+ keys_to_fix = ["input_ids", "attention_mask", "token_type_ids"]
153
+ pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
154
+ cls_id = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else 101 # Common default
155
+ sep_id = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else 102 # Common default
156
+
157
+ for key in keys_to_fix:
158
+ if key in tokenized_output:
159
+ for i in range(len(tokenized_output[key])): # Iterate over each feature's list for this key
160
+ feature_list = tokenized_output[key][i]
161
+ if feature_list is None: # If the entire list for a feature is None
162
+ logger.warning(f"Tokenizer produced None for '{key}' at feature index {i}. Replacing with default.")
163
+ if key == "input_ids":
164
+ default_seq = [cls_id, sep_id] + [pad_id] * (max_seq_len - 2)
165
+ tokenized_output[key][i] = default_seq[:max_seq_len]
166
+ elif key == "attention_mask":
167
+ default_mask = [1, 1] + [0] * (max_seq_len - 2)
168
+ tokenized_output[key][i] = default_mask[:max_seq_len]
169
+ elif key == "token_type_ids":
170
+ tokenized_output[key][i] = [0] * max_seq_len
171
+ elif not all(isinstance(x, int) for x in feature_list): # Check for non-integers (like None)
172
+ logger.warning(f"Tokenizer produced non-integers in '{key}' at feature index {i}: {str(feature_list)[:100]}... Fixing.")
173
+ default_val = pad_id if key == "input_ids" else 0
174
+ tokenized_output[key][i] = [default_val if not isinstance(x, int) else x for x in feature_list]
175
 
 
 
 
 
176
 
177
  processed_features = []
178
+ num_generated_features = len(tokenized_output["input_ids"])
179
+
180
+ # sample_mapping from tokenized_output might be incorrect if we filtered inputs
181
+ # Reconstruct sample_mapping based on original_indices_for_valid_inputs and overflow
182
+ # This part gets tricky if return_overflowing_tokens is True and we filtered.
183
+ # For simplicity, let's assume for now that if valid_inputs_for_tokenizer_q is not empty,
184
+ # tokenizer works on all of them. The more complex case is if tokenizer itself only processes a subset.
185
 
186
+ # The `overflow_to_sample_mapping` maps generated features to the indices in the *input to the tokenizer*.
187
+ # Our input to tokenizer was `valid_inputs_for_tokenizer_q/c`.
188
+ overflow_mapping = tokenized_output.pop("overflow_to_sample_mapping")
189
 
190
 
191
  for i in range(num_generated_features):
192
  feature = {}
193
+ # Map the index from the tokenizer's output (which is based on valid_inputs)
194
+ # back to the index in the original `examples` batch.
195
+ idx_in_valid_inputs = overflow_mapping[i]
196
+ original_example_batch_index = original_indices_for_valid_inputs[idx_in_valid_inputs]
197
 
 
198
  feature["input_ids"] = tokenized_output["input_ids"][i]
199
  if "attention_mask" in tokenized_output:
200
  feature["attention_mask"] = tokenized_output["attention_mask"][i]
201
  if "token_type_ids" in tokenized_output:
202
  feature["token_type_ids"] = tokenized_output["token_type_ids"][i]
203
 
204
+ feature["example_id"] = examples["id"][original_example_batch_index]
 
205
 
206
  current_offset_mapping = tokenized_output["offset_mapping"][i]
207
+ sequence_ids = tokenized_output.sequence_ids(i)
208
  context_idx_in_pair = 1 if pad_on_right else 0
209
 
210
  feature["offset_mapping"] = [
211
+ offset if sequence_ids is not None and k < len(sequence_ids) and sequence_ids[k] == context_idx_in_pair else None
212
  for k, offset in enumerate(current_offset_mapping)
213
  ]
214
  processed_features.append(feature)
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  final_batch = {}
217
+ if processed_features:
218
+ for key in processed_features[0].keys():
219
+ final_batch[key] = [feature[key] for feature in processed_features]
220
+ else:
221
+ logger.warning(f"No features could be processed for example IDs: {examples.get('id', ['N/A'])}. Input q: {examples.get('question', ['N/A'])}, c: {examples.get('context', ['N/A'])}")
222
+ for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
223
+ final_batch[key_to_ensure] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  return final_batch
226