Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -13,7 +13,13 @@ import numpy as np
|
|
13 |
from datasets import Dataset
|
14 |
|
15 |
from utils_qa import postprocess_qa_predictions
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# 假設 utils_qa.py 在同一目錄下 (或者您需要將其函數複製過來或確保可導入)
|
18 |
# from utils_qa import postprocess_qa_predictions # 您可能需要完整路徑或將其放入 requirements.txt
|
19 |
|
@@ -100,118 +106,121 @@ def prepare_features_for_qa_inference_gradio(question_id, question_text, selecte
|
|
100 |
# 您 inference_pipeline.py 中的 prepare_features_for_qa_inference 函數需要被複製到這裡
|
101 |
# 或者確保它可以被導入
|
102 |
def prepare_features_for_qa_inference(examples, tokenizer, pad_on_right, max_seq_len, doc_stride):
|
|
|
103 |
examples["question"] = [q.lstrip() if isinstance(q, str) else "" for q in examples["question"]]
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
-
# Ensure questions and contexts are lists of strings, handle None by converting to empty string
|
108 |
-
questions = [q if isinstance(q, str) else "" for q in questions]
|
109 |
-
contexts = [c if isinstance(c, str) else "" for c in contexts]
|
110 |
|
111 |
tokenized_output = tokenizer(
|
112 |
-
|
113 |
-
|
114 |
truncation="only_second" if pad_on_right else "only_first",
|
115 |
max_length=max_seq_len,
|
116 |
stride=doc_stride,
|
117 |
return_overflowing_tokens=True,
|
118 |
return_offsets_mapping=True,
|
119 |
-
padding="max_length",
|
120 |
)
|
121 |
|
122 |
-
#
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
-
# Let's directly create the structure we need for the output Dataset.
|
129 |
-
# `tokenized_output` is a BatchEncoding (dict-like).
|
130 |
-
# If `return_overflowing_tokens=True` and N features are generated from one example,
|
131 |
-
# then `tokenized_output['input_ids']` is a list of N lists.
|
132 |
|
133 |
processed_features = []
|
134 |
-
num_generated_features = len(tokenized_output["input_ids"])
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
-
# `
|
137 |
-
|
|
|
138 |
|
139 |
|
140 |
for i in range(num_generated_features):
|
141 |
feature = {}
|
142 |
-
|
|
|
|
|
|
|
143 |
|
144 |
-
# These should always be lists of integers due to padding="max_length"
|
145 |
feature["input_ids"] = tokenized_output["input_ids"][i]
|
146 |
if "attention_mask" in tokenized_output:
|
147 |
feature["attention_mask"] = tokenized_output["attention_mask"][i]
|
148 |
if "token_type_ids" in tokenized_output:
|
149 |
feature["token_type_ids"] = tokenized_output["token_type_ids"][i]
|
150 |
|
151 |
-
|
152 |
-
feature["example_id"] = examples["id"][original_example_index]
|
153 |
|
154 |
current_offset_mapping = tokenized_output["offset_mapping"][i]
|
155 |
-
sequence_ids = tokenized_output.sequence_ids(i)
|
156 |
context_idx_in_pair = 1 if pad_on_right else 0
|
157 |
|
158 |
feature["offset_mapping"] = [
|
159 |
-
offset if sequence_ids[k] == context_idx_in_pair else None
|
160 |
for k, offset in enumerate(current_offset_mapping)
|
161 |
]
|
162 |
processed_features.append(feature)
|
163 |
|
164 |
-
# The .map function expects a dictionary where keys are column names
|
165 |
-
# and values are lists of features for those columns.
|
166 |
-
# Since we are processing one original example at a time (batched=True on a Dataset of 1 row),
|
167 |
-
# and this one example can produce multiple features, `processed_features` is a list of dicts.
|
168 |
-
# We need to return a dictionary of lists.
|
169 |
-
if not processed_features: # Should not happen if tokenizer works, but as a safeguard
|
170 |
-
# Return structure with empty lists to match expected features by .map()
|
171 |
-
# This case indicates an issue with tokenizing the input example.
|
172 |
-
logger.error(f"No features generated for example ID {examples['id'][0]}. Input q: {examples['question'][0]}, c: {examples['context'][0]}")
|
173 |
-
return {
|
174 |
-
"input_ids": [], "token_type_ids": [], "attention_mask": [],
|
175 |
-
"offset_mapping": [], "example_id": []
|
176 |
-
}
|
177 |
-
|
178 |
-
# Transpose the list of feature dictionaries into a dictionary of feature lists
|
179 |
-
# This is what the .map(batched=True) function expects as a return value
|
180 |
final_batch = {}
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
raise ValueError(f"在 prepare_features_for_qa_inference 中,{key_to_check} 的第 {i} 個特徵列表為 None!")
|
189 |
-
if any(x is None for x in lst):
|
190 |
-
raise ValueError(f"在 prepare_features_for_qa_inference 中,{key_to_check} 的第 {i} 個特徵列表內部包含 None!內容: {lst[:20]}")
|
191 |
-
|
192 |
-
for key_to_check in ["input_ids", "attention_mask", "token_type_ids"]:
|
193 |
-
if key_to_check in final_batch:
|
194 |
-
new_list_of_lists = []
|
195 |
-
for single_feature_list in final_batch[key_to_check]:
|
196 |
-
if single_feature_list is None: # 如果整個特徵的這個字段是 None
|
197 |
-
# logger.error(f"Critical error: {key_to_check} list for a feature is None. Reconstructing a default.")
|
198 |
-
# 根據 key_to_check 類型創建一個安全的默認值
|
199 |
-
if key_to_check == "input_ids":
|
200 |
-
safe_list = [tokenizer.cls_token_id or 101, tokenizer.sep_token_id or 102] + \
|
201 |
-
[tokenizer.pad_token_id or 0] * (max_seq_len - 2)
|
202 |
-
new_list_of_lists.append(safe_list[:max_seq_len])
|
203 |
-
elif key_to_check == "attention_mask":
|
204 |
-
safe_list = [1,1] + [0] * (max_seq_len-2)
|
205 |
-
new_list_of_lists.append(safe_list[:max_seq_len])
|
206 |
-
elif key_to_check == "token_type_ids":
|
207 |
-
new_list_of_lists.append([0] * max_seq_len)
|
208 |
-
elif not all(isinstance(x, int) for x in single_feature_list): # 如果列表內包含非整數
|
209 |
-
# logger.error(f"Critical error: {key_to_check} list for a feature contains non-integers: {single_feature_list[:10]}. Fixing.")
|
210 |
-
default_val = tokenizer.pad_token_id if key_to_check == "input_ids" else 0
|
211 |
-
new_list_of_lists.append([default_val if not isinstance(x, int) else x for x in single_feature_list])
|
212 |
-
else:
|
213 |
-
new_list_of_lists.append(single_feature_list) # 原本就是好的
|
214 |
-
final_batch[key_to_check] = new_list_of_lists
|
215 |
|
216 |
return final_batch
|
217 |
|
|
|
13 |
from datasets import Dataset
|
14 |
|
15 |
from utils_qa import postprocess_qa_predictions
|
16 |
+
import logging
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
logging.basicConfig(
|
19 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
20 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
21 |
+
level=logging.INFO, # Or logging.DEBUG for more verbose output
|
22 |
+
)
|
23 |
# 假設 utils_qa.py 在同一目錄下 (或者您需要將其函數複製過來或確保可導入)
|
24 |
# from utils_qa import postprocess_qa_predictions # 您可能需要完整路徑或將其放入 requirements.txt
|
25 |
|
|
|
106 |
# 您 inference_pipeline.py 中的 prepare_features_for_qa_inference 函數需要被複製到這裡
|
107 |
# 或者確保它可以被導入
|
108 |
def prepare_features_for_qa_inference(examples, tokenizer, pad_on_right, max_seq_len, doc_stride):
|
109 |
+
# Initial stripping and assignment
|
110 |
examples["question"] = [q.lstrip() if isinstance(q, str) else "" for q in examples["question"]]
|
111 |
+
questions_to_tokenize = examples["question" if pad_on_right else "context"]
|
112 |
+
contexts_to_tokenize = examples["context" if pad_on_right else "question"]
|
113 |
+
|
114 |
+
questions_to_tokenize = [q if isinstance(q, str) else "" for q in questions_to_tokenize]
|
115 |
+
contexts_to_tokenize = [c if isinstance(c, str) else "" for c in contexts_to_tokenize]
|
116 |
+
|
117 |
+
# Handle cases where either question or context might be empty after processing
|
118 |
+
# Tokenizer might handle empty strings, but let's be explicit if one is vital
|
119 |
+
valid_inputs_for_tokenizer_q = []
|
120 |
+
valid_inputs_for_tokenizer_c = []
|
121 |
+
original_indices_for_valid_inputs = []
|
122 |
+
|
123 |
+
for i in range(len(questions_to_tokenize)):
|
124 |
+
q_str = questions_to_tokenize[i]
|
125 |
+
c_str = contexts_to_tokenize[i]
|
126 |
+
# Add a basic check: if context is empty, tokenization might be problematic for QA
|
127 |
+
if q_str.strip() and c_str.strip(): # Ensure both have content after stripping
|
128 |
+
valid_inputs_for_tokenizer_q.append(q_str)
|
129 |
+
valid_inputs_for_tokenizer_c.append(c_str)
|
130 |
+
original_indices_for_valid_inputs.append(i)
|
131 |
+
else:
|
132 |
+
logger.warning(f"Skipping tokenization for example index {i} due to empty question or context. Q: '{q_str}', C: '{c_str}'")
|
133 |
+
|
134 |
+
if not valid_inputs_for_tokenizer_q: # No valid (q,c) pairs to tokenize
|
135 |
+
logger.error(f"No valid question/context pairs to tokenize for examples with IDs: {examples.get('id', ['N/A'])}. Returning empty features.")
|
136 |
+
# Return a structure that .map expects (dictionary of empty lists for all expected keys)
|
137 |
+
return {key: [] for key in ["input_ids", "attention_mask", "token_type_ids", "example_id", "offset_mapping"]}
|
138 |
|
|
|
|
|
|
|
139 |
|
140 |
tokenized_output = tokenizer(
|
141 |
+
valid_inputs_for_tokenizer_q,
|
142 |
+
valid_inputs_for_tokenizer_c,
|
143 |
truncation="only_second" if pad_on_right else "only_first",
|
144 |
max_length=max_seq_len,
|
145 |
stride=doc_stride,
|
146 |
return_overflowing_tokens=True,
|
147 |
return_offsets_mapping=True,
|
148 |
+
padding="max_length",
|
149 |
)
|
150 |
|
151 |
+
# Robustness check and fix for tokenizer outputs
|
152 |
+
keys_to_fix = ["input_ids", "attention_mask", "token_type_ids"]
|
153 |
+
pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
|
154 |
+
cls_id = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else 101 # Common default
|
155 |
+
sep_id = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else 102 # Common default
|
156 |
+
|
157 |
+
for key in keys_to_fix:
|
158 |
+
if key in tokenized_output:
|
159 |
+
for i in range(len(tokenized_output[key])): # Iterate over each feature's list for this key
|
160 |
+
feature_list = tokenized_output[key][i]
|
161 |
+
if feature_list is None: # If the entire list for a feature is None
|
162 |
+
logger.warning(f"Tokenizer produced None for '{key}' at feature index {i}. Replacing with default.")
|
163 |
+
if key == "input_ids":
|
164 |
+
default_seq = [cls_id, sep_id] + [pad_id] * (max_seq_len - 2)
|
165 |
+
tokenized_output[key][i] = default_seq[:max_seq_len]
|
166 |
+
elif key == "attention_mask":
|
167 |
+
default_mask = [1, 1] + [0] * (max_seq_len - 2)
|
168 |
+
tokenized_output[key][i] = default_mask[:max_seq_len]
|
169 |
+
elif key == "token_type_ids":
|
170 |
+
tokenized_output[key][i] = [0] * max_seq_len
|
171 |
+
elif not all(isinstance(x, int) for x in feature_list): # Check for non-integers (like None)
|
172 |
+
logger.warning(f"Tokenizer produced non-integers in '{key}' at feature index {i}: {str(feature_list)[:100]}... Fixing.")
|
173 |
+
default_val = pad_id if key == "input_ids" else 0
|
174 |
+
tokenized_output[key][i] = [default_val if not isinstance(x, int) else x for x in feature_list]
|
175 |
|
|
|
|
|
|
|
|
|
176 |
|
177 |
processed_features = []
|
178 |
+
num_generated_features = len(tokenized_output["input_ids"])
|
179 |
+
|
180 |
+
# sample_mapping from tokenized_output might be incorrect if we filtered inputs
|
181 |
+
# Reconstruct sample_mapping based on original_indices_for_valid_inputs and overflow
|
182 |
+
# This part gets tricky if return_overflowing_tokens is True and we filtered.
|
183 |
+
# For simplicity, let's assume for now that if valid_inputs_for_tokenizer_q is not empty,
|
184 |
+
# tokenizer works on all of them. The more complex case is if tokenizer itself only processes a subset.
|
185 |
|
186 |
+
# The `overflow_to_sample_mapping` maps generated features to the indices in the *input to the tokenizer*.
|
187 |
+
# Our input to tokenizer was `valid_inputs_for_tokenizer_q/c`.
|
188 |
+
overflow_mapping = tokenized_output.pop("overflow_to_sample_mapping")
|
189 |
|
190 |
|
191 |
for i in range(num_generated_features):
|
192 |
feature = {}
|
193 |
+
# Map the index from the tokenizer's output (which is based on valid_inputs)
|
194 |
+
# back to the index in the original `examples` batch.
|
195 |
+
idx_in_valid_inputs = overflow_mapping[i]
|
196 |
+
original_example_batch_index = original_indices_for_valid_inputs[idx_in_valid_inputs]
|
197 |
|
|
|
198 |
feature["input_ids"] = tokenized_output["input_ids"][i]
|
199 |
if "attention_mask" in tokenized_output:
|
200 |
feature["attention_mask"] = tokenized_output["attention_mask"][i]
|
201 |
if "token_type_ids" in tokenized_output:
|
202 |
feature["token_type_ids"] = tokenized_output["token_type_ids"][i]
|
203 |
|
204 |
+
feature["example_id"] = examples["id"][original_example_batch_index]
|
|
|
205 |
|
206 |
current_offset_mapping = tokenized_output["offset_mapping"][i]
|
207 |
+
sequence_ids = tokenized_output.sequence_ids(i)
|
208 |
context_idx_in_pair = 1 if pad_on_right else 0
|
209 |
|
210 |
feature["offset_mapping"] = [
|
211 |
+
offset if sequence_ids is not None and k < len(sequence_ids) and sequence_ids[k] == context_idx_in_pair else None
|
212 |
for k, offset in enumerate(current_offset_mapping)
|
213 |
]
|
214 |
processed_features.append(feature)
|
215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
final_batch = {}
|
217 |
+
if processed_features:
|
218 |
+
for key in processed_features[0].keys():
|
219 |
+
final_batch[key] = [feature[key] for feature in processed_features]
|
220 |
+
else:
|
221 |
+
logger.warning(f"No features could be processed for example IDs: {examples.get('id', ['N/A'])}. Input q: {examples.get('question', ['N/A'])}, c: {examples.get('context', ['N/A'])}")
|
222 |
+
for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
|
223 |
+
final_batch[key_to_ensure] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
225 |
return final_batch
|
226 |
|