Spaces:

TheWeeeed
/

chinese-qa-demo

Running

App Files Files Community

TheWeeeed commited on 7 days ago

Commit

43478fe

verified ·

1 Parent(s): 525e038

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -77

app.py CHANGED Viewed

@@ -13,7 +13,13 @@ import numpy as np
 from datasets import Dataset
 from utils_qa import postprocess_qa_predictions
 # 假設 utils_qa.py 在同一目錄下 (或者您需要將其函數複製過來或確保可導入)
 # from utils_qa import postprocess_qa_predictions # 您可能需要完整路徑或將其放入 requirements.txt
@@ -100,118 +106,121 @@ def prepare_features_for_qa_inference_gradio(question_id, question_text, selecte
 # 您 inference_pipeline.py 中的 prepare_features_for_qa_inference 函數需要被複製到這裡
 # 或者確保它可以被導入
 def prepare_features_for_qa_inference(examples, tokenizer, pad_on_right, max_seq_len, doc_stride):
     examples["question"] = [q.lstrip() if isinstance(q, str) else "" for q in examples["question"]]
-    questions = examples["question" if pad_on_right else "context"]
-    contexts = examples["context" if pad_on_right else "question"]
-    # Ensure questions and contexts are lists of strings, handle None by converting to empty string
-    questions = [q if isinstance(q, str) else "" for q in questions]
-    contexts = [c if isinstance(c, str) else "" for c in contexts]
     tokenized_output = tokenizer(
-        questions,
-        contexts,
         truncation="only_second" if pad_on_right else "only_first",
         max_length=max_seq_len,
         stride=doc_stride,
         return_overflowing_tokens=True,
         return_offsets_mapping=True,
-        padding="max_length", # This ensures all primary outputs are lists of numbers of fixed length
     )
-    # The tokenizer with padding="max_length" should already produce lists of integers
-    # for input_ids, attention_mask, token_type_ids.
-    # The main risk of 'None' would be if the input strings were so problematic
-    # that the tokenizer failed internally in a way not producing standard padded output.
-    # However, standard tokenizers are quite robust with empty strings when padding is enabled.
-    # Let's directly create the structure we need for the output Dataset.
-    # `tokenized_output` is a BatchEncoding (dict-like).
-    # If `return_overflowing_tokens=True` and N features are generated from one example,
-    # then `tokenized_output['input_ids']` is a list of N lists.
     processed_features = []
-    num_generated_features = len(tokenized_output["input_ids"]) # Number of features due to overflow
-    # `sample_mapping` maps each generated feature back to its original example index in the input `examples`
-    sample_mapping = tokenized_output.pop("overflow_to_sample_mapping", list(range(len(examples["id"]))))
     for i in range(num_generated_features):
         feature = {}
-        original_example_index = sample_mapping[i] # Index of the original example this feature came from
-        # These should always be lists of integers due to padding="max_length"
         feature["input_ids"] = tokenized_output["input_ids"][i]
         if "attention_mask" in tokenized_output:
             feature["attention_mask"] = tokenized_output["attention_mask"][i]
         if "token_type_ids" in tokenized_output:
             feature["token_type_ids"] = tokenized_output["token_type_ids"][i]
-        # These might not be strictly needed by the model's forward pass but are used by postprocessing
-        feature["example_id"] = examples["id"][original_example_index]
         current_offset_mapping = tokenized_output["offset_mapping"][i]
-        sequence_ids = tokenized_output.sequence_ids(i) # Pass the index of the feature
         context_idx_in_pair = 1 if pad_on_right else 0
         feature["offset_mapping"] = [
-            offset if sequence_ids[k] == context_idx_in_pair else None
             for k, offset in enumerate(current_offset_mapping)
         ]
         processed_features.append(feature)
-    # The .map function expects a dictionary where keys are column names
-    # and values are lists of features for those columns.
-    # Since we are processing one original example at a time (batched=True on a Dataset of 1 row),
-    # and this one example can produce multiple features, `processed_features` is a list of dicts.
-    # We need to return a dictionary of lists.
-    if not processed_features: # Should not happen if tokenizer works, but as a safeguard
-        # Return structure with empty lists to match expected features by .map()
-        # This case indicates an issue with tokenizing the input example.
-        logger.error(f"No features generated for example ID {examples['id'][0]}. Input q: {examples['question'][0]}, c: {examples['context'][0]}")
-        return {
-            "input_ids": [], "token_type_ids": [], "attention_mask": [],
-            "offset_mapping": [], "example_id": []
-        }
-    # Transpose the list of feature dictionaries into a dictionary of feature lists
-    # This is what the .map(batched=True) function expects as a return value
     final_batch = {}
-    for key in processed_features[0].keys():
-        final_batch[key] = [feature[key] for feature in processed_features]
-    for key_to_check in ["input_ids", "attention_mask", "token_type_ids"]:
-        if key_to_check in final_batch:
-            for i, lst in enumerate(final_batch[key_to_check]):
-                if lst is None:
-                    raise ValueError(f"在 prepare_features_for_qa_inference 中，{key_to_check} 的第 {i} 個特徵列表為 None！")
-                if any(x is None for x in lst):
-                    raise ValueError(f"在 prepare_features_for_qa_inference 中，{key_to_check} 的第 {i} 個特徵列表內部包含 None！內容: {lst[:20]}")
-    for key_to_check in ["input_ids", "attention_mask", "token_type_ids"]:
-        if key_to_check in final_batch:
-            new_list_of_lists = []
-            for single_feature_list in final_batch[key_to_check]:
-                if single_feature_list is None: # 如果整個特徵的這個字段是 None
-                    # logger.error(f"Critical error: {key_to_check} list for a feature is None. Reconstructing a default.")
-                    # 根據 key_to_check 類型創建一個安全的默認值
-                    if key_to_check == "input_ids":
-                        safe_list = [tokenizer.cls_token_id or 101, tokenizer.sep_token_id or 102] + \
-                                    [tokenizer.pad_token_id or 0] * (max_seq_len - 2)
-                        new_list_of_lists.append(safe_list[:max_seq_len])
-                    elif key_to_check == "attention_mask":
-                        safe_list = [1,1] + [0] * (max_seq_len-2)
-                        new_list_of_lists.append(safe_list[:max_seq_len])
-                    elif key_to_check == "token_type_ids":
-                        new_list_of_lists.append([0] * max_seq_len)
-                elif not all(isinstance(x, int) for x in single_feature_list): # 如果列表內包含非整數
-                    # logger.error(f"Critical error: {key_to_check} list for a feature contains non-integers: {single_feature_list[:10]}. Fixing.")
-                    default_val = tokenizer.pad_token_id if key_to_check == "input_ids" else 0
-                    new_list_of_lists.append([default_val if not isinstance(x, int) else x for x in single_feature_list])
-                else:
-                    new_list_of_lists.append(single_feature_list) # 原本就是好的
-            final_batch[key_to_check] = new_list_of_lists
     return final_batch

 from datasets import Dataset
 from utils_qa import postprocess_qa_predictions
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO, # Or logging.DEBUG for more verbose output
+)
 # 假設 utils_qa.py 在同一目錄下 (或者您需要將其函數複製過來或確保可導入)
 # from utils_qa import postprocess_qa_predictions # 您可能需要完整路徑或將其放入 requirements.txt
 # 您 inference_pipeline.py 中的 prepare_features_for_qa_inference 函數需要被複製到這裡
 # 或者確保它可以被導入
 def prepare_features_for_qa_inference(examples, tokenizer, pad_on_right, max_seq_len, doc_stride):
+    # Initial stripping and assignment
     examples["question"] = [q.lstrip() if isinstance(q, str) else "" for q in examples["question"]]
+    questions_to_tokenize = examples["question" if pad_on_right else "context"]
+    contexts_to_tokenize = examples["context" if pad_on_right else "question"]
+    questions_to_tokenize = [q if isinstance(q, str) else "" for q in questions_to_tokenize]
+    contexts_to_tokenize = [c if isinstance(c, str) else "" for c in contexts_to_tokenize]
+    # Handle cases where either question or context might be empty after processing
+    # Tokenizer might handle empty strings, but let's be explicit if one is vital
+    valid_inputs_for_tokenizer_q = []
+    valid_inputs_for_tokenizer_c = []
+    original_indices_for_valid_inputs = []
+    for i in range(len(questions_to_tokenize)):
+        q_str = questions_to_tokenize[i]
+        c_str = contexts_to_tokenize[i]
+        # Add a basic check: if context is empty, tokenization might be problematic for QA
+        if q_str.strip() and c_str.strip(): # Ensure both have content after stripping
+            valid_inputs_for_tokenizer_q.append(q_str)
+            valid_inputs_for_tokenizer_c.append(c_str)
+            original_indices_for_valid_inputs.append(i)
+        else:
+            logger.warning(f"Skipping tokenization for example index {i} due to empty question or context. Q: '{q_str}', C: '{c_str}'")
+    if not valid_inputs_for_tokenizer_q: # No valid (q,c) pairs to tokenize
+        logger.error(f"No valid question/context pairs to tokenize for examples with IDs: {examples.get('id', ['N/A'])}. Returning empty features.")
+        # Return a structure that .map expects (dictionary of empty lists for all expected keys)
+        return {key: [] for key in ["input_ids", "attention_mask", "token_type_ids", "example_id", "offset_mapping"]}
     tokenized_output = tokenizer(
+        valid_inputs_for_tokenizer_q,
+        valid_inputs_for_tokenizer_c,
         truncation="only_second" if pad_on_right else "only_first",
         max_length=max_seq_len,
         stride=doc_stride,
         return_overflowing_tokens=True,
         return_offsets_mapping=True,
+        padding="max_length",
     )
+    # Robustness check and fix for tokenizer outputs
+    keys_to_fix = ["input_ids", "attention_mask", "token_type_ids"]
+    pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
+    cls_id = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else 101 # Common default
+    sep_id = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else 102 # Common default
+    for key in keys_to_fix:
+        if key in tokenized_output:
+            for i in range(len(tokenized_output[key])): # Iterate over each feature's list for this key
+                feature_list = tokenized_output[key][i]
+                if feature_list is None: # If the entire list for a feature is None
+                    logger.warning(f"Tokenizer produced None for '{key}' at feature index {i}. Replacing with default.")
+                    if key == "input_ids":
+                        default_seq = [cls_id, sep_id] + [pad_id] * (max_seq_len - 2)
+                        tokenized_output[key][i] = default_seq[:max_seq_len]
+                    elif key == "attention_mask":
+                        default_mask = [1, 1] + [0] * (max_seq_len - 2)
+                        tokenized_output[key][i] = default_mask[:max_seq_len]
+                    elif key == "token_type_ids":
+                        tokenized_output[key][i] = [0] * max_seq_len
+                elif not all(isinstance(x, int) for x in feature_list): # Check for non-integers (like None)
+                    logger.warning(f"Tokenizer produced non-integers in '{key}' at feature index {i}: {str(feature_list)[:100]}... Fixing.")
+                    default_val = pad_id if key == "input_ids" else 0
+                    tokenized_output[key][i] = [default_val if not isinstance(x, int) else x for x in feature_list]
     processed_features = []
+    num_generated_features = len(tokenized_output["input_ids"])
+    # sample_mapping from tokenized_output might be incorrect if we filtered inputs
+    # Reconstruct sample_mapping based on original_indices_for_valid_inputs and overflow
+    # This part gets tricky if return_overflowing_tokens is True and we filtered.
+    # For simplicity, let's assume for now that if valid_inputs_for_tokenizer_q is not empty,
+    # tokenizer works on all of them. The more complex case is if tokenizer itself only processes a subset.
+    # The `overflow_to_sample_mapping` maps generated features to the indices in the *input to the tokenizer*.
+    # Our input to tokenizer was `valid_inputs_for_tokenizer_q/c`.
+    overflow_mapping = tokenized_output.pop("overflow_to_sample_mapping")
     for i in range(num_generated_features):
         feature = {}
+        # Map the index from the tokenizer's output (which is based on valid_inputs)
+        # back to the index in the original `examples` batch.
+        idx_in_valid_inputs = overflow_mapping[i]
+        original_example_batch_index = original_indices_for_valid_inputs[idx_in_valid_inputs]
         feature["input_ids"] = tokenized_output["input_ids"][i]
         if "attention_mask" in tokenized_output:
             feature["attention_mask"] = tokenized_output["attention_mask"][i]
         if "token_type_ids" in tokenized_output:
             feature["token_type_ids"] = tokenized_output["token_type_ids"][i]
+        feature["example_id"] = examples["id"][original_example_batch_index]
         current_offset_mapping = tokenized_output["offset_mapping"][i]
+        sequence_ids = tokenized_output.sequence_ids(i)
         context_idx_in_pair = 1 if pad_on_right else 0
         feature["offset_mapping"] = [
+            offset if sequence_ids is not None and k < len(sequence_ids) and sequence_ids[k] == context_idx_in_pair else None
             for k, offset in enumerate(current_offset_mapping)
         ]
         processed_features.append(feature)
     final_batch = {}
+    if processed_features:
+        for key in processed_features[0].keys():
+            final_batch[key] = [feature[key] for feature in processed_features]
+    else:
+        logger.warning(f"No features could be processed for example IDs: {examples.get('id', ['N/A'])}. Input q: {examples.get('question', ['N/A'])}, c: {examples.get('context', ['N/A'])}")
+        for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
+            final_batch[key_to_ensure] = []
     return final_batch