Looooooong commited on
Commit
daca86f
·
1 Parent(s): 076d29b

添加 Gradio 應用程序文件

Browse files
Files changed (2) hide show
  1. app.py +280 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForMultipleChoice, AutoModelForQuestionAnswering
4
+ import json
5
+ import collections # 如果您的 postprocess_qa_predictions 需要
6
+
7
+ # 假設 utils_qa.py 在同一目錄下 (或者您需要將其函數複製過來或確保可導入)
8
+ # from utils_qa import postprocess_qa_predictions # 您可能需要完整路徑或將其放入 requirements.txt
9
+
10
+ # --- 模型和分詞器加載 ---
11
+ # 建議從 Hugging Face Hub 加載您已經上傳的模型
12
+ # 這樣您的 Space 就不需要包含模型文件本身,保持輕量
13
+ TOKENIZER_PATH = "TheWeeeed/bert-base-chinese" # 或者您上傳的分詞器路徑
14
+ SELECTOR_MODEL_PATH = "TheWeeeed/chinese-paragraph-selector" # 替換為您上傳的段落選擇模型 ID
15
+ QA_MODEL_PATH = "TheWeeeed/chinese-extractive-qa" # 替換為您上傳的答案抽取模型 ID
16
+
17
+ try:
18
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
19
+ selector_model = AutoModelForMultipleChoice.from_pretrained(SELECTOR_MODEL_PATH)
20
+ qa_model = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL_PATH)
21
+
22
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
+ selector_model.to(device)
24
+ selector_model.eval()
25
+ qa_model.to(device)
26
+ qa_model.eval()
27
+ models_loaded_successfully = True
28
+ print(f"模型和分詞器加載成功,使用設備: {device}")
29
+ except Exception as e:
30
+ models_loaded_successfully = False
31
+ error_message = f"加載模型或分詞器時出錯: {e}"
32
+ print(error_message)
33
+ # 在 Gradio 界面中,我們可以顯示這個錯誤信息
34
+
35
+
36
+ # --- 從您的 inference_pipeline.py 中提取並調整以下函數 ---
37
+ def select_relevant_paragraph_gradio(question_text, candidate_paragraph_texts_str, model, tokenizer, device, max_seq_len):
38
+ # candidate_paragraph_texts_str 是一個由換行符分隔的字符串
39
+ candidate_paragraph_texts = [p.strip() for p in candidate_paragraph_texts_str.split('\n') if p.strip()]
40
+ if not candidate_paragraph_texts:
41
+ return "請至少提供一個候選段落。", -1
42
+
43
+ model.eval()
44
+ inputs_mc = []
45
+ for p_text in candidate_paragraph_texts:
46
+ inputs_mc.append(
47
+ tokenizer(
48
+ question_text, p_text, add_special_tokens=True, max_length=max_seq_len,
49
+ padding="max_length", truncation=True, return_tensors="pt"
50
+ )
51
+ )
52
+ input_ids = torch.stack([inp["input_ids"].squeeze(0) for inp in inputs_mc]).unsqueeze(0).to(device)
53
+ attention_mask = torch.stack([inp["attention_mask"].squeeze(0) for inp in inputs_mc]).unsqueeze(0).to(device)
54
+ token_type_ids = None
55
+ if "token_type_ids" in inputs_mc[0]:
56
+ token_type_ids = torch.stack([inp["token_type_ids"].squeeze(0) for inp in inputs_mc]).unsqueeze(0).to(device)
57
+
58
+ with torch.no_grad():
59
+ if token_type_ids is not None:
60
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
61
+ else:
62
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
63
+ predicted_index = torch.argmax(outputs.logits, dim=1).item()
64
+ if predicted_index < len(candidate_paragraph_texts):
65
+ return candidate_paragraph_texts[predicted_index], predicted_index
66
+ else:
67
+ return "段落選擇索引錯誤。", -1
68
+
69
+
70
+ def prepare_features_for_qa_inference_gradio(question_id, question_text, selected_context, tokenizer, max_seq_len, doc_stride):
71
+ # 這個函數需要從您的 inference_pipeline.py 中提取並適當修改
72
+ # 它需要返回一個可以被 QA 模型使用的 Dataset 或 features 列表
73
+ # 簡化版:
74
+ from datasets import Dataset # 需要在 requirements.txt 中
75
+
76
+ qa_example_for_processing = {"id": [question_id], "question": [question_text], "context": [selected_context]}
77
+ temp_dataset = Dataset.from_dict(qa_example_for_processing)
78
+
79
+ pad_on_right = tokenizer.padding_side == "right"
80
+
81
+ qa_features = temp_dataset.map(
82
+ lambda examples: prepare_features_for_qa_inference( # 這是您 inference_pipeline.py 中的函數
83
+ examples, tokenizer, pad_on_right, max_seq_len, doc_stride
84
+ ),
85
+ batched=True,
86
+ remove_columns=temp_dataset.column_names
87
+ )
88
+ return qa_features # 返回 Dataset 對象
89
+
90
+ # 您 inference_pipeline.py 中的 prepare_features_for_qa_inference 函數需要被複製到這裡
91
+ # 或者確保它可以被導入
92
+ def prepare_features_for_qa_inference(examples, tokenizer, pad_on_right, max_seq_len, doc_stride):
93
+ examples["question"] = [q.lstrip() if isinstance(q, str) else "" for q in examples["question"]]
94
+ questions = examples["question" if pad_on_right else "context"]
95
+ contexts = examples["context" if pad_on_right else "question"]
96
+
97
+ # Ensure questions and contexts are lists of strings, handle None by converting to empty string
98
+ questions = [q if isinstance(q, str) else "" for q in questions]
99
+ contexts = [c if isinstance(c, str) else "" for c in contexts]
100
+
101
+ tokenized_output = tokenizer(
102
+ questions,
103
+ contexts,
104
+ truncation="only_second" if pad_on_right else "only_first",
105
+ max_length=max_seq_len,
106
+ stride=doc_stride,
107
+ return_overflowing_tokens=True,
108
+ return_offsets_mapping=True,
109
+ padding="max_length", # This ensures all primary outputs are lists of numbers of fixed length
110
+ )
111
+
112
+ # The tokenizer with padding="max_length" should already produce lists of integers
113
+ # for input_ids, attention_mask, token_type_ids.
114
+ # The main risk of 'None' would be if the input strings were so problematic
115
+ # that the tokenizer failed internally in a way not producing standard padded output.
116
+ # However, standard tokenizers are quite robust with empty strings when padding is enabled.
117
+
118
+ # Let's directly create the structure we need for the output Dataset.
119
+ # `tokenized_output` is a BatchEncoding (dict-like).
120
+ # If `return_overflowing_tokens=True` and N features are generated from one example,
121
+ # then `tokenized_output['input_ids']` is a list of N lists.
122
+
123
+ processed_features = []
124
+ num_generated_features = len(tokenized_output["input_ids"]) # Number of features due to overflow
125
+
126
+ # `sample_mapping` maps each generated feature back to its original example index in the input `examples`
127
+ sample_mapping = tokenized_output.pop("overflow_to_sample_mapping", list(range(len(examples["id"]))))
128
+
129
+
130
+ for i in range(num_generated_features):
131
+ feature = {}
132
+ original_example_index = sample_mapping[i] # Index of the original example this feature came from
133
+
134
+ # These should always be lists of integers due to padding="max_length"
135
+ feature["input_ids"] = tokenized_output["input_ids"][i]
136
+ if "attention_mask" in tokenized_output:
137
+ feature["attention_mask"] = tokenized_output["attention_mask"][i]
138
+ if "token_type_ids" in tokenized_output:
139
+ feature["token_type_ids"] = tokenized_output["token_type_ids"][i]
140
+
141
+ # These might not be strictly needed by the model's forward pass but are used by postprocessing
142
+ feature["example_id"] = examples["id"][original_example_index]
143
+
144
+ current_offset_mapping = tokenized_output["offset_mapping"][i]
145
+ sequence_ids = tokenized_output.sequence_ids(i) # Pass the index of the feature
146
+ context_idx_in_pair = 1 if pad_on_right else 0
147
+
148
+ feature["offset_mapping"] = [
149
+ offset if sequence_ids[k] == context_idx_in_pair else None
150
+ for k, offset in enumerate(current_offset_mapping)
151
+ ]
152
+ processed_features.append(feature)
153
+
154
+ # The .map function expects a dictionary where keys are column names
155
+ # and values are lists of features for those columns.
156
+ # Since we are processing one original example at a time (batched=True on a Dataset of 1 row),
157
+ # and this one example can produce multiple features, `processed_features` is a list of dicts.
158
+ # We need to return a dictionary of lists.
159
+ if not processed_features: # Should not happen if tokenizer works, but as a safeguard
160
+ # Return structure with empty lists to match expected features by .map()
161
+ # This case indicates an issue with tokenizing the input example.
162
+ logger.error(f"No features generated for example ID {examples['id'][0]}. Input q: {examples['question'][0]}, c: {examples['context'][0]}")
163
+ return {
164
+ "input_ids": [], "token_type_ids": [], "attention_mask": [],
165
+ "offset_mapping": [], "example_id": []
166
+ }
167
+
168
+ # Transpose the list of feature dictionaries into a dictionary of feature lists
169
+ # This is what the .map(batched=True) function expects as a return value
170
+ final_batch = {}
171
+ for key in processed_features[0].keys():
172
+ final_batch[key] = [feature[key] for feature in processed_features]
173
+
174
+ for key_to_check in ["input_ids", "attention_mask", "token_type_ids"]:
175
+ if key_to_check in final_batch:
176
+ for i, lst in enumerate(final_batch[key_to_check]):
177
+ if lst is None:
178
+ raise ValueError(f"在 prepare_features_for_qa_inference 中,{key_to_check} 的第 {i} 個特徵列表為 None!")
179
+ if any(x is None for x in lst):
180
+ raise ValueError(f"在 prepare_features_for_qa_inference 中,{key_to_check} 的第 {i} 個特徵列表內部包含 None!內容: {lst[:20]}")
181
+
182
+ return final_batch
183
+
184
+ # postprocess_qa_predictions 函數也需要從 utils_qa.py 複製或導入
185
+ # from utils_qa import postprocess_qa_predictions # 確保 utils_qa.py 在 Space 的環境中可用
186
+
187
+ # --- Gradio 界面函數 ---
188
+ def two_stage_qa(question, candidate_paragraphs_str, max_seq_len_mc=512, max_seq_len_qa=384, doc_stride_qa=128, n_best_size=20, max_answer_length=100):
189
+ if not models_loaded_successfully:
190
+ return f"錯誤: {error_message}", "N/A", "N/A"
191
+
192
+ if not question.strip() or not candidate_paragraphs_str.strip():
193
+ return "錯誤: 問題和候���段落不能為空。", "N/A", "N/A"
194
+
195
+ # 階段一
196
+ selected_paragraph, selected_idx = select_relevant_paragraph_gradio(
197
+ question, candidate_paragraphs_str, selector_model, tokenizer, device, max_seq_len_mc
198
+ )
199
+ if selected_idx == -1: # 段落選擇出錯
200
+ return f"段落選擇出錯: {selected_paragraph}", "N/A", selected_paragraph
201
+
202
+ # 階段二
203
+ # 準備 QA 特徵
204
+ qa_features_dataset = prepare_features_for_qa_inference_gradio(
205
+ "temp_id", question, selected_paragraph, tokenizer, max_seq_len_qa, doc_stride_qa
206
+ )
207
+
208
+ if len(qa_features_dataset) == 0:
209
+ return "錯誤: 無法為選定段落生成QA特徵 (可能段落太短或內容問題)。", f"選中的段落 (索引 {selected_idx}):\n{selected_paragraph}", "N/A"
210
+
211
+ # 創建 DataLoader
212
+ from transformers import default_data_collator # 需要導入
213
+ qa_dataloader = DataLoader(
214
+ qa_features_dataset, collate_fn=default_data_collator, batch_size=8 # batch_size可以小一些
215
+ )
216
+
217
+ all_start_logits = []
218
+ all_end_logits = []
219
+ for batch in qa_dataloader:
220
+ batch = {k: v.to(device) for k, v in batch.items()}
221
+ with torch.no_grad():
222
+ outputs_qa = qa_model(**batch)
223
+ all_start_logits.append(outputs_qa.start_logits.cpu().numpy())
224
+ all_end_logits.append(outputs_qa.end_logits.cpu().numpy())
225
+
226
+ if not all_start_logits:
227
+ return "錯誤: QA模型沒有產生logits。", f"選中的段落 (索引 {selected_idx}):\n{selected_paragraph}", "N/A"
228
+
229
+ start_logits_np = np.concatenate(all_start_logits, axis=0)
230
+ end_logits_np = np.concatenate(all_end_logits, axis=0)
231
+
232
+ # 為了 postprocess_qa_predictions,我們需要原始的 example 數據
233
+ # 它期望一個包含 "answers" 字段的 Dataset
234
+ def add_empty_answers(example):
235
+ example["answers"] = {"text": [], "answer_start": []}
236
+ return example
237
+
238
+ # temp_dataset 用於 postprocessing
239
+ original_example_for_postproc = {"id": ["temp_id"], "question": [question], "context": [selected_paragraph]}
240
+ original_dataset_for_postproc = Dataset.from_dict(original_example_for_postproc).map(add_empty_answers)
241
+
242
+
243
+ # 後處理
244
+ # 確保 postprocess_qa_predictions 可用
245
+ predictions_dict = postprocess_qa_predictions(
246
+ examples=original_dataset_for_postproc, # 原始的、包含 context 和空 answers 的 Dataset
247
+ features=qa_features_dataset, # 包含 offset_mapping 和 example_id 的 Dataset
248
+ predictions=(start_logits_np, end_logits_np),
249
+ version_2_with_negative=False,
250
+ n_best_size=n_best_size,
251
+ max_answer_length=max_answer_length,
252
+ null_score_diff_threshold=0.0,
253
+ output_dir=None,
254
+ prefix="gradio_predict",
255
+ is_world_process_zero=True
256
+ )
257
+
258
+ final_answer = predictions_dict.get("temp_id", "未能提取答案。")
259
+
260
+ return final_answer, f"選中的段落 (索引 {selected_idx}):\n{selected_paragraph}", predictions_dict
261
+
262
+ # --- 創建 Gradio 界面 ---
263
+ iface = gr.Interface(
264
+ fn=two_stage_qa,
265
+ inputs=[
266
+ gr.Textbox(lines=2, placeholder="輸入您的問題...", label="問題 (Question)"),
267
+ gr.Textbox(lines=10, placeholder="在此處輸入候選段落,每段一行...", label="候選段落 (Candidate Paragraphs - One per line)")
268
+ ],
269
+ outputs=[
270
+ gr.Textbox(label="預測答案 (Predicted Answer)"),
271
+ gr.Textbox(label="選中的相關段落 (Selected Relevant Paragraph)"),
272
+ gr.JSON(label="原始預測字典 (Raw Predictions Dict - for debugging)") # 可選的調試輸出
273
+ ],
274
+ title="兩階段中文抽取式問答系統",
275
+ description="輸入一個問題和多個候選段落(每行一個段落)。系統會先選擇最相關的段落,然後從中抽取答案。",
276
+ allow_flagging="never"
277
+ )
278
+
279
+ if __name__ == "__main__":
280
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ datasets
4
+ gradio
5
+ pandas
6
+ numpy