Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -214,14 +214,49 @@ def prepare_features_for_qa_inference(examples, tokenizer, pad_on_right, max_seq
|
|
| 214 |
processed_features.append(feature)
|
| 215 |
|
| 216 |
final_batch = {}
|
| 217 |
-
if processed_features:
|
| 218 |
-
for
|
| 219 |
-
|
| 220 |
-
else:
|
| 221 |
-
logger.warning(f"No features could be processed for example IDs: {examples.get('id', ['N/A'])}. Input q: {examples.get('question', ['N/A'])}, c: {examples.get('context', ['N/A'])}")
|
| 222 |
for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
|
| 223 |
final_batch[key_to_ensure] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
|
|
|
|
|
|
| 225 |
return final_batch
|
| 226 |
|
| 227 |
# postprocess_qa_predictions 函數也需要從 utils_qa.py 複製或導入
|
|
|
|
| 214 |
processed_features.append(feature)
|
| 215 |
|
| 216 |
final_batch = {}
|
| 217 |
+
if not processed_features:
|
| 218 |
+
logger.warning(f"No features generated for example IDs: {examples.get('id', ['N/A'])}. Returning empty structure.")
|
| 219 |
+
# 確保返回的結構與 .map 期望的一致,即字典的鍵是列名,值是空列表
|
|
|
|
|
|
|
| 220 |
for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
|
| 221 |
final_batch[key_to_ensure] = []
|
| 222 |
+
return final_batch
|
| 223 |
+
|
| 224 |
+
# 1. 首先,將 processed_features (list of dicts) 轉換為 final_batch (dict of lists)
|
| 225 |
+
for key in processed_features[0].keys(): # 假設所有特徵字典有相同的鍵
|
| 226 |
+
final_batch[key] = [feature[key] for feature in processed_features]
|
| 227 |
+
|
| 228 |
+
# 2. 然後,對 final_batch 中需要轉換為張量的字段進行健壯性檢查和修正
|
| 229 |
+
keys_to_fix_for_tensor_conversion = ["input_ids", "attention_mask", "token_type_ids"]
|
| 230 |
+
pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
|
| 231 |
+
cls_token_id = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else 101
|
| 232 |
+
sep_token_id = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else 102
|
| 233 |
+
|
| 234 |
+
for key in keys_to_fix_for_tensor_conversion:
|
| 235 |
+
if key in final_batch:
|
| 236 |
+
# final_batch[key] 是一個列表的列表,例如 [[ids_for_feature1], [ids_for_feature2], ...]
|
| 237 |
+
corrected_list_of_lists = []
|
| 238 |
+
for i, single_feature_list in enumerate(final_batch[key]):
|
| 239 |
+
if single_feature_list is None:
|
| 240 |
+
logger.warning(f"Feature list for '{key}' at index {i} is None. Replacing with default for max_seq_len {max_seq_len}.")
|
| 241 |
+
if key == "input_ids":
|
| 242 |
+
default_seq = [cls_token_id, sep_token_id] + [pad_token_id] * (max_seq_len - 2)
|
| 243 |
+
corrected_list_of_lists.append(default_seq[:max_seq_len])
|
| 244 |
+
elif key == "attention_mask":
|
| 245 |
+
default_mask = [1, 1] + [0] * (max_seq_len - 2)
|
| 246 |
+
corrected_list_of_lists.append(default_mask[:max_seq_len])
|
| 247 |
+
elif key == "token_type_ids":
|
| 248 |
+
corrected_list_of_lists.append([0] * max_seq_len)
|
| 249 |
+
elif not all(isinstance(x, int) for x in single_feature_list):
|
| 250 |
+
logger.warning(f"Feature list for '{key}' at index {i} contains non-integers: {str(single_feature_list)[:50]}... Fixing Nones.")
|
| 251 |
+
default_val = pad_token_id if key == "input_ids" else 0
|
| 252 |
+
fixed_list = [default_val if not isinstance(x, int) else x for x in single_feature_list]
|
| 253 |
+
corrected_list_of_lists.append(fixed_list)
|
| 254 |
+
else:
|
| 255 |
+
corrected_list_of_lists.append(single_feature_list) # List is already good
|
| 256 |
+
final_batch[key] = corrected_list_of_lists
|
| 257 |
|
| 258 |
+
# 在返回前,可以再加一層打印,確認修正後的 final_batch 結構
|
| 259 |
+
# logger.debug(f"Returning final_batch from prepare_features: { {k: str(v)[:200] + '...' for k,v in final_batch.items()} }")
|
| 260 |
return final_batch
|
| 261 |
|
| 262 |
# postprocess_qa_predictions 函數也需要從 utils_qa.py 複製或導入
|