Spaces:

TheWeeeed
/

chinese-qa-demo

Sleeping

App Files Files Community

TheWeeeed commited on Jun 1

Commit

2bb107e

verified ·

1 Parent(s): 43478fe

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -5

app.py CHANGED Viewed

@@ -214,14 +214,49 @@ def prepare_features_for_qa_inference(examples, tokenizer, pad_on_right, max_seq
         processed_features.append(feature)
     final_batch = {}
-    if processed_features:
-        for key in processed_features[0].keys():
-            final_batch[key] = [feature[key] for feature in processed_features]
-    else:
-        logger.warning(f"No features could be processed for example IDs: {examples.get('id', ['N/A'])}. Input q: {examples.get('question', ['N/A'])}, c: {examples.get('context', ['N/A'])}")
         for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
             final_batch[key_to_ensure] = []
     return final_batch
 # postprocess_qa_predictions 函數也需要從 utils_qa.py 複製或導入

         processed_features.append(feature)
     final_batch = {}
+    if not processed_features:
+        logger.warning(f"No features generated for example IDs: {examples.get('id', ['N/A'])}. Returning empty structure.")
+        # 確保返回的結構與 .map 期望的一致，即字典的鍵是列名，值是空列表
         for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
             final_batch[key_to_ensure] = []
+        return final_batch
+    # 1. 首先，將 processed_features (list of dicts) 轉換為 final_batch (dict of lists)
+    for key in processed_features[0].keys(): # 假設所有特徵字典有相同的鍵
+        final_batch[key] = [feature[key] for feature in processed_features]
+    # 2. 然後，對 final_batch 中需要轉換為張量的字段進行健壯性檢查和修正
+    keys_to_fix_for_tensor_conversion = ["input_ids", "attention_mask", "token_type_ids"]
+    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
+    cls_token_id = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else 101
+    sep_token_id = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else 102
+    for key in keys_to_fix_for_tensor_conversion:
+        if key in final_batch:
+            # final_batch[key] 是一個列表的列表，例如 [[ids_for_feature1], [ids_for_feature2], ...]
+            corrected_list_of_lists = []
+            for i, single_feature_list in enumerate(final_batch[key]):
+                if single_feature_list is None:
+                    logger.warning(f"Feature list for '{key}' at index {i} is None. Replacing with default for max_seq_len {max_seq_len}.")
+                    if key == "input_ids":
+                        default_seq = [cls_token_id, sep_token_id] + [pad_token_id] * (max_seq_len - 2)
+                        corrected_list_of_lists.append(default_seq[:max_seq_len])
+                    elif key == "attention_mask":
+                        default_mask = [1, 1] + [0] * (max_seq_len - 2)
+                        corrected_list_of_lists.append(default_mask[:max_seq_len])
+                    elif key == "token_type_ids":
+                        corrected_list_of_lists.append([0] * max_seq_len)
+                elif not all(isinstance(x, int) for x in single_feature_list):
+                    logger.warning(f"Feature list for '{key}' at index {i} contains non-integers: {str(single_feature_list)[:50]}... Fixing Nones.")
+                    default_val = pad_token_id if key == "input_ids" else 0
+                    fixed_list = [default_val if not isinstance(x, int) else x for x in single_feature_list]
+                    corrected_list_of_lists.append(fixed_list)
+                else:
+                    corrected_list_of_lists.append(single_feature_list) # List is already good
+            final_batch[key] = corrected_list_of_lists
+    # 在返回前，可以再加一層打印，確認修正後的 final_batch 結構
+    # logger.debug(f"Returning final_batch from prepare_features: { {k: str(v)[:200] + '...' for k,v in final_batch.items()} }")
     return final_batch
 # postprocess_qa_predictions 函數也需要從 utils_qa.py 複製或導入