Spaces:
Sleeping
Sleeping
| from typing import Dict, List | |
| from nlp4web_codebase.ir.data_loaders import IRDataset, Split | |
| from nlp4web_codebase.ir.data_loaders.dm import Document, Query, QRel | |
| from datasets import load_dataset | |
| import joblib | |
| def load_sciq(verbose: bool = False) -> IRDataset: | |
| train = load_dataset("allenai/sciq", split="train") | |
| validation = load_dataset("allenai/sciq", split="validation") | |
| test = load_dataset("allenai/sciq", split="test") | |
| data = {Split.train: train, Split.dev: validation, Split.test: test} | |
| # Each duplicated record is the same to each other: | |
| df = train.to_pandas() + validation.to_pandas() + test.to_pandas() | |
| for question, group in df.groupby("question"): | |
| assert len(set(group["support"].tolist())) == len(group) | |
| assert len(set(group["correct_answer"].tolist())) == len(group) | |
| # Build: | |
| corpus = [] | |
| queries = [] | |
| split2qrels: Dict[str, List[dict]] = {} | |
| question2id = {} | |
| support2id = {} | |
| for split, rows in data.items(): | |
| if verbose: | |
| print(f"|raw_{split}|", len(rows)) | |
| split2qrels[split] = [] | |
| for i, row in enumerate(rows): | |
| example_id = f"{split}-{i}" | |
| support: str = row["support"] | |
| if len(support.strip()) == 0: | |
| continue | |
| question = row["question"] | |
| if len(support.strip()) == 0: | |
| continue | |
| if support in support2id: | |
| continue | |
| else: | |
| support2id[support] = example_id | |
| if question in question2id: | |
| continue | |
| else: | |
| question2id[question] = example_id | |
| doc = {"collection_id": example_id, "text": support} | |
| query = {"query_id": example_id, "text": row["question"]} | |
| qrel = { | |
| "query_id": example_id, | |
| "collection_id": example_id, | |
| "relevance": 1, | |
| "answer": row["correct_answer"], | |
| } | |
| corpus.append(Document(**doc)) | |
| queries.append(Query(**query)) | |
| split2qrels[split].append(QRel(**qrel)) | |
| # Assembly and return: | |
| return IRDataset(corpus=corpus, queries=queries, split2qrels=split2qrels) | |
| if __name__ == "__main__": | |
| # python -m nlp4web_codebase.ir.data_loaders.sciq | |
| import ujson | |
| import time | |
| start = time.time() | |
| dataset = load_sciq(verbose=True) | |
| print(f"Loading costs: {time.time() - start}s") | |
| print(ujson.dumps(dataset.get_stats(), indent=4)) | |
| # ________________________________________________________________________________ | |
| # [Memory] Calling __main__--home-kwang-research-nlp4web-ir-exercise-nlp4web-nlp4web-ir-data_loaders-sciq.load_sciq... | |
| # load_sciq(verbose=True) | |
| # |raw_train| 11679 | |
| # |raw_dev| 1000 | |
| # |raw_test| 1000 | |
| # ________________________________________________________load_sciq - 7.3s, 0.1min | |
| # Loading costs: 7.260092735290527s | |
| # { | |
| # "|corpus|": 12160, | |
| # "|queries|": 12160, | |
| # "|qrels-train|": 10409, | |
| # "|qrels-dev|": 875, | |
| # "|qrels-test|": 876 | |
| # } | |