Skip to content

Commit d9da44a

Browse files
Add hotpot QA (huggingface#703)
* Add hotpot QA * Remove unused deps from hotpotqa * fix remote tests for new datasets (huggingface#704) * Update datasets/hotpot_qa/hotpot_qa.py Co-authored-by: Quentin Lhoest <[email protected]> * Add hotpot QA * Remove unused deps from hotpotqa Co-authored-by: Quentin Lhoest <[email protected]>
1 parent 1ac5ecd commit d9da44a

File tree

4 files changed

+148
-0
lines changed

4 files changed

+148
-0
lines changed

datasets/hotpot_qa/dataset_infos.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"distractor": {"description": "HotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervisionand explain the predictions; (4) we offer a new type of factoid comparison questions to testQA systems\u2019 ability to extract relevant facts and perform necessary comparison.\n", "citation": "\n@inproceedings{yang2018hotpotqa,\n title={{HotpotQA}: A Dataset for Diverse, Explainable Multi-hop Question Answering},\n author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William W. and Salakhutdinov, Ruslan and Manning, Christopher D.},\n booktitle={Conference on Empirical Methods in Natural Language Processing ({EMNLP})},\n year={2018}\n}\n", "homepage": "https://hotpotqa.github.io/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "supporting_facts": {"feature": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sent_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "context": {"feature": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sentences": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "hotpot_qa", "config_name": "distractor", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 552949315, "num_examples": 90447, "dataset_name": "hotpot_qa"}, "validation": {"name": "validation", "num_bytes": 45716111, "num_examples": 7405, "dataset_name": "hotpot_qa"}}, "download_checksums": {"http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json": {"num_bytes": 566426227, "checksum": "26650cf50234ef5fb2e664ed70bbecdfd87815e6bffc257e068efea5cf7cd316"}, "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json": {"num_bytes": 46320117, "checksum": "4e9ecb5c8d3b719f624d66b60f8d56bf227f03914f5f0753d6fa1b359d7104ea"}}, "download_size": 612746344, "post_processing_size": null, "dataset_size": 598665426, "size_in_bytes": 1211411770}, "fullwiki": {"description": "HotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervisionand explain the predictions; (4) we offer a new type of factoid comparison questions to testQA systems\u2019 ability to extract relevant facts and perform necessary comparison.\n", "citation": "\n@inproceedings{yang2018hotpotqa,\n title={{HotpotQA}: A Dataset for Diverse, Explainable Multi-hop Question Answering},\n author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William W. and Salakhutdinov, Ruslan and Manning, Christopher D.},\n booktitle={Conference on Empirical Methods in Natural Language Processing ({EMNLP})},\n year={2018}\n}\n", "homepage": "https://hotpotqa.github.io/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"dtype": "string", "id": null, "_type": "Value"}, "level": {"dtype": "string", "id": null, "_type": "Value"}, "supporting_facts": {"feature": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sent_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "context": {"feature": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "sentences": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "hotpot_qa", "config_name": "fullwiki", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 552949315, "num_examples": 90447, "dataset_name": "hotpot_qa"}, "validation": {"name": "validation", "num_bytes": 46848601, "num_examples": 7405, "dataset_name": "hotpot_qa"}, "test": {"name": "test", "num_bytes": 46000102, "num_examples": 7405, "dataset_name": "hotpot_qa"}}, "download_checksums": {"http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json": {"num_bytes": 566426227, "checksum": "26650cf50234ef5fb2e664ed70bbecdfd87815e6bffc257e068efea5cf7cd316"}, "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json": {"num_bytes": 47454698, "checksum": "2f1f3e594a3066a3084cc57950ca2713c24712adaad03af6ccce18d1846d5618"}, "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_test_fullwiki_v1.json": {"num_bytes": 46213747, "checksum": "c61a5274b9aa6deca3f7d2dc4d7757684c158fbd2264f759307699fb53801c2b"}}, "download_size": 660094672, "post_processing_size": null, "dataset_size": 645798018, "size_in_bytes": 1305892690}}
Binary file not shown.
Binary file not shown.

datasets/hotpot_qa/hotpot_qa.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Lint as: python3
17+
"""HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering."""
18+
19+
from __future__ import absolute_import, division, print_function
20+
21+
import json
22+
import os
23+
import textwrap
24+
25+
import datasets
26+
27+
28+
_CITATION = """
29+
@inproceedings{yang2018hotpotqa,
30+
title={{HotpotQA}: A Dataset for Diverse, Explainable Multi-hop Question Answering},
31+
author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William W. and Salakhutdinov, Ruslan and Manning, Christopher D.},
32+
booktitle={Conference on Empirical Methods in Natural Language Processing ({EMNLP})},
33+
year={2018}
34+
}
35+
"""
36+
37+
_DESCRIPTION = """\
38+
HotpotQA is a new dataset with 113k Wikipedia-based question-answer pairs with four key features:
39+
(1) the questions require finding and reasoning over multiple supporting documents to answer;
40+
(2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas;
41+
(3) we provide sentence-level supporting facts required for reasoning, allowingQA systems to reason with strong supervisionand explain the predictions;
42+
(4) we offer a new type of factoid comparison questions to testQA systems’ ability to extract relevant facts and perform necessary comparison.
43+
"""
44+
45+
_URL_BASE = "http://curtis.ml.cmu.edu/datasets/hotpot/"
46+
47+
48+
class HotpotQA(datasets.GeneratorBasedBuilder):
49+
"""HotpotQA is a Dataset for Diverse, Explainable Multi-hop Question Answering."""
50+
51+
BUILDER_CONFIGS = [
52+
datasets.BuilderConfig(
53+
name="distractor",
54+
version=datasets.Version("1.0.0"),
55+
description=textwrap.dedent(
56+
"""
57+
In the distractor setting, a question-answering system reads 10 paragraphs to provide an answer to a question.
58+
They must also justify these answers with supporting facts. This setting challenges the model to find the true
59+
supporting facts in the presence of noise, for each example we employ bigram tf-idf (Chen et al., 2017) to retrieve
60+
8 paragraphs from Wikipedia as distractors, using the question as the query. We mix them with the 2 gold paragraphs
61+
(the ones used to collect the question and answer) to construct the distractor setting.
62+
"""
63+
),
64+
),
65+
datasets.BuilderConfig(
66+
name="fullwiki",
67+
version=datasets.Version("1.0.0"),
68+
description=textwrap.dedent(
69+
"""
70+
In the fullwiki setting, a question-answering system must find the answer to a question in the scope of the
71+
entire Wikipedia. We fully test the model’s ability to locate relevant facts as well as reasoning about them
72+
by requiring it to answer the question given the first paragraphs of all Wikipedia articles without the gold
73+
paragraphs specified. This full wiki setting truly tests the performance of the systems’ ability at multi-hop
74+
reasoning in the wild.
75+
"""
76+
),
77+
),
78+
]
79+
80+
def _info(self):
81+
return datasets.DatasetInfo(
82+
description=_DESCRIPTION,
83+
features=datasets.Features(
84+
{
85+
"id": datasets.Value("string"),
86+
"question": datasets.Value("string"),
87+
"answer": datasets.Value("string"),
88+
"type": datasets.Value("string"),
89+
"level": datasets.Value("string"),
90+
"supporting_facts": datasets.features.Sequence(
91+
{
92+
"title": datasets.Value("string"),
93+
"sent_id": datasets.Value("int32"),
94+
}
95+
),
96+
"context": datasets.features.Sequence(
97+
{
98+
"title": datasets.Value("string"),
99+
"sentences": datasets.features.Sequence(datasets.Value("string")),
100+
}
101+
),
102+
}
103+
),
104+
supervised_keys=None,
105+
homepage="https://hotpotqa.github.io/",
106+
citation=_CITATION,
107+
)
108+
109+
def _split_generators(self, dl_manager):
110+
"""Returns SplitGenerators."""
111+
paths = {
112+
datasets.Split.TRAIN: os.path.join(_URL_BASE, "hotpot_train_v1.1.json"),
113+
datasets.Split.VALIDATION: os.path.join(_URL_BASE, "hotpot_dev_" + self.config.name + "_v1.json"),
114+
}
115+
if self.config.name == "fullwiki":
116+
paths[datasets.Split.TEST] = os.path.join(_URL_BASE, "hotpot_test_fullwiki_v1.json")
117+
118+
files = dl_manager.download(paths)
119+
120+
split_generators = []
121+
for split in files:
122+
split_generators.append(datasets.SplitGenerator(name=split, gen_kwargs={"data_file": files[split]}))
123+
124+
return split_generators
125+
126+
def _generate_examples(self, data_file):
127+
"""This function returns the examples."""
128+
data = json.load(open(data_file))
129+
for idx, example in enumerate(data):
130+
131+
# Test set has missing keys
132+
for k in ["answer", "type", "level"]:
133+
if k not in example.keys():
134+
example[k] = None
135+
136+
if "supporting_facts" not in example.keys():
137+
example["supporting_facts"] = []
138+
139+
yield idx, {
140+
"id": example["_id"],
141+
"question": example["question"],
142+
"answer": example["answer"],
143+
"type": example["type"],
144+
"level": example["level"],
145+
"supporting_facts": [{"title": f[0], "sent_id": f[1]} for f in example["supporting_facts"]],
146+
"context": [{"title": f[0], "sentences": f[1]} for f in example["context"]],
147+
}

0 commit comments

Comments
 (0)