Skip to content

Commit 6a54c87

Browse files
Update data_lib.py
Add dataset variants with openmix_v1 vocab.
1 parent 7695cec commit 6a54c87

File tree

1 file changed

+7
-9
lines changed

1 file changed

+7
-9
lines changed

hero/data_lib.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
# Tokenizers / vocabularies.
3737

3838
T5_CC_VOCAB = 'vb32000_t5_cc.model'
39+
OPENMIX_V1_VOCAB = 'vb100864_openmix_v1.model'
40+
41+
ALL_VOCABS = [('vb32000_t5_cc', T5_CC_VOCAB),
42+
('vb100864_openmix_v1', OPENMIX_V1_VOCAB)]
3943

4044
################################################################################
4145
# PT datasets.
@@ -90,11 +94,9 @@ def add_lm1b_task():
9094
'train': 'train[:500]',
9195
'validation': 'train[500:1000]',
9296
'test': 'test'})
93-
vocabs = []
94-
vocabs += [('vb32000_t5_cc', T5_CC_VOCAB)]
9597
for name, source in [('lm1b', lm1b_source),
9698
('minilm1b', minilm1b_source)]:
97-
for vocab_name, vocab in vocabs:
99+
for vocab_name, vocab in ALL_VOCABS:
98100
task_name = f'{name}.{vocab_name}'
99101
add_pt_task_v1(task_name, source, vocab,
100102
use_reduce_concat_split=False)
@@ -105,9 +107,7 @@ def add_lm1b_task():
105107
def add_c4_task():
106108
"""Adds C4 tasks."""
107109
source = seqio.TfdsDataSource(tfds_name='c4:3.1.0')
108-
vocabs = []
109-
vocabs += [('vb32000_t5_cc', T5_CC_VOCAB)]
110-
for vocab_name, vocab in vocabs:
110+
for vocab_name, vocab in ALL_VOCABS:
111111
task_name = f'c4.{vocab_name}'
112112
add_pt_task_v1(task_name, source, vocab,
113113
use_reduce_concat_split=True)
@@ -123,10 +123,8 @@ def add_imdb_reviews_task():
123123
'train': 'train[:90%]',
124124
'validation': 'train[90%:]',
125125
'test': 'test'})
126-
vocabs = []
127-
vocabs += [('vb32000_t5_cc', T5_CC_VOCAB)]
128126
name = 'imdb_reviews'
129-
for vocab_name, vocab in vocabs:
127+
for vocab_name, vocab in ALL_VOCABS:
130128
task_name = f'{name}.{vocab_name}'
131129
add_pt_task_v1(task_name, source, vocab,
132130
use_reduce_concat_split=False)

0 commit comments

Comments
 (0)