3636# Tokenizers / vocabularies.
3737
3838T5_CC_VOCAB = 'vb32000_t5_cc.model'
39+ OPENMIX_V1_VOCAB = 'vb100864_openmix_v1.model'
40+
41+ ALL_VOCABS = [('vb32000_t5_cc' , T5_CC_VOCAB ),
42+ ('vb100864_openmix_v1' , OPENMIX_V1_VOCAB )]
3943
4044################################################################################
4145# PT datasets.
@@ -90,11 +94,9 @@ def add_lm1b_task():
9094 'train' : 'train[:500]' ,
9195 'validation' : 'train[500:1000]' ,
9296 'test' : 'test' })
93- vocabs = []
94- vocabs += [('vb32000_t5_cc' , T5_CC_VOCAB )]
9597 for name , source in [('lm1b' , lm1b_source ),
9698 ('minilm1b' , minilm1b_source )]:
97- for vocab_name , vocab in vocabs :
99+ for vocab_name , vocab in ALL_VOCABS :
98100 task_name = f'{ name } .{ vocab_name } '
99101 add_pt_task_v1 (task_name , source , vocab ,
100102 use_reduce_concat_split = False )
@@ -105,9 +107,7 @@ def add_lm1b_task():
105107def add_c4_task ():
106108 """Adds C4 tasks."""
107109 source = seqio .TfdsDataSource (tfds_name = 'c4:3.1.0' )
108- vocabs = []
109- vocabs += [('vb32000_t5_cc' , T5_CC_VOCAB )]
110- for vocab_name , vocab in vocabs :
110+ for vocab_name , vocab in ALL_VOCABS :
111111 task_name = f'c4.{ vocab_name } '
112112 add_pt_task_v1 (task_name , source , vocab ,
113113 use_reduce_concat_split = True )
@@ -123,10 +123,8 @@ def add_imdb_reviews_task():
123123 'train' : 'train[:90%]' ,
124124 'validation' : 'train[90%:]' ,
125125 'test' : 'test' })
126- vocabs = []
127- vocabs += [('vb32000_t5_cc' , T5_CC_VOCAB )]
128126 name = 'imdb_reviews'
129- for vocab_name , vocab in vocabs :
127+ for vocab_name , vocab in ALL_VOCABS :
130128 task_name = f'{ name } .{ vocab_name } '
131129 add_pt_task_v1 (task_name , source , vocab ,
132130 use_reduce_concat_split = False )
0 commit comments