Skip to content

Commit c430e5b

Browse files
Merge pull request megagonlabs#172 from megagonlabs/feature/spacy_v3
update configs
2 parents 6e0e120 + 67badfe commit c430e5b

11 files changed

+1201
-33
lines changed

config/ja_ginza.cfg

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[paths]
2-
train = null
3-
dev = null
2+
train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy"
3+
dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy"
44
vectors = null
55
init_tok2vec = null
66

@@ -12,7 +12,7 @@ seed = 0
1212
lang = "ja"
1313
pipeline = ["tok2vec","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"]
1414
batch_size = 1000
15-
disabled = []
15+
disabled = ["attribute_ruler"]
1616
before_creation = null
1717
after_creation = null
1818
after_pipeline_creation = null
@@ -23,15 +23,17 @@ split_mode = "C"
2323

2424
[components]
2525

26+
[components.attribute_ruler]
27+
factory = "attribute_ruler"
28+
validate = false
29+
2630
[components.bunsetu_recognizer]
2731
factory = "bunsetu_recognizer"
32+
remain_bunsetu_suffix = true
2833

2934
[components.compound_splitter]
3035
factory = "compound_splitter"
31-
32-
[components.attribute_ruler]
33-
factory = "attribute_ruler"
34-
validate = false
36+
split_mode = null
3537

3638
[components.morphologizer]
3739
factory = "morphologizer"
@@ -43,9 +45,11 @@ nO = null
4345
[components.morphologizer.model.tok2vec]
4446
@architectures = "spacy.Tok2VecListener.v1"
4547
width = ${components.tok2vec.model.encode.width}
48+
upstream = "*"
4649

4750
[components.ner]
4851
factory = "ner"
52+
incorrect_spans_key = null
4953
moves = null
5054
update_with_oracle_cut_size = 100
5155

@@ -135,6 +139,7 @@ max_steps = 50000
135139
eval_frequency = 200
136140
frozen_components = []
137141
before_to_disk = null
142+
annotating_components = []
138143

139144
[training.batcher]
140145
@batchers = "spacy.batch_by_words.v1"
@@ -151,6 +156,7 @@ t = 0.0
151156

152157
[training.logger]
153158
@loggers = "spacy.ConsoleLogger.v1"
159+
progress_bar = false
154160

155161
[training.optimizer]
156162
@optimizers = "Adam.v1"
@@ -164,17 +170,20 @@ eps = 0.00000001
164170
learn_rate = 0.001
165171

166172
[training.score_weights]
173+
dep_uas = 0.25
174+
dep_las = 0.25
167175
dep_las_per_type = null
168176
sents_p = null
169177
sents_r = null
170-
ents_per_type = null
171-
dep_uas = 0.17
172-
dep_las = 0.17
173-
sents_f = 0.0
174-
ents_f = 0.33
178+
sents_f = 0.1
179+
ents_f = 0.25
175180
ents_p = 0.0
176181
ents_r = 0.0
177-
tag_acc = 0.33
182+
ents_per_type = null
183+
pos_acc = 0.15
184+
morph_acc = 0.0
185+
morph_per_feat = null
186+
tag_acc = 0.0
178187

179188
[pretraining]
180189

@@ -188,4 +197,4 @@ after_init = null
188197

189198
[initialize.components]
190199

191-
[initialize.tokenizer]
200+
[initialize.tokenizer]

config/ja_ginza.meta.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@
4949
"compound_splitter",
5050
"bunsetu_recognizer"
5151
],
52+
"disabled": [
53+
"attribute_ruler"
54+
],
5255
"requirements":[
5356
"sudachipy>=0.5.2,<0.6.0",
5457
"sudachidict_core>=20210608",
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
[paths]
2+
train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy"
3+
dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy"
4+
vectors = null
5+
init_tok2vec = null
6+
7+
[system]
8+
gpu_allocator = "pytorch"
9+
seed = 0
10+
11+
[nlp]
12+
lang = "ja"
13+
pipeline = ["transformer","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"]
14+
batch_size = 128
15+
disabled = ["attribute_ruler"]
16+
before_creation = null
17+
after_creation = null
18+
after_pipeline_creation = null
19+
20+
[nlp.tokenizer]
21+
@tokenizers = "spacy.ja.JapaneseTokenizer"
22+
split_mode = "C"
23+
24+
[components]
25+
26+
[components.attribute_ruler]
27+
factory = "attribute_ruler"
28+
validate = false
29+
30+
[components.bunsetu_recognizer]
31+
factory = "bunsetu_recognizer"
32+
remain_bunsetu_suffix = true
33+
34+
[components.compound_splitter]
35+
factory = "compound_splitter"
36+
split_mode = null
37+
38+
[components.morphologizer]
39+
factory = "morphologizer"
40+
41+
[components.morphologizer.model]
42+
@architectures = "spacy.Tagger.v1"
43+
nO = null
44+
45+
[components.morphologizer.model.tok2vec]
46+
@architectures = "spacy-transformers.TransformerListener.v1"
47+
grad_factor = 1.0
48+
pooling = {"@layers":"reduce_mean.v1"}
49+
upstream = "*"
50+
51+
[components.ner]
52+
factory = "ner"
53+
moves = null
54+
update_with_oracle_cut_size = 100
55+
56+
[components.ner.model]
57+
@architectures = "spacy.TransitionBasedParser.v2"
58+
state_type = "ner"
59+
extra_state_tokens = false
60+
hidden_width = 64
61+
maxout_pieces = 2
62+
use_upper = false
63+
nO = null
64+
65+
[components.ner.model.tok2vec]
66+
@architectures = "spacy-transformers.TransformerListener.v1"
67+
grad_factor = 1.0
68+
pooling = {"@layers":"reduce_mean.v1"}
69+
upstream = "*"
70+
71+
[components.parser]
72+
factory = "parser"
73+
learn_tokens = false
74+
min_action_freq = 30
75+
moves = null
76+
update_with_oracle_cut_size = 100
77+
78+
[components.parser.model]
79+
@architectures = "spacy.TransitionBasedParser.v2"
80+
state_type = "parser"
81+
extra_state_tokens = false
82+
hidden_width = 128
83+
maxout_pieces = 3
84+
use_upper = false
85+
nO = null
86+
87+
[components.parser.model.tok2vec]
88+
@architectures = "spacy-transformers.TransformerListener.v1"
89+
grad_factor = 1.0
90+
pooling = {"@layers":"reduce_mean.v1"}
91+
upstream = "*"
92+
93+
[components.transformer]
94+
factory = "transformer_custom"
95+
max_batch_items = 4096
96+
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
97+
98+
[components.transformer.model]
99+
@architectures = "ginza-transformers.TransformerModel.v1"
100+
name = "megagonlabs/bert-base-japanese-char-v2-ginza"
101+
102+
[components.transformer.model.get_spans]
103+
@span_getters = "spacy-transformers.strided_spans.v1"
104+
window = 128
105+
stride = 96
106+
107+
[components.transformer.model.tokenizer_config]
108+
use_fast = false
109+
tokenizer_class = "BertJapaneseTokenizer"
110+
word_tokenizer_type = basic
111+
subword_tokenizer_type = character
112+
113+
[corpora]
114+
115+
[corpora.dev]
116+
@readers = "spacy.Corpus.v1"
117+
path = ${paths.dev}
118+
max_length = 0
119+
gold_preproc = false
120+
limit = 0
121+
augmenter = null
122+
123+
[corpora.train]
124+
@readers = "spacy.Corpus.v1"
125+
path = ${paths.train}
126+
max_length = 500
127+
gold_preproc = false
128+
limit = 0
129+
augmenter = null
130+
131+
[training]
132+
accumulate_gradient = 3
133+
dev_corpus = "corpora.dev"
134+
train_corpus = "corpora.train"
135+
seed = ${system.seed}
136+
gpu_allocator = ${system.gpu_allocator}
137+
dropout = 0.1
138+
patience = 0
139+
max_epochs = 0
140+
max_steps = 50000
141+
eval_frequency = 200
142+
frozen_components = []
143+
before_to_disk = null
144+
annotating_components = []
145+
146+
[training.batcher]
147+
@batchers = "spacy.batch_by_padded.v1"
148+
discard_oversize = true
149+
size = 2000
150+
buffer = 256
151+
get_length = null
152+
153+
[training.logger]
154+
@loggers = "spacy.ConsoleLogger.v1"
155+
progress_bar = false
156+
157+
[training.optimizer]
158+
@optimizers = "Adam.v1"
159+
beta1 = 0.9
160+
beta2 = 0.999
161+
L2_is_weight_decay = true
162+
L2 = 0.01
163+
grad_clip = 1.0
164+
use_averages = false
165+
eps = 0.00000001
166+
167+
[training.optimizer.learn_rate]
168+
@schedules = "warmup_linear.v1"
169+
warmup_steps = 250
170+
total_steps = 50000
171+
initial_rate = 0.00005
172+
173+
[training.score_weights]
174+
dep_uas = 0.25
175+
dep_las = 0.25
176+
dep_las_per_type = null
177+
sents_p = null
178+
sents_r = null
179+
sents_f = 0.1
180+
ents_f = 0.25
181+
ents_p = 0.0
182+
ents_r = 0.0
183+
ents_per_type = null
184+
pos_acc = 0.15
185+
morph_acc = 0.0
186+
morph_per_feat = null
187+
tag_acc = 0.0
188+
189+
[pretraining]
190+
191+
[initialize]
192+
vectors = null
193+
init_tok2vec = ${paths.init_tok2vec}
194+
vocab_data = null
195+
lookups = null
196+
before_init = null
197+
after_init = null
198+
199+
[initialize.components]
200+
201+
[initialize.tokenizer]

0 commit comments

Comments
 (0)