Skip to content

Commit 49c93ad

Browse files
Merge pull request megagonlabs#250 from megagonlabs/develop
Release v5.1.2
2 parents 24dee81 + d94979d commit 49c93ad

File tree

11 files changed

+103
-18
lines changed

11 files changed

+103
-18
lines changed

.github/workflows/pytest.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: pytest
2+
3+
on:
4+
push:
5+
branches:
6+
- master
7+
pull_request:
8+
branches:
9+
- develop
10+
11+
jobs:
12+
pytest:
13+
name: Run tests with pytest
14+
runs-on: ubuntu-latest
15+
strategy:
16+
matrix:
17+
python-version: [3.7, 3.8]
18+
steps:
19+
- name: Checkout
20+
uses: actions/checkout@v2
21+
- name: Set up Python ${{ matrix.python-version }}
22+
uses: actions/setup-python@v1
23+
with:
24+
python-version: ${{ matrix.python-version }}
25+
- name: Upgrade pip
26+
run: >-
27+
python -m
28+
pip install -U pip
29+
- name: Install dependencies
30+
run: >-
31+
python -m
32+
pip install . pytest pytest-mock ja-ginza ja-ginza-electra
33+
- name: Run Tests
34+
run: pytest

README.md

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,16 @@ Please read the official documents to compile user dictionaries with `sudachipy`
234234

235235
### version 5.x
236236

237+
#### ginza-5.1.2
238+
- 2022-03-12
239+
- Migrate to spaCy v3.4
240+
241+
#### ginza-5.1.1
242+
- 2022-03-12
243+
- Improvements
244+
- auto deploy for pypi by @nimiusrd in #184
245+
- modify github actions: trigger by tagging, stop uploading test pypi by @r-terada in #233
246+
237247
#### ginza-5.1.0
238248
- 2021-12-10, Euclase
239249
- Important changes
@@ -534,10 +544,10 @@ Ginza uses the pytest framework for testing, and you can run the tests via `setu
534544
Some tests depends on the ginza default models (`ja-ginza`, `ja-ginza-electra`), so install them before the tests is needed.
535545

536546
```console
537-
pip install ja-ginza ja-ginza-electra
538-
pip install -e .
547+
$ pip install ja-ginza ja-ginza-electra
548+
$ pip install -e .
539549
# full test
540-
python setup.py test
550+
$ python setup.py test
541551
# test single file
542-
python setup.py test --addopts ginza/tests/test_analyzer.py
552+
$ python setup.py test --addopts ginza/tests/test_analyzer.py
543553
```

config/ja_ginza.meta.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"lang":"ja",
33
"name":"ginza",
4-
"version":"5.1.0",
4+
"version":"5.1.2",
55
"description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019). Assigns word2vec token vectors. Components: tok2vec, parser, ner, morphologizer, atteribute_ruler, compound_splitter, bunsetu_recognizer.",
66
"author":"Megagon Labs Tokyo.",
77
"email":"[email protected]",
@@ -34,7 +34,7 @@
3434
}
3535
],
3636
"parent_package":"spacy",
37-
"spacy_version":">=3.2.0,<3.3.0",
37+
"spacy_version":">=3.2.0,<3.5.0",
3838
"pipeline":[
3939
"tok2vec",
4040
"parser",

config/ja_ginza_electra.meta.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"lang":"ja",
33
"name":"ginza_electra",
4-
"version":"5.1.0",
4+
"version":"5.1.2",
55
"description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.",
66
"author":"Megagon Labs Tokyo.",
77
"email":"[email protected]",
@@ -41,7 +41,7 @@
4141
}
4242
],
4343
"parent_package":"spacy",
44-
"spacy_version":">=3.2.0,<3.3.0",
44+
"spacy_version":">=3.2.0,<3.5.0",
4545
"pipeline":[
4646
"transformer",
4747
"parser",

docs/index.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ $ pip uninstall ginza ja_ginza_electra
8080
$ pip uninstall ja_ginza
8181
```
8282

83-
旧バージョンの`j_ginza_electra`をインストールしている場合は次のコマンドでアンインストールします。
83+
旧バージョンの`ja_ginza_electra`をインストールしている場合は次のコマンドでアンインストールします。
8484
```console
8585
$ pip uninstall ja_ginza_electra
8686
```
@@ -261,6 +261,17 @@ Contains information from mC4 which is made available under the ODC Attribution
261261

262262
### version 5.x
263263

264+
#### ginza-5.1.2
265+
- 2022-03-12
266+
- Migrate to spaCy v3.4
267+
268+
#### ginza-5.1.1
269+
- 2022-03-12
270+
- Improvements
271+
- auto deploy for pypi by @nimiusrd in #184
272+
- modify github actions: trigger by tagging, stop uploading test pypi by @r-terada in #233
273+
274+
264275
#### ginza-5.1.0
265276
- 2021-12-10, Euclase
266277
- 重要な変更

ginza/analyzer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def set_nlp(self) -> None:
7777
try:
7878
nlp = spacy.load("ja_ginza")
7979
except IOError as e:
80-
raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza`.')
80+
raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza` or `pip install ja-ginza-electra`.')
8181

8282
if self.disable_sentencizer:
8383
nlp.add_pipe("disable_sentencizer", before="parser")

ginza/command_line.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def is_json(self):
2828

2929
def open(self):
3030
if self.output_path:
31-
self.output = open(self.output_path, "w")
31+
self.output = open(self.output_path, "w", encoding="utf-8")
3232
else:
3333
self.output = sys.stdout
3434

@@ -154,10 +154,17 @@ def _analyze_tty(analyzer: Analyzer, output: _OutputWrapper) -> None:
154154
def _analyze_single(analyzer: Analyzer, output: _OutputWrapper, files: Iterable[str]) -> None:
155155
try:
156156
analyzer.set_nlp()
157+
batch = []
157158
for path in files:
158-
with open(path, "r") as f:
159+
with open(path, "r", encoding="utf-8") as f:
159160
for line in f:
160-
output.write(analyzer.analyze_line(line))
161+
batch.append(line)
162+
if len(batch) < MINI_BATCH_SIZE:
163+
continue
164+
output.write(analyzer.analyze_batch(batch))
165+
batch.clear()
166+
if batch:
167+
output.write(analyzer.analyze_batch(batch))
161168
except KeyboardInterrupt:
162169
pass
163170

@@ -194,7 +201,7 @@ def _analyze_parallel(analyzer: Analyzer, output: _OutputWrapper, files: Iterabl
194201
def _data_loader(files: List[str], batch_size: int) -> Generator[List[str], None, None]:
195202
mini_batch = []
196203
for path in files:
197-
with open(path, "r") as f:
204+
with open(path, "r", encoding="utf-8") as f:
198205
for line in f:
199206
mini_batch.append(line)
200207
if len(mini_batch) == batch_size:

ginza/tests/test_analyzer.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,3 +144,26 @@ def test_analyze_batch(self, output_format, input_batch, tokens_batch, tokens_fu
144144
analyzer.output_format = output_format
145145
ret = analyzer.analyze_batch(input_batch)
146146
assert tokens_func(ret) == sum(tokens_batch, [])
147+
148+
@pytest.mark.parametrize(
149+
"raises_analysis_before_set, tokens_func",
150+
[
151+
(TypeError, _tokens_conllu)
152+
],
153+
)
154+
@pytest.mark.parametrize(
155+
"split_mode, input_text, tokens",
156+
[
157+
("A", "機能性食品", ["機能", "性", "食品"]),
158+
("B", "機能性食品", ["機能性", "食品"]),
159+
("C", "機能性食品", ["機能性食品"]),
160+
],
161+
)
162+
def test_analyze_split(self, split_mode, input_text, tokens, raises_analysis_before_set, tokens_func, analyzer):
163+
analyzer.split_mode = split_mode
164+
with pytest.raises(raises_analysis_before_set):
165+
analyzer.analyze_line(input_text)
166+
167+
analyzer.set_nlp()
168+
ret = analyzer.analyze_line(input_text)
169+
assert tokens_func(ret) == tokens

ginza/tests/test_models.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,6 @@ def test_tokenize(nlp, text, expected_tokens):
176176
def test_compound_spliter(nlp, text, len_a, len_b, len_c):
177177
assert len(nlp(text)) == len_c
178178
for split_mode, l in zip(["A", "B", "C"], [len_a, len_b, len_c]):
179-
nlp = deepcopy(nlp)
180179
set_split_mode(nlp, split_mode)
181180
assert len(nlp(text)) == l
182181

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
spacy>=3.2.0,<3.3.0
1+
spacy>=3.2.0,<3.5.0
22
plac>=1.3.3
33
SudachiPy>=0.6.2,<0.7.0
44
SudachiDict-core>=20210802

setup.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@
1616
"ginzame = ginza.command_line:main_ginzame",
1717
],
1818
},
19+
python_requires=">=3.6",
1920
install_requires=[
20-
"spacy>=3.2.0,<3.3.0",
21+
"spacy>=3.2.0,<3.5.0",
2122
"plac>=1.3.3",
2223
"SudachiPy>=0.6.2,<0.7.0",
2324
"SudachiDict-core>=20210802",
@@ -28,5 +29,5 @@
2829
name="ginza",
2930
packages=find_packages(include=["ginza"]),
3031
url="https://github.com/megagonlabs/ginza",
31-
version='5.1.1',
32+
version='5.1.2',
3233
)

0 commit comments

Comments
 (0)