Merge pull request megagonlabs#250 from megagonlabs/develop

hiroshi-matsuda-rit · web-flow · commit 49c93adb98e9 · 2022-08-09T18:17:07.000+09:00
Release v5.1.2
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -0,0 +1,34 @@
+name: pytest
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - develop
+
+jobs:
+  pytest:
+    name: Run tests with pytest
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.7, 3.8]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v1
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip
+        run: >-
+          python -m
+          pip install -U pip
+      - name: Install dependencies
+        run: >-
+          python -m
+          pip install . pytest pytest-mock ja-ginza ja-ginza-electra
+      - name: Run Tests
+        run: pytest
diff --git a/README.md b/README.md
@@ -234,6 +234,16 @@ Please read the official documents to compile user dictionaries with `sudachipy`
 
 ### version 5.x
 
+#### ginza-5.1.2
+- 2022-03-12
+- Migrate to spaCy v3.4
+
+#### ginza-5.1.1
+- 2022-03-12
+- Improvements
+  - auto deploy for pypi by @nimiusrd in #184
+  - modify github actions: trigger by tagging, stop uploading test pypi by @r-terada in #233
+
 #### ginza-5.1.0
 - 2021-12-10, Euclase
 - Important changes
@@ -534,10 +544,10 @@ Ginza uses the pytest framework for testing, and you can run the tests via `setu
 Some tests depends on the ginza default models (`ja-ginza`, `ja-ginza-electra`), so install them before the tests is needed.
 
 ```console
-pip install ja-ginza ja-ginza-electra
-pip install -e .
+$ pip install ja-ginza ja-ginza-electra
+$ pip install -e .
 # full test
-python setup.py test
+$ python setup.py test
 # test single file
-python setup.py test --addopts ginza/tests/test_analyzer.py
+$ python setup.py test --addopts ginza/tests/test_analyzer.py
 ```
diff --git a/config/ja_ginza.meta.json b/config/ja_ginza.meta.json
@@ -1,7 +1,7 @@
 {
   "lang":"ja",
   "name":"ginza",
-  "version":"5.1.0",
+  "version":"5.1.2",
   "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019). Assigns word2vec token vectors. Components: tok2vec, parser, ner, morphologizer, atteribute_ruler, compound_splitter, bunsetu_recognizer.",
   "author":"Megagon Labs Tokyo.",
   "email":"ginza@megagon.ai",
@@ -34,7 +34,7 @@
     }
   ],
   "parent_package":"spacy",
-  "spacy_version":">=3.2.0,<3.3.0",
+  "spacy_version":">=3.2.0,<3.5.0",
   "pipeline":[
     "tok2vec",
     "parser",
diff --git a/config/ja_ginza_electra.meta.json b/config/ja_ginza_electra.meta.json
@@ -1,7 +1,7 @@
 {
   "lang":"ja",
   "name":"ginza_electra",
-  "version":"5.1.0",
+  "version":"5.1.2",
   "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.",
   "author":"Megagon Labs Tokyo.",
   "email":"ginza@megagon.ai",
@@ -41,7 +41,7 @@
     }
   ],
   "parent_package":"spacy",
-  "spacy_version":">=3.2.0,<3.3.0",
+  "spacy_version":">=3.2.0,<3.5.0",
   "pipeline":[
     "transformer",
     "parser",
diff --git a/docs/index.md b/docs/index.md
@@ -80,7 +80,7 @@ $ pip uninstall ginza ja_ginza_electra
 $ pip uninstall ja_ginza
 ```
 
-旧バージョンの`j_ginza_electra`をインストールしている場合は次のコマンドでアンインストールします。
+旧バージョンの`ja_ginza_electra`をインストールしている場合は次のコマンドでアンインストールします。
 ```console
 $ pip uninstall ja_ginza_electra
 ```
@@ -261,6 +261,17 @@ Contains information from mC4 which is made available under the ODC Attribution
 
 ### version 5.x
 
+#### ginza-5.1.2
+- 2022-03-12
+- Migrate to spaCy v3.4
+
+#### ginza-5.1.1
+- 2022-03-12
+- Improvements
+  - auto deploy for pypi by @nimiusrd in #184
+  - modify github actions: trigger by tagging, stop uploading test pypi by @r-terada in #233
+
+
 #### ginza-5.1.0
 - 2021-12-10, Euclase
 - 重要な変更
diff --git a/ginza/analyzer.py b/ginza/analyzer.py
@@ -77,7 +77,7 @@ def set_nlp(self) -> None:
                     try:
                         nlp = spacy.load("ja_ginza")
                     except IOError as e:
-                        raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza`.')
+                        raise OSError("E050", 'You need to install "ja-ginza" or "ja-ginza-electra" by executing `pip install ja-ginza` or `pip install ja-ginza-electra`.')
 
             if self.disable_sentencizer:
                 nlp.add_pipe("disable_sentencizer", before="parser")
diff --git a/ginza/command_line.py b/ginza/command_line.py
@@ -28,7 +28,7 @@ def is_json(self):
 
     def open(self):
         if self.output_path:
-            self.output = open(self.output_path, "w")
+            self.output = open(self.output_path, "w", encoding="utf-8")
         else:
             self.output = sys.stdout
 
@@ -154,10 +154,17 @@ def _analyze_tty(analyzer: Analyzer, output: _OutputWrapper) -> None:
 def _analyze_single(analyzer: Analyzer, output: _OutputWrapper, files: Iterable[str]) -> None:
     try:
         analyzer.set_nlp()
+        batch = []
         for path in files:
-            with open(path, "r") as f:
+            with open(path, "r", encoding="utf-8") as f:
                 for line in f:
-                    output.write(analyzer.analyze_line(line))
+                    batch.append(line)
+                    if len(batch) < MINI_BATCH_SIZE:
+                        continue
+                    output.write(analyzer.analyze_batch(batch))
+                    batch.clear()
+        if batch:
+            output.write(analyzer.analyze_batch(batch))
     except KeyboardInterrupt:
         pass
 
@@ -194,7 +201,7 @@ def _analyze_parallel(analyzer: Analyzer, output: _OutputWrapper, files: Iterabl
 def _data_loader(files: List[str], batch_size: int) -> Generator[List[str], None, None]:
     mini_batch = []
     for path in files:
-        with open(path, "r") as f:
+        with open(path, "r", encoding="utf-8") as f:
             for line in f:
                 mini_batch.append(line)
                 if len(mini_batch) == batch_size:
diff --git a/ginza/tests/test_analyzer.py b/ginza/tests/test_analyzer.py
@@ -144,3 +144,26 @@ def test_analyze_batch(self, output_format, input_batch, tokens_batch, tokens_fu
         analyzer.output_format = output_format
         ret = analyzer.analyze_batch(input_batch)
         assert tokens_func(ret) == sum(tokens_batch, [])
+
+    @pytest.mark.parametrize(
+        "raises_analysis_before_set, tokens_func",
+        [
+            (TypeError, _tokens_conllu)
+        ],
+    )
+    @pytest.mark.parametrize(
+        "split_mode, input_text, tokens",
+        [
+            ("A", "機能性食品", ["機能", "性", "食品"]),
+            ("B", "機能性食品", ["機能性", "食品"]),
+            ("C", "機能性食品", ["機能性食品"]),
+        ],
+    )
+    def test_analyze_split(self, split_mode, input_text, tokens, raises_analysis_before_set, tokens_func, analyzer):
+        analyzer.split_mode = split_mode
+        with pytest.raises(raises_analysis_before_set):
+            analyzer.analyze_line(input_text)
+
+        analyzer.set_nlp()
+        ret = analyzer.analyze_line(input_text)
+        assert tokens_func(ret) == tokens
diff --git a/ginza/tests/test_models.py b/ginza/tests/test_models.py
@@ -176,7 +176,6 @@ def test_tokenize(nlp, text, expected_tokens):
 def test_compound_spliter(nlp, text, len_a, len_b, len_c):
     assert len(nlp(text)) == len_c
     for split_mode, l in zip(["A", "B", "C"], [len_a, len_b, len_c]):
-        nlp = deepcopy(nlp)
         set_split_mode(nlp, split_mode)
         assert len(nlp(text)) == l
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-spacy>=3.2.0,<3.3.0
+spacy>=3.2.0,<3.5.0
 plac>=1.3.3
 SudachiPy>=0.6.2,<0.7.0
 SudachiDict-core>=20210802
diff --git a/setup.py b/setup.py
@@ -16,8 +16,9 @@
             "ginzame = ginza.command_line:main_ginzame",
         ],
     },
+    python_requires=">=3.6",
     install_requires=[
-        "spacy>=3.2.0,<3.3.0",
+        "spacy>=3.2.0,<3.5.0",
         "plac>=1.3.3",
         "SudachiPy>=0.6.2,<0.7.0",
         "SudachiDict-core>=20210802",
@@ -28,5 +29,5 @@
     name="ginza",
     packages=find_packages(include=["ginza"]),
     url="https://github.com/megagonlabs/ginza",
-    version='5.1.1',
+    version='5.1.2',
 )