Merge pull request #28 from R1j1t/dev

R1j1t · web-flow · commit c168650b8bb7 · 2020-10-25T15:50:50.000+05:30
fixes separate tokenization logic of spaCy and transformers for `'s`
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-ignore = W503
+ignore = W503, E203
 exclude = .git,__pycache__,build,peters_code,.ipynb_checkpoints,setup.py
 max-complexity = 15
 per-file-ignores =
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -30,7 +30,10 @@ jobs:
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: install spacy model
       run: |
+        # Download base english langauge model
         python -m spacy download en_core_web_sm
+        # Download large english language model
+        python -m spacy download en_core_web_lg
     - name: Black Code Formatter
       run: black . --check
     - name: Flake Code Checker
diff --git a/.gitignore b/.gitignore
@@ -138,7 +138,7 @@ dmypy.json
 # uncased_L-4_H-512_A-8/
 
 # research code
-peter's code/
+peters_code/
 *.pptx
 *.ipynb
 contextualSpellCheck/tests/debugFile.txt
diff --git a/README.md b/README.md
@@ -182,6 +182,7 @@ Response:
 ## Task List
 
 - [ ] Add support for Real Word Error (RWE) (Big Task)
+- [ ] Include transformers deTokenizer to get better suggestions
 - [x] specify maximum edit distance for `candidateRanking`
 - [x] allow user to specify bert model
 - [ ] edit distance code optimisation
diff --git a/contextualSpellCheck/contextualSpellCheck.py b/contextualSpellCheck/contextualSpellCheck.py
@@ -3,6 +3,7 @@
 import os
 import warnings
 from datetime import datetime
+import unicodedata
 
 import editdistance
 import spacy
@@ -251,8 +252,8 @@ def misspell_identify(self, doc, query=""):
                 and (token.ent_type_ != "GPE")
                 and (token.ent_type_ != "ORG")
             ):
-                misspell.append(token)
-
+                if self.deep_tokenize_in_vocab(token.text):
+                    misspell.append(token)
         if self.debug:
             print("misspell identified: ", misspell)
         return misspell, doc
@@ -574,6 +575,59 @@ def doc_outcome_spell_check(self, doc):
 
         return update_query
 
+    def deep_tokenize_in_vocab(self, text):
+        """Check if the token contains punctuations
+            if char is punctuation then check in vocab
+            check rest of the word in vocab
+            if both in vocab return False
+
+        Args:
+            text (str): Text to tokenize again for punct
+
+        Returns:
+            Bool: True if both punct and rest of the word
+                 in vocab
+        """
+        text_len = len(text)
+        sub_tokens = []
+        pre_puct_position = -1
+        for char_position in range(text_len):
+            if unicodedata.category(text[char_position]).startswith("P"):
+                # print("current_pos is {} and sub_token append {}"
+                # .format(char_position,text[char_position]))
+                sub_tokens.append(text[char_position])
+                # print("pre_pos is {}, cur  is {} , pre to current is {}"
+                # .format(pre_puct_position,char_position,text[pre_puct_position+1:char_position]))
+                if (
+                    pre_puct_position >= 0
+                    and text[pre_puct_position + 1 : char_position] != ""
+                ):
+                    # print("pre_pos is {}, cur  is {} , pre to current is {}"
+                    # .format(pre_puct_position,char_position,text[pre_puct_position+1:char_position]))
+                    sub_tokens.append(
+                        text[pre_puct_position + 1 : char_position]
+                    )
+                pre_puct_position = char_position
+
+            if (
+                (len(sub_tokens) > 0)
+                and (char_position + 1 == text_len)
+                and (text[pre_puct_position + 1 :] != "")
+            ):
+                # print("inside last token append {}"
+                # .format(text[pre_puct_position+1:]))
+                sub_tokens.append(text[pre_puct_position + 1 :])
+
+        if len(sub_tokens) > 0:
+            for sub_token in sub_tokens:
+                print(sub_token in self.vocab)
+                if sub_token not in self.vocab:
+                    return True
+        else:
+            return True
+
+        return False
+
 
 if __name__ == "__main__":
     print("Code running...")
diff --git a/contextualSpellCheck/tests/test_contextualSpellCheck.py b/contextualSpellCheck/tests/test_contextualSpellCheck.py
@@ -640,3 +640,37 @@ def test_max_edit_dist(max_edit_distance, expected_spell_check_flag):
     assert doc._.outcome_spellCheck == gold_outcome
 
     nlp.remove_pipe("contextual spellchecker")
+
+
+@pytest.mark.parametrize(
+    "input_sentence,expected_outcome,expected_score_doc,\
+expected_suggestion_doc,possible_misspel_index",
+    [
+        (
+            "This is not a pure Python Spell Checking based on Peter Norvig’s \
+blog post on setting up a simple spell checking algorithm.",
+            "",
+            None,
+            {},
+            8,
+        )
+    ],
+)
+def test_deep_tokenization(
+    input_sentence,
+    expected_outcome,
+    expected_score_doc,
+    expected_suggestion_doc,
+    possible_misspel_index,
+):
+    nlp_lg = spacy.load("en_core_web_lg")
+    checker_deep_tokenize = ContextualSpellCheck(max_edit_dist=4)
+    nlp_lg.add_pipe(checker_deep_tokenize)
+    doc = nlp(input_sentence)
+
+    # To check the status of `performed_spell_check` flag
+    assert doc._.outcome_spellCheck == expected_outcome
+    assert doc._.score_spellCheck == expected_score_doc
+    assert doc._.suggestions_spellCheck == expected_suggestion_doc
+
+    assert doc[possible_misspel_index]._.get_suggestion_spellCheck == ""