Skip to content

Commit c168650

Browse files
authored
Merge pull request #28 from R1j1t/dev
fixes separate tokenization logic of spaCy and transformers for `'s`
2 parents 96cb79e + 1e9d6a1 commit c168650

File tree

6 files changed

+96
-4
lines changed

6 files changed

+96
-4
lines changed

.flake8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[flake8]
2-
ignore = W503
2+
ignore = W503, E203
33
exclude = .git,__pycache__,build,peters_code,.ipynb_checkpoints,setup.py
44
max-complexity = 15
55
per-file-ignores =

.github/workflows/python-package.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,10 @@ jobs:
3030
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
3131
- name: install spacy model
3232
run: |
33+
# Download base english langauge model
3334
python -m spacy download en_core_web_sm
35+
# Download large english language model
36+
python -m spacy download en_core_web_lg
3437
- name: Black Code Formatter
3538
run: black . --check
3639
- name: Flake Code Checker

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ dmypy.json
138138
# uncased_L-4_H-512_A-8/
139139

140140
# research code
141-
peter's code/
141+
peters_code/
142142
*.pptx
143143
*.ipynb
144144
contextualSpellCheck/tests/debugFile.txt

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ Response:
182182
## Task List
183183

184184
- [ ] Add support for Real Word Error (RWE) (Big Task)
185+
- [ ] Include transformers deTokenizer to get better suggestions
185186
- [x] specify maximum edit distance for `candidateRanking`
186187
- [x] allow user to specify bert model
187188
- [ ] edit distance code optimisation

contextualSpellCheck/contextualSpellCheck.py

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import warnings
55
from datetime import datetime
6+
import unicodedata
67

78
import editdistance
89
import spacy
@@ -251,8 +252,8 @@ def misspell_identify(self, doc, query=""):
251252
and (token.ent_type_ != "GPE")
252253
and (token.ent_type_ != "ORG")
253254
):
254-
misspell.append(token)
255-
255+
if self.deep_tokenize_in_vocab(token.text):
256+
misspell.append(token)
256257
if self.debug:
257258
print("misspell identified: ", misspell)
258259
return misspell, doc
@@ -574,6 +575,59 @@ def doc_outcome_spell_check(self, doc):
574575

575576
return update_query
576577

578+
def deep_tokenize_in_vocab(self, text):
579+
"""Check if the token contains punctuations
580+
if char is punctuation then check in vocab
581+
check rest of the word in vocab
582+
if both in vocab return False
583+
584+
Args:
585+
text (str): Text to tokenize again for punct
586+
587+
Returns:
588+
Bool: True if both punct and rest of the word
589+
in vocab
590+
"""
591+
text_len = len(text)
592+
sub_tokens = []
593+
pre_puct_position = -1
594+
for char_position in range(text_len):
595+
if unicodedata.category(text[char_position]).startswith("P"):
596+
# print("current_pos is {} and sub_token append {}"
597+
# .format(char_position,text[char_position]))
598+
sub_tokens.append(text[char_position])
599+
# print("pre_pos is {}, cur is {} , pre to current is {}"
600+
# .format(pre_puct_position,char_position,text[pre_puct_position+1:char_position]))
601+
if (
602+
pre_puct_position >= 0
603+
and text[pre_puct_position + 1 : char_position] != ""
604+
):
605+
# print("pre_pos is {}, cur is {} , pre to current is {}"
606+
# .format(pre_puct_position,char_position,text[pre_puct_position+1:char_position]))
607+
sub_tokens.append(
608+
text[pre_puct_position + 1 : char_position]
609+
)
610+
pre_puct_position = char_position
611+
612+
if (
613+
(len(sub_tokens) > 0)
614+
and (char_position + 1 == text_len)
615+
and (text[pre_puct_position + 1 :] != "")
616+
):
617+
# print("inside last token append {}"
618+
# .format(text[pre_puct_position+1:]))
619+
sub_tokens.append(text[pre_puct_position + 1 :])
620+
621+
if len(sub_tokens) > 0:
622+
for sub_token in sub_tokens:
623+
print(sub_token in self.vocab)
624+
if sub_token not in self.vocab:
625+
return True
626+
else:
627+
return True
628+
629+
return False
630+
577631

578632
if __name__ == "__main__":
579633
print("Code running...")

contextualSpellCheck/tests/test_contextualSpellCheck.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -640,3 +640,37 @@ def test_max_edit_dist(max_edit_distance, expected_spell_check_flag):
640640
assert doc._.outcome_spellCheck == gold_outcome
641641

642642
nlp.remove_pipe("contextual spellchecker")
643+
644+
645+
@pytest.mark.parametrize(
646+
"input_sentence,expected_outcome,expected_score_doc,\
647+
expected_suggestion_doc,possible_misspel_index",
648+
[
649+
(
650+
"This is not a pure Python Spell Checking based on Peter Norvig’s \
651+
blog post on setting up a simple spell checking algorithm.",
652+
"",
653+
None,
654+
{},
655+
8,
656+
)
657+
],
658+
)
659+
def test_deep_tokenization(
660+
input_sentence,
661+
expected_outcome,
662+
expected_score_doc,
663+
expected_suggestion_doc,
664+
possible_misspel_index,
665+
):
666+
nlp_lg = spacy.load("en_core_web_lg")
667+
checker_deep_tokenize = ContextualSpellCheck(max_edit_dist=4)
668+
nlp_lg.add_pipe(checker_deep_tokenize)
669+
doc = nlp(input_sentence)
670+
671+
# To check the status of `performed_spell_check` flag
672+
assert doc._.outcome_spellCheck == expected_outcome
673+
assert doc._.score_spellCheck == expected_score_doc
674+
assert doc._.suggestions_spellCheck == expected_suggestion_doc
675+
676+
assert doc[possible_misspel_index]._.get_suggestion_spellCheck == ""

0 commit comments

Comments
 (0)