|
3 | 3 | import os |
4 | 4 | import warnings |
5 | 5 | from datetime import datetime |
| 6 | +import unicodedata |
6 | 7 |
|
7 | 8 | import editdistance |
8 | 9 | import spacy |
@@ -251,8 +252,8 @@ def misspell_identify(self, doc, query=""): |
251 | 252 | and (token.ent_type_ != "GPE") |
252 | 253 | and (token.ent_type_ != "ORG") |
253 | 254 | ): |
254 | | - misspell.append(token) |
255 | | - |
| 255 | + if self.deep_tokenize_in_vocab(token.text): |
| 256 | + misspell.append(token) |
256 | 257 | if self.debug: |
257 | 258 | print("misspell identified: ", misspell) |
258 | 259 | return misspell, doc |
@@ -574,6 +575,59 @@ def doc_outcome_spell_check(self, doc): |
574 | 575 |
|
575 | 576 | return update_query |
576 | 577 |
|
| 578 | + def deep_tokenize_in_vocab(self, text): |
| 579 | + """Check if the token contains punctuations |
| 580 | + if char is punctuation then check in vocab |
| 581 | + check rest of the word in vocab |
| 582 | + if both in vocab return False |
| 583 | +
|
| 584 | + Args: |
| 585 | + text (str): Text to tokenize again for punct |
| 586 | +
|
| 587 | + Returns: |
| 588 | + Bool: True if both punct and rest of the word |
| 589 | + in vocab |
| 590 | + """ |
| 591 | + text_len = len(text) |
| 592 | + sub_tokens = [] |
| 593 | + pre_puct_position = -1 |
| 594 | + for char_position in range(text_len): |
| 595 | + if unicodedata.category(text[char_position]).startswith("P"): |
| 596 | + # print("current_pos is {} and sub_token append {}" |
| 597 | + # .format(char_position,text[char_position])) |
| 598 | + sub_tokens.append(text[char_position]) |
| 599 | + # print("pre_pos is {}, cur is {} , pre to current is {}" |
| 600 | + # .format(pre_puct_position,char_position,text[pre_puct_position+1:char_position])) |
| 601 | + if ( |
| 602 | + pre_puct_position >= 0 |
| 603 | + and text[pre_puct_position + 1 : char_position] != "" |
| 604 | + ): |
| 605 | + # print("pre_pos is {}, cur is {} , pre to current is {}" |
| 606 | + # .format(pre_puct_position,char_position,text[pre_puct_position+1:char_position])) |
| 607 | + sub_tokens.append( |
| 608 | + text[pre_puct_position + 1 : char_position] |
| 609 | + ) |
| 610 | + pre_puct_position = char_position |
| 611 | + |
| 612 | + if ( |
| 613 | + (len(sub_tokens) > 0) |
| 614 | + and (char_position + 1 == text_len) |
| 615 | + and (text[pre_puct_position + 1 :] != "") |
| 616 | + ): |
| 617 | + # print("inside last token append {}" |
| 618 | + # .format(text[pre_puct_position+1:])) |
| 619 | + sub_tokens.append(text[pre_puct_position + 1 :]) |
| 620 | + |
| 621 | + if len(sub_tokens) > 0: |
| 622 | + for sub_token in sub_tokens: |
| 623 | + print(sub_token in self.vocab) |
| 624 | + if sub_token not in self.vocab: |
| 625 | + return True |
| 626 | + else: |
| 627 | + return True |
| 628 | + |
| 629 | + return False |
| 630 | + |
577 | 631 |
|
578 | 632 | if __name__ == "__main__": |
579 | 633 | print("Code running...") |
|
0 commit comments