Skip to content

Commit f054e80

Browse files
diyclassicskylepjohnson
authored andcommitted
Regex lemmatizer update (cltk#565)
* Refactor RegexpLemmatizer * Update Regexp Lemmatizer test * Update Regexp Lemmatizer test * Refactor RomanNumeralLemmatizer * Add test for BackoffLatinLemmatizer; fix test coverage in general
1 parent 46ddac9 commit f054e80

File tree

3 files changed

+477
-102
lines changed

3 files changed

+477
-102
lines changed

cltk/lemmatize/latin/backoff.py

Lines changed: 72 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
NLTK and repurposes several of the tagging classes for lemmatization
99
tasks. See here for more info on sequential backoff tagging in NLTK:
1010
http://www.nltk.org/_modules/nltk/tag/sequential.html
11-
1211
"""
1312

1413
__author__ = ['Patrick J. Burns <[email protected]>']
@@ -22,19 +21,14 @@
2221
from nltk.tag.sequential import SequentialBackoffTagger, ContextTagger, DefaultTagger, NgramTagger, UnigramTagger, RegexpTagger
2322

2423
from cltk.utils.file_operations import open_pickle
24+
from cltk.lemmatize.latin.latin import latin_sub_patterns, latin_verb_patterns, latin_pps, rn_patterns
2525

26-
rn_patterns = [(r'(?=^[MDCLXVUI]+$)(?=^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|IU|V?I{0,3}|U?I{0,3})$)', 'NUM'),
27-
(r'(?=^[mdclxvui]+$)(?=^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|iu|v?i{0,3}|u?i{0,3})$)', 'NUM')]
28-
29-
__author__ = ['Patrick J. Burns <[email protected]>']
30-
__license__ = 'MIT License. See LICENSE.'
31-
32-
33-
def backoff_lemmatizer(train_sents, lemmatizer_classes, backoff=None):
34-
"""From Python Text Processing with NLTK Cookbook."""
35-
for cls in lemmatizer_classes:
36-
backoff = cls(train_sents, backoff=backoff)
37-
return backoff
26+
# Unused for now
27+
#def backoff_lemmatizer(train_sents, lemmatizer_classes, backoff=None):
28+
# """From Python Text Processing with NLTK Cookbook."""
29+
# for cls in lemmatizer_classes:
30+
# backoff = cls(train_sents, backoff=backoff)
31+
# return backoff
3832

3933

4034
class LemmatizerI(TaggerI):
@@ -121,8 +115,7 @@ def choose_lemma(self, tokens, index, history):
121115
:param history: List with tokens that have already been lemmatized
122116
:return: String, spec. the token found at the current index.
123117
"""
124-
_lemma = tokens[index]
125-
return _lemma
118+
return tokens[index]
126119

127120

128121
class TrainLemmatizer(SequentialBackoffLemmatizer):
@@ -206,6 +199,7 @@ def __init__(self, train=None, model=None, backoff=None, cutoff=0):
206199

207200
class RegexpLemmatizer(SequentialBackoffLemmatizer, RegexpTagger):
208201
""""""
202+
209203
def __init__(self, regexps=None, backoff=None):
210204
"""Setup for RegexpLemmatizer()
211205
@@ -214,8 +208,7 @@ def __init__(self, regexps=None, backoff=None):
214208
"""
215209
SequentialBackoffLemmatizer.__init__(self, backoff)
216210
RegexpTagger.__init__(self, regexps, backoff)
217-
self._check = re.compile('|'.join('(?:%s)' % r[0] for r in regexps))
218-
self._regexs = [(re.compile(regexp), pattern,) for regexp, pattern in regexps]
211+
self._regexs = regexps
219212

220213
def choose_lemma(self, tokens, index, history):
221214
"""Use regular expressions for rules-based lemmatizing based on word endings;
@@ -226,12 +219,11 @@ def choose_lemma(self, tokens, index, history):
226219
:param history: List with tokens that have already been lemmatized
227220
:return: Str with concatenated lemma
228221
"""
229-
if self._check.match(tokens[index]):
230-
for regexp, pattern in self._regexs:
231-
m = re.match(regexp, tokens[index])
232-
if m:
233-
return (m.group(1)) + pattern
234-
222+
for pattern, replace in self._regexs:
223+
if re.search(pattern, tokens[index]):
224+
return re.sub(pattern, replace, tokens[index])
225+
break # pragma: no cover
226+
235227

236228
class PPLemmatizer(RegexpLemmatizer):
237229
"""Customization of the RegexpLemmatizer for Latin. The RegexpLemmatizer is
@@ -248,10 +240,9 @@ def __init__(self, regexps=None, pps=None, backoff=None):
248240
# Note different compile to make use of principal parts dictionary structure; also, note
249241
# that the PP dictionary has been set up so that principal parts match their traditional
250242
# numbering, i.e. present stem is indexed as 1. The 0 index is used for the lemma.
251-
self._check = re.compile('|'.join('(?:%s)' % r[0] for r in regexps))
252-
self._regexs = [(re.compile(regexp), num) for regexp, num in
253-
regexps]
254-
self.pps = pps
243+
self._regexs = latin_verb_patterns
244+
self.pps = latin_pps
245+
255246

256247
def choose_lemma(self, tokens, index, history):
257248
"""Use regular expressions for rules-based lemmatizing based on
@@ -263,25 +254,25 @@ def choose_lemma(self, tokens, index, history):
263254
:param index: Int with current token
264255
:param history: List with tokens that have already been lemmatized
265256
:return: Str with index[0] from the dictionary value, see above about '0 index'
266-
"""
267-
if self._check.match(tokens[index]):
268-
for regexp in self._regexs:
269-
m = re.match(regexp[0], tokens[index])
270-
if m:
271-
root = m.group(1)
272-
match = [lemma for (lemma, pp) in self.pps.items() if root == pp[regexp[1]]]
273-
if not match:
274-
pass
275-
else:
276-
return match[0] # Lemma is indexed at zero in PP dictionary
277-
278-
257+
"""
258+
for regexp in self._regexs:
259+
m = re.match(regexp[0], tokens[index])
260+
if m:
261+
root = m.group(1)
262+
match = [lemma for (lemma, pp) in self.pps.items() if root == pp[regexp[1]]]
263+
if not match:
264+
pass
265+
else:
266+
return match[0] # Lemma is indexed at zero in PP dictionary
267+
268+
279269
class RomanNumeralLemmatizer(RegexpLemmatizer):
280270
""""""
281-
def __init__(self, regexps=rn_patterns, backoff=None):
271+
def __init__(self, regexps=rn_patterns, default=None, backoff=None):
282272
"""RomanNumeralLemmatizer"""
283273
RegexpLemmatizer.__init__(self, regexps, backoff)
284274
self._regexs = [(re.compile(regexp), pattern,) for regexp, pattern in regexps]
275+
self.default = default
285276

286277
def choose_lemma(self, tokens, index, history):
287278
"""Test case for customized rules-based improvements to lemmatizer using regex; differs
@@ -292,11 +283,13 @@ def choose_lemma(self, tokens, index, history):
292283
:param history: List with tokens that have already been lemmatized
293284
:return: Str with replacement from pattern
294285
"""
295-
for regexp, pattern in self._regexs:
296-
m = re.match(regexp, tokens[index])
297-
if m:
298-
return pattern
299-
return None
286+
for pattern, replace in self._regexs:
287+
if re.search(pattern, tokens[index]):
288+
if self.default:
289+
return self.default
290+
else:
291+
return replace
292+
break # pragma: no cover
300293

301294

302295
class ContextPOSLemmatizer(ContextLemmatizer):
@@ -503,34 +496,13 @@ def __init__(self, train, seed=3):
503496
print('The file %s is not available in cltk_data' % file)
504497

505498
# Check for presence of misc_patterns
506-
file = 'latin_misc_patterns.pickle'
507-
508-
misc_patterns_path = os.path.join(path, file)
509-
if os.path.isfile(misc_patterns_path):
510-
self.latin_misc_patterns = open_pickle(misc_patterns_path)
511-
else:
512-
self.latin_misc_patterns = {}
513-
print('The file %s is not available in cltk_data' % file)
499+
self.latin_sub_patterns = latin_sub_patterns
514500

515501
# Check for presence of verb_patterns
516-
file = 'latin_verb_patterns.pickle'
517-
518-
verb_patterns_path = os.path.join(path, file)
519-
if os.path.isfile(verb_patterns_path):
520-
self.latin_verb_patterns = open_pickle(verb_patterns_path)
521-
else:
522-
self.latin_verb_patterns = {}
523-
print('The file %s is not available in cltk_data' % file)
502+
self.latin_verb_patterns = latin_verb_patterns
524503

525504
# Check for presence of latin_pps
526-
file = 'latin_pps.pickle'
527-
528-
latin_pps_path = os.path.join(path, file)
529-
if os.path.isfile(latin_pps_path):
530-
self.latin_pps = open_pickle(latin_pps_path)
531-
else:
532-
self.latin_pps = {}
533-
print('The file %s is not available in cltk_data' % file)
505+
self.latin_pps = latin_pps
534506

535507
def _randomize_data(train, seed):
536508
import random
@@ -551,7 +523,7 @@ def _define_lemmatizer(self):
551523
backoff2 = TrainLemmatizer(model=self.LATIN_OLD_MODEL, backoff=backoff1)
552524
backoff3 = PPLemmatizer(regexps=self.latin_verb_patterns, pps=self.latin_pps, backoff=backoff2)
553525
backoff4 = UnigramLemmatizer(self.train_sents, backoff=backoff3)
554-
backoff5 = RegexpLemmatizer(self.latin_misc_patterns, backoff=backoff4)
526+
backoff5 = RegexpLemmatizer(self.latin_sub_patterns, backoff=backoff4)
555527
backoff6 = TrainLemmatizer(model=self.LATIN_MODEL, backoff=backoff5)
556528
#backoff7 = BigramPOSLemmatizer(self.pos_train_sents, include=['cum'], backoff=backoff6)
557529
#lemmatizer = backoff7
@@ -567,32 +539,32 @@ def evaluate(self):
567539
lemmatizer = self._define_lemmatizer()
568540
return lemmatizer.evaluate(self.test_sents)
569541

570-
571-
if __name__ == "__main__":
572-
573-
# Set up training sentences
574-
rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
575-
path = os.path.expanduser(rel_path)
576-
577-
# Check for presence of latin_pos_lemmatized_sents
578-
file = 'latin_pos_lemmatized_sents.pickle'
579-
580-
latin_pos_lemmatized_sents_path = os.path.join(path, file)
581-
if os.path.isfile(latin_pos_lemmatized_sents_path):
582-
latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
583-
else:
584-
latin_pos_lemmatized_sents = []
585-
print('The file %s is not available in cltk_data' % file)
586-
587-
588-
RUN = 10
589-
ACCURACIES = []
590-
591-
for I in range(RUN):
592-
LEMMATIZER = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)
593-
ACC = LEMMATIZER.evaluate()
594-
ACCURACIES.append(ACC)
595-
print('{:.2%}'.format(ACC))
596-
597-
print('\nTOTAL (Run %d) times' % RUN)
598-
print('{:.2%}'.format(sum(ACCURACIES) / RUN))
542+
# Accuracty test available below——keep? delete?
543+
#if __name__ == "__main__":
544+
#
545+
# # Set up training sentences
546+
# rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
547+
# path = os.path.expanduser(rel_path)
548+
#
549+
# # Check for presence of latin_pos_lemmatized_sents
550+
# file = 'latin_pos_lemmatized_sents.pickle'
551+
#
552+
# latin_pos_lemmatized_sents_path = os.path.join(path, file)
553+
# if os.path.isfile(latin_pos_lemmatized_sents_path):
554+
# latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
555+
# else:
556+
# latin_pos_lemmatized_sents = []
557+
# print('The file %s is not available in cltk_data' % file)
558+
#
559+
#
560+
# RUN = 10
561+
# ACCURACIES = []
562+
#
563+
# for I in range(RUN):
564+
# LEMMATIZER = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)
565+
# ACC = LEMMATIZER.evaluate()
566+
# ACCURACIES.append(ACC)
567+
# print('{:.2%}'.format(ACC))
568+
#
569+
# print('\nTOTAL (Run %d) times' % RUN)
570+
# print('{:.2%}'.format(sum(ACCURACIES) / RUN))

0 commit comments

Comments
 (0)