8
8
NLTK and repurposes several of the tagging classes for lemmatization
9
9
tasks. See here for more info on sequential backoff tagging in NLTK:
10
10
http://www.nltk.org/_modules/nltk/tag/sequential.html
11
-
12
11
"""
13
12
14
13
__author__ = [
'Patrick J. Burns <[email protected] >' ]
22
21
from nltk .tag .sequential import SequentialBackoffTagger , ContextTagger , DefaultTagger , NgramTagger , UnigramTagger , RegexpTagger
23
22
24
23
from cltk .utils .file_operations import open_pickle
24
+ from cltk .lemmatize .latin .latin import latin_sub_patterns , latin_verb_patterns , latin_pps , rn_patterns
25
25
26
- rn_patterns = [(r'(?=^[MDCLXVUI]+$)(?=^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|IU|V?I{0,3}|U?I{0,3})$)' , 'NUM' ),
27
- (r'(?=^[mdclxvui]+$)(?=^m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|iu|v?i{0,3}|u?i{0,3})$)' , 'NUM' )]
28
-
29
- __author__ = [
'Patrick J. Burns <[email protected] >' ]
30
- __license__ = 'MIT License. See LICENSE.'
31
-
32
-
33
- def backoff_lemmatizer (train_sents , lemmatizer_classes , backoff = None ):
34
- """From Python Text Processing with NLTK Cookbook."""
35
- for cls in lemmatizer_classes :
36
- backoff = cls (train_sents , backoff = backoff )
37
- return backoff
26
+ # Unused for now
27
+ #def backoff_lemmatizer(train_sents, lemmatizer_classes, backoff=None):
28
+ # """From Python Text Processing with NLTK Cookbook."""
29
+ # for cls in lemmatizer_classes:
30
+ # backoff = cls(train_sents, backoff=backoff)
31
+ # return backoff
38
32
39
33
40
34
class LemmatizerI (TaggerI ):
@@ -121,8 +115,7 @@ def choose_lemma(self, tokens, index, history):
121
115
:param history: List with tokens that have already been lemmatized
122
116
:return: String, spec. the token found at the current index.
123
117
"""
124
- _lemma = tokens [index ]
125
- return _lemma
118
+ return tokens [index ]
126
119
127
120
128
121
class TrainLemmatizer (SequentialBackoffLemmatizer ):
@@ -206,6 +199,7 @@ def __init__(self, train=None, model=None, backoff=None, cutoff=0):
206
199
207
200
class RegexpLemmatizer (SequentialBackoffLemmatizer , RegexpTagger ):
208
201
""""""
202
+
209
203
def __init__ (self , regexps = None , backoff = None ):
210
204
"""Setup for RegexpLemmatizer()
211
205
@@ -214,8 +208,7 @@ def __init__(self, regexps=None, backoff=None):
214
208
"""
215
209
SequentialBackoffLemmatizer .__init__ (self , backoff )
216
210
RegexpTagger .__init__ (self , regexps , backoff )
217
- self ._check = re .compile ('|' .join ('(?:%s)' % r [0 ] for r in regexps ))
218
- self ._regexs = [(re .compile (regexp ), pattern ,) for regexp , pattern in regexps ]
211
+ self ._regexs = regexps
219
212
220
213
def choose_lemma (self , tokens , index , history ):
221
214
"""Use regular expressions for rules-based lemmatizing based on word endings;
@@ -226,12 +219,11 @@ def choose_lemma(self, tokens, index, history):
226
219
:param history: List with tokens that have already been lemmatized
227
220
:return: Str with concatenated lemma
228
221
"""
229
- if self ._check .match (tokens [index ]):
230
- for regexp , pattern in self ._regexs :
231
- m = re .match (regexp , tokens [index ])
232
- if m :
233
- return (m .group (1 )) + pattern
234
-
222
+ for pattern , replace in self ._regexs :
223
+ if re .search (pattern , tokens [index ]):
224
+ return re .sub (pattern , replace , tokens [index ])
225
+ break # pragma: no cover
226
+
235
227
236
228
class PPLemmatizer (RegexpLemmatizer ):
237
229
"""Customization of the RegexpLemmatizer for Latin. The RegexpLemmatizer is
@@ -248,10 +240,9 @@ def __init__(self, regexps=None, pps=None, backoff=None):
248
240
# Note different compile to make use of principal parts dictionary structure; also, note
249
241
# that the PP dictionary has been set up so that principal parts match their traditional
250
242
# numbering, i.e. present stem is indexed as 1. The 0 index is used for the lemma.
251
- self ._check = re .compile ('|' .join ('(?:%s)' % r [0 ] for r in regexps ))
252
- self ._regexs = [(re .compile (regexp ), num ) for regexp , num in
253
- regexps ]
254
- self .pps = pps
243
+ self ._regexs = latin_verb_patterns
244
+ self .pps = latin_pps
245
+
255
246
256
247
def choose_lemma (self , tokens , index , history ):
257
248
"""Use regular expressions for rules-based lemmatizing based on
@@ -263,25 +254,25 @@ def choose_lemma(self, tokens, index, history):
263
254
:param index: Int with current token
264
255
:param history: List with tokens that have already been lemmatized
265
256
:return: Str with index[0] from the dictionary value, see above about '0 index'
266
- """
267
- if self ._check .match (tokens [index ]):
268
- for regexp in self ._regexs :
269
- m = re .match (regexp [0 ], tokens [index ])
270
- if m :
271
- root = m .group (1 )
272
- match = [lemma for (lemma , pp ) in self .pps .items () if root == pp [regexp [1 ]]]
273
- if not match :
274
- pass
275
- else :
276
- return match [0 ] # Lemma is indexed at zero in PP dictionary
277
-
278
-
257
+ """
258
+ for regexp in self ._regexs :
259
+ m = re .match (regexp [0 ], tokens [index ])
260
+ if m :
261
+ root = m .group (1 )
262
+ match = [lemma for (lemma , pp ) in self .pps .items () if root == pp [regexp [1 ]]]
263
+ if not match :
264
+ pass
265
+ else :
266
+ return match [0 ] # Lemma is indexed at zero in PP dictionary
267
+
268
+
279
269
class RomanNumeralLemmatizer (RegexpLemmatizer ):
280
270
""""""
281
- def __init__ (self , regexps = rn_patterns , backoff = None ):
271
+ def __init__ (self , regexps = rn_patterns , default = None , backoff = None ):
282
272
"""RomanNumeralLemmatizer"""
283
273
RegexpLemmatizer .__init__ (self , regexps , backoff )
284
274
self ._regexs = [(re .compile (regexp ), pattern ,) for regexp , pattern in regexps ]
275
+ self .default = default
285
276
286
277
def choose_lemma (self , tokens , index , history ):
287
278
"""Test case for customized rules-based improvements to lemmatizer using regex; differs
@@ -292,11 +283,13 @@ def choose_lemma(self, tokens, index, history):
292
283
:param history: List with tokens that have already been lemmatized
293
284
:return: Str with replacement from pattern
294
285
"""
295
- for regexp , pattern in self ._regexs :
296
- m = re .match (regexp , tokens [index ])
297
- if m :
298
- return pattern
299
- return None
286
+ for pattern , replace in self ._regexs :
287
+ if re .search (pattern , tokens [index ]):
288
+ if self .default :
289
+ return self .default
290
+ else :
291
+ return replace
292
+ break # pragma: no cover
300
293
301
294
302
295
class ContextPOSLemmatizer (ContextLemmatizer ):
@@ -503,34 +496,13 @@ def __init__(self, train, seed=3):
503
496
print ('The file %s is not available in cltk_data' % file )
504
497
505
498
# Check for presence of misc_patterns
506
- file = 'latin_misc_patterns.pickle'
507
-
508
- misc_patterns_path = os .path .join (path , file )
509
- if os .path .isfile (misc_patterns_path ):
510
- self .latin_misc_patterns = open_pickle (misc_patterns_path )
511
- else :
512
- self .latin_misc_patterns = {}
513
- print ('The file %s is not available in cltk_data' % file )
499
+ self .latin_sub_patterns = latin_sub_patterns
514
500
515
501
# Check for presence of verb_patterns
516
- file = 'latin_verb_patterns.pickle'
517
-
518
- verb_patterns_path = os .path .join (path , file )
519
- if os .path .isfile (verb_patterns_path ):
520
- self .latin_verb_patterns = open_pickle (verb_patterns_path )
521
- else :
522
- self .latin_verb_patterns = {}
523
- print ('The file %s is not available in cltk_data' % file )
502
+ self .latin_verb_patterns = latin_verb_patterns
524
503
525
504
# Check for presence of latin_pps
526
- file = 'latin_pps.pickle'
527
-
528
- latin_pps_path = os .path .join (path , file )
529
- if os .path .isfile (latin_pps_path ):
530
- self .latin_pps = open_pickle (latin_pps_path )
531
- else :
532
- self .latin_pps = {}
533
- print ('The file %s is not available in cltk_data' % file )
505
+ self .latin_pps = latin_pps
534
506
535
507
def _randomize_data (train , seed ):
536
508
import random
@@ -551,7 +523,7 @@ def _define_lemmatizer(self):
551
523
backoff2 = TrainLemmatizer (model = self .LATIN_OLD_MODEL , backoff = backoff1 )
552
524
backoff3 = PPLemmatizer (regexps = self .latin_verb_patterns , pps = self .latin_pps , backoff = backoff2 )
553
525
backoff4 = UnigramLemmatizer (self .train_sents , backoff = backoff3 )
554
- backoff5 = RegexpLemmatizer (self .latin_misc_patterns , backoff = backoff4 )
526
+ backoff5 = RegexpLemmatizer (self .latin_sub_patterns , backoff = backoff4 )
555
527
backoff6 = TrainLemmatizer (model = self .LATIN_MODEL , backoff = backoff5 )
556
528
#backoff7 = BigramPOSLemmatizer(self.pos_train_sents, include=['cum'], backoff=backoff6)
557
529
#lemmatizer = backoff7
@@ -567,32 +539,32 @@ def evaluate(self):
567
539
lemmatizer = self ._define_lemmatizer ()
568
540
return lemmatizer .evaluate (self .test_sents )
569
541
570
-
571
- if __name__ == "__main__" :
572
-
573
- # Set up training sentences
574
- rel_path = os .path .join ('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff' )
575
- path = os .path .expanduser (rel_path )
576
-
577
- # Check for presence of latin_pos_lemmatized_sents
578
- file = 'latin_pos_lemmatized_sents.pickle'
579
-
580
- latin_pos_lemmatized_sents_path = os .path .join (path , file )
581
- if os .path .isfile (latin_pos_lemmatized_sents_path ):
582
- latin_pos_lemmatized_sents = open_pickle (latin_pos_lemmatized_sents_path )
583
- else :
584
- latin_pos_lemmatized_sents = []
585
- print ('The file %s is not available in cltk_data' % file )
586
-
587
-
588
- RUN = 10
589
- ACCURACIES = []
590
-
591
- for I in range (RUN ):
592
- LEMMATIZER = BackoffLatinLemmatizer (latin_pos_lemmatized_sents )
593
- ACC = LEMMATIZER .evaluate ()
594
- ACCURACIES .append (ACC )
595
- print ('{:.2%}' .format (ACC ))
596
-
597
- print ('\n TOTAL (Run %d) times' % RUN )
598
- print ('{:.2%}' .format (sum (ACCURACIES ) / RUN ))
542
+ # Accuracty test available below——keep? delete?
543
+ # if __name__ == "__main__":
544
+ #
545
+ # # Set up training sentences
546
+ # rel_path = os.path.join('~/cltk_data/latin/model/latin_models_cltk/lemmata/backoff')
547
+ # path = os.path.expanduser(rel_path)
548
+ #
549
+ # # Check for presence of latin_pos_lemmatized_sents
550
+ # file = 'latin_pos_lemmatized_sents.pickle'
551
+ #
552
+ # latin_pos_lemmatized_sents_path = os.path.join(path, file)
553
+ # if os.path.isfile(latin_pos_lemmatized_sents_path):
554
+ # latin_pos_lemmatized_sents = open_pickle(latin_pos_lemmatized_sents_path)
555
+ # else:
556
+ # latin_pos_lemmatized_sents = []
557
+ # print('The file %s is not available in cltk_data' % file)
558
+ #
559
+ #
560
+ # RUN = 10
561
+ # ACCURACIES = []
562
+ #
563
+ # for I in range(RUN):
564
+ # LEMMATIZER = BackoffLatinLemmatizer(latin_pos_lemmatized_sents)
565
+ # ACC = LEMMATIZER.evaluate()
566
+ # ACCURACIES.append(ACC)
567
+ # print('{:.2%}'.format(ACC))
568
+ #
569
+ # print('\nTOTAL (Run %d) times' % RUN)
570
+ # print('{:.2%}'.format(sum(ACCURACIES) / RUN))
0 commit comments