refactor prefix handling to correctly parse Portuguese prefixes #72

derek73 · derek73 · commit 10f34e450d64 · 2018-08-31T13:50:25.000-07:00
while continuing to support multiple names after a prefix #23
diff --git a/docs/customize.rst b/docs/customize.rst
@@ -39,14 +39,14 @@ instantiate the :py:class:`~nameparser.parser.HumanName` class (see below).
 Editable attributes of nameparser.config.CONSTANTS
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-* :py:class:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names.
-* :py:class:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David".
-* :py:class:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.".
-* :py:class:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.".
-* :py:class:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece.
-* :py:class:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name.
-* :py:class:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D".
-* :py:class:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc.
+* :py:obj:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names.
+* :py:obj:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David".
+* :py:obj:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.".
+* :py:obj:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.".
+* :py:obj:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece.
+* :py:obj:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name.
+* :py:obj:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D".
+* :py:obj:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc.
 
 Each set of constants comes with :py:func:`~nameparser.config.SetManager.add` and :py:func:`~nameparser.config.SetManager.remove` methods for tuning
 the constants for your project. These methods automatically lower case and
diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py
@@ -1,7 +1,15 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
-#: Name pieces that appear before a last name. They join to the piece that follows them to make one new piece.
+#: Name pieces that appear before a last name. Prefixes join to the piece
+# that follows them to make one new piece. They can be chained together, e.g
+# "von der" and "de la". Because they only appear in middle or last names,
+# they also signifiy that all following name pieces should be in the same name
+# part, for example, "von" will be joined to all following pieces that are not
+# prefixes or suffixes, allowing recognition of double last names when they
+# appear after a prefixes. So in "pennie von bergen wessels MD", "von" will
+# join with all following name pieces until the suffix "MD", resulting in the
+# correct parsing of the last name "von bergen wessels".
 PREFIXES = set([
     'abu',
     'bin',
diff --git a/nameparser/parser.py b/nameparser/parser.py
@@ -501,14 +501,6 @@ def parse_full_name(self):
                     self.last_list.append(piece)
                     self.suffix_list += pieces[i+1:]
                     break
-                if piece in self.prefix_joins:
-                    last_piece = pieces[-1:][0]
-                    if self.is_suffix(last_piece):
-                        self.last_list += pieces[i:-1]
-                        self.suffix = last_piece
-                    else:
-                        self.last_list += pieces[i:]
-                    break
                 if not nxt:
                     self.last_list.append(piece)
                     continue
@@ -548,14 +540,6 @@ def parse_full_name(self):
                         self.last_list.append(piece)
                         self.suffix_list = pieces[i+1:] + self.suffix_list
                         break
-                    if piece in self.prefix_joins:
-                        last_piece = pieces[-1:][0]
-                        if self.is_suffix(last_piece):
-                            self.last_list += pieces[i:-1]
-                            self.suffix_list.insert(0, last_piece)
-                        else:
-                            self.last_list += pieces[i:]
-                        break
                     if not nxt:
                         self.last_list.append(piece)
                         continue
@@ -596,9 +580,6 @@ def parse_full_name(self):
                     if self.is_suffix(piece):
                         self.suffix_list.append(piece)
                         continue
-                    if piece in self.prefix_joins:
-                        self.last_list += pieces[i:]
-                        break
                     self.middle_list.append(piece)
                 try:
                     if parts[2]:
@@ -685,27 +666,27 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
         # don't join on conjunctions if there's only 2 parts
         if length < 3:
             return pieces
-            
+
         rootname_pieces = [p for p in pieces if self.is_rootname(p)]
         total_length = len(rootname_pieces) + additional_parts_count
-        
+
         # find all the conjunctions, join any conjunctions that are next to each
         # other, then join those newly joined conjunctions and any single
         # conjunctions to the piece before and after it
-        conj_index = [i for i, piece in enumerate(pieces) 
+        conj_index = [i for i, piece in enumerate(pieces)
                                 if self.is_conjunction(piece)]
-        
+
         contiguous_conj_i = []
         for i, val in enumerate(conj_index):
             try:
                 if conj_index[i+1] == val+1:
                     contiguous_conj_i += [val]
             except IndexError:
                 pass
-        
+
         contiguous_conj_i = group_contiguous_integers(conj_index)
-        
-        delete_i = [] 
+
+        delete_i = []
         for i in contiguous_conj_i:
             if type(i) == tuple:
                 new_piece = " ".join(pieces[ i[0] : i[1]+1] )
@@ -717,7 +698,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
                 pieces[i] = new_piece
             #add newly joined conjunctions to constants to be found later
             self.C.conjunctions.add(new_piece)
-        
+
         for i in reversed(delete_i):
             # delete pieces in reverse order or the index changes on each delete
             del pieces[i]
@@ -728,15 +709,15 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
 
         # refresh conjunction index locations
         conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)]
-        
+
         for i in conj_index:
             if len(pieces[i]) == 1 and total_length < 4:
                 # if there are only 3 total parts (minus known titles, suffixes
                 # and prefixes) and this conjunction is a single letter, prefer
                 # treating it as an initial rather than a conjunction.
                 # http://code.google.com/p/python-nameparser/issues/detail?id=11
                 continue
-            
+
             if i is 0:
                 new_piece = " ".join(pieces[i:i+2])
                 if self.is_title(pieces[i+1]):
@@ -748,8 +729,8 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
                 for j,val in enumerate(conj_index):
                     if val > i:
                         conj_index[j]=val-1
-                
-            else:    
+
+            else:
                 new_piece = " ".join(pieces[i-1:i+2])
                 if self.is_title(pieces[i-1]):
                     # when joining to a title, make new_piece a title too
@@ -767,23 +748,51 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
                 for j,val in enumerate(conj_index):
                     if val > i:
                         conj_index[j] = val - rm_count
-        
-        
-        # join prefixes to following lastnames: ['de la Vega'], ['van Buren']
+
+
+        # join prefixes to following lastnames: ['de la Vega'], ['van Buren III']
         prefixes = list(filter(self.is_prefix, pieces))
         if prefixes:
-            i = pieces.index(prefixes[0])
-            # join everything after the prefix until the next non prefix
-            # store joined pieces in prefix_joins. When a prefix occurs in a last name,
-            # I think it means the rest of the name is part of the last name, so prefix_joins
-            # lets us do that in the parser flow.
-            non_suffixes = list(filter(lambda x: not self.is_prefix(x), pieces[i:]))
-            if non_suffixes:
-                j = pieces.index(non_suffixes[0])
-                new_piece = ' '.join(pieces[i:j + 1])
-                self.prefix_joins += [new_piece]
-                pieces = pieces[:i] + [new_piece] + pieces[j + 1:]
-            
+            for prefix in prefixes:
+                try:
+                    i = pieces.index(prefix)
+                except ValueError:
+                    # If the prefix is no longer in pieces, it's because it has been
+                    # combined with the prefix that appears right before (or before that when
+                    # chained together) in the last loop, so the index of that newly created
+                    # piece is the same as in the last loop, i==i still, and we want to join
+                    # it to the next piece.
+                    pass
+
+                new_piece = ''
+
+                # join everything after the prefix until the next non prefix
+                # store joined pieces in prefix_joins. When a prefix occurs in a last name,
+                # I think it means the rest of the name is part of the last name, so prefix_joins
+                # lets us do that in the parser flow.
+                # for prefix in prefixes:
+
+                try:
+                    next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:])))
+                    j = pieces.index(next_prefix)
+                    if j == i + 1:
+                        # if there are two prefixes in sequence, join to the following piece
+                        j += 1
+                    new_piece = ' '.join(pieces[i:j])
+                    pieces = pieces[:i] + [new_piece] + pieces[j:]
+                except StopIteration:
+                    try:
+                        # if there are no more prefixes, look for a suffix to stop at
+                        stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:])))
+                        j = pieces.index(stop_at)
+                        new_piece = ' '.join(pieces[i:j])
+                        pieces = pieces[:i] + [new_piece] + pieces[j:]
+                    except StopIteration:
+                        # if there were no suffixes, nothing to stop at so join all
+                        # remaining pieces
+                        new_piece = ' '.join(pieces[i:])
+                        pieces = pieces[:i] + [new_piece]
+
         log.debug("pieces: {0}".format(pieces))
         return pieces
     
diff --git a/tests.py b/tests.py
@@ -1247,18 +1247,6 @@ def test_name_is_conjunctions(self):
         hn = HumanName("e and e")
         self.m(hn.first, "e and e", hn)
 
-    def test_portuguese_dos(self):
-        hn = HumanName("Rafael Sousa dos Anjos")
-        self.m(hn.first, "Rafael", hn)
-        self.m(hn.middle, "Sousa", hn)
-        self.m(hn.last, "dos Anjos", hn)
-
-    def test_portuguese_prefixes(self):
-        hn = HumanName("Joao da Silva do Amaral de Souza")
-        self.m(hn.first, "Joao", hn)
-        self.m(hn.middle, "", hn)
-        self.m(hn.last, "da Silva do Amaral de Souza", hn)
-
 
 class ConstantsCustomization(HumanNameTestBase):
 
@@ -1518,6 +1506,42 @@ def test_title_two_part_last_name_with_suffix_in_first_part(self):
         self.m(hn.last, "von bergen wessels", hn)
         self.m(hn.suffix, "MD, III", hn)
 
+    def test_portuguese_dos(self):
+        hn = HumanName("Rafael Sousa dos Anjos")
+        self.m(hn.first, "Rafael", hn)
+        self.m(hn.middle, "Sousa", hn)
+        self.m(hn.last, "dos Anjos", hn)
+
+    def test_portuguese_prefixes(self):
+        hn = HumanName("Joao da Silva do Amaral de Souza")
+        self.m(hn.first, "Joao", hn)
+        self.m(hn.middle, "da Silva do Amaral", hn)
+        self.m(hn.last, "de Souza", hn)
+
+    def test_three_conjunctions(self):
+        hn = HumanName("Dr. Juan Q. Xavier de la dos Vega III")
+        self.m(hn.first, "Juan", hn)
+        self.m(hn.last, "de la dos Vega", hn)
+        self.m(hn.title, "Dr.", hn)
+        self.m(hn.middle, "Q. Xavier", hn)
+        self.m(hn.suffix, "III", hn)
+
+    def test_lastname_three_conjunctions(self):
+        hn = HumanName("de la dos Vega, Dr. Juan Q. Xavier III")
+        self.m(hn.first, "Juan", hn)
+        self.m(hn.last, "de la dos Vega", hn)
+        self.m(hn.title, "Dr.", hn)
+        self.m(hn.middle, "Q. Xavier", hn)
+        self.m(hn.suffix, "III", hn)
+
+    def test_comma_three_conjunctions(self):
+        hn = HumanName("Dr. Juan Q. Xavier de la dos Vega, III")
+        self.m(hn.first, "Juan", hn)
+        self.m(hn.last, "de la dos Vega", hn)
+        self.m(hn.title, "Dr.", hn)
+        self.m(hn.middle, "Q. Xavier", hn)
+        self.m(hn.suffix, "III", hn)
+
 
 class SuffixesTestCase(HumanNameTestBase):