Skip to content

Commit 10f34e4

Browse files
committed
refactor prefix handling to correctly parse Portuguese prefixes #72
while continuing to support multiple names after a prefix #23
1 parent e9fd11e commit 10f34e4

File tree

4 files changed

+108
-67
lines changed

4 files changed

+108
-67
lines changed

docs/customize.rst

+8-8
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,14 @@ instantiate the :py:class:`~nameparser.parser.HumanName` class (see below).
3939
Editable attributes of nameparser.config.CONSTANTS
4040
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4141

42-
* :py:class:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names.
43-
* :py:class:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David".
44-
* :py:class:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.".
45-
* :py:class:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.".
46-
* :py:class:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece.
47-
* :py:class:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name.
48-
* :py:class:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D".
49-
* :py:class:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc.
42+
* :py:obj:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names.
43+
* :py:obj:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David".
44+
* :py:obj:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.".
45+
* :py:obj:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.".
46+
* :py:obj:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece.
47+
* :py:obj:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name.
48+
* :py:obj:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D".
49+
* :py:obj:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc.
5050

5151
Each set of constants comes with :py:func:`~nameparser.config.SetManager.add` and :py:func:`~nameparser.config.SetManager.remove` methods for tuning
5252
the constants for your project. These methods automatically lower case and

nameparser/config/prefixes.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
# -*- coding: utf-8 -*-
22
from __future__ import unicode_literals
33

4-
#: Name pieces that appear before a last name. They join to the piece that follows them to make one new piece.
4+
#: Name pieces that appear before a last name. Prefixes join to the piece
5+
# that follows them to make one new piece. They can be chained together, e.g
6+
# "von der" and "de la". Because they only appear in middle or last names,
7+
# they also signifiy that all following name pieces should be in the same name
8+
# part, for example, "von" will be joined to all following pieces that are not
9+
# prefixes or suffixes, allowing recognition of double last names when they
10+
# appear after a prefixes. So in "pennie von bergen wessels MD", "von" will
11+
# join with all following name pieces until the suffix "MD", resulting in the
12+
# correct parsing of the last name "von bergen wessels".
513
PREFIXES = set([
614
'abu',
715
'bin',

nameparser/parser.py

+55-46
Original file line numberDiff line numberDiff line change
@@ -501,14 +501,6 @@ def parse_full_name(self):
501501
self.last_list.append(piece)
502502
self.suffix_list += pieces[i+1:]
503503
break
504-
if piece in self.prefix_joins:
505-
last_piece = pieces[-1:][0]
506-
if self.is_suffix(last_piece):
507-
self.last_list += pieces[i:-1]
508-
self.suffix = last_piece
509-
else:
510-
self.last_list += pieces[i:]
511-
break
512504
if not nxt:
513505
self.last_list.append(piece)
514506
continue
@@ -548,14 +540,6 @@ def parse_full_name(self):
548540
self.last_list.append(piece)
549541
self.suffix_list = pieces[i+1:] + self.suffix_list
550542
break
551-
if piece in self.prefix_joins:
552-
last_piece = pieces[-1:][0]
553-
if self.is_suffix(last_piece):
554-
self.last_list += pieces[i:-1]
555-
self.suffix_list.insert(0, last_piece)
556-
else:
557-
self.last_list += pieces[i:]
558-
break
559543
if not nxt:
560544
self.last_list.append(piece)
561545
continue
@@ -596,9 +580,6 @@ def parse_full_name(self):
596580
if self.is_suffix(piece):
597581
self.suffix_list.append(piece)
598582
continue
599-
if piece in self.prefix_joins:
600-
self.last_list += pieces[i:]
601-
break
602583
self.middle_list.append(piece)
603584
try:
604585
if parts[2]:
@@ -685,27 +666,27 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
685666
# don't join on conjunctions if there's only 2 parts
686667
if length < 3:
687668
return pieces
688-
669+
689670
rootname_pieces = [p for p in pieces if self.is_rootname(p)]
690671
total_length = len(rootname_pieces) + additional_parts_count
691-
672+
692673
# find all the conjunctions, join any conjunctions that are next to each
693674
# other, then join those newly joined conjunctions and any single
694675
# conjunctions to the piece before and after it
695-
conj_index = [i for i, piece in enumerate(pieces)
676+
conj_index = [i for i, piece in enumerate(pieces)
696677
if self.is_conjunction(piece)]
697-
678+
698679
contiguous_conj_i = []
699680
for i, val in enumerate(conj_index):
700681
try:
701682
if conj_index[i+1] == val+1:
702683
contiguous_conj_i += [val]
703684
except IndexError:
704685
pass
705-
686+
706687
contiguous_conj_i = group_contiguous_integers(conj_index)
707-
708-
delete_i = []
688+
689+
delete_i = []
709690
for i in contiguous_conj_i:
710691
if type(i) == tuple:
711692
new_piece = " ".join(pieces[ i[0] : i[1]+1] )
@@ -717,7 +698,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
717698
pieces[i] = new_piece
718699
#add newly joined conjunctions to constants to be found later
719700
self.C.conjunctions.add(new_piece)
720-
701+
721702
for i in reversed(delete_i):
722703
# delete pieces in reverse order or the index changes on each delete
723704
del pieces[i]
@@ -728,15 +709,15 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
728709

729710
# refresh conjunction index locations
730711
conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)]
731-
712+
732713
for i in conj_index:
733714
if len(pieces[i]) == 1 and total_length < 4:
734715
# if there are only 3 total parts (minus known titles, suffixes
735716
# and prefixes) and this conjunction is a single letter, prefer
736717
# treating it as an initial rather than a conjunction.
737718
# http://code.google.com/p/python-nameparser/issues/detail?id=11
738719
continue
739-
720+
740721
if i is 0:
741722
new_piece = " ".join(pieces[i:i+2])
742723
if self.is_title(pieces[i+1]):
@@ -748,8 +729,8 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
748729
for j,val in enumerate(conj_index):
749730
if val > i:
750731
conj_index[j]=val-1
751-
752-
else:
732+
733+
else:
753734
new_piece = " ".join(pieces[i-1:i+2])
754735
if self.is_title(pieces[i-1]):
755736
# when joining to a title, make new_piece a title too
@@ -767,23 +748,51 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
767748
for j,val in enumerate(conj_index):
768749
if val > i:
769750
conj_index[j] = val - rm_count
770-
771-
772-
# join prefixes to following lastnames: ['de la Vega'], ['van Buren']
751+
752+
753+
# join prefixes to following lastnames: ['de la Vega'], ['van Buren III']
773754
prefixes = list(filter(self.is_prefix, pieces))
774755
if prefixes:
775-
i = pieces.index(prefixes[0])
776-
# join everything after the prefix until the next non prefix
777-
# store joined pieces in prefix_joins. When a prefix occurs in a last name,
778-
# I think it means the rest of the name is part of the last name, so prefix_joins
779-
# lets us do that in the parser flow.
780-
non_suffixes = list(filter(lambda x: not self.is_prefix(x), pieces[i:]))
781-
if non_suffixes:
782-
j = pieces.index(non_suffixes[0])
783-
new_piece = ' '.join(pieces[i:j + 1])
784-
self.prefix_joins += [new_piece]
785-
pieces = pieces[:i] + [new_piece] + pieces[j + 1:]
786-
756+
for prefix in prefixes:
757+
try:
758+
i = pieces.index(prefix)
759+
except ValueError:
760+
# If the prefix is no longer in pieces, it's because it has been
761+
# combined with the prefix that appears right before (or before that when
762+
# chained together) in the last loop, so the index of that newly created
763+
# piece is the same as in the last loop, i==i still, and we want to join
764+
# it to the next piece.
765+
pass
766+
767+
new_piece = ''
768+
769+
# join everything after the prefix until the next non prefix
770+
# store joined pieces in prefix_joins. When a prefix occurs in a last name,
771+
# I think it means the rest of the name is part of the last name, so prefix_joins
772+
# lets us do that in the parser flow.
773+
# for prefix in prefixes:
774+
775+
try:
776+
next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:])))
777+
j = pieces.index(next_prefix)
778+
if j == i + 1:
779+
# if there are two prefixes in sequence, join to the following piece
780+
j += 1
781+
new_piece = ' '.join(pieces[i:j])
782+
pieces = pieces[:i] + [new_piece] + pieces[j:]
783+
except StopIteration:
784+
try:
785+
# if there are no more prefixes, look for a suffix to stop at
786+
stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:])))
787+
j = pieces.index(stop_at)
788+
new_piece = ' '.join(pieces[i:j])
789+
pieces = pieces[:i] + [new_piece] + pieces[j:]
790+
except StopIteration:
791+
# if there were no suffixes, nothing to stop at so join all
792+
# remaining pieces
793+
new_piece = ' '.join(pieces[i:])
794+
pieces = pieces[:i] + [new_piece]
795+
787796
log.debug("pieces: {0}".format(pieces))
788797
return pieces
789798

tests.py

+36-12
Original file line numberDiff line numberDiff line change
@@ -1247,18 +1247,6 @@ def test_name_is_conjunctions(self):
12471247
hn = HumanName("e and e")
12481248
self.m(hn.first, "e and e", hn)
12491249

1250-
def test_portuguese_dos(self):
1251-
hn = HumanName("Rafael Sousa dos Anjos")
1252-
self.m(hn.first, "Rafael", hn)
1253-
self.m(hn.middle, "Sousa", hn)
1254-
self.m(hn.last, "dos Anjos", hn)
1255-
1256-
def test_portuguese_prefixes(self):
1257-
hn = HumanName("Joao da Silva do Amaral de Souza")
1258-
self.m(hn.first, "Joao", hn)
1259-
self.m(hn.middle, "", hn)
1260-
self.m(hn.last, "da Silva do Amaral de Souza", hn)
1261-
12621250

12631251
class ConstantsCustomization(HumanNameTestBase):
12641252

@@ -1518,6 +1506,42 @@ def test_title_two_part_last_name_with_suffix_in_first_part(self):
15181506
self.m(hn.last, "von bergen wessels", hn)
15191507
self.m(hn.suffix, "MD, III", hn)
15201508

1509+
def test_portuguese_dos(self):
1510+
hn = HumanName("Rafael Sousa dos Anjos")
1511+
self.m(hn.first, "Rafael", hn)
1512+
self.m(hn.middle, "Sousa", hn)
1513+
self.m(hn.last, "dos Anjos", hn)
1514+
1515+
def test_portuguese_prefixes(self):
1516+
hn = HumanName("Joao da Silva do Amaral de Souza")
1517+
self.m(hn.first, "Joao", hn)
1518+
self.m(hn.middle, "da Silva do Amaral", hn)
1519+
self.m(hn.last, "de Souza", hn)
1520+
1521+
def test_three_conjunctions(self):
1522+
hn = HumanName("Dr. Juan Q. Xavier de la dos Vega III")
1523+
self.m(hn.first, "Juan", hn)
1524+
self.m(hn.last, "de la dos Vega", hn)
1525+
self.m(hn.title, "Dr.", hn)
1526+
self.m(hn.middle, "Q. Xavier", hn)
1527+
self.m(hn.suffix, "III", hn)
1528+
1529+
def test_lastname_three_conjunctions(self):
1530+
hn = HumanName("de la dos Vega, Dr. Juan Q. Xavier III")
1531+
self.m(hn.first, "Juan", hn)
1532+
self.m(hn.last, "de la dos Vega", hn)
1533+
self.m(hn.title, "Dr.", hn)
1534+
self.m(hn.middle, "Q. Xavier", hn)
1535+
self.m(hn.suffix, "III", hn)
1536+
1537+
def test_comma_three_conjunctions(self):
1538+
hn = HumanName("Dr. Juan Q. Xavier de la dos Vega, III")
1539+
self.m(hn.first, "Juan", hn)
1540+
self.m(hn.last, "de la dos Vega", hn)
1541+
self.m(hn.title, "Dr.", hn)
1542+
self.m(hn.middle, "Q. Xavier", hn)
1543+
self.m(hn.suffix, "III", hn)
1544+
15211545

15221546
class SuffixesTestCase(HumanNameTestBase):
15231547

0 commit comments

Comments
 (0)