Skip to content

Improve GB postcode regex #15

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 34 additions & 23 deletions pyap/source_GB/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,32 +353,43 @@

postal_code = r"""
(?P<postal_code>
(?:
(?:[gG][iI][rR] {0,}0[aA]{2})|
# Girobank postcode
(?:[gG][iI][rR] {0,}0[aA]{2})|
(?: # British Overseas Territories in usual format
(?:
(?:
[aA][sS][cC][nN]|
[sS][tT][hH][lL]|
[tT][dD][cC][uU]|
[bB][bB][nN][dD]|
[bB][iI][qQ][qQ]|
[fF][iI][qQ][qQ]|
[pP][cC][rR][nN]|
[sS][iI][qQ][qQ]|
[iT][kK][cC][aA]
)
\ {0,}1[zZ]{2}
)|
[aA][sS][cC][nN]|
[sS][tT][hH][lL]|
[tT][dD][cC][uU]|
[bB][bB][nN][dD]|
[bB][iI][qQ][qQ]|
[fF][iI][qQ][qQ]|
[pP][cC][rR][nN]|
[sS][iI][qQ][qQ]|
[iT][kK][cC][aA]
)
\ {0,}1[zZ]{2}
)|
(?: # British Overseas Territories in zip-code format
(KY[0-9]|MSR|VG|AI)[ -]{0,}[0-9]{4}
)|
# (?: # Bermuda including this causes too many false positives, so excluded for now
# [a-zA-Z]{2}\ {0,}[0-9]{2}
# )|
(?: # British Forces Post Office
[Bb][Ff][Pp][Oo]\ {0,}[0-9]{1,4}
)|
(?: # Mainland British postcodes
(?:
(?:
(?:[a-pr-uwyzA-PR-UWYZ][a-hk-yxA-HK-XY]?[0-9][0-9]?)|
(?:
(?:[a-pr-uwyzA-PR-UWYZ][0-9][a-hjkstuwA-HJKSTUW])|
(?:[a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y][0-9][abehmnprv-yABEHMNPRV-Y])
)
)
\ {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2}
(?:[Ww][Cc][0-9][abehmnprvwxyABEHMNPRVWXY])|
(?:[Ee][Cc][1-4][abehmnprvwxyABEHMNPRVWXY])|
(?:[Nn][Ww]1[Ww])|
(?:[Ss][Ee]1[Pp])|
(?:[Ss][Ww]1[abehmnprvwxyABEHMNPRVWXY])|
(?:[EeNnWw]1[a-hjkpstuwA-HJKPSTUW])|
(?:[BbEeGgLlMmNnSsWw][0-9][0-9]?)|
(?:[a-pr-uwyzA-PR-UWYZ][a-hk-yxA-HK-XY][0-9][0-9]?)
)
\ {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2}
)
) # end postal_code
"""
Expand Down
31 changes: 30 additions & 1 deletion test_parser_gb.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

import re
import pytest
import pandas as pd
import pathlib
import zipfile
import requests
import itertools
import pyap
import pyap.parser
Expand Down Expand Up @@ -392,6 +396,31 @@ def test_postal_code(input, expected):
execute_matching_test(input, expected, data_gb.postal_code)


def test_postal_code_extensive():
"""Test post code regex against a list of all post codes."""
zip_location = pathlib.Path(__file__).parent / 'code_point_uk_post_codes.zip'

# Download an extensive list of all postcodes
if not zip_location.exists():
url = 'https://api.os.uk/downloads/v1/products/CodePointOpen/downloads?area=GB&format=CSV&redirect'
r = requests.get(url, allow_redirects=True)
with open(zip_location.name, 'wb') as f:
f.write(r.content)

# Run the detector against this list to ensure we pickup all post codes
with zipfile.ZipFile(zip_location.name) as zip:
data_file_names = [
name for name in zip.namelist()
if name.lower().endswith('.csv') and name.startswith('Data/CSV')
]
for data_file_name in data_file_names:
with zip.open(data_file_name) as data_file:
df = pd.read_csv(data_file, header=None)
post_codes = df.loc[:, 0].values.tolist()
for post_code in post_codes:
execute_matching_test(post_code, True, data_gb.postal_code)


@pytest.mark.parametrize("input,expected", [
# positive assertions
("Montana", True),
Expand Down Expand Up @@ -432,7 +461,7 @@ def test_country(input, expected):
# positive assertions
("11-59 High Road, East Finchley London, N2 8AW", True),
("88 White parkway, Stanleyton, L2 3DB", True),
("Studio 96D, Graham roads, Westtown, L1A 3GP, Great Britain", True),
("Studio 96D, Graham roads, Westtown, L1 3GP, Great Britain", True),
("01 Brett mall, Lake Donna, W02 3JQ", True),
("Flat 05, Byrne shores, Howardshire, GL6 8EA, UK", True),
("12 Henry route, Clementsborough, W2 5DQ", True),
Expand Down
13 changes: 8 additions & 5 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
envlist = py27, py38

[testenv]
commands = py.test \
test_parser.py \
test_parser_ca.py \
test_parser_us.py \
test_parser_gb.py
commands =
py.test \
test_parser.py \
test_parser_ca.py \
test_parser_us.py \
test_parser_gb.py
deps =
pytest
pandas
requests