Source code for en_us_normalization.production.classify.address

"""
Copyright Balacoon 2022

tokenizes and classifies addresses
"""

import pynini
from en_us_normalization.production.english_utils import get_data_file_path
from pynini.lib import pynutil

from learn_to_normalize.grammar_utils.base_fst import BaseFst
from learn_to_normalize.grammar_utils.data_loader import load_mapping
from learn_to_normalize.grammar_utils.shortcuts import ALPHA, DIGIT, LOWER, TO_LOWER, TO_UPPER, UPPER


[docs]class AddressFst(BaseFst): """ Finite state transducer for classifying address. Address consists of multiple slots, most of which are optional. Those slots are: 1. house number - mandatory 2. street, consisting of name, type (road, street, square, etc), pre- or post- directional (N for north) - mandatory 3. suite - apartment or house number, consists of type and number (for ex. Apt #23) - optional 4. town - possibly multi-word town (for ex. San-Francisco) - optional 5. state - usually abbreviated state (for ex. CA) - optional 6. zip-code - 5-digit number with optional dash-separated 4-digits extension (for ex. 45149-3214). Another option for zip-code is british format zip code, such as "SW1W 0NY". That one consitst of outcode and incode separated by space - optional Examples of addresses and their parsing: - 1599 Curabitur Rd. Bandera South Dakota 45149 -> address { house: "1599" street_name: "Curabitur" street_type: "road" town: "Bandera" state: "South Dakota" zip: "45149"} - 123 N Malanyuka St. SE, Apt #23 San-Francisco CA 45149-3214 -> address { house: "123" pre_directional: "north" street_name: "Malanyuka" street_type: "street" post_directional: "south east" suite_type: "apartment" suite_number: "23" town: "San Francisco" state: "california" zip: "451493214"} """
[docs] def __init__(self): super().__init__(name="address") # house number - just digits with optional dash, for ex. 123, 928-3313 house_number = pynini.closure(DIGIT, 1, 5) house_number |= pynini.closure(DIGIT, 1, 3) + pynini.cross("-", "") + pynini.closure(DIGIT, 1, 4) house_number = pynutil.insert('house: "') + house_number + pynutil.insert('"') # street moved to separate method given complexity street = self._get_street_fst() # suite moved to separate method given complexity suite = self._get_suite_fst() # town is just letters. can be multi-word, contain dash and end with coma town = pynini.closure(ALPHA, 1) # there can be a compositional town with dash, for ex. Wilkes-Barre town = town + pynini.closure(pynini.cross("-", " ") + town) # there can be multiple words in town name, town can optionally have comma at the end town = ( town + pynini.closure(pynini.accep(" ") + pynutil.add_weight(town, 2.1), 0, 2) + pynini.closure(pynutil.delete(","), 0, 1) ) # more weight for town to start from capital town = (pynutil.add_weight(LOWER, 100) | UPPER) + town town = pynutil.add_weight( pynutil.insert('town: "') + town + pynutil.insert('"'), 1.1 ) # state just comes from predefined mapping file, for ex.: CA, california state = load_mapping( get_data_file_path("address", "state.tsv"), key_case_agnostic=False, key_with_dot=False, val_case_agnostic=True, ) state = pynutil.insert('state: "') + state + pynutil.insert('"') # regular zip is 5 digits zipcode = pynini.closure(DIGIT, 5, 5) # zip can have more 4-digit zip extension zipcode = zipcode + pynini.closure( pynutil.delete("-") + pynini.closure(DIGIT, 4, 4), 0, 1 ) # british zip can have letters british_zip_outcode = ( pynini.closure(UPPER, 1, 2) + pynini.closure(DIGIT, 1, 2) + pynini.closure(UPPER, 0, 1) ) british_zip_incode = DIGIT + UPPER + UPPER british_zip = ( british_zip_outcode + pynini.closure(pynutil.delete(" ")) + british_zip_incode ) zipcode |= british_zip zipcode = pynutil.insert('zip: "') + zipcode + pynutil.insert('"') # combine house number and street house_street = house_number + pynini.accep(" ") + street # combine suite, house number and street. suite can go before house and street suite_house_street = ( suite + pynini.accep(" ") + house_street ) # or after suite_house_street |= ( house_street + pynini.closure(pynutil.delete(",")) + pynini.accep(" ") + suite ) # or it can be absent suite_house_street |= house_street address = ( suite_house_street + pynini.closure(pynini.accep(" ") + town, 0, 1) + pynini.closure(pynini.accep(" ") + state, 0, 1) + pynini.closure(pynini.accep(" ") + zipcode, 0, 1) ) final_graph = self.add_tokens(address) self._single_fst = final_graph.optimize()
@staticmethod def _get_street_fst(): """ street consists of name, type, pre/post direction """ # street name is a sequence of letters, for ex. Aqua street_name = pynini.closure(ALPHA, 1) # there can be a dot at the end of street, for ex. "Neque." street_name = street_name + pynini.closure(pynutil.delete(".")) # there can be a compositional street name with dash street_name = street_name + pynini.closure(pynini.cross("-", " ") + street_name) # there can be street name from multiple words street_name = street_name + pynini.closure( pynini.accep(" ") + pynutil.add_weight(street_name, 2.1), 0, 2 ) # having street name starting from capital letter is much more probable street_name = (pynutil.add_weight(LOWER, 100) | UPPER) + street_name street_name = ( pynutil.insert('street_name: "') + street_name + pynutil.insert('"') ) # Rd or rd. or Road, load mapping from a file street_type = load_mapping( get_data_file_path("address", "street_type.tsv"), key_case_agnostic=True, key_with_dot=True, val_case_agnostic=True, ) street_type = ( pynutil.insert('street_type: "') + street_type + pynutil.insert('"') ) # N or north directional = load_mapping( get_data_file_path("address", "directional.tsv"), key_case_agnostic=True, key_with_dot=True, val_case_agnostic=True, ) pre_directional = ( pynutil.insert('pre_directional: "') + directional + pynutil.insert('"') + pynini.accep(" ") ) post_directional = ( pynini.accep(" ") + pynutil.insert('post_directional: "') + directional + pynutil.insert('"') ) # street is combination of street name and street type street = street_name + pynini.accep(" ") + street_type street = ( pynini.closure(pre_directional, 0, 1) + street + pynini.closure(post_directional, 0, 1) ) return street @staticmethod def _get_suite_fst(): """ suite - some extra information that goes after house number and street. It could be apartment number or PO Box. suite consists of suite type and suite number. """ # P.O. Box - special mark for just a post box, not actual address p = pynini.cross("p", "P") | pynini.accep("P") + pynini.closure( pynutil.delete(".") ) o = pynini.cross("o", "O") | pynini.accep("O") + pynini.closure( pynutil.delete(".") ) box = pynini.closure(LOWER | TO_LOWER, 3, 3) @ pynini.accep("box") po_box = p + pynini.closure(pynutil.delete(" ")) + o + pynini.accep(" ") + box # STE or APT - load possible suite types from a file suite_type = load_mapping( get_data_file_path("address", "suite_type.tsv"), key_case_agnostic=True, key_with_dot=True, val_case_agnostic=True, ) suite_type |= po_box suite_type = pynutil.insert('suite_type: "') + suite_type + pynutil.insert('"') # optional prefix of suite number "#" suite_number_prefix = pynini.closure( pynutil.delete("#") + pynini.closure(pynutil.delete(" "), 0, 1), 0, 1 ) suite_number_digit = pynini.closure(DIGIT, 1) suite_number_alpha = UPPER | TO_UPPER # suite number be digits, upper-case letters or both suite_number = ( suite_number_digit | suite_number_alpha | (suite_number_digit + suite_number_alpha) ) suite_number = suite_number_prefix + suite_number suite_number = ( pynutil.insert('suite_number: "') + suite_number + pynutil.insert('"') ) # combine suite type and suite number, for ex.: Ap #32 suite = suite_type + pynini.accep(" ") + suite_number # suite can just contain number, but with lower weight suite |= pynutil.add_weight(suite_number, 2.1) # it can be in reverse order: first number then type suite |= suite_number + pynini.accep(" ") + suite_type return suite