Source code for en_us_normalization.production.classify.electronic

Copyright 2022 Balacoon
Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
Copyright 2015 and onwards Google, Inc.

tokenize and classify electronic addresses

import pynini
from en_us_normalization.production.english_utils import get_data_file_path
from pynini.lib import pynutil

from learn_to_normalize.grammar_utils.base_fst import BaseFst
from learn_to_normalize.grammar_utils.data_loader import load_union
from learn_to_normalize.grammar_utils.shortcuts import ALPHA, CHAR, DIGIT, NOT_SPACE, TO_UPPER, NOT_PUNCT

[docs]class ElectronicFst(BaseFst): """ Finite state transducer for classifying electronic: as URLs, email addresses, etc. Electronic semiotic class can contain multiple optional fields: - protocol - in front of urls there can be a protocol, such as "http://" or "mailto://" - username - before domain, there might be a used name, separated with "@". Most common case - username in email address, for ex "". Within username there might be a password separated with a colon symbol, for ex. "" - domain - smth that goes after protocol and username. can represent the whole URL, the only non-optional in electronic address. domain may have optional prefix, such as "www." and mandatory suffix that helps to identify domain. Suffixes are 2-3 letters long (such as "com" or "io"), can be repeated (for ex. "") and separated with a dot. Usually suffixes should be spelled in pronunciation, but in some cases, they should be pronounced as a regular word (for ex. "com"). - port - optional digits after domain separated with colon symbol, for ex. "" - path - something that follow domain after a slash "/". Could be any symbols. Examples of strings that should be classified as electronic semiotic classes and their tagging: - -> electronic { username: "cdf1" domain: "" } - -> electronic { protocol: "http" username: "cat" password: "dog" domain: "" port: "8080" path: "/?231eds2@90iu" } """
[docs] def __init__(self): super().__init__(name="electronic") protocol = self._get_protocol_fst() username = self._get_username_fst() domain = self._get_domain_fst() # options for domain: # 1. protocol + user + domain << should be more probable # 2. user + domain << should be less probable # 3. protocol + domain # 4. domain protocol_username_domain = ( pynutil.add_weight(protocol + username + domain, 0.9) | (protocol + domain) | pynutil.add_weight(username + domain, 2.0) | domain ) # port is optional after domain port = pynini.closure(DIGIT, 1) port = pynini.cross(":", ' port: "') + port + pynutil.insert('"') protocol_username_domain_port = protocol_username_domain + pynini.closure( port, 0, 1 ) # after domain there could be optional path # should escape qoute so its parsed properly non_space_char = pynutil.add_weight(NOT_SPACE, 1.01) | pynini.cross('"', '\\"') | pynini.cross('\\', '\\\\\\') # should give up punctuation at the end to go to "right_punct" path = pynini.closure(pynutil.add_weight(non_space_char, 1.01)) + pynini.closure(NOT_PUNCT) path = ( pynutil.insert(' path: "') + pynini.accep("/") + path + pynutil.insert('"') ) protocol_username_domain_port_path = ( protocol_username_domain_port + pynini.closure(path, 0, 1) ) final_graph = self.add_tokens(protocol_username_domain_port_path) self._single_fst = final_graph.optimize()
@staticmethod def _get_protocol_fst() -> pynini.FstLike: """ helper function that composes fst that accepts electronic protocols, such as "http://" """ # read protocols from should be spelled out. # they are converted to upper case to signalize that with open(get_data_file_path("electronic", "protocols.tsv"), "r") as fp: protocols = fp.readlines() protocols = [ pynini.accep(x.strip()) @ pynini.closure(TO_UPPER) for x in protocols ] protocols = pynini.union(*protocols) spoken_protocols = load_union( get_data_file_path("electronic", "spoken_protocols.tsv") ) protocols |= spoken_protocols remove_optional_slashes = pynutil.delete(pynini.closure("/")) protocols = ( pynutil.insert('protocol: "') + protocols + pynini.cross(":", '" ') + remove_optional_slashes ) return protocols @staticmethod def _get_username_fst() -> pynini.FstLike: """ helper function that composes fst to accept username in electronic addresses. optional field after protocol and before domain separated with "@" """ alpha_or_digit = ALPHA | DIGIT accepted_symbols = load_union(get_data_file_path("symbols.tsv"), column=0) except_symbols = "/\\:" except_symbols = [pynini.accep(x) for x in except_symbols] except_symbols = pynini.union(*except_symbols) accepted_symbols = pynini.difference(accepted_symbols, except_symbols).optimize() # alpha characters, digits and limited set of symbols can appear in name username = alpha_or_digit + pynini.closure(alpha_or_digit | accepted_symbols) username = pynutil.insert('username: "') + username + pynutil.insert('"') # password is optional part of username password = ( pynini.cross(":", ' password: "') + pynini.closure(CHAR, 1) + pynutil.insert('"') ) password = pynini.closure(password, 0, 1) username = username + password + pynini.cross("@", " ") return username @staticmethod def _get_domain_fst() -> pynini.FstLike: """ helper function that composes fst to accept domains. domain can be recognized by suffix, such as "edu" or "". """ alpha_or_digit = ALPHA | DIGIT # domain name [a-z][0-9]\- domain = alpha_or_digit + pynini.closure(alpha_or_digit | pynini.accep("-")) # domain prefix, i.e. www. domain_prefix = pynini.cross("www.", "WWW.") | pynutil.add_weight( pynini.accep(""), 1.1 ) # domain suffix has at least two letters, i.e. "" or "eu" domain_suffix = pynutil.add_weight( pynini.closure(ALPHA, 2, 3) @ pynini.closure(TO_UPPER), 1.1 ) domain_suffix |= load_union( get_data_file_path("electronic", "spoken_domains.tsv") ) domain_suffix = pynini.closure("." + domain_suffix, 1) domain = domain_prefix + domain + domain_suffix domain = pynutil.insert('domain: "') + domain + pynutil.insert('"') return domain