Source code for en_us_normalization.production.classify.shortening

Copyright 2022 Balacoon
Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
Copyright 2015 and onwards Google, Inc.

expand shortenings

import pynini
from en_us_normalization.production.english_utils import get_data_file_path
from pynini.lib import pynutil

from learn_to_normalize.grammar_utils.base_fst import BaseFst
from learn_to_normalize.grammar_utils.data_loader import load_mapping
from learn_to_normalize.grammar_utils.shortcuts import LOWER, TO_LOWER

[docs]class ShorteningFst(BaseFst): """ Finite state transducer for discovering shortenings, such as Mrs. or prof. All shortenings and their mappings are stored in: - shortenings/case_agnostic.tsv - shortenings that should be expanded for any case - shortenings/cased.tsv - shortenings that require precise writing as in the data file to be expanded Shortenings are expanded immediately, so no need to separate verbalization or dedicated semiotic class. Examples of input/output strings: - mrs. -> name: "misses" """
[docs] def __init__(self): super().__init__(name="shortening") # some custom shortenings that require context # 1. street vs saint. partially resolved when full address is provided. delete_optional_dot = pynini.closure(pynutil.delete("."), 0, 1) st = (pynini.accep("st") | pynini.accep("ST") | pynini.accep("St")) + delete_optional_dot st_street = pynini.cross(st, "street") graph = TO_LOWER + pynini.closure(LOWER, 1) + pynini.accep(" ") + st_street st_saint = pynini.cross(st, "saint") graph |= st_saint + pynini.accep(" ") + TO_LOWER + pynini.closure(LOWER, 1) graph |= load_mapping( get_data_file_path("shortenings", "case_agnostic.tsv"), key_case_agnostic=True, key_with_dot=True, ) graph |= load_mapping( get_data_file_path("shortenings", "cased.tsv"), key_case_agnostic=False ) graph = pynutil.insert('name: "') + graph + pynutil.insert('"') self._single_fst = graph.optimize()