Source code for learn_to_normalize.grammar_utils.base_fst

"""
Copyright 2022 Balacoon

base class for all the grammars
"""

from typing import Union, List

import pynini
from pynini.lib import pynutil

from learn_to_normalize.grammar_utils.shortcuts import wrap_token, delete_space, insert_space


[docs]class BaseFst: """ Base class for text normalization rules. Wrapper around pynini FST, implements some common functions used in tokenization / verbalization BaseFST implements a logic of connecting transducer to itself, for ex. when it is allowed to connect a semiotic class to itself. It is expected that implementations of BaseFst would first define self._single_fst and then can call :func:`.connect_to_self` multiple times. At usage (when merging all transducers together), one just refers to fst which returns multi or single fst depending on what's available. When reusing fst in other semiotic classes you probably want to access single_fst though. """
[docs] def __init__(self, name: str): self._name = name self._single_fst = None self._multi_fst = None
@property def fst(self) -> pynini.FstLike: if self._multi_fst is not None: return self._multi_fst assert self._single_fst is not None, "both single- and multi-token fsts are None for {}".format(self.name) return self._single_fst @property def single_fst(self) -> pynini.FstLike: return self._single_fst
[docs] def add_tokens(self, fst: pynini.FstLike) -> pynini.FstLike: """ Wraps fst into curly brackets and prepends with name of grammar. Used in tokenization/classification Parameters ---------- fst: pynini.FstLike fst to wrap Returns ------- fst: pynini.FstLike fst wrapped with grammar names """ return pynutil.insert("{} {{ ".format(self._name)) + fst + pynutil.insert(" }")
[docs] def delete_tokens(self, fst: pynini.FstLike) -> pynini.FstLike: """ Removes name grammar name from string passed for verbalization Parameters ---------- fst: pynini.FstLike fst to remove grammar name from Returns ------- fst: pynini.FstLike fst without grammar name and trailing straight slash """ return pynutil.delete("{}|".format(self._name)) + fst
[docs] def connect_to_self(self, connector_in: Union[str, List[str]], connector_out: Union[str, List[str]], connector_spaces: str = "any", weight: float = 1.0, to_closure: bool = False, to_closure_connector: bool = False): """ Helper function which connects self.fst to itself through intermediate connector. Should be applied at final stage of creating classification transducer For example, allows to connect cardinals with a dash, i.e. "28 - 40" which means range. It changes `self.fst` to `self.fst | (self.fst + connector + self.fst)` Parameters ---------- connector_in: Union[str, List[str]] which connector tokens to look for. either single connector or multiple connector_out: Union[str, List[str]] what is the expansion of a connector. For example "-" in case of range is expanded to "to". If its none, transducer just deletes strings from `connector_in` connector_spaces: str defines which spaces are allowed around connector `any` - means can be no or any number of spaces both form left and right from connector `none_or_one` - means there is no spaces around connector or one from each side, for ex. 1:2 or 1 : 2. `none` - there shouldn't be any spaces around connector weight: float weight to add to multi-token branch to_closure: bool if True, allows multiple repetitions of (connector + fst) to_closure_connector: bool if True, also closure connector, so multiple occurrences of same connector between tokens are allowed """ if isinstance(connector_in, str): connector_in = [connector_in] if connector_out is not None: if isinstance(connector_out, str): connector_out = [connector_out] assert len(connector_in) == len(connector_out), "Number of in/out connectors should be the same!" all_connectors = [] if connector_out: for c_in, c_out in zip(connector_in, connector_out): connector = pynini.cross(c_in, c_out) connector = pynutil.insert('name: "') + connector + pynutil.insert('"') connector = wrap_token(connector) all_connectors.append(connector) else: all_connectors = [pynutil.delete(x) for x in connector_in] final_connector = pynini.union(*all_connectors) if to_closure_connector: closured_connector = final_connector if connector_out: closured_connector = insert_space + final_connector final_connector += pynini.closure(closured_connector) # define spaces and surround connector with spaces if connector_spaces == "any": # remove all spaces (no matter how many including 0) and insert just one. space = delete_space + insert_space elif connector_spaces == "none_or_one": # either accept just one space or expect no spaces and insert one space = pynini.accep(" ") | insert_space elif connector_spaces == "none": # no spaces around connector expected space = insert_space else: raise RuntimeError("Unexpected configuration of spaces around connector: {}".format(connector_spaces)) if connector_out: final_connector = space + final_connector + space else: final_connector = space + final_connector + delete_space extra_fst = pynutil.insert(' }') + final_connector + pynutil.insert('tokens { ') + self.single_fst if to_closure: extra_fst = pynini.closure(extra_fst, 1) multi_fst = self.single_fst + extra_fst if weight != 1.0: multi_fst = pynutil.add_weight(multi_fst, weight) if self._multi_fst is not None: self._multi_fst |= multi_fst else: self._multi_fst = self._single_fst | multi_fst self._multi_fst.optimize()
[docs] def apply(self, text: str) -> str: """ helper method to apply the grammar to input text Parameters ---------- text: str input string to apply transducer to Returns ------- res: str transduced string. In case of tokenize/classify - returns string parsable into protobuf. In case of verbalization, converts the text into spoken form """ lattice = text @ self.fst res = pynini.shortestpath(lattice, nshortest=1, unique=True).string() return res