"""
Copyright 2022 Balacoon
base class for all the grammars
"""
from typing import Union, List
import pynini
from pynini.lib import pynutil
from learn_to_normalize.grammar_utils.shortcuts import wrap_token, delete_space, insert_space
[docs]class BaseFst:
"""
Base class for text normalization rules. Wrapper around
pynini FST, implements some common functions used in
tokenization / verbalization
BaseFST implements a logic of connecting transducer to itself, for ex. when it is allowed
to connect a semiotic class to itself. It is expected that implementations of BaseFst
would first define self._single_fst and then can call :func:`.connect_to_self` multiple times.
At usage (when merging all transducers together), one just refers to fst which returns
multi or single fst depending on what's available.
When reusing fst in other semiotic classes you probably want to access single_fst though.
"""
[docs] def __init__(self, name: str):
self._name = name
self._single_fst = None
self._multi_fst = None
@property
def fst(self) -> pynini.FstLike:
if self._multi_fst is not None:
return self._multi_fst
assert self._single_fst is not None, "both single- and multi-token fsts are None for {}".format(self.name)
return self._single_fst
@property
def single_fst(self) -> pynini.FstLike:
return self._single_fst
[docs] def add_tokens(self, fst: pynini.FstLike) -> pynini.FstLike:
"""
Wraps fst into curly brackets and prepends with name of grammar.
Used in tokenization/classification
Parameters
----------
fst: pynini.FstLike
fst to wrap
Returns
-------
fst: pynini.FstLike
fst wrapped with grammar names
"""
return pynutil.insert("{} {{ ".format(self._name)) + fst + pynutil.insert(" }")
[docs] def delete_tokens(self, fst: pynini.FstLike) -> pynini.FstLike:
"""
Removes name grammar name from string passed for verbalization
Parameters
----------
fst: pynini.FstLike
fst to remove grammar name from
Returns
-------
fst: pynini.FstLike
fst without grammar name and trailing straight slash
"""
return pynutil.delete("{}|".format(self._name)) + fst
[docs] def connect_to_self(self, connector_in: Union[str, List[str]], connector_out: Union[str, List[str]],
connector_spaces: str = "any", weight: float = 1.0, to_closure: bool = False,
to_closure_connector: bool = False):
"""
Helper function which connects self.fst to itself through intermediate connector.
Should be applied at final stage of creating classification transducer
For example, allows to connect cardinals with a dash, i.e. "28 - 40" which means range.
It changes `self.fst` to `self.fst | (self.fst + connector + self.fst)`
Parameters
----------
connector_in: Union[str, List[str]]
which connector tokens to look for. either single connector or multiple
connector_out: Union[str, List[str]]
what is the expansion of a connector. For example "-" in case of range is expanded to "to".
If its none, transducer just deletes strings from `connector_in`
connector_spaces: str
defines which spaces are allowed around connector
`any` - means can be no or any number of spaces both form left and right from connector
`none_or_one` - means there is no spaces around connector or one from each side, for ex. 1:2 or 1 : 2.
`none` - there shouldn't be any spaces around connector
weight: float
weight to add to multi-token branch
to_closure: bool
if True, allows multiple repetitions of (connector + fst)
to_closure_connector: bool
if True, also closure connector, so multiple occurrences of same connector between tokens are allowed
"""
if isinstance(connector_in, str):
connector_in = [connector_in]
if connector_out is not None:
if isinstance(connector_out, str):
connector_out = [connector_out]
assert len(connector_in) == len(connector_out), "Number of in/out connectors should be the same!"
all_connectors = []
if connector_out:
for c_in, c_out in zip(connector_in, connector_out):
connector = pynini.cross(c_in, c_out)
connector = pynutil.insert('name: "') + connector + pynutil.insert('"')
connector = wrap_token(connector)
all_connectors.append(connector)
else:
all_connectors = [pynutil.delete(x) for x in connector_in]
final_connector = pynini.union(*all_connectors)
if to_closure_connector:
closured_connector = final_connector
if connector_out:
closured_connector = insert_space + final_connector
final_connector += pynini.closure(closured_connector)
# define spaces and surround connector with spaces
if connector_spaces == "any":
# remove all spaces (no matter how many including 0) and insert just one.
space = delete_space + insert_space
elif connector_spaces == "none_or_one":
# either accept just one space or expect no spaces and insert one
space = pynini.accep(" ") | insert_space
elif connector_spaces == "none":
# no spaces around connector expected
space = insert_space
else:
raise RuntimeError("Unexpected configuration of spaces around connector: {}".format(connector_spaces))
if connector_out:
final_connector = space + final_connector + space
else:
final_connector = space + final_connector + delete_space
extra_fst = pynutil.insert(' }') + final_connector + pynutil.insert('tokens { ') + self.single_fst
if to_closure:
extra_fst = pynini.closure(extra_fst, 1)
multi_fst = self.single_fst + extra_fst
if weight != 1.0:
multi_fst = pynutil.add_weight(multi_fst, weight)
if self._multi_fst is not None:
self._multi_fst |= multi_fst
else:
self._multi_fst = self._single_fst | multi_fst
self._multi_fst.optimize()
[docs] def apply(self, text: str) -> str:
"""
helper method to apply the grammar to input text
Parameters
----------
text: str
input string to apply transducer to
Returns
-------
res: str
transduced string. In case of tokenize/classify - returns
string parsable into protobuf. In case of verbalization,
converts the text into spoken form
"""
lattice = text @ self.fst
res = pynini.shortestpath(lattice, nshortest=1, unique=True).string()
return res