"""
Copyright 2022 Balacoon
class that loads grammars from directory with
normalization rules.
directory should have specific structure.
"""
import importlib
import os
import sys
from typing import Tuple
import pynini
from pynini.export import grm
from learn_to_normalize.grammar_utils.base_fst import BaseFst
[docs]class GrammarLoader:
"""
Loads normalization grammars from directory of specific structure
"""
NORMALIZATION_MODES = ["classify", "verbalize"]
CONFIGURATION_NAMES = [
"tokenizer.ascii_proto",
"verbalizer.ascii_proto",
"verbalizer_serialization_spec.ascii_proto",
]
[docs] def __init__(self, grammars_dir: str):
# all imports within grammars repo are done relative to parent dir of grammar repo.
# so <grammar_repo>/../ should be added to PYTHONPATH and suffix stored to be used
# during grammar loading. First <grammar_repo> directory should be identified
grammars_dir = os.path.abspath(grammars_dir)
# find the repo location
repo_dir = (
os.popen("cd {} && git rev-parse --show-toplevel".format(grammars_dir))
.read()
.strip()
)
# add to PYTHONPATH directory one level higher than repo
above_repo_dir = os.path.join(repo_dir, "..")
sys.path.append(above_repo_dir)
# find out location of rules inside of grammars repo
self._module_prefix = os.path.relpath(grammars_dir, above_repo_dir).replace(
"/", "."
)
self._grammars_dir = grammars_dir
# test that there is minimal entry points for export
for mode in self.NORMALIZATION_MODES:
if not os.path.isfile(os.path.join(grammars_dir, mode, mode + ".py")):
raise RuntimeError(
"Grammars directory is missing entry point: {}/{}.py".format(
mode, mode
)
)
for config_name in self.CONFIGURATION_NAMES:
config_path = os.path.join(grammars_dir, "configs", config_name)
if not os.path.isfile(config_path):
raise RuntimeError(
"Grammars directory is missing text_normalization proto config: {}".format(
config_path
)
)
[docs] def get_grammar(self, module_str: str, class_name: str) -> BaseFst:
"""
Loads grammar from grammar dir based on module name and class name of the grammar
Paramaters
----------
module_str: str
module to load, for ex. classify.cardinal
class_name: str
fst class to load from the module, for ex. CardinalFst
Returns
-------
grammar: BaseFST
grammar loaded by the name and initialized
"""
grammar_path = os.path.join(
self._grammars_dir, module_str.replace(".", "/") + ".py"
)
if not os.path.isfile(grammar_path):
raise RuntimeError(
"Cant get grammar from {}, {} doesn't exist".format(
module_str, grammar_path
)
)
module = importlib.import_module(self._module_prefix + "." + module_str)
if not hasattr(module, class_name):
raise RuntimeError("{} doesn't have {}".format(grammar_path, class_name))
grammar_class = getattr(module, class_name)
grammar = grammar_class()
assert isinstance(
grammar, BaseFst
), "loaded grammar does not inherit base grammar"
return grammar
@staticmethod
def _serialize_fst(fst: pynini.FstLike, rule_name: str, out_path: str) -> bytes:
"""
Exports FAR, reads exported far as bytes.
TODO: check if possible to serialize without saving to disk
Parameters
----------
fst: pynini.FstLike
fst of a grammar to serialize
rule_name: str
name under which to store fst in FAR
out_path: str
path to export FAR to during serialization
Returns
-------
res: bytes
serialized fst as bytes
"""
exporter = grm.Exporter(out_path)
exporter[rule_name] = fst
exporter.close()
with open(out_path, "rb") as fp:
res = fp.read()
return res
[docs] def get_verbalizer(self, work_dir: str) -> bytes:
"""
Exports verbalizer, stores FAR on disk, returns serialized FAR
Parameters
----------
work_dir: str
directory to store verbalizer FAR to
Returns
-------
res: bytes
serialized verbalizer
"""
verb_path = os.path.join(work_dir, "verbalizer.far")
verb = self.get_grammar("verbalize.verbalize", "VerbalizeFst")
# should match rule name in configs/verbalizer.ascii_proto
return self._serialize_fst(verb.fst, "ALL", verb_path)
[docs] def get_tokenizer(self, work_dir: str) -> bytes:
"""
Exports tokenizer/classifier, stores FAR on disk, returns serialized FAR
Parameters
----------
work_dir: str
directory to store tokenizer FAR to
Returns
-------
res: bytes
serialized tokenizer
"""
classify_path = os.path.join(work_dir, "tokenizer.far")
classify = self.get_grammar("classify.classify", "ClassifyFst")
# should match rule name in configs/tokenizer.ascii_proto
return self._serialize_fst(classify.fst, "TOKENIZE_AND_CLASSIFY", classify_path)
@staticmethod
def _read_text_file(path: str) -> str:
"""
Reads file from given path. Helper function to read config files
Parameters
----------
path: str
path to read from
Returns
-------
content: str
read content of the file
"""
with open(path, "r", encoding="utf-8") as fp:
content = fp.read()
return content
[docs] def get_configs(self) -> Tuple[str, str, str]:
"""
Loads configurations required by text_normalization
Returns
-------
configs: Tuple[str, str, str]
Loaded proto configurations as strings.
There are 3 configurations required by text_normalization package:
tokenizer configuration - defines name of the grammar and main rule
verbalizer configuration - defines name of grammar and main rule
verbalizer serialization specification - fields of tokenized semiotic classes
"""
configs = [
self._read_text_file(os.path.join(self._grammars_dir, "configs", x))
for x in self.CONFIGURATION_NAMES
]
return configs