Source code for learn_to_normalize.grammar_utils.grammar_loader

"""
Copyright 2022 Balacoon

class that loads grammars from directory with
normalization rules.
directory should have specific structure.
"""

import importlib
import os
import sys
from typing import Tuple

import pynini
from pynini.export import grm

from learn_to_normalize.grammar_utils.base_fst import BaseFst


[docs]class GrammarLoader:
    """
    Loads normalization grammars from directory of specific structure
    """

    NORMALIZATION_MODES = ["classify", "verbalize"]
    CONFIGURATION_NAMES = [
        "tokenizer.ascii_proto",
        "verbalizer.ascii_proto",
        "verbalizer_serialization_spec.ascii_proto",
    ]

[docs]    def __init__(self, grammars_dir: str):
        # all imports within grammars repo are done relative to parent dir of grammar repo.
        # so <grammar_repo>/../ should be added to PYTHONPATH and suffix stored to be used
        # during grammar loading. First <grammar_repo> directory should be identified
        grammars_dir = os.path.abspath(grammars_dir)
        # find the repo location
        repo_dir = (
            os.popen("cd {} && git rev-parse --show-toplevel".format(grammars_dir))
            .read()
            .strip()
        )
        # add to PYTHONPATH directory one level higher than repo
        above_repo_dir = os.path.join(repo_dir, "..")
        sys.path.append(above_repo_dir)
        # find out location of rules inside of grammars repo
        self._module_prefix = os.path.relpath(grammars_dir, above_repo_dir).replace(
            "/", "."
        )
        self._grammars_dir = grammars_dir

        # test that there is minimal entry points for export
        for mode in self.NORMALIZATION_MODES:
            if not os.path.isfile(os.path.join(grammars_dir, mode, mode + ".py")):
                raise RuntimeError(
                    "Grammars directory is missing entry point: {}/{}.py".format(
                        mode, mode
                    )
                )
        for config_name in self.CONFIGURATION_NAMES:
            config_path = os.path.join(grammars_dir, "configs", config_name)
            if not os.path.isfile(config_path):
                raise RuntimeError(
                    "Grammars directory is missing text_normalization proto config: {}".format(
                        config_path
                    )
                )

[docs]    def get_grammar(self, module_str: str, class_name: str) -> BaseFst:
        """
        Loads grammar from grammar dir based on module name and class name of the grammar

        Paramaters
        ----------
        module_str: str
            module to load, for ex. classify.cardinal
        class_name: str
            fst class to load from the module, for ex. CardinalFst

        Returns
        -------
        grammar: BaseFST
            grammar loaded by the name and initialized
        """
        grammar_path = os.path.join(
            self._grammars_dir, module_str.replace(".", "/") + ".py"
        )
        if not os.path.isfile(grammar_path):
            raise RuntimeError(
                "Cant get grammar from {}, {} doesn't exist".format(
                    module_str, grammar_path
                )
            )
        module = importlib.import_module(self._module_prefix + "." + module_str)
        if not hasattr(module, class_name):
            raise RuntimeError("{} doesn't have {}".format(grammar_path, class_name))
        grammar_class = getattr(module, class_name)
        grammar = grammar_class()
        assert isinstance(
            grammar, BaseFst
        ), "loaded grammar does not inherit base grammar"
        return grammar

    @staticmethod
    def _serialize_fst(fst: pynini.FstLike, rule_name: str, out_path: str) -> bytes:
        """
        Exports FAR, reads exported far as bytes.
        TODO: check if possible to serialize without saving to disk

        Parameters
        ----------
        fst: pynini.FstLike
            fst of a grammar to serialize
        rule_name: str
            name under which to store fst in FAR
        out_path: str
            path to export FAR to during serialization

        Returns
        -------
        res: bytes
            serialized fst as bytes
        """
        exporter = grm.Exporter(out_path)
        exporter[rule_name] = fst
        exporter.close()
        with open(out_path, "rb") as fp:
            res = fp.read()
        return res

[docs]    def get_verbalizer(self, work_dir: str) -> bytes:
        """
        Exports verbalizer, stores FAR on disk, returns serialized FAR

        Parameters
        ----------
        work_dir: str
            directory to store verbalizer FAR to

        Returns
        -------
        res: bytes
            serialized verbalizer
        """
        verb_path = os.path.join(work_dir, "verbalizer.far")
        verb = self.get_grammar("verbalize.verbalize", "VerbalizeFst")
        # should match rule name in configs/verbalizer.ascii_proto
        return self._serialize_fst(verb.fst, "ALL", verb_path)

[docs]    def get_tokenizer(self, work_dir: str) -> bytes:
        """
        Exports tokenizer/classifier, stores FAR on disk, returns serialized FAR

        Parameters
        ----------
        work_dir: str
            directory to store tokenizer FAR to

        Returns
        -------
        res: bytes
            serialized tokenizer
        """
        classify_path = os.path.join(work_dir, "tokenizer.far")
        classify = self.get_grammar("classify.classify", "ClassifyFst")
        # should match rule name in configs/tokenizer.ascii_proto
        return self._serialize_fst(classify.fst, "TOKENIZE_AND_CLASSIFY", classify_path)

    @staticmethod
    def _read_text_file(path: str) -> str:
        """
        Reads file from given path. Helper function to read config files

        Parameters
        ----------
        path: str
            path to read from

        Returns
        -------
        content: str
            read content of the file
        """
        with open(path, "r", encoding="utf-8") as fp:
            content = fp.read()
        return content

[docs]    def get_configs(self) -> Tuple[str, str, str]:
        """
        Loads configurations required by text_normalization

        Returns
        -------
        configs: Tuple[str, str, str]
            Loaded proto configurations as strings.
            There are 3 configurations required by text_normalization package:
            tokenizer configuration - defines name of the grammar and main rule
            verbalizer configuration - defines name of grammar and main rule
            verbalizer serialization specification - fields of tokenized semiotic classes
        """
        configs = [
            self._read_text_file(os.path.join(self._grammars_dir, "configs", x))
            for x in self.CONFIGURATION_NAMES
        ]
        return configs