Source code for en_us_normalization.production.classify.roman

"""
Copyright 2022 Balacoon
Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
Copyright 2015 and onwards Google, Inc.

tokenize and classify roman numbers
"""

import pynini
from en_us_normalization.production.classify.cardinal import CardinalFst
from en_us_normalization.production.english_utils import get_data_file_path
from pynini.lib import pynutil

from learn_to_normalize.grammar_utils.base_fst import BaseFst
from learn_to_normalize.grammar_utils.data_loader import load_union
from learn_to_normalize.grammar_utils.shortcuts import CHAR, SIGMA, insert_space, DIGIT


[docs]class RomanFst(BaseFst): """ Finite state transducer for classifying romans (III, IV, etc). In order to convert roman numbers, mappings from data files are used: - roman/digit_teen.tsv - contains mapping for numbers from 1 to 49. - roman/ties.tsv - contains mapping for dozens, i.e. 50, 60, ... - roman/hunderds.tsv - contains mapping for hundreds, i.e. 100, 200, ... Roman transducer reuses cardinal transducer to accept digits. Depending on the context, specifically predecessing word, it should be possible to define if the roman digit is cardinal or ordinal. - roman/cardinal_prefixes.tsv - contains cardinal prefixes, such as "Chapter" - roman/ordinal_prefixes.tsv - contains ordinal prefixes, such as "George" In case roman number doesn't have a known prefix, i.e. standalone roman number, it should be treated carefully. Typical mistakes: - roman number can be confused with abbreviation - roman number that consists of a single character, such as "I". - "XXX" - denotes pornographic materials, should have bigger weight Examples of transducer input/output: - IV -> roman { cardinal { count: "4" } } - George I -> roman { prefix: "george" ordinal { order: "1" } } - CHAPTER XIX -> roman { prefix: "chapter" cardinal { count: "1" } } """
[docs] def __init__(self, cardinal: CardinalFst = None): """ cosntructor for roman numbers transducer Parameters ---------- cardinal: CardinalFst transducer for cardinal numbers to reuse. if not provided, will be created from scratch """ super().__init__(name="roman") if cardinal is None: cardinal = CardinalFst() digit_teen = pynini.string_file(get_data_file_path("roman", "digit_teen.tsv")) ties = pynini.string_file(get_data_file_path("roman", "ties.tsv")) hundreds = pynini.string_file(get_data_file_path("roman", "hundreds.tsv")) # any roman number roman = digit_teen | ties roman |= ties + insert_space + digit_teen roman |= ( hundreds + pynini.closure(insert_space + ties, 0, 1) + pynini.closure(insert_space + digit_teen, 0, 1) ) roman = roman @ pynini.cdrewrite( pynini.cross("00 ", ""), "", "", SIGMA ) # remove zeros introduced by "hundreds" roman = roman @ pynini.cdrewrite( pynini.cross("0 ", ""), "", "", SIGMA ) # removes zero introduces by "ties" cardinal_roman_prefix = ( self._load_prefixes("cardinal_prefixes.tsv") + roman @ cardinal.single_fst ) # make ordinal from roman. allow only values < 10 ordinal_numbers = roman @ DIGIT ordinal_numbers = ordinal_numbers @ cardinal.get_digits_fst() ordinal_roman = ( pynutil.insert('ordinal { order: "') + ordinal_numbers + pynutil.insert('" }') ) ordinal_roman_prefix = ( self._load_prefixes("ordinal_prefixes.tsv") + ordinal_roman ) # stand alone roman - should have at least two digits, should be digits/teens at most standalone_roman = pynini.closure(CHAR, 2) @ digit_teen standalone_roman = standalone_roman @ cardinal.single_fst graph = standalone_roman | ordinal_roman_prefix | cardinal_roman_prefix self._single_fst = self.add_tokens(graph).optimize() self.connect_to_self(connector_in="-", connector_out="to")
@staticmethod def _load_prefixes(name: str) -> pynini.FstLike: """ helper function to load prefixes of roman numbers from a file Parameters ---------- name: str name of data file in roman data directory """ prefixes = load_union(get_data_file_path("roman", name), case_agnostic=True) return ( pynutil.insert('prefix: "') + prefixes + pynutil.insert('"') + pynini.accep(" ") )