Source code for learn_to_normalize.evaluation.google_data.parsed_utterance

Copyright 2022 Balacoon

a holder for tokens that are getting parsed
from a google data file.

import re
import unidecode

[docs]class ParsedUtterance: """ A data structure that contains unnormalized and normalized tokens parsed from a Google data file. This class also contains knowledge how google data conventions map to Balacoon text_normalization formats. """
[docs] def __init__(self): self._tags = [] self._unnormalized = [] self._normalized = [] self._next_token_prefix = "" self._is_first_qoute = True
@staticmethod def _strip_letter_suffix(normalized: str) -> str: """ ELECTRONIC semiotic class has fields being spelled letter by letter, where each character gets special "_letter" suffix. This is a helper method to collapse that suffix. """ if "_letter" not in normalized: return normalized parts = normalized.split() new_parts = [] cur_word = [] for p in parts: if p.endswith("_letter"): p = p[:-len("_letter")] if p: cur_word.append(p.upper()) elif cur_word: # its just suffix. indicates whitespace new_parts.append("".join(cur_word)) cur_word = [] else: if cur_word: new_parts.append("".join(cur_word)) cur_word = [] new_parts.append(p) if cur_word: new_parts.append("".join(cur_word)) return " ".join(new_parts)
[docs] def add_token(self, tag: str, unnormalized: str, normalized: str): """ once a line from data file is read, add that info into currently parsed utterance """ self._tags.append(tag) # most of the work is to handle punctiation marks. need to attach them properly if tag == "PUNCT": if not self._unnormalized or unnormalized in ["(", "{", "["]: # if there is no previous token or its an opening bracket, # then its should be attached to the next token self._next_token_prefix += unnormalized elif unnormalized == "\"": # handling a qoute trying to track if its an opening or a closing one if self._is_first_qoute: self._is_first_qoute = False self._next_token_prefix += unnormalized else: self._unnormalized[-1] = self._unnormalized[-1] + unnormalized self._is_first_qoute = True else: # other punctuation marks are just attached to the previous token if self._next_token_prefix: # there is some prefix but no actual token was found unnormalized = self._next_token_prefix + unnormalized self._next_token_prefix = "" self._unnormalized[-1] = self._unnormalized[-1] + unnormalized else: # this is non punct semiotic class if normalized == "<self>": normalized = unnormalized.lower() if tag == "LETTERS": normalized = normalized.replace(" ", "").upper() if tag == "VERBATIM" and re.match("[a-z]( [a-z])+", normalized): # its an abbreviation normalized = normalized.replace(" ", "").upper() # replace all non-ascii characters if any normalized = unidecode.unidecode(normalized) # strip all "_letter" suffixes if any normalized = self._strip_letter_suffix(normalized) if normalized and normalized != "sil": self._normalized.append(normalized) if self._next_token_prefix: # there is some prefix but no actual token was found unnormalized = self._next_token_prefix + unnormalized self._next_token_prefix = "" self._unnormalized.append(unnormalized)
[docs] def has_semiotic_class(self, tag: str) -> bool: """ checks if this utterance has particular semiotic class Parameters ---------- tag: str semiotic class to look for Returns ------- flag: bool True if this utterance has requested semiotic class """ return tag in self._tags
[docs] def get_unnormalized(self) -> str: """ getter to return unnomralized utterance as a single string concatenates previously accumulated unnormalized tokens Returns ------- unnorm: str string with unnormalized utterance """ unnorm = " ".join(self._unnormalized) # remove space after slash if any unnorm = unnorm.replace("/ ", "/") return unnorm
[docs] def get_normalized(self) -> str: """ getter to return normalized utterance as a single string. essentially a ground truth for text normalization. concatenates previously accumulated normalized tokens Returns ------- norm: str string with normalized utterance """ return " ".join(self._normalized)
[docs] def get_tokens_num(self): """ getter that returns number of tokens that were added to this utterance Returns ------- num: int number of tokens added """ return len(self._tags)
[docs] def is_empty(self) -> bool: """ checks if any tokens where added to the utterance Returns ------- flag: bool True if no tokens where added to this utterance """ return self.get_tokens_num() == 0