Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| """ adapted from https://github.com/keithito/tacotron """ | |
| import re | |
| import numpy as np | |
| from . import cleaners | |
| from python.common.text.symbols import get_symbols | |
| from .cmudict import CMUDict | |
| from python.common.text.numbers import _currency_re, _expand_currency | |
| ######### | |
| # REGEX # | |
| ######### | |
| # Regular expression matching text enclosed in curly braces for encoding | |
| _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') | |
| # Regular expression matching words and not words | |
| _words_re = re.compile(r"([a-zA-ZÀ-ž]+['][a-zA-ZÀ-ž]{1,2}|[a-zA-ZÀ-ž]+)|([{][^}]+[}]|[^a-zA-ZÀ-ž{}]+)") | |
| # Regular expression separating words enclosed in curly braces for cleaning | |
| _arpa_re = re.compile(r'{[^}]+}|\S+') | |
| def lines_to_list(filename): | |
| with open(filename, encoding='utf-8') as f: | |
| lines = f.readlines() | |
| lines = [l.rstrip() for l in lines] | |
| return lines | |
| class TextProcessing(object): | |
| def __init__(self, symbol_set, cleaner_names, p_arpabet=0.0, | |
| handle_arpabet='word', handle_arpabet_ambiguous='ignore', | |
| expand_currency=True): | |
| self.symbols = get_symbols(symbol_set) | |
| self.cleaner_names = cleaner_names | |
| # Mappings from symbol to numeric ID and vice versa: | |
| self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)} | |
| self.id_to_symbol = {i: s for i, s in enumerate(self.symbols)} | |
| self.expand_currency = expand_currency | |
| # cmudict | |
| self.p_arpabet = p_arpabet | |
| self.handle_arpabet = handle_arpabet | |
| self.handle_arpabet_ambiguous = handle_arpabet_ambiguous | |
| def text_to_sequence(self, text): | |
| sequence = [] | |
| # Check for curly braces and treat their contents as ARPAbet: | |
| while len(text): | |
| m = _curly_re.match(text) | |
| if not m: | |
| sequence += self.symbols_to_sequence(text) | |
| break | |
| sequence += self.symbols_to_sequence(m.group(1)) | |
| sequence += self.arpabet_to_sequence(m.group(2)) | |
| text = m.group(3) | |
| return sequence | |
| def sequence_to_text(self, sequence): | |
| # result = '' | |
| result = [] | |
| for symbol_id in sequence: | |
| if symbol_id in self.id_to_symbol: | |
| s = self.id_to_symbol[symbol_id] | |
| # Enclose ARPAbet back in curly braces: | |
| if len(s) > 1 and s[0] == '@': | |
| s = '{%s}' % s[1:] | |
| # result += s | |
| result.append(s) | |
| return "|".join(result) | |
| # return result.replace('}{', ' ') | |
| def clean_text(self, text): | |
| for name in self.cleaner_names: | |
| cleaner = getattr(cleaners, name) | |
| if not cleaner: | |
| raise Exception('Unknown cleaner: %s' % name) | |
| text = cleaner(text) | |
| return text | |
| def symbols_to_sequence(self, symbols): | |
| return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id] | |
| def arpabet_to_sequence(self, text): | |
| return self.symbols_to_sequence(['@' + s for s in text.split()]) | |
| def get_arpabet(self, word): | |
| arpabet_suffix = '' | |
| if word.lower() in cmudict.heteronyms: | |
| return word | |
| if len(word) > 2 and word.endswith("'s"): | |
| arpabet = cmudict.lookup(word) | |
| if arpabet is None: | |
| arpabet = self.get_arpabet(word[:-2]) | |
| arpabet_suffix = ' Z' | |
| elif len(word) > 1 and word.endswith("s"): | |
| arpabet = cmudict.lookup(word) | |
| if arpabet is None: | |
| arpabet = self.get_arpabet(word[:-1]) | |
| arpabet_suffix = ' Z' | |
| else: | |
| arpabet = cmudict.lookup(word) | |
| if arpabet is None: | |
| return word | |
| elif arpabet[0] == '{': | |
| arpabet = [arpabet[1:-1]] | |
| if len(arpabet) > 1: | |
| if self.handle_arpabet_ambiguous == 'first': | |
| arpabet = arpabet[0] | |
| elif self.handle_arpabet_ambiguous == 'random': | |
| arpabet = np.random.choice(arpabet) | |
| elif self.handle_arpabet_ambiguous == 'ignore': | |
| return word | |
| else: | |
| arpabet = arpabet[0] | |
| arpabet = "{" + arpabet + arpabet_suffix + "}" | |
| return arpabet | |
| # def get_characters(self, word): | |
| # for name in self.cleaner_names: | |
| # cleaner = getattr(cleaners, f'{name}_post_chars') | |
| # if not cleaner: | |
| # raise Exception('Unknown cleaner: %s' % name) | |
| # word = cleaner(word) | |
| # return word | |
| def capitalize_repetitions (self, text): | |
| text_out = [] | |
| for letter in text: | |
| if len(text_out)==0: | |
| text_out.append(letter) | |
| else: | |
| if text_out[-1].lower()==letter.lower(): | |
| if text_out[-1]==letter.lower(): | |
| text_out.append(letter.upper()) | |
| elif text_out[-1]==letter.upper(): | |
| text_out.append(letter.lower()) | |
| else: | |
| text_out.append(letter.lower()) | |
| return "".join(text_out) | |
| def encode_text(self, text, return_all=False): | |
| if self.expand_currency: | |
| text = re.sub(_currency_re, _expand_currency, text) | |
| text_clean = [self.clean_text(split) if split[0] != '{' else split | |
| for split in _arpa_re.findall(text)] | |
| text_clean = ' '.join(text_clean) | |
| text = text_clean | |
| text_arpabet = '' | |
| if self.p_arpabet > 0: | |
| if self.handle_arpabet == 'sentence': | |
| if np.random.uniform() < self.p_arpabet: | |
| words = _words_re.findall(text) | |
| text_arpabet = [ | |
| self.get_arpabet(word[0]) | |
| if (word[0] != '') else word[1] | |
| for word in words] | |
| text_arpabet = ''.join(text_arpabet) | |
| text = text_arpabet | |
| elif self.handle_arpabet == 'word': | |
| words = _words_re.findall(text) | |
| text_arpabet = [ | |
| word[1] if word[0] == '' else ( | |
| self.get_arpabet(word[0]) | |
| if np.random.uniform() < self.p_arpabet | |
| else word[0]) | |
| for word in words] | |
| text_arpabet = ''.join(text_arpabet) | |
| text = text_arpabet | |
| elif self.handle_arpabet != '': | |
| raise Exception("{} handle_arpabet is not supported".format( | |
| self.handle_arpabet)) | |
| text = self.capitalize_repetitions(text) | |
| text_encoded = self.text_to_sequence(text) | |
| if return_all: | |
| return text_encoded, text_clean, text_arpabet | |
| return text_encoded | |