Source code for wordseg.separator

# coding: utf-8

"""Manage token separation at phone, syllable and word levels"""

import itertools
import re


[docs]class Separator(object): """Token separation at phone, syllable and word levels A Separator is made of 3 entries *phone*, *syllable* and *word* defining the token separators for each of these levels within an utterance. A token separator can be a string or None. If not None, the entries 'phone', 'syllable' and 'word' must be all different. The following characters are forbidden in separators: !#$%&'*+-.^`|~:\\\" """ def __init__(self, phone=' ', syllable=';esyll', word=';eword'): # check we have different separators, None excluded g1 = list(sep for sep in (phone, syllable, word) if sep) g2 = set(sep for sep in (phone, syllable, word) if sep) if len(g1) != len(g2): raise ValueError( 'cannot init separator: phone, syllable and word must be ' 'different, they are: "{}", "{}" and "{}"' .format(phone, syllable, word)) self.phone = str(phone) if phone else None self.syllable = str(syllable) if syllable else None self.word = str(word) if word else None # ensure the separators are valid for sep in (self.phone, self.syllable, self.word): self.check_separator(sep) # store the tokens as precompiled regular expressions for # faster lookup in strings self._regexp = { 'phone': re.compile(self.phone) if phone else None, 'syllable': re.compile(self.syllable) if syllable else None, 'word': re.compile(self.word) if word else None} forbidden_chars = "!#$%&'*+-.^`|~:\\\"" """Characters forbidden in separators They interfer with regular expression processing """
[docs] def check_separator(self, sep): """Raise a ValueError if the `sep` contains a forbidden character""" if sep is None: return for c in self.forbidden_chars: if c in sep: raise ValueError( 'the separator "{}" contains the illegal character "{}", ' 'the following characters are illegal: {}'.format( sep, c, self.forbidden_chars))
def __str__(self): """Returns a string representation of a separator Examples -------- >>> sep = Separator(phone='_', syllable=None, word=' ') >>> print(sep) (phone: "_", word: " ") """ return '({})'.format( ', '.join('{}: "{}"'.format(k, v) for k, v in self.iterate(type='pair') if v))
[docs] def check_level(self, level): """Raises ValueError if `level` is not defined in the separator""" if level not in self.levels(): raise ValueError( 'level "{}" undefined, choose in: {}'.format( level, ', '.join(self.levels())))
[docs] def strip(self, utterance, level=None): """Removes leading and ending separators of an `utterance` Parameters ---------- utterance : str The utterance to be striped. level : str, optional Specify the level boundaries to strip. If not specified remove all the boundaries. If specified, must be 'phone', 'syllable' or 'word'. Returns ------- The striped `utterance` """ # order matters: remove word separators, then syllables and # finally phones to_remove = ['word', 'syllable', 'phone'] if level: self.check_level(level) to_remove = [level] # build a regular expression for separator suppression, # considers also spaces within contiguous separators pattern = ( r'((' + r'|'.join( '({})'.format(self.__dict__[l]) for l in to_remove if self.__dict__[l]) + r')+\s*)+') # remove leading separators utterance = re.sub(r'^' + pattern, '', utterance) # remove ending ones utterance = re.sub(pattern + r'$', '', utterance) return utterance.strip()
[docs] def tokenize(self, utterance, level=None, keep_boundaries=True): """Return the tokens in `utterance` at the given `level` Iterates on phones, syllable or words within a given utterance, other levels being ignored. Parameters ---------- utterance : str The utterance to be tokenized. level : str, optional The level to tokenize the utterance at, must be 'phone', 'syllable' or 'word'. If not specified, tokenize at all the defined levels and return a nested list. keep_boundaries : bool, optional When True (default) preserve the sublevel token boundaries in the output. When False all token boundaries are removed. Returns ------- token : list of (list of (list of)) str The successive phones, syllables or words tokenized from the utterance. From outer to inner levels in the returned nested list are words, syllables and phones. Empty tokens are ignored, tokens are striped. Raises ------ ValueError If the `level` is not 'phone', 'syllable' or 'word'. Examples -------- >>> from wordseg.separator import Separator >>> s = Separator(phone=' ', syllable=None, word=';eword') >>> t = 'j uː ;eword n oʊ ;eword dʒ ʌ s t ;eword' >>> list(s.tokenize(t, level='word')) ['j uː', 'n oʊ', 'dʒ ʌ s t'] >>> list(s.tokenize(t, level='word', keep_boundaries=False)) ['juː', 'noʊ', 'dʒʌst'] >>> list(s.tokenize(t, level='phone')) ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't'] >>> list(s.tokenize(t)) [['j', 'uː'], ['n', 'oʊ'], ['dʒ', 'ʌ', 's', 't']] """ if level: self.check_level(level) # auxiliary function tokenizing at a given level def _tokenize(utterance, level): if not self._regexp[level]: return [utterance] return [self.strip(token, level) for token in re.split( self._regexp[level], utterance) if token] if level is None: # fully tokenize the utterance as a nested list. Whatever the # separator we have here a 3-levels list tokens = [[_tokenize(s, 'phone') for s in _tokenize(w, 'syllable')] for w in _tokenize(utterance, 'word')] # remove the undefined levels from the list if not self.phone: tokens = [[tt[0] for tt in t] for t in tokens] if not self.syllable: tokens = [t[0] for t in tokens] if not self.word: tokens = tokens[0] return tokens # word tokens if self.word: tokens = _tokenize(utterance, 'word') else: tokens = [utterance] # syllable tokens if level in ('phone', 'syllable') and self.syllable: tokens = itertools.chain( syll for word in tokens for syll in _tokenize(word, 'syllable')) # phone tokens if level == 'phone' and self.phone: tokens = itertools.chain( phn for syll in tokens for phn in _tokenize(syll, 'phone')) # strip the tokens tokens = (self.strip(t) for t in tokens) # delete intermediate token boundaries when asked if not keep_boundaries: tokens = (self.remove(t) for t in tokens) return [t for t in tokens if t]
[docs] def split(self, utterance, level, keep_boundaries=False): """Split the `utterance` at a given token `level` This method is sensitive to either the `utterance` is striped or not. It may output empty tokens. Parameters ---------- utterance : str The string to split in tokens. level : str Token level to split the string with. Must be 'phone', 'syllable' or 'word'. keep_boundaries : bool, optional If False (default), remove all the separators for all levels from the returned sub-utterances. Returns ------- tokens : generator The tokens extracted from `utt`, may include empty tokens. Raises ------ ValueError If the `level` is not 'phone', 'syllable' or 'word'. See Also -------- tokenize : an higher-level method to split an utterance """ self.check_level(level) sep = self._regexp[level] tokens = re.split(sep, utterance) if keep_boundaries: tokens = (re.sub(' +', ' ', u) for u in tokens) else: tokens = (self.remove(u) for u in tokens) # remove any leading ' ' tokens = (t.lstrip(' ') for t in tokens) return tokens
[docs] def remove(self, utterance, level=None): """Returns the `utterance` with separators removed Parameters ---------- utterance : str The string to remove the separators from level : str, optional If specified (must be 'phone', 'syllable' or 'word'), remove only the separators of the given `level`. Else remove all the separators. Returns ------- The utterance with specified separators removed. Multiple spaces are removed as well. Raises ------ ValueError If the `level` is specified and is not 'phone', 'syllable' or 'word'. """ if level: self.check_level(level) to_remove = {'phone', 'syllable', 'word'} if level: to_remove = {level} if self.word and 'word' in to_remove: utterance = re.sub(self._regexp['word'], '', utterance) if self.syllable and 'syllable' in to_remove: utterance = re.sub(self._regexp['syllable'], '', utterance) if self.phone and 'phone' in to_remove: utterance = re.sub(self._regexp['phone'], '', utterance) return re.sub(' +', ' ', utterance)
[docs] def iterate(self, type='value'): """Yields on phone, syllable and word tokens, in that order Parameters ---------- type : str, optional Type of separator representation to return, must be 'value' or 'pair'. Yields ------ token : str or tuple In the form **token_value** if `type` is 'value'. In the form **(token_name, token_value)** if `type` is 'pair'. Raises ------ ValueError If the `type` is not 'value' or 'pair'. """ if type == 'value': yield self.phone yield self.syllable yield self.word elif type == 'pair': yield ('phone', self.phone) yield ('syllable', self.syllable) yield ('word', self.word) else: raise ValueError( 'iteration type must be "value" or "pair", it is "{}"' .format(type))
[docs] def levels(self): """The list of defined token levels from inner to outer""" # curiously levels order and alphabetical order are the same # (phone < syllable < word) return sorted([k for k, v in self.iterate(type='pair') if v])
[docs] def upper_levels(self, level): """Lists the defined levels upper than the given one Parameters ---------- level : str Must be 'phone', 'syllable' or 'word'. Raises ------ ValuError when `level` is not defined in the separator. Examples -------- >>> from wordseg.separator import Separator >>> s = Separator(phone='p', syllable='s', word='w') >>> s.upper_levels('phone') ['syllable', 'word'] >>> s.upper_levels('word') [] >>> s = Separator(phone='p', syllable=None, word='w') >>> s.upper_levels('phone') ['word'] """ # ensure the required level exists self.check_level(level) # extract tue upper levels index = self.levels().index(level) return self.levels()[index+1:]