Source code for wordseg.separator

# coding: utf-8

"""Manage token separation at phone, syllable and word levels"""

import itertools
import re


[docs]class Separator(object):
    """Token separation at phone, syllable and word levels

    A Separator is made of 3 entries *phone*, *syllable* and *word*
    defining the token separators for each of these levels within an
    utterance. A token separator can be a string or None. If not None,
    the entries 'phone', 'syllable' and 'word' must be all different.

    The following characters are forbidden in separators: !#$%&'*+-.^`|~:\\\"

    """
    def __init__(self, phone=' ', syllable=';esyll', word=';eword'):
        # check we have different separators, None excluded
        g1 = list(sep for sep in (phone, syllable, word) if sep)
        g2 = set(sep for sep in (phone, syllable, word) if sep)
        if len(g1) != len(g2):
            raise ValueError(
                'cannot init separator: phone, syllable and word must be '
                'different, they are: "{}", "{}" and "{}"'
                .format(phone, syllable, word))

        self.phone = str(phone) if phone else None
        self.syllable = str(syllable) if syllable else None
        self.word = str(word) if word else None

        # ensure the separators are valid
        for sep in (self.phone, self.syllable, self.word):
            self.check_separator(sep)

        # store the tokens as precompiled regular expressions for
        # faster lookup in strings
        self._regexp = {
            'phone': re.compile(self.phone) if phone else None,
            'syllable': re.compile(self.syllable) if syllable else None,
            'word': re.compile(self.word) if word else None}

    forbidden_chars = "!#$%&'*+-.^`|~:\\\""
    """Characters forbidden in separators

    They interfer with regular expression processing

    """

[docs]    def check_separator(self, sep):
        """Raise a ValueError if the `sep` contains a forbidden character"""
        if sep is None:
            return

        for c in self.forbidden_chars:
            if c in sep:
                raise ValueError(
                    'the separator "{}" contains the illegal character "{}", '
                    'the following characters are illegal: {}'.format(
                        sep, c, self.forbidden_chars))

    def __str__(self):
        """Returns a string representation of a separator

        Examples
        --------
        >>> sep = Separator(phone='_', syllable=None, word=' ')
        >>> print(sep)
        (phone: "_", word: " ")

        """
        return '({})'.format(
            ', '.join('{}: "{}"'.format(k, v) for k, v
                      in self.iterate(type='pair') if v))

[docs]    def check_level(self, level):
        """Raises ValueError if `level` is not defined in the separator"""
        if level not in self.levels():
            raise ValueError(
                'level "{}" undefined, choose in: {}'.format(
                    level, ', '.join(self.levels())))

[docs]    def strip(self, utterance, level=None):
        """Removes leading and ending separators of an `utterance`

        Parameters
        ----------
        utterance : str
            The utterance to be striped.
        level : str, optional
            Specify the level boundaries to strip. If not specified
            remove all the boundaries. If specified, must be 'phone',
            'syllable' or 'word'.

        Returns
        -------
        The striped `utterance`

        """
        # order matters: remove word separators, then syllables and
        # finally phones
        to_remove = ['word', 'syllable', 'phone']
        if level:
            self.check_level(level)
            to_remove = [level]

        # build a regular expression for separator suppression,
        # considers also spaces within contiguous separators
        pattern = (
            r'((' + r'|'.join(
                '({})'.format(self.__dict__[l])
                for l in to_remove if self.__dict__[l])
            + r')+\s*)+')

        # remove leading separators
        utterance = re.sub(r'^' + pattern, '', utterance)
        # remove ending ones
        utterance = re.sub(pattern + r'$', '', utterance)

        return utterance.strip()

[docs]    def tokenize(self, utterance, level=None, keep_boundaries=True):
        """Return the tokens in `utterance` at the given `level`

        Iterates on phones, syllable or words within a given
        utterance, other levels being ignored.

        Parameters
        ----------
        utterance : str
            The utterance to be tokenized.
        level : str, optional
            The level to tokenize the utterance at, must be 'phone',
            'syllable' or 'word'. If not specified, tokenize at all
            the defined levels and return a nested list.
        keep_boundaries : bool, optional
            When True (default) preserve the sublevel token boundaries
            in the output. When False all token boundaries are
            removed.

        Returns
        -------
        token : list of (list of (list of)) str
            The successive phones, syllables or words tokenized from
            the utterance. From outer to inner levels in the returned
            nested list are words, syllables and phones. Empty tokens
            are ignored, tokens are striped.

        Raises
        ------
        ValueError
            If the `level` is not 'phone', 'syllable' or 'word'.

        Examples
        --------
        >>> from wordseg.separator import Separator
        >>> s = Separator(phone=' ', syllable=None, word=';eword')
        >>> t = 'j uː ;eword n oʊ ;eword dʒ ʌ s t ;eword'
        >>> list(s.tokenize(t, level='word'))
        ['j uː', 'n oʊ', 'dʒ ʌ s t']
        >>> list(s.tokenize(t, level='word', keep_boundaries=False))
        ['juː', 'noʊ', 'dʒʌst']
        >>> list(s.tokenize(t, level='phone'))
        ['j', 'uː', 'n', 'oʊ', 'dʒ', 'ʌ', 's', 't']
        >>> list(s.tokenize(t))
        [['j', 'uː'], ['n', 'oʊ'], ['dʒ', 'ʌ', 's', 't']]

        """
        if level:
            self.check_level(level)

        # auxiliary function tokenizing at a given level
        def _tokenize(utterance, level):
            if not self._regexp[level]:
                return [utterance]

            return [self.strip(token, level) for token in re.split(
                self._regexp[level], utterance) if token]

        if level is None:
            # fully tokenize the utterance as a nested list. Whatever the
            # separator we have here a 3-levels list
            tokens = [[_tokenize(s, 'phone')
                       for s in _tokenize(w, 'syllable')]
                      for w in _tokenize(utterance, 'word')]

            # remove the undefined levels from the list
            if not self.phone:
                tokens = [[tt[0] for tt in t] for t in tokens]
            if not self.syllable:
                tokens = [t[0] for t in tokens]
            if not self.word:
                tokens = tokens[0]

            return tokens

        # word tokens
        if self.word:
            tokens = _tokenize(utterance, 'word')
        else:
            tokens = [utterance]

        # syllable tokens
        if level in ('phone', 'syllable') and self.syllable:
            tokens = itertools.chain(
                syll for word in tokens
                for syll in _tokenize(word, 'syllable'))

        # phone tokens
        if level == 'phone' and self.phone:
            tokens = itertools.chain(
                phn for syll in tokens
                for phn in _tokenize(syll, 'phone'))

        # strip the tokens
        tokens = (self.strip(t) for t in tokens)

        # delete intermediate token boundaries when asked
        if not keep_boundaries:
            tokens = (self.remove(t) for t in tokens)

        return [t for t in tokens if t]

[docs]    def split(self, utterance, level, keep_boundaries=False):
        """Split the `utterance` at a given token `level`

        This method is sensitive to either the `utterance` is striped
        or not. It may output empty tokens.

        Parameters
        ----------
        utterance : str
            The string to split in tokens.
        level : str
            Token level to split the string with. Must be 'phone',
            'syllable' or 'word'.
        keep_boundaries : bool, optional
            If False (default), remove all the separators for all
            levels from the returned sub-utterances.

        Returns
        -------
        tokens : generator
             The tokens extracted from `utt`, may include empty tokens.

        Raises
        ------
        ValueError
            If the `level` is not 'phone', 'syllable' or 'word'.

        See Also
        --------
        tokenize : an higher-level method to split an utterance

        """
        self.check_level(level)

        sep = self._regexp[level]
        tokens = re.split(sep, utterance)

        if keep_boundaries:
            tokens = (re.sub(' +', ' ', u) for u in tokens)
        else:
            tokens = (self.remove(u) for u in tokens)

        # remove any leading ' '
        tokens = (t.lstrip(' ') for t in tokens)

        return tokens

[docs]    def remove(self, utterance, level=None):
        """Returns the `utterance` with separators removed

        Parameters
        ----------
        utterance : str
           The string to remove the separators from
        level : str, optional
           If specified (must be 'phone', 'syllable' or 'word'),
           remove only the separators of the given `level`. Else
           remove all the separators.

        Returns
        -------
        The utterance with specified separators removed. Multiple
        spaces are removed as well.

        Raises
        ------
        ValueError
            If the `level` is specified and is not 'phone', 'syllable'
            or 'word'.

        """
        if level:
            self.check_level(level)

        to_remove = {'phone', 'syllable', 'word'}
        if level:
            to_remove = {level}

        if self.word and 'word' in to_remove:
            utterance = re.sub(self._regexp['word'], '', utterance)

        if self.syllable and 'syllable' in to_remove:
            utterance = re.sub(self._regexp['syllable'], '', utterance)

        if self.phone and 'phone' in to_remove:
            utterance = re.sub(self._regexp['phone'], '', utterance)

        return re.sub(' +', ' ', utterance)

[docs]    def iterate(self, type='value'):
        """Yields on phone, syllable and word tokens, in that order

        Parameters
        ----------
        type : str, optional
            Type of separator representation to return, must be
            'value' or 'pair'.

        Yields
        ------
        token : str or tuple
            In the form **token_value** if `type` is 'value'. In the
            form **(token_name, token_value)** if `type` is 'pair'.

        Raises
        ------
        ValueError
            If the `type` is not 'value' or 'pair'.

        """
        if type == 'value':
            yield self.phone
            yield self.syllable
            yield self.word
        elif type == 'pair':
            yield ('phone', self.phone)
            yield ('syllable', self.syllable)
            yield ('word', self.word)
        else:
            raise ValueError(
                'iteration type must be "value" or "pair", it is "{}"'
                .format(type))

[docs]    def levels(self):
        """The list of defined token levels from inner to outer"""
        # curiously levels order and alphabetical order are the same
        # (phone < syllable < word)
        return sorted([k for k, v in self.iterate(type='pair') if v])

[docs]    def upper_levels(self, level):
        """Lists the defined levels upper than the given one

        Parameters
        ----------
        level : str
            Must be 'phone', 'syllable' or 'word'.

        Raises
        ------
        ValuError
            when `level` is not defined in the separator.

        Examples
        --------
        >>> from wordseg.separator import Separator
        >>> s = Separator(phone='p', syllable='s', word='w')
        >>> s.upper_levels('phone')
        ['syllable', 'word']
        >>> s.upper_levels('word')
        []
        >>> s = Separator(phone='p', syllable=None, word='w')
        >>> s.upper_levels('phone')
        ['word']

        """
        # ensure the required level exists
        self.check_level(level)

        # extract tue upper levels
        index = self.levels().index(level)
        return self.levels()[index+1:]