Source code for wordseg.syllabification

"""Estimates syllable boundaries on a text using the maximal onset principle.

This algorithm fully syllabifies a text from a list of onsets and
vowels. Input text must be in orthographic form (with word separators
only) or in phonemized form (with both word and phone
separators). Output text has syllable separators added at estimated
syllable boundaries. For exemples of vowels and onsets files, see the
directory `wordseg/data/syllabification`.

"""

# Created by Lawrence Phillips & Lisa Pearl (2013), adapted by Alex
# Cristia (2015), converted from perl to python and integration in
# wordseg by Mathieu Bernard (2017). Credit is owed mainly to the
# original authors.


import codecs
import os
import re
import six

from wordseg import utils
from wordseg.separator import Separator


[docs]class Syllabifier(object):
    """Syllabify a text given in phonological or orthographic form

    Syllabification errors can occur when the onsets and/or vowels are
    not adapted to the input text (see the `tolerant` parameter).

    Parameters
    ----------
    onsets : list
        The list of valid onsets in the `text`
    vowels : list
        The list of vowels in the `text`
    separator : Separator, optional
        Token separation in the `text`
    silent : bool, optional
        When True, append a silent vowel to the end of words without
        vowel (the vowel is removed after processing so the text is
        unchanged). When False those words cannot be syllabified.
    log : logging.Logger, optional
        Where to send log messages

    Raises
    ------
    ValueError
        If `onsets` or `vowels` are empty or are not lists.

    """
    def __init__(self, onsets, vowels, separator=Separator(),
                 filling_vowel=False, log=utils.null_logger()):
        self.onsets = onsets
        self.vowels = vowels
        self.separator = separator
        self.log = log

        # ensure onsets and vowels are not empty
        if not isinstance(vowels, list) or not len(vowels):
            raise ValueError('unvalid or empty vowels list')
        if not isinstance(onsets, list) or not len(onsets):
            raise ValueError('unvalid or empty onsets list')

        # concatenation of all chars in onsets and vowels (usefull to
        # detect any char during syllabification)
        self.symbols = (
            set(''.join(v for v in vowels)).union(
                set(''.join(o for o in onsets))))

        # if defined, ensure the silent vowel is not already used
        if filling_vowel:
            # find a silent vowel (some char not already prensent in
            # the symbols)
            code = 1
            while six.unichr(code) in self.symbols:
                code += 1
            self.silent = six.unichr(code)
            self.symbols.add(self.silent)
            self.vowels.append(self.silent)
        else:
            self.silent = None

[docs]    def syllabify(self, text, strip=False, tolerant=False):
        """Returns the text with syllable boundaries added

        Parameters
        ----------
        text : sequence
            The input text to be syllabified. Each element of the sequence
            is assumed to be a single and complete utterance in valid
            phonological form.
        strip : bool, optional
            When True, removes the syllable boundary at the end of words.
        tolerant : bool, optional
            When False (the default), the function raise a ValueError on
            the first utterance that have not been correctly
            syllabified. When True, ignore the failed utterances in output
            but issue a log warning instead.

        Returns
        -------
        The text with estimated syllable boundaries added. If `tolerant`
        is True some utterances may be missing in the output.

        Raises
        ------
        ValueError
            If an utterance has not been correctly syllabified . If
            `separator.syllable` is found in the text, or if `onsets`
            or `vowels` are empty.

        """
        # we are syllabifying utterance per utterance
        syllabified_text = []
        nerrors = 0
        for n, utt in enumerate(text):
            utt = utt.strip()

            # first ensure the utterance is compatible with the given
            # syllable separator
            if self.separator.syllable in utt:
                raise ValueError(
                    'syllable separator "{}" found in text (line {}): {}'
                    .format(self.separator.syllable, n+1, utt))

            # if we have phone separators, removes them and store
            # their positions
            utt, index = self._remove_phone_separators(utt)

            # estimate the syllable boundaries on the utterance
            try:
                syllables = self._syllabify_utterance(utt, strip=strip)
            except RuntimeError as err:
                error = 'line {}, {}'.format(n+1, err)
                if tolerant:
                    # issue a warning and ignore that utterance
                    self.log.warning(error)
                    nerrors += 1
                    continue
                else:
                    # fail with error
                    raise ValueError(error)

            # restore the phones separators as they were before
            syllables = self._restore_phone_separators(syllables, index, strip)

            syllabified_text.append(syllables)

        if tolerant and nerrors > 0:
            self.log.error(
                'syllabification failed for {} utterances'.format(nerrors))

        return syllabified_text

[docs]    @staticmethod
    def open_datafile(data_file):
        """Read a vowel or onsets file as a list"""
        data = codecs.open(data_file, 'r', encoding='utf8').readlines()

        # ignore empty lines in data file
        return [line for line in (line.strip() for line in data) if line]

    def _syllabify_utterance(self, utterance, strip=False):
        """Syllabify a single utterance

        Auxiliary function to syllabify_text().

        Raises
        ------
        RuntimeError
            If the syllabification failed

        """
        # split the utterances into words
        words = self.separator.tokenize(
            utterance, level='word', keep_boundaries=False)

        # estimate syllables boundaries word per word, read them from
        # end to start
        output = ''
        for n, word in enumerate(words[::-1]):
            try:
                output_word = self._syllabify_word(word, strip)
            except RuntimeError as err:
                # forward the exception with word id added
                raise RuntimeError(
                    'word {}: {}'.format(len(words) - n, err))

            # concatenate the syllabified word to the output, do not
            # append a word separator at the end if stripped
            if strip and not self.separator.remove(output):
                output = output_word
            else:
                output = output_word + self.separator.word + output

        return output

    def _syllabify_word(self, word, strip):
        """Return a single word with syllable boundaries added

        Auxiliary function to syllabify_utterance().

        Raises
        ------
        RuntimeError
            If the word has no vowel, contains an unknown symbol (not
            present in vowels or onsets) or if the syllabification
            failed.

        """
        # ensure all the chars in word are defined in vowels or onsets
        unknown = self._unknown_char(word)
        if unknown:
            raise RuntimeError(
                'unknown symbol "{}" in word "{}"'.format(unknown, word))

        # ensure the word containe at least a vowel
        if not self._has_vowels(word):
            if not self.silent:
                raise RuntimeError(
                    'no vowel in word "{}"'.format(word))
            else:
                word += self.silent

        input_word = word
        output_word = ''
        syllable = ''

        # read characters of the current word from end to start
        while len(word) > 0:
            char, word = word[-1], word[:-1]

            # append current char to current syllable - that will be
            # necessary regardless of whether it's a vowel or a coda
            syllable = char + syllable

            if char in self.vowels:
                word, syllable = self._build_onset(word, syllable)

                # add the syllable to words entry
                if strip and not output_word:
                    output_word = syllable
                else:
                    output_word = (
                        syllable + self.separator.syllable + output_word)
                syllable = ''

        if input_word != self.separator.remove(output_word, 'syllable'):
            raise RuntimeError(
                'onset not found in "{}"'.format(
                    input_word,
                    self.separator.remove(output_word, 'syllable')))

        if self.silent:
            return re.sub(self.silent, '', output_word)
        else:
            return output_word

    def _build_onset(self, word, syllable):
        try:
            prevchar = word[-1]
            if prevchar not in self.vowels:
                # if this char is a vowel and the previous one is not,
                # then we need to make the onset, start with nothing as
                # the onset
                onset = ''

                # then we want to take one letter at a time and check
                # whether their concatenation makes a good onset
                while len(word) and word[-1] + onset in self.onsets:
                    onset = word[-1] + onset
                    word = word[:-1]

                # we get here either because we've concatenated the
                # onset+rest or because there was no onset and the
                # preceding element is a vowel, so this is the end of the
                # syllable
                syllable = onset + syllable
        except IndexError:  # there is no previous char
            pass

        return word, syllable

    def _remove_phone_separators(self, utt):
        # special case when there is no phone separator in the utterance
        if not re.search(self.separator.phone, utt):
            return utt, []

        # the returned index is a list of lists (for each word, length
        # of each phone)
        index = []

        # split the utterance in words and index the phones length
        for word in self.separator.split(utt, 'word', keep_boundaries=True):
            phones = self.separator.split(word, 'phone', keep_boundaries=False)
            current_index = [len(p) for p in phones if len(p)]
            if current_index:
                index.append(current_index)

        return self.separator.remove(utt, level='phone'), index

    def _restore_phone_separators(self, utt, index, strip):
        # special case when there is no phone separator in the
        # utterance
        if index == []:
            return utt

        # restore the utterance word per word (index[i]) and within
        # words, phone per phone (index[i][j]).
        restored = ''
        for i, word in enumerate(
                self.separator.split(utt, 'word', keep_boundaries=True)):
            if len(word) == 0 and strip is False:
                # coherent behavior for non striped texts
                restored += self.separator.word
            else:
                j = 0  # iterate on syllables
                for syllable in self.separator.split(
                        word, 'syllable', keep_boundaries=True):
                    # for each phone in the syllable, append a phone
                    # separator
                    k = 0
                    while k < len(syllable):
                        restored += (
                            syllable[k:k+index[i][j]] + self.separator.phone)
                        k += index[i][j]
                        j += 1

                    # end of the syllable, append a separator
                    if strip:
                        restored = restored[:-len(self.separator.phone)]
                    restored += self.separator.syllable

                # end of the word, remove the last syllable boundary
                # append a word separator
                restored = (restored[:-len(self.separator.syllable)] +
                            self.separator.word)

        # remove the last word boundary of the utterance
        return restored[:-len(self.separator.word)]

    def _unknown_char(self, word):
        """Returns the unknown char if anyone if found, False otherwise"""
        for w in word:
            if w not in self.symbols:
                return w
        return False

    def _has_vowels(self, word):
        """True if the `word` contains any vowel, False otherwise"""
        for v in self.vowels:
            if v in word:
                return True
        return False


def _add_arguments(parser):
    """Add command line arguments for wordseg-syll"""
    parser.add_argument(
        'onsets_file', type=str, metavar='<onsets-file>',
        help=('a file containing the list of valid onsets for the '
              'input text, one onset per line'))

    parser.add_argument(
        'vowels_file', type=str, metavar='<vowels-file>',
        help=('a file containing the list of vowels for '
              'the input text, one vowel per line'))

    parser.add_argument(
        '-S', '--strip', action='store_true',
        help='removes the end separators in syllabified output')

    parser.add_argument(
        '-t', '--tolerant', action='store_true',
        help='tolerate syllabification failures and report them as warnings, '
        'default is to fail at the first error')

    parser.add_argument(
        '-f', '--filling-vowel', action='store_true',
        help='add a silent filling vowel to groups of consonants '
        'with no vowel, by default words with no vowel cannot be syllabified')


@utils.CatchExceptions
def main():
    """Entry point of the 'wordseg-syll' command"""
    streamin, streamout, separator, log, args = utils.prepare_main(
        name='wordseg-syll',
        description=__doc__,
        separator=utils.Separator(' ', ';esyll', ';eword'),
        add_arguments=_add_arguments)

    # loads the onsets
    if not os.path.isfile(args.onsets_file):
        raise RuntimeError(
            'unknown onsets file "{}"'.format(args.onsets_file))
    onsets = Syllabifier.open_datafile(args.onsets_file)

    # loads the vowels
    if not os.path.isfile(args.vowels_file):
        raise RuntimeError(
            'unknown vowels file "{}"'.format(args.vowels_file))
    vowels = Syllabifier.open_datafile(args.vowels_file)

    log.info('loaded %s onsets', len(onsets))
    log.debug('onsets are "%s"', ', '.join(onsets))
    log.info('loaded %s vowels', len(vowels))
    log.debug('vowels are "%s"', ', '.join(vowels))
    log.debug('separator is %s', separator)

    syllabifier = Syllabifier(
        onsets, vowels, separator=separator,
        filling_vowel=args.filling_vowel, log=log)

    # syllabify the input text
    sylls = utils.CountingIterator(syllabifier.syllabify(
        streamin, strip=args.strip, tolerant=args.tolerant))

    # display the output
    log.info('syllabified %s utterances', sylls.count)
    streamout.write('\n'.join(sylls) + '\n')


if __name__ == '__main__':
    main()