Source code for wordseg.prepare

"""Prepare an input text for word segmentation

* The input text must be in a phonologized form (a suite of phones,
  syllables or words tokens as specified by the token separator).

* The input text is checked for errors in formatting (presence of
  punctuation, missing separators, etc...).

* The output text contains space separated phones (or syllables
  according to the *unit* option).

* The program fails on the first encountered error, or ignore them if
  the *tolerant* option is used.

"""

import six
import string
import re

from wordseg import utils
from wordseg.separator import Separator


punctuation_re = re.compile('[%s]' % re.escape(string.punctuation))
"""A regular expression matching all the punctuation characters"""


def _pairwise(l):
    """Yields paiwise elements of a sequence

    Examples
    --------
    >>> list(pairwise([1, 2, 3]))
    [(1, 2), (2, 3)]

    """
    for a, b in zip(l[:-1], l[1:]):
        yield a, b


[docs]def check_utterance(utterance, separator=Separator(), check_punctuation=True):
    """Ensures an utterance is in a valid phonological form

    Parameters
    ----------
    utterance : str
        The utterance to be checked
    separator : Separator, optional
        The token separators used in the `utterance`
    check_punctuation : bool, optional
        When True (default), forbid any punctuation character in the
        utterance and raise ValueError if any punctuation is
        found. When False, do not check punctuation.

    Returns
    -------
    bool
        True if no error detected, raises otherwise

    Raises
    ------
    ValueError
        If one of the following errors is detected:

        * `utterance` is empty or is not a string
        * `utterance` contains any punctuation character (once the
          separators are removed), only if `check_punctuation` is
          True
        * `utterance` begins with a separator
        * `utterance` does not end with a word separator
        * `utterance` contains syllable tokens but a word does not end
          with a syllable separator

    """
    # utterance is empty or not a string (or unicode for python2)
    if not utterance or not isinstance(utterance, six.string_types):
        raise ValueError(
            'utterance is not a string ({}): {}'.format(
                type(utterance), utterance))

    if not len(separator.strip(utterance)):
        raise ValueError('utterance is an empty string')

    # search any punctuation in utterance (take care to remove token
    # separators first)
    if check_punctuation is True:
        cleaned_utterance = separator.remove(utterance)
        if punctuation_re.sub('', cleaned_utterance) != cleaned_utterance:
            raise ValueError('punctuation found in utterance')

    # utterance must not begin with a separator
    for sep in separator.iterate():
        if sep and re.match('^{}'.format(re.escape(sep)), utterance):
            raise ValueError(
                'utterance begins with a separator: "{}"'.format(utterance))

    # utterance must end with a word separator
    if not utterance.endswith(separator.word):
        raise ValueError(
            'utterance does not end with a word separator: "{}"'
            .format(utterance))

    # a word does not finish with a syllable separator
    if separator.syllable and separator.syllable in utterance and not all(
            a == separator.syllable
            for a, b in _pairwise(utterance.split(separator.phone))
            if b == separator.word):
        raise ValueError(
            'a word does not end with a syllable separator: "{}"'
            .format(utterance))

    return True


[docs]def prepare(text, separator=Separator(), unit='phone',
            check_punctuation=True, tolerant=False,
            log=utils.null_logger()):
    """Prepares a text in phonological form for word segmentation

    The returned text is ready to be segmented. It consists in a suite
    of phonological symbols (can be phones or syllable depending on
    `unit`) separated by spaces.

    The function removes the word separators from all the lines in
    `text` and replaces boundaries at the unit level defined by `unit`
    by a space. If `unit` is 'phone' the syllable separators are
    removed, and vice-versa if `unit` is 'syllable' the phone
    separators are dicarded.

    Parameters
    ----------
    text : sequence
        The input text to be prepared for segmentation. Each element
        of the sequence is assumed to be a single and complete
        utterance in valid phonological form.
    separator : Separator, optional
        Token separation in the `text`
    unit : str, optional
        The unit representation level to prepare the `text` at, must
        be 'syllable' or 'phone'.
    check_punctuation : bool, optional
        When True (default), forbid any punctuation character in the
        utterance and raise ValueError if any punctuation is
        found. When False, do not check punctiation.
    tolerant : bool, optional
        If False, raise ValueError on the first format error detected
        in the `text`. If True, the badly formated utterances are
        filtered out from the output and a warning is issued.
    log : logging.Logger, optional
        The logger instance where to send messages.

    Returns
    -------
    prepared_text : generator
        Utterances from the `text` with separators removed, prepared
        for segmentation at a syllable or phoneme representation level
        (separated by space).

    Raises
    ------
    ValueError
        On the first format error encountered in `text` (see the
        prepare.check_utterance function), only if `tolerant` is
        False.

    """
    # raise an error if unit is not valid
    if unit not in ('phone', 'syllable'):
        raise ValueError(
            "unit must be 'phone' or 'syllable', it is '{}'".format(unit))

    # define the function that prepare the text (removing requested
    # separators)
    if unit == 'phone':
        def func(line):
            return line.replace(separator.syllable, '')\
                       .replace(separator.word, '')
    else:  # syllable
        def func(line):
            return line.replace(separator.word, '')\
                       .replace(' ', '')\
                       .replace(separator.syllable, ' ')

    nremoved = 0
    for n, line in enumerate(text):
        try:  # force the utf8 encoding
            line = line.encode('utf8').decode().strip()
        except ValueError:  # line is already in bytes, not str
            line = line.strip()

        # ignore empty lines
        if line == '':
            log.debug('ignoring empty line %d', n+1)
            nremoved += 1
            continue

        try:
            check_utterance(
                line, separator, check_punctuation=check_punctuation)
            yield utils.strip(func(line))
        except ValueError as err:
            if tolerant is True:
                log.info('removing line %d: "%s"', n + 1, line)
                nremoved += 1
            else:
                raise ValueError('line {}: {}'.format(n + 1, err))

    if nremoved > 0:
        log.warning('removed %d badly formatted utterances', nremoved)


[docs]def gold(text, separator=Separator()):
    """Returns a gold text from a phonologized one

    The returned gold text is the ground-truth segmentation. It has
    phone and syllable separators removed and word separators replaced
    by a single space ' '. It is used to evaluate the output of
    segmentation algorithms.

    Parameters
    ----------
    text : sequence
        The input text to be prepared for segmentation. Each element
        of the sequence is assumed to be a single and complete
        utterance in valid phonological form.
    separator : Separator, optional
        Token separation in the `text`

    Returns
    -------
    gold_text : generator
        Gold utterances with separators removed and words separated by
        spaces. The returned text is the gold version, against which
        the algorithms are evaluated.

    """
    # delete phone and syllable separators. Replace word boundaries by
    # a single space.
    gold = (line.replace(separator.syllable, '')
            .replace(separator.phone or '', '')
            .replace(separator.word, ' ') for line in text)

    # delete any duplicate, begin or end spaces. As for prepare, we
    # ignore empty lines.
    return (line for line in (utils.strip(line) for line in gold) if line)


@utils.CatchExceptions
def main():
    """Entry point of the 'wordseg-prep' command"""
    # add a command-specific argument
    def add_arguments(parser):
        parser.add_argument(
            '-u', '--unit', type=str,
            choices=['phone', 'syllable'], default='phone', help='''
            output level representation, must be "phone" or "syllable"''')

        parser.add_argument(
            '-t', '--tolerant', action='store_true',
            help='''tolerate the badly formated utterances in input,
            but ignore them in output (default is to exit on the first
            encountered error)''')

        parser.add_argument(
            '-P', '--punctuation', action='store_true',
            help='punctuation characters are not considered illegal')

        group = [g for g in parser._action_groups
                 if g.title == 'input/output arguments'][0]
        group.add_argument(
            '-g', '--gold', type=str, metavar='<gold-file>',
            help='''generates the gold text to the specified file,
            do not generate gold if no file specified''')

    # command initialization
    streamin, streamout, separator, log, args = utils.prepare_main(
        name='wordseg-prep',
        description=__doc__,
        separator=utils.Separator(' ', ';esyll', ';eword'),
        add_arguments=add_arguments)

    streamin = list(streamin)

    log.debug('separator is %s', separator)
    log.info('preparing the text at {} level'.format(args.unit))

    # check all the utterances are correctly formatted.
    prep = utils.CountingIterator(prepare(
        streamin, separator, unit=args.unit, log=log,
        check_punctuation=not args.punctuation, tolerant=args.tolerant))

    # write prepared text, one utterance a line, ending with a newline
    streamout.write('\n'.join(prep) + '\n')
    log.info('prepared %s utterances', prep.count)

    if args.gold:
        log.info('generating gold text to %s', args.gold)
        gold_text = gold(streamin, separator=separator)
        open(args.gold, 'w').write('\n'.join(gold_text) + '\n')


if __name__ == '__main__':
    main()