Source code for wordseg.algos.baseline

"""Baseline algorithm for word segmentation

This algorithm randomly adds word boundaries after the input tokens
with a given probability.

"""

import codecs
import os
import random

from wordseg import utils
from wordseg.separator import Separator


[docs]def segment(text, probability=0.5, log=utils.null_logger()): """Random word segmentation given a boundary probability Given a probability :math:`p`, the probability :math:`P(t_i)` to add a word boundary after each token :math:`t_i` is: .. math:: P(t_i) = P(X < p), X \\sim \\mathcal{U}(0, 1). Parameters ---------- text : sequence The input utterances to segment, tokens are assumed to be space separated. probability: float, optional The probability to append a word boundary after each token. log : logging.Logger Where to send log messages Yields ------ segmented_text : generator The randomly segmented utterances. Raises ------ ValueError if the probability is not a float in [0, 1]. """ # make sure the probability is valid if not isinstance(probability, float): raise ValueError('probability must be a float') if probability < 0 or probability > 1: raise ValueError( 'probability must be in [0, 1], it is {}'.format(probability)) log.info('P(word boundary) = %s', probability) for utt in text: yield ''.join( token + ' ' if random.random() < probability else token for token in utt.strip().split(' '))
[docs]def segment_oracle(text, oracle_text, oracle_separator=Separator(), oracle_level='phone', log=utils.null_logger()): """Random oracle word segmentation The probability of word boundary :math:`p` is estimated from an `oracle` text as the ration ``nwords / (nphones or nsyllables)``, according to ``oracle_level``. The segmentation is then delegated to the segment(text, :math:`p`) method is called. Parameters ---------- text : sequence of str The input utterances to segment, tokens are assumed to be space separated. oracle_text : sequence of str The text on which to estimate the probaility of word boundary. Must be tokenized at word and at least phone or syllable levels (according to ``oracle_level``). oracle_separator : Separator, optional Token separation in the oracle text. oracle_level : str, optional The level to consider when estimating :math:`p`, must be 'phone' or 'syllable', default to 'phone'. log : logging.Logger Where to send log messages Yields ------ segmented_text : generator The randomly segmented utterances. """ # estimate the word probability boundary in the text nphones = sum( len(list(oracle_separator.tokenize(utt, level=oracle_level))) for utt in oracle_text) nwords = sum( len(list(oracle_separator.tokenize(utt, level='word'))) for utt in oracle_text) log.info('nwords = %s, n%ss = %s', nwords, oracle_level, nphones) if nwords == nphones: log.warning( 'nwords==nphones. Is the oracle\'s token separation correct?') probability = float(nwords) / float(nphones) return segment(text, probability, log=log)
def _add_arguments(parser): """Add algorithm specific options to the parser""" parser.add_argument( '-r', '--random', type=int, default=None, metavar='<int>', help='the seed for initializing the random number generator, ' 'default is based on system time') group = parser.add_argument_group('probability of word boundary') group = group.add_mutually_exclusive_group() group.add_argument( '-P', '--probability', type=float, default=0.5, metavar='<float>', help='the probability to have a word boundary after a phone, ' 'default is %(default)s') group.add_argument( '-O', '--oracle', type=str, metavar='<file>', help='the word boundary probability is estimated on this oracle text. ' 'Must be tokenized at word and at least phone or syllable levels') group = parser.add_argument_group( 'oracle tokens separation', 'to be used with the --oracle option ' 'to estimate word boundary probability\nas the ratio ' 'nwords / (nphones or nsyllables).') separator = Separator() group.add_argument( '-l', '--level', choices=['phone', 'syllable'], default='phone', help='level to consider when computing pwb, default is %(default)s') group.add_argument( '-p', '--phone-separator', metavar='<str>', default=separator.phone, help='phone separator in oracle, default is "%(default)s"') group.add_argument( '-s', '--syllable-separator', metavar='<str>', default=separator.syllable, help='syllable separator in oracle, default is "%(default)s"') group.add_argument( '-w', '--word-separator', metavar='<str>', default=separator.word, help='word separator in oracle, default is "%(default)s"') @utils.CatchExceptions def main(): """Entry point of the 'wordseg-baseline' command""" streamin, streamout, _, log, args = utils.prepare_main( name='wordseg-baseline', description=__doc__, add_arguments=_add_arguments) # setup the seed for random number generation if args.random: log.info('setup random seed to %s', args.random) random.seed(args.random) if args.oracle: # load the oracle text if not os.path.isfile(args.oracle): raise ValueError('oracle file not found: {}'.format(args.oracle)) oracle_text = list(codecs.open(args.oracle, 'r')) log.info('loaded %s utterances from oracle text', len(oracle_text)) # init the oracle tokens separator oracle_separator = Separator( phone=args.phone_separator, syllable=args.syllable_separator, word=args.word_separator) segmented = segment_oracle( streamin, oracle_text, oracle_separator, args.level, log=log) else: segmented = segment( streamin, probability=args.probability, log=log) streamout.write('\n'.join(segmented) + '\n') if __name__ == '__main__': main()