"""Bayesian word segmentation algorithm
See Goldwater, Griffiths, Johnson (2010) and Phillips & Pearl (2014).
1. Uses a hierarchical Pitman-Yor process rather than a hierarchical
Dirichlet process model. The HDP model can be recovered by setting
the PY parameters appropriately (set --a1 and --a2 to 0, --b1 and
--b2 then correspond to the HDP parameters).
2. Implements several different estimation procedures, including the
original Gibbs sampler (*flip sampler*) as well as a sentence-based
Gibbs sampler that uses dynamic programming (*tree sampler*) and a
similar dynamic programming algorithm that chooses the best
segmentation of each utterance rather than a sample. The latter
two algorithms can be run either in batch mode or in online mode.
If in online mode, they can also be set to "forget" parts of the
previously analysis. This is described in more detail below.
3. Functionality for using separate training and testing files. If
you provide an evaluation file, the program will first run through
its full training procedure (i.e., using whichever algorithm for
however many iterations, kneeling, etc.). After that, it will
freeze the lexicon in whatever state it is in and then make a
single pass through the evaluation data, segmenting each sentence
according to the probabilities computed from the frozen lexicon.
No new words/counts will be added to the lexicon during evaluation.
Evaluation can be set to either sample segmentations or choose the
maximum probability segmentation for each utterance. Scores will
be printed out at the end of the complete run based on either the
evaluation data (if provided) or the training data (if not).
"""
import joblib
import logging
import os
import re
import shlex
import six
import subprocess
import tempfile
import threading
from wordseg import utils, folding
# # a list of dpseg options we don't want to expose in wordseg-dpseg
# ['--help', '--data-file', '--data-start-index', '--data-num-sents',
# '--eval-start-index', '--eval-num-sents', '--output-file']
DPSEG_ARGUMENTS = [
utils.Argument(
short_name='-d', name='--debug-level', type=int, default=0,
help='debugging level'),
utils.Argument(
short_name='-e', name='--eval-file', type='file',
help=('testing data file. If no eval-file is listed, evaluation will '
'be on the training file. Note that listing the same file for '
'both training and testing has different functionality than '
'not listing a test file, due to the way that the test file '
'is segmented.')),
utils.Argument(
name='--eval-maximize', type=bool,
help=('choose max probability segmentation of test sentences, '
'default is sample')),
utils.Argument(
name='--eval-interval', type=int,
help=('how many iterations are run before the test set is evaluated, '
'0 (default) means to only evaluate the test set after all'
'iterations are complete')),
utils.Argument(
short_name='-E', name='--estimator',
type=['viterbi', 'flip', 'tree', 'decayed-flip'],
help=('Viterbi does dynamic programming maximization, '
'Tree does dynamic programming sampling, '
'Flip does original Gibbs sampler (default)')),
utils.Argument(
short_name='-D', name='--decay-rate', type=float,
help='decay rate for decayed-flip estimator, default = 1.0'),
utils.Argument(
short_name='-S', name='--samples-per-utt', type=int,
help='samples per utterance for decayed-flip estimator, '
'default is 1000'),
utils.Argument(
short_name='-m', name='--mode', type=['batch', 'online'],
help='default is batch'),
utils.Argument(
short_name='-n', name='--ngram', type=['unigram', 'bigram'],
help='default is bigram'),
utils.Argument(
name='--do-mbdp', type=bool,
help='maximize using Brent ngram instead of DP'),
utils.Argument(
name='--a1', type=float,
help='unigram Pitman-Yor a parameter, default = 0.0'),
utils.Argument(
name='--b1', type=float,
help='unigram Pitman-Yor b parameter, default = 1.0'),
utils.Argument(
name='--a2', type=float,
help='bigram Pitman-Yor a parameter, default = 0.0'),
utils.Argument(
name='--b2', type=float,
help='bigram Pitman-Yor a parameter, default = 1.0'),
utils.Argument(
short_name='-p', name='--Pstop', type=float,
help='monkey model stop probability, default = 0.5'),
utils.Argument(
short_name='-H', name='--hypersamp-ratio', type=float,
help='standard deviation for new hyperparmeters proposals, '
'default = 0.1'),
utils.Argument(
name='--nchartypes', type=int,
help=('number of characters assumed in P_0, '
'0 will compute from input, default = 0')),
utils.Argument(
name='--aeos', type=float,
help='beta prior on end of sentence prob, default is 2'),
utils.Argument(
short_name='-b', name='--init-pboundary', type=float,
help='initial segmentation boundary probability (-1 = gold, '
'default = 0)'),
utils.Argument(
name='--pya-beta-a', type=float,
help='if non-zero, a parameter of Beta prior on pya, default = 1.0'),
utils.Argument(
name='--pya-beta-b', type=float, default=1.0,
help='if non-zero, b parameter of Beta prior on pya, default = 1.0'),
utils.Argument(
name='--pya-gamma-s', type=float, default=10.0,
help='if non-zero, parameter of Gamma prior on pyb, default = 10.0'),
utils.Argument(
name='--pya-gamma-c', type=float, default=0.1,
help='if non-zero, parameter of Gamma prior on pyb, default = 0.1'),
utils.Argument(
name='--trace-every', type=int, default=0,
help=('epochs between printing out trace information '
'(0 = don\'t trace), default = 0')),
utils.Argument(
short_name='-s', name='--nsubjects', type=int,
help='number of subjects to simulate, default = 1'),
utils.Argument(
short_name='-F', name='--forget-rate', type=int,
help='number of utterances whose words can be remembered, '
'default = 1'),
utils.Argument(
short_name='-i', name='--burnin-iterations', type=int,
help=('number of burn-in epochs. This is actually the total '
'number of iterations through the training data, '
'default = 2000')),
utils.Argument(
name='--anneal-iterations', type=int,
help=('number of epochs to anneal for. So e.g. burn-in = 100 '
'and anneal = 90 would leave 10 iters at the end at '
'the final annealing temp, default = 0')),
utils.Argument(
name='--anneal-start-temperature', type=float,
help='start annealing at this temperature, default = 1'),
utils.Argument(
name='--anneal-stop-temperature', type=float,
help='stop annealing at this temperature, default = 1'),
utils.Argument(
name='--anneal-a', type=float,
help=('parameter in annealing temperature sigmoid function, '
'(0 = use ACL06 schedule, deafult = 0')),
utils.Argument(
name='--anneal-b', type=float,
help='parameter in annealing temperature sigmoid function, '
'default = 0.2'),
utils.Argument(
name='--result-field-separator', type=str,
help='Field separator used to print results, default is "\t"'),
utils.Argument(
name='--forget-method', type=['uniformly', 'proportional'],
help='method for deleting lexical items, default is uniformly'),
utils.Argument(
short_name='-N', name='--token-memory', type=int,
help='number of tokens that can be remembered, default = 0'),
utils.Argument(
short_name='-L', name='--type-memory', type=int,
help='number of types that can be remembered, default = 0')
]
[docs]class UnicodeGenerator(object):
"""Iterates on unicode characters
Excludes the space characters. Used to build a (unit -> char)
mapping. The actual dpseg implementation requires that all units
(phones or syllables) are encoded as a unicode char.
Parameters
----------
start : int
The first unicode character to be generated
Notes
-----
This class is a perl to python simplified transcription of the
original script create-unicode-dict-flexible.pl
Examples
--------
This shows a basic usage mapping a list of strings to unicode.
>>> units = ['unit1', 'unit2', 'unit3']
>>> unicode_gen = UnicodeGenerator()
>>> unicode_mapping = {unit: unicode_gen() for unit in units}
"""
def __init__(self, start=3001):
self.index = start
def __call__(self):
"""Returns the next unicode character"""
char = six.unichr(self.index)
while re.match(r'\s', char):
self.index += 1
char = six.unichr(self.index)
self.index += 1
return char
def _dpseg(text, args,
log_level=logging.ERROR,
log_name='wordseg-dpseg',
binary=utils.get_binary('dpseg')):
log = utils.get_logger(name=log_name, level=log_level)
with tempfile.NamedTemporaryFile() as tmp_output:
command = '{binary} --output-file {output} {args}'.format(
binary=binary,
output=tmp_output.name,
args=args)
log.debug('running "%s"', command)
process = subprocess.Popen(
shlex.split(command), stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
def writer():
for utt in text:
process.stdin.write((utt.strip() + '\n').encode('utf8'))
process.stdin.close()
thread = threading.Thread(target=writer)
thread.start()
# Send stdout and stderr to logger, break if EOF reached
while True:
line_out = process.stdout.readline().decode('utf8')
line_err = process.stderr.readline().decode('utf8')
if line_out == "" and line_err == "":
break
if line_out != "":
log.debug(line_out.strip())
if line_err != "":
log.debug(line_err.strip())
thread.join()
process.wait()
if process.returncode:
raise RuntimeError(
'failed with error code {}'.format(process.returncode))
tmp_output.seek(0)
return tmp_output.read().decode('utf8').split('\n')
[docs]def segment(text, nfolds=5, njobs=1,
args='--ngram 1 --a1 0 --b1 1',
log=utils.null_logger(),
binary=utils.get_binary('dpseg')):
"""Run the 'dpseg' binary on `nfolds` folds"""
# force the text to be a list of utterances
text = list(text)
# set of unique units (syllables or phones) present in the text
units = set(unit for utt in text for unit in utt.split())
log.info('%s units found in %s utterances', len(units), len(text))
# create a unicode equivalent for each unit and convert the text
# to that unicode version
log.debug('converting input to unicode')
unicode_gen = UnicodeGenerator()
unicode_mapping = {unit: unicode_gen() for unit in units}
unicode_text = [''.join(unicode_mapping[unit] for unit in utt.split())
for utt in text]
log.debug('building %s folds', nfolds)
fold_boundaries = _dpseg_bugfix(
unicode_text, folding.boundaries(unicode_text, nfolds), log)
folded_texts, fold_index = folding.fold(
unicode_text, nfolds, fold_boundaries=fold_boundaries)
segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)(
joblib.delayed(_dpseg)(
fold, args,
log_level=log.getEffectiveLevel(),
log_name='wordseg-dpseg - fold {}'.format(n+1),
binary=binary)
for n, fold in enumerate(folded_texts))
log.debug('unfolding the %s folds', nfolds)
output_text = folding.unfold(segmented_texts, fold_index)
# convert the text back to unit level (from unicode level)
log.debug('converting output back from unicode')
unit_mapping = {v: k for k, v in unicode_mapping.items()}
unit_mapping[' '] = ' '
segmented_text = (
''.join(unit_mapping[char] for char in utt) for utt in output_text)
return (utt for utt in segmented_text if utt)
def _find_first_line_with_min_len(lines, min_len=2):
"""Return the index of first line in `lines` with len >= `min_len`
Or return None when no line found.
"""
for i, l in enumerate(lines):
if len(l) >= min_len:
return i
return None
def _dpseg_bugfix(text, boundaries, log=utils.null_logger()):
"""Ensure all folds have their first line with more than one symbol
There is a bug in the C++ implementation of dpseg: when the first
input line is composed of a single character (i.e. a unicode
symbol), the program fails. To avoid this, this method ensures all
the folds begin with at least two symbols. If this is not the
case, the boundary position is moved to the next line containing
at least two symbols.
Raises
------
ValueError if the bugfix is needed and cannot be applied
Notes
-----
This implementation differs with the one in CDSWordSeg. In wordseg
we modify the fold index and thus the fold size, whereas in
CDSWordSeg we permute lines in the text (and thus to the gold
file). In wordseg we don't want to expose the gold file at this
level.
"""
# we have something to fix if one of those lengths is 1
first_len = [len(text[i]) for i in boundaries]
if 1 not in first_len:
log.debug('folds boundaries are OK for dpseg')
return boundaries
if first_len[0] == 1:
raise ValueError(
'The input text\'s first line has a single symbol, '
'this will cause wordseg-dpseg to bug. '
'Please re-arrange your text manually and try again.')
need_to_fix = [i for i, length in enumerate(first_len) if length == 1]
log.debug(
'dpseg bugfix: need to fix folds {}'
.format([i+1 for i in need_to_fix]))
for i in need_to_fix:
# find the first line of the fold with len >= 2
index = _find_first_line_with_min_len(
text[boundaries[i]:], min_len=2)
if index is None:
raise ValueError(
'dpseg bugfix failed: all lines in the fold {} have len == 1'
.format(i+1))
log.debug(
'dpseg bugfix: fixing fold {} index from {} to {}'
.format(i+1, boundaries[i], boundaries[i] + index))
boundaries[i] += index
if not boundaries == sorted(set(boundaries)):
raise ValueError(
'dpseg bugfix failed: broke the folds order. '
'Please re-arrange (shuffle) your text manually and try again.')
return boundaries
def _add_arguments(parser):
"""Add algorithm specific options to the parser"""
parser.add_argument(
'-f', '--nfolds', type=int, metavar='<int>', default=5,
help='number of folds to segment the text on, default is %(default)s')
parser.add_argument(
'-j', '--njobs', type=int, metavar='<int>', default=1,
help='number of parallel jobs to use, default is %(default)s')
parser.add_argument(
'-r', '--randseed', type=int, metavar='<int>',
help=('seed for random numbers generation, '
'default is based on current time')),
parser.add_argument(
'-c', '--config-file', metavar='<file>',
help=('configuration file to read the algorithm parameters from, '
'for example configuration files see {}'.format(
os.path.dirname(utils.get_config_files('dpseg')[0]))))
group = parser.add_argument_group('algorithm options')
for arg in DPSEG_ARGUMENTS:
arg.add_to_parser(group)
@utils.CatchExceptions
def main():
"""Entry point of the 'wordseg-dpseg' command"""
streamin, streamout, _, log, args = utils.prepare_main(
name='wordseg-dpseg',
description=__doc__,
add_arguments=_add_arguments)
assert args.nfolds > 0
# build the dpseg command arguments
dpseg_args = {}
excluded_args = ['verbose', 'quiet', 'input', 'output', 'nfolds', 'njobs']
for k, v in vars(args).items():
# ignored arguments
if k in excluded_args or (v in (None, False) and v != 0):
continue
if k == 'estimator':
v = {'viterbi': 'V', 'flip': 'F',
'decayed-flip': 'D', 'tree': 'T'}[v]
if k == 'ngram':
v = {'unigram': 1, 'bigram': 2}[v]
if k == 'forget_method':
v = {'uniformly': 'U', 'proportional': 'P'}[v]
dpseg_args[k.replace('_', '-')] = v
dpseg_args = ' '.join('--{} {}'.format(k, v)
for k, v in dpseg_args.items())
# adapt boolean values
for orig, new in (
('--eval-maximize True', '--eval-maximize 1'),
('--eval-maximize False', ''),
('--do-mbdp True', '--do-mbdp 1'),
('--do-mbdp False', '')):
dpseg_args = dpseg_args.replace(orig, new)
segmented = segment(
streamin, nfolds=args.nfolds, njobs=args.njobs,
args=dpseg_args, log=log)
streamout.write('\n'.join(segmented) + '\n')
if __name__ == '__main__':
main()