Source code for wordseg.folding

"""Folding and unfolding texts for use in iterative word segmenters

Iterative algorithms pass through the input text only once, the model
is learned online. Thus only the end of the text is relevent for the
algorithm evaluation. To use the whole input for evaluation, the
folding module create "folded" versions of a text to be used in
iterative text based algorithms.

Let "A B C" be a text made of three blocks A, B and C having roughly
the same number of lines. Folding that text in 3 generates a list of 3
versions ["A1 B1 C1", "C2 A2 B2", and "B3 C3 A3"]. The algorithm is
ran over the 3 versions and their outputs are then unfolded to
retrieve the original text "A3 B2 C1".

"""

import itertools


def _permute(l):
    """Pops the last element of a list an push it at beginning"""
    return [l[-1]] + l[0:-1]


def _flatten(l):
    """Flattens a list of lists in a single list"""
    return list(itertools.chain(*l))


def _cumsum(l):
    """Return the cumulative sum from a list `l`"""
    return [sum(l[:i+1]) for i in range(len(l))]

[docs]def boundaries(text, nfolds): """Returns `nfolds` boundaries as a list of line indices in `text` Parameters ---------- text : list The input text as a list of utterances. nfolds : int The number of fold boundaries to compute. Returns ------- boundaries : list The list of indices in `text` corresponding to the computed fold boundaries. Raises ------ ValueError If the `text` has not enought lines to build the requested `nfolds`, or if `nfolds` is not strictly positive. """ if nfolds < 1: raise ValueError( 'nfolds must be a stricly positive interger, it is {}' .format(nfolds)) if len(text) < nfolds: raise ValueError( 'not enought lines in text to make {} folds' .format(nfolds)) return [i * int(len(text)/nfolds) for i in range(nfolds)]
[docs]def fold(text, nfolds, fold_boundaries=None): """Create `nfolds` versions of an input `text` In order to serve the unfold operation, this functions also build the index of the beginning of the last block in each fold. Parameters ---------- text : list The input text as a list of utterances. nfolds : int The number of folds to build on `text`. fold_boundaries : list, optional An increasing list of length `nfolds` with the start index of each fold in `text`. By default, use the boundaries() function. Returns ------- folds : list a list of folded versions of the `text` index : list a list of index positions for text unfolding """ if nfolds == 1: return [text], [0] # compute boundaries if not specified in arguments if fold_boundaries is None: fold_boundaries = boundaries(text, nfolds) # create data blocks from boundaries b = fold_boundaries blocks = [text[b[i]:b[i+1]] for i in range(len(b)-1)] blocks.append(text[b[-1]:]) # build the folds from the blocks and store index of the last # block in each fold nfolds = len(b) folds = [] index = [] idx = list(range(nfolds)) for _ in range(nfolds): folds += [_flatten([blocks[idx[j]] for j in range(nfolds)])] index += [_cumsum( [len(blocks[idx[j]]) for j in range(nfolds)])[-2]] idx = _permute(idx) assert all([len(f) == len(text) for f in folds]) return folds, index
[docs]def unfold(folds, index): """Concatenate the last block of each fold to form the unfolded text This is the reverse operation of the fold() method. Parameters ---------- folds, index As outputed by the fold() method. Returns ------- unfolded_text : list of str The unfolded utterances composed of ending blocks of the input `folds`. """ last_blocks = [f[index[i]:] for i, f in enumerate(folds)] return _flatten(last_blocks[::-1])