Source code for wordseg.statistics

# coding: utf-8

"""Extract statistics relevant for word segmentation corpora

To analyze a segmented text or a text in orthographic form (i.e. with
word separators only), you must define empty phone and syllable
separators (see the token separation arguments below).

"""

import collections
import json
import math

from wordseg import utils
from wordseg.separator import Separator


[docs]class CorpusStatistics(object): """Estimates descriptive statistics from a text corpus Parameters ---------- corpus : sequence of str The text to describe is a suite of tokenized utterances. separator : Separator The token separators used in the `text`. log : logging.Logger Where to send log messages, disabled by default. Attributes ---------- tokens : dict For all levels defined in `separator`, tokens[level] is the `corpus` utterances tokenized at that level. Each utterance is a list of tokens without any separator. unigram : dict For all levels defined in `separator`, unigram[level] is the tokens frequency as a dict (token: frequency). """ def __init__(self, corpus, separator, log=utils.null_logger()): self.log = log # check the separator have words and possibly phones self.separator = separator if not self.separator.word: raise ValueError('word separator not defined') if not self.separator.phone: log.warning('phone separator not defined, some stats ignored') if not self.separator.syllable: log.warning('syllable separator not defined, some stats ignored') self.log.info('token separator is %s', self.separator) # force to list and ignore empty lines self.corpus = [ utt for utt in (utt.strip() for utt in corpus) if len(utt)] self.log.info('loaded %s utterances', len(self.corpus)) if len(self.corpus) == 0: raise ValueError('no text to load') # tokenize the entire text at each defined level ('word', # 'syllable' and/or 'phone') TODO can be optimized we are # tokenizing the entire text up to 3 times (implement nested # tokenization). self.tokens = {} for level in self.separator.levels()[::-1]: self.log.debug('tokenizing %s', level) self.tokens[level] = [ self.separator.tokenize(utt, level, keep_boundaries=False) for utt in self.corpus] ntokens = sum(len(t) for t in self.tokens[level]) self.log.info('parsed %s %ss', ntokens, level) if ntokens == 0: raise ValueError('{}s expected but 0 parsed'.format(level)) # estimates token frequencies self.unigram = {} for level in self.separator.levels()[::-1]: self.unigram[level] = self._unigram(level) def _mattr(self, level, size=10): """Return the mean ratio of unique tokens per chunk of `size`""" # the list of all the tokens tokens = [w for u in self.tokens[level] for w in u] # ratio of uniques words per chunk of `size` words nuniques = [float(len(set(tokens[x:x + size]))) / size for x in range(len(tokens) - size)] return float(sum(nuniques)) / len(nuniques) def _unigram(self, level): """Return dictionary of (token: frequency) items""" count = self.most_common_tokens(level) self.log.info('5 most common {}s: {}'.format( level, [t for t, c in count[:5]])) total_count = float(sum(c[1] for c in count)) return {c[0]: c[1] / total_count for c in count}
[docs] def describe_all(self): """Full description of the corpus at utterance and token levels This method is a simple wrapper on the other statistical methods. It call all the methods available for the defined separator (some of them requires 'phone' tokens) and wraps the results in an ordered dictionary. """ # store the output statistics in a dictionary results = collections.OrderedDict() # corpus description at utterance level results['corpus'] = self.describe_corpus() # if phone are defined, compute the corpus entropy if self.separator.phone: results['corpus']['entropy'] \ = self.normalized_segmentation_entropy() # for each defined token level (from word to phone), # describe the corpus at that level for level in self.separator.levels()[::-1]: results[level + 's'] = self.describe_tokens(level) return results
[docs] def describe_corpus(self): """Basic description of the corpus at word level Returns ------- stats : ordered dict A dictionnary made of the following entries (all counts being on the entire corpus): - 'nutts': number of utterances - 'nutts_single_word': number of utterances made of a single world - 'mattr': mean ratio of unique words per chunk of 10 words Notes ----- This method is a Python implementation of `this script`_ from CDSWordSeg. .. _this script: https://github.com/alecristia/CDSwordSeg/blob/master/ recipes/CatalanSpanish/_describe_gold.sh """ # length of utterances in number of words wlen = [len(utt) for utt in self.tokens['word']] stats = collections.OrderedDict((k, v) for k, v in ( # number of utterances ('nutts', len(self.corpus)), # number of single word utterances ('nutts_single_word', wlen.count(1)), # mean ratio of unique words per chunk of 10 words ('mattr', self._mattr('word', size=10)))) return stats
[docs] def describe_tokens(self, level): """Basic description of the corpus at tokens level Parameters ---------- level : str The tokens level to describe. Must be 'phone', 'syllable' or 'word'. Returns ------- stats : ordered dict A dictionnary made of the following entries (all counts being on the entire corpus): - 'tokens': number of tokens - 'types': number of types - 'hapaxes': number of types occuring only once in the corpus """ stats = collections.OrderedDict() # length of utterances in number of words tokens_len = [len(utt) for utt in self.tokens[level]] # number of tokens stats['tokens'] = sum(tokens_len) # types types_count = self.most_common_tokens(level) # number of types stats['types'] = len(types_count) # number of types occuring only once in the corpus stats['hapaxes'] = len([k for k, v in types_count if v == 1]) return stats
[docs] def most_common_tokens(self, level, n=None): """Return the most common tokens and their count Parameters ---------- level : str Must be 'phone', 'syllable' or word'. n : int, optional When specified returns only the `n` most commons tokens, when omitted or None returns all the tokens. Returns ------- counts : list The list of (token, count) values sorted in decreasing count order. """ return collections.Counter( (t for utt in self.tokens[level] for t in utt)).most_common(n)
[docs] def normalized_segmentation_entropy(self): """Return the Normalized Segmentation Entropy computed on `text` Token separators must be defined for phones and words. Returns ------- entropy : float The estimated NSE in bits. Raises ------ KeyError if the corpus is not tokenized at 'phone' and 'word' levels. Notes ----- As explained in [1]_ we are interested in the ambiguity generated by the different possible parses that result from a segmentation. In order to quantify this idea in general, we define a Normalized Segmentation Entropy. To do this, we need to assign a probability to every possible segmentation. To this end, we use a unigram model where the probability of a lexical item is its normalized frequency in the corpus and the probability of a parse is the product of the probabilities of its terms. In order to obtain a measure that does not depend on the utterance length, we normalize by the number of possible boundaries in the utterance. So for an utterance of length N, the Normalized Segmentation Entropy (NSE) is computed using Shannon formula (Shannon, 1948) as follows: .. math:: NSE = -\\sum_i P_ilog_2(P_i) / (N-1), where :math:`P_i` is the probability of the word :math:`i` and :math:`N` the number of phonemes in the text. .. [1] A. Fourtassi, B. Börschinger, M. Johnson and E. Dupoux, "Whyisenglishsoeasytosegment". In Proceedings of the Fourth Annual Workshop on Cognitive Modeling and Computational Linguistics (pp. 1-10), 2013. """ # count the number of phones in the text N = sum(len(utt) for utt in self.tokens['phone']) # word lexicon with probabilities P = self.unigram['word'] # the probability of each word in the text probs = (P[word] for utt in self.tokens['word'] for word in utt) # compute the entropy return -1 * sum(p * math.log(p, 2) / float(N - 1) for p in probs)
@utils.CatchExceptions def main(): """Entry point of the 'wordseg-stats' command""" # options description def add_arguments(parser): parser.add_argument( '--json', action='store_true', help='print the results in JSON format, else print in raw text') # command initialization streamin, streamout, separator, log, args = utils.prepare_main( name='wordseg-stats', description=__doc__, add_arguments=add_arguments, separator=Separator()) # compute the statistics stats = CorpusStatistics(streamin, separator, log=log) results = stats.describe_all() # display the results either as a JSON string or in raw text if args.json: streamout.write((json.dumps(results, indent=4)) + '\n') else: out = (' '.join((name, k, str(v))) for name, stats in results.items() for k, v in stats.items()) streamout.write('\n'.join(out) + '\n') if __name__ == '__main__': main()