# coding: utf-8
"""Extract statistics relevant for word segmentation corpora
To analyze a segmented text or a text in orthographic form (i.e. with
word separators only), you must define empty phone and syllable
separators (see the token separation arguments below).
"""
import collections
import json
import math
from wordseg import utils
from wordseg.separator import Separator
[docs]class CorpusStatistics(object):
"""Estimates descriptive statistics from a text corpus
Parameters
----------
corpus : sequence of str
The text to describe is a suite of tokenized utterances.
separator : Separator
The token separators used in the `text`.
log : logging.Logger
Where to send log messages, disabled by default.
Attributes
----------
tokens : dict
For all levels defined in `separator`, tokens[level] is the
`corpus` utterances tokenized at that level. Each utterance is
a list of tokens without any separator.
unigram : dict
For all levels defined in `separator`, unigram[level] is the
tokens frequency as a dict (token: frequency).
"""
def __init__(self, corpus, separator, log=utils.null_logger()):
self.log = log
# check the separator have words and possibly phones
self.separator = separator
if not self.separator.word:
raise ValueError('word separator not defined')
if not self.separator.phone:
log.warning('phone separator not defined, some stats ignored')
if not self.separator.syllable:
log.warning('syllable separator not defined, some stats ignored')
self.log.info('token separator is %s', self.separator)
# force to list and ignore empty lines
self.corpus = [
utt for utt in (utt.strip() for utt in corpus) if len(utt)]
self.log.info('loaded %s utterances', len(self.corpus))
if len(self.corpus) == 0:
raise ValueError('no text to load')
# tokenize the entire text at each defined level ('word',
# 'syllable' and/or 'phone') TODO can be optimized we are
# tokenizing the entire text up to 3 times (implement nested
# tokenization).
self.tokens = {}
for level in self.separator.levels()[::-1]:
self.log.debug('tokenizing %s', level)
self.tokens[level] = [
self.separator.tokenize(utt, level, keep_boundaries=False)
for utt in self.corpus]
ntokens = sum(len(t) for t in self.tokens[level])
self.log.info('parsed %s %ss', ntokens, level)
if ntokens == 0:
raise ValueError('{}s expected but 0 parsed'.format(level))
# estimates token frequencies
self.unigram = {}
for level in self.separator.levels()[::-1]:
self.unigram[level] = self._unigram(level)
def _mattr(self, level, size=10):
"""Return the mean ratio of unique tokens per chunk of `size`"""
# the list of all the tokens
tokens = [w for u in self.tokens[level] for w in u]
# ratio of uniques words per chunk of `size` words
nuniques = [float(len(set(tokens[x:x + size]))) / size
for x in range(len(tokens) - size)]
return float(sum(nuniques)) / len(nuniques)
def _unigram(self, level):
"""Return dictionary of (token: frequency) items"""
count = self.most_common_tokens(level)
self.log.info('5 most common {}s: {}'.format(
level, [t for t, c in count[:5]]))
total_count = float(sum(c[1] for c in count))
return {c[0]: c[1] / total_count for c in count}
[docs] def describe_all(self):
"""Full description of the corpus at utterance and token levels
This method is a simple wrapper on the other statistical
methods. It call all the methods available for the defined
separator (some of them requires 'phone' tokens) and wraps the
results in an ordered dictionary.
"""
# store the output statistics in a dictionary
results = collections.OrderedDict()
# corpus description at utterance level
results['corpus'] = self.describe_corpus()
# if phone are defined, compute the corpus entropy
if self.separator.phone:
results['corpus']['entropy'] \
= self.normalized_segmentation_entropy()
# for each defined token level (from word to phone),
# describe the corpus at that level
for level in self.separator.levels()[::-1]:
results[level + 's'] = self.describe_tokens(level)
return results
[docs] def describe_corpus(self):
"""Basic description of the corpus at word level
Returns
-------
stats : ordered dict
A dictionnary made of the following entries (all counts
being on the entire corpus):
- 'nutts': number of utterances
- 'nutts_single_word': number of utterances made of a single world
- 'mattr': mean ratio of unique words per chunk of 10 words
Notes
-----
This method is a Python implementation of `this script`_ from
CDSWordSeg.
.. _this script: https://github.com/alecristia/CDSwordSeg/blob/master/
recipes/CatalanSpanish/_describe_gold.sh
"""
# length of utterances in number of words
wlen = [len(utt) for utt in self.tokens['word']]
stats = collections.OrderedDict((k, v) for k, v in (
# number of utterances
('nutts', len(self.corpus)),
# number of single word utterances
('nutts_single_word', wlen.count(1)),
# mean ratio of unique words per chunk of 10 words
('mattr', self._mattr('word', size=10))))
return stats
[docs] def describe_tokens(self, level):
"""Basic description of the corpus at tokens level
Parameters
----------
level : str
The tokens level to describe. Must be 'phone', 'syllable'
or 'word'.
Returns
-------
stats : ordered dict
A dictionnary made of the following entries (all counts
being on the entire corpus):
- 'tokens': number of tokens
- 'types': number of types
- 'hapaxes': number of types occuring only once in the corpus
"""
stats = collections.OrderedDict()
# length of utterances in number of words
tokens_len = [len(utt) for utt in self.tokens[level]]
# number of tokens
stats['tokens'] = sum(tokens_len)
# types
types_count = self.most_common_tokens(level)
# number of types
stats['types'] = len(types_count)
# number of types occuring only once in the corpus
stats['hapaxes'] = len([k for k, v in types_count if v == 1])
return stats
[docs] def most_common_tokens(self, level, n=None):
"""Return the most common tokens and their count
Parameters
----------
level : str
Must be 'phone', 'syllable' or word'.
n : int, optional
When specified returns only the `n` most commons tokens,
when omitted or None returns all the tokens.
Returns
-------
counts : list
The list of (token, count) values sorted in decreasing
count order.
"""
return collections.Counter(
(t for utt in self.tokens[level] for t in utt)).most_common(n)
[docs] def normalized_segmentation_entropy(self):
"""Return the Normalized Segmentation Entropy computed on `text`
Token separators must be defined for phones and words.
Returns
-------
entropy : float
The estimated NSE in bits.
Raises
------
KeyError if the corpus is not tokenized at 'phone' and 'word' levels.
Notes
-----
As explained in [1]_ we are interested in the ambiguity generated
by the different possible parses that result from a
segmentation. In order to quantify this idea in general, we define
a Normalized Segmentation Entropy. To do this, we need to assign a
probability to every possible segmentation. To this end, we use a
unigram model where the probability of a lexical item is its
normalized frequency in the corpus and the probability of a parse
is the product of the probabilities of its terms. In order to
obtain a measure that does not depend on the utterance length, we
normalize by the number of possible boundaries in the
utterance. So for an utterance of length N, the Normalized
Segmentation Entropy (NSE) is computed using Shannon formula
(Shannon, 1948) as follows:
.. math::
NSE = -\\sum_i P_ilog_2(P_i) / (N-1),
where :math:`P_i` is the probability of the word :math:`i` and
:math:`N` the number of phonemes in the text.
.. [1] A. Fourtassi, B. Börschinger, M. Johnson and E. Dupoux,
"Whyisenglishsoeasytosegment". In Proceedings of the Fourth Annual
Workshop on Cognitive Modeling and Computational Linguistics
(pp. 1-10), 2013.
"""
# count the number of phones in the text
N = sum(len(utt) for utt in self.tokens['phone'])
# word lexicon with probabilities
P = self.unigram['word']
# the probability of each word in the text
probs = (P[word] for utt in self.tokens['word'] for word in utt)
# compute the entropy
return -1 * sum(p * math.log(p, 2) / float(N - 1) for p in probs)
@utils.CatchExceptions
def main():
"""Entry point of the 'wordseg-stats' command"""
# options description
def add_arguments(parser):
parser.add_argument(
'--json', action='store_true',
help='print the results in JSON format, else print in raw text')
# command initialization
streamin, streamout, separator, log, args = utils.prepare_main(
name='wordseg-stats',
description=__doc__,
add_arguments=add_arguments,
separator=Separator())
# compute the statistics
stats = CorpusStatistics(streamin, separator, log=log)
results = stats.describe_all()
# display the results either as a JSON string or in raw text
if args.json:
streamout.write((json.dumps(results, indent=4)) + '\n')
else:
out = (' '.join((name, k, str(v)))
for name, stats in results.items()
for k, v in stats.items())
streamout.write('\n'.join(out) + '\n')
if __name__ == '__main__':
main()