"""Prepare an input text for word segmentation
* The input text must be in a phonologized form (a suite of phones,
syllables or words tokens as specified by the token separator).
* The input text is checked for errors in formatting (presence of
punctuation, missing separators, etc...).
* The output text contains space separated phones (or syllables
according to the *unit* option).
* The program fails on the first encountered error, or ignore them if
the *tolerant* option is used.
"""
import six
import string
import re
from wordseg import utils
from wordseg.separator import Separator
punctuation_re = re.compile('[%s]' % re.escape(string.punctuation))
"""A regular expression matching all the punctuation characters"""
def _pairwise(l):
"""Yields paiwise elements of a sequence
Examples
--------
>>> list(pairwise([1, 2, 3]))
[(1, 2), (2, 3)]
"""
for a, b in zip(l[:-1], l[1:]):
yield a, b
[docs]def check_utterance(utterance, separator=Separator(), check_punctuation=True):
"""Ensures an utterance is in a valid phonological form
Parameters
----------
utterance : str
The utterance to be checked
separator : Separator, optional
The token separators used in the `utterance`
check_punctuation : bool, optional
When True (default), forbid any punctuation character in the
utterance and raise ValueError if any punctuation is
found. When False, do not check punctuation.
Returns
-------
bool
True if no error detected, raises otherwise
Raises
------
ValueError
If one of the following errors is detected:
* `utterance` is empty or is not a string
* `utterance` contains any punctuation character (once the
separators are removed), only if `check_punctuation` is
True
* `utterance` begins with a separator
* `utterance` does not end with a word separator
* `utterance` contains syllable tokens but a word does not end
with a syllable separator
"""
# utterance is empty or not a string (or unicode for python2)
if not utterance or not isinstance(utterance, six.string_types):
raise ValueError(
'utterance is not a string ({}): {}'.format(
type(utterance), utterance))
if not len(separator.strip(utterance)):
raise ValueError('utterance is an empty string')
# search any punctuation in utterance (take care to remove token
# separators first)
if check_punctuation is True:
cleaned_utterance = separator.remove(utterance)
if punctuation_re.sub('', cleaned_utterance) != cleaned_utterance:
raise ValueError('punctuation found in utterance')
# utterance must not begin with a separator
for sep in separator.iterate():
if sep and re.match('^{}'.format(re.escape(sep)), utterance):
raise ValueError(
'utterance begins with a separator: "{}"'.format(utterance))
# utterance must end with a word separator
if not utterance.endswith(separator.word):
raise ValueError(
'utterance does not end with a word separator: "{}"'
.format(utterance))
# a word does not finish with a syllable separator
if separator.syllable and separator.syllable in utterance and not all(
a == separator.syllable
for a, b in _pairwise(utterance.split(separator.phone))
if b == separator.word):
raise ValueError(
'a word does not end with a syllable separator: "{}"'
.format(utterance))
return True
[docs]def prepare(text, separator=Separator(), unit='phone',
check_punctuation=True, tolerant=False,
log=utils.null_logger()):
"""Prepares a text in phonological form for word segmentation
The returned text is ready to be segmented. It consists in a suite
of phonological symbols (can be phones or syllable depending on
`unit`) separated by spaces.
The function removes the word separators from all the lines in
`text` and replaces boundaries at the unit level defined by `unit`
by a space. If `unit` is 'phone' the syllable separators are
removed, and vice-versa if `unit` is 'syllable' the phone
separators are dicarded.
Parameters
----------
text : sequence
The input text to be prepared for segmentation. Each element
of the sequence is assumed to be a single and complete
utterance in valid phonological form.
separator : Separator, optional
Token separation in the `text`
unit : str, optional
The unit representation level to prepare the `text` at, must
be 'syllable' or 'phone'.
check_punctuation : bool, optional
When True (default), forbid any punctuation character in the
utterance and raise ValueError if any punctuation is
found. When False, do not check punctiation.
tolerant : bool, optional
If False, raise ValueError on the first format error detected
in the `text`. If True, the badly formated utterances are
filtered out from the output and a warning is issued.
log : logging.Logger, optional
The logger instance where to send messages.
Returns
-------
prepared_text : generator
Utterances from the `text` with separators removed, prepared
for segmentation at a syllable or phoneme representation level
(separated by space).
Raises
------
ValueError
On the first format error encountered in `text` (see the
prepare.check_utterance function), only if `tolerant` is
False.
"""
# raise an error if unit is not valid
if unit not in ('phone', 'syllable'):
raise ValueError(
"unit must be 'phone' or 'syllable', it is '{}'".format(unit))
# define the function that prepare the text (removing requested
# separators)
if unit == 'phone':
def func(line):
return line.replace(separator.syllable, '')\
.replace(separator.word, '')
else: # syllable
def func(line):
return line.replace(separator.word, '')\
.replace(' ', '')\
.replace(separator.syllable, ' ')
nremoved = 0
for n, line in enumerate(text):
try: # force the utf8 encoding
line = line.encode('utf8').decode().strip()
except ValueError: # line is already in bytes, not str
line = line.strip()
# ignore empty lines
if line == '':
log.debug('ignoring empty line %d', n+1)
nremoved += 1
continue
try:
check_utterance(
line, separator, check_punctuation=check_punctuation)
yield utils.strip(func(line))
except ValueError as err:
if tolerant is True:
log.info('removing line %d: "%s"', n + 1, line)
nremoved += 1
else:
raise ValueError('line {}: {}'.format(n + 1, err))
if nremoved > 0:
log.warning('removed %d badly formatted utterances', nremoved)
[docs]def gold(text, separator=Separator()):
"""Returns a gold text from a phonologized one
The returned gold text is the ground-truth segmentation. It has
phone and syllable separators removed and word separators replaced
by a single space ' '. It is used to evaluate the output of
segmentation algorithms.
Parameters
----------
text : sequence
The input text to be prepared for segmentation. Each element
of the sequence is assumed to be a single and complete
utterance in valid phonological form.
separator : Separator, optional
Token separation in the `text`
Returns
-------
gold_text : generator
Gold utterances with separators removed and words separated by
spaces. The returned text is the gold version, against which
the algorithms are evaluated.
"""
# delete phone and syllable separators. Replace word boundaries by
# a single space.
gold = (line.replace(separator.syllable, '')
.replace(separator.phone or '', '')
.replace(separator.word, ' ') for line in text)
# delete any duplicate, begin or end spaces. As for prepare, we
# ignore empty lines.
return (line for line in (utils.strip(line) for line in gold) if line)
@utils.CatchExceptions
def main():
"""Entry point of the 'wordseg-prep' command"""
# add a command-specific argument
def add_arguments(parser):
parser.add_argument(
'-u', '--unit', type=str,
choices=['phone', 'syllable'], default='phone', help='''
output level representation, must be "phone" or "syllable"''')
parser.add_argument(
'-t', '--tolerant', action='store_true',
help='''tolerate the badly formated utterances in input,
but ignore them in output (default is to exit on the first
encountered error)''')
parser.add_argument(
'-P', '--punctuation', action='store_true',
help='punctuation characters are not considered illegal')
group = [g for g in parser._action_groups
if g.title == 'input/output arguments'][0]
group.add_argument(
'-g', '--gold', type=str, metavar='<gold-file>',
help='''generates the gold text to the specified file,
do not generate gold if no file specified''')
# command initialization
streamin, streamout, separator, log, args = utils.prepare_main(
name='wordseg-prep',
description=__doc__,
separator=utils.Separator(' ', ';esyll', ';eword'),
add_arguments=add_arguments)
streamin = list(streamin)
log.debug('separator is %s', separator)
log.info('preparing the text at {} level'.format(args.unit))
# check all the utterances are correctly formatted.
prep = utils.CountingIterator(prepare(
streamin, separator, unit=args.unit, log=log,
check_punctuation=not args.punctuation, tolerant=args.tolerant))
# write prepared text, one utterance a line, ending with a newline
streamout.write('\n'.join(prep) + '\n')
log.info('prepared %s utterances', prep.count)
if args.gold:
log.info('generating gold text to %s', args.gold)
gold_text = gold(streamin, separator=separator)
open(args.gold, 'w').write('\n'.join(gold_text) + '\n')
if __name__ == '__main__':
main()