"""Estimates syllable boundaries on a text using the maximal onset principle.
This algorithm fully syllabifies a text from a list of onsets and
vowels. Input text must be in orthographic form (with word separators
only) or in phonemized form (with both word and phone
separators). Output text has syllable separators added at estimated
syllable boundaries. For exemples of vowels and onsets files, see the
directory `wordseg/data/syllabification`.
"""
# Created by Lawrence Phillips & Lisa Pearl (2013), adapted by Alex
# Cristia (2015), converted from perl to python and integration in
# wordseg by Mathieu Bernard (2017). Credit is owed mainly to the
# original authors.
import codecs
import os
import re
import six
from wordseg import utils
from wordseg.separator import Separator
[docs]class Syllabifier(object):
"""Syllabify a text given in phonological or orthographic form
Syllabification errors can occur when the onsets and/or vowels are
not adapted to the input text (see the `tolerant` parameter).
Parameters
----------
onsets : list
The list of valid onsets in the `text`
vowels : list
The list of vowels in the `text`
separator : Separator, optional
Token separation in the `text`
silent : bool, optional
When True, append a silent vowel to the end of words without
vowel (the vowel is removed after processing so the text is
unchanged). When False those words cannot be syllabified.
log : logging.Logger, optional
Where to send log messages
Raises
------
ValueError
If `onsets` or `vowels` are empty or are not lists.
"""
def __init__(self, onsets, vowels, separator=Separator(),
filling_vowel=False, log=utils.null_logger()):
self.onsets = onsets
self.vowels = vowels
self.separator = separator
self.log = log
# ensure onsets and vowels are not empty
if not isinstance(vowels, list) or not len(vowels):
raise ValueError('unvalid or empty vowels list')
if not isinstance(onsets, list) or not len(onsets):
raise ValueError('unvalid or empty onsets list')
# concatenation of all chars in onsets and vowels (usefull to
# detect any char during syllabification)
self.symbols = (
set(''.join(v for v in vowels)).union(
set(''.join(o for o in onsets))))
# if defined, ensure the silent vowel is not already used
if filling_vowel:
# find a silent vowel (some char not already prensent in
# the symbols)
code = 1
while six.unichr(code) in self.symbols:
code += 1
self.silent = six.unichr(code)
self.symbols.add(self.silent)
self.vowels.append(self.silent)
else:
self.silent = None
[docs] def syllabify(self, text, strip=False, tolerant=False):
"""Returns the text with syllable boundaries added
Parameters
----------
text : sequence
The input text to be syllabified. Each element of the sequence
is assumed to be a single and complete utterance in valid
phonological form.
strip : bool, optional
When True, removes the syllable boundary at the end of words.
tolerant : bool, optional
When False (the default), the function raise a ValueError on
the first utterance that have not been correctly
syllabified. When True, ignore the failed utterances in output
but issue a log warning instead.
Returns
-------
The text with estimated syllable boundaries added. If `tolerant`
is True some utterances may be missing in the output.
Raises
------
ValueError
If an utterance has not been correctly syllabified . If
`separator.syllable` is found in the text, or if `onsets`
or `vowels` are empty.
"""
# we are syllabifying utterance per utterance
syllabified_text = []
nerrors = 0
for n, utt in enumerate(text):
utt = utt.strip()
# first ensure the utterance is compatible with the given
# syllable separator
if self.separator.syllable in utt:
raise ValueError(
'syllable separator "{}" found in text (line {}): {}'
.format(self.separator.syllable, n+1, utt))
# if we have phone separators, removes them and store
# their positions
utt, index = self._remove_phone_separators(utt)
# estimate the syllable boundaries on the utterance
try:
syllables = self._syllabify_utterance(utt, strip=strip)
except RuntimeError as err:
error = 'line {}, {}'.format(n+1, err)
if tolerant:
# issue a warning and ignore that utterance
self.log.warning(error)
nerrors += 1
continue
else:
# fail with error
raise ValueError(error)
# restore the phones separators as they were before
syllables = self._restore_phone_separators(syllables, index, strip)
syllabified_text.append(syllables)
if tolerant and nerrors > 0:
self.log.error(
'syllabification failed for {} utterances'.format(nerrors))
return syllabified_text
[docs] @staticmethod
def open_datafile(data_file):
"""Read a vowel or onsets file as a list"""
data = codecs.open(data_file, 'r', encoding='utf8').readlines()
# ignore empty lines in data file
return [line for line in (line.strip() for line in data) if line]
def _syllabify_utterance(self, utterance, strip=False):
"""Syllabify a single utterance
Auxiliary function to syllabify_text().
Raises
------
RuntimeError
If the syllabification failed
"""
# split the utterances into words
words = self.separator.tokenize(
utterance, level='word', keep_boundaries=False)
# estimate syllables boundaries word per word, read them from
# end to start
output = ''
for n, word in enumerate(words[::-1]):
try:
output_word = self._syllabify_word(word, strip)
except RuntimeError as err:
# forward the exception with word id added
raise RuntimeError(
'word {}: {}'.format(len(words) - n, err))
# concatenate the syllabified word to the output, do not
# append a word separator at the end if stripped
if strip and not self.separator.remove(output):
output = output_word
else:
output = output_word + self.separator.word + output
return output
def _syllabify_word(self, word, strip):
"""Return a single word with syllable boundaries added
Auxiliary function to syllabify_utterance().
Raises
------
RuntimeError
If the word has no vowel, contains an unknown symbol (not
present in vowels or onsets) or if the syllabification
failed.
"""
# ensure all the chars in word are defined in vowels or onsets
unknown = self._unknown_char(word)
if unknown:
raise RuntimeError(
'unknown symbol "{}" in word "{}"'.format(unknown, word))
# ensure the word containe at least a vowel
if not self._has_vowels(word):
if not self.silent:
raise RuntimeError(
'no vowel in word "{}"'.format(word))
else:
word += self.silent
input_word = word
output_word = ''
syllable = ''
# read characters of the current word from end to start
while len(word) > 0:
char, word = word[-1], word[:-1]
# append current char to current syllable - that will be
# necessary regardless of whether it's a vowel or a coda
syllable = char + syllable
if char in self.vowels:
word, syllable = self._build_onset(word, syllable)
# add the syllable to words entry
if strip and not output_word:
output_word = syllable
else:
output_word = (
syllable + self.separator.syllable + output_word)
syllable = ''
if input_word != self.separator.remove(output_word, 'syllable'):
raise RuntimeError(
'onset not found in "{}"'.format(
input_word,
self.separator.remove(output_word, 'syllable')))
if self.silent:
return re.sub(self.silent, '', output_word)
else:
return output_word
def _build_onset(self, word, syllable):
try:
prevchar = word[-1]
if prevchar not in self.vowels:
# if this char is a vowel and the previous one is not,
# then we need to make the onset, start with nothing as
# the onset
onset = ''
# then we want to take one letter at a time and check
# whether their concatenation makes a good onset
while len(word) and word[-1] + onset in self.onsets:
onset = word[-1] + onset
word = word[:-1]
# we get here either because we've concatenated the
# onset+rest or because there was no onset and the
# preceding element is a vowel, so this is the end of the
# syllable
syllable = onset + syllable
except IndexError: # there is no previous char
pass
return word, syllable
def _remove_phone_separators(self, utt):
# special case when there is no phone separator in the utterance
if not re.search(self.separator.phone, utt):
return utt, []
# the returned index is a list of lists (for each word, length
# of each phone)
index = []
# split the utterance in words and index the phones length
for word in self.separator.split(utt, 'word', keep_boundaries=True):
phones = self.separator.split(word, 'phone', keep_boundaries=False)
current_index = [len(p) for p in phones if len(p)]
if current_index:
index.append(current_index)
return self.separator.remove(utt, level='phone'), index
def _restore_phone_separators(self, utt, index, strip):
# special case when there is no phone separator in the
# utterance
if index == []:
return utt
# restore the utterance word per word (index[i]) and within
# words, phone per phone (index[i][j]).
restored = ''
for i, word in enumerate(
self.separator.split(utt, 'word', keep_boundaries=True)):
if len(word) == 0 and strip is False:
# coherent behavior for non striped texts
restored += self.separator.word
else:
j = 0 # iterate on syllables
for syllable in self.separator.split(
word, 'syllable', keep_boundaries=True):
# for each phone in the syllable, append a phone
# separator
k = 0
while k < len(syllable):
restored += (
syllable[k:k+index[i][j]] + self.separator.phone)
k += index[i][j]
j += 1
# end of the syllable, append a separator
if strip:
restored = restored[:-len(self.separator.phone)]
restored += self.separator.syllable
# end of the word, remove the last syllable boundary
# append a word separator
restored = (restored[:-len(self.separator.syllable)] +
self.separator.word)
# remove the last word boundary of the utterance
return restored[:-len(self.separator.word)]
def _unknown_char(self, word):
"""Returns the unknown char if anyone if found, False otherwise"""
for w in word:
if w not in self.symbols:
return w
return False
def _has_vowels(self, word):
"""True if the `word` contains any vowel, False otherwise"""
for v in self.vowels:
if v in word:
return True
return False
def _add_arguments(parser):
"""Add command line arguments for wordseg-syll"""
parser.add_argument(
'onsets_file', type=str, metavar='<onsets-file>',
help=('a file containing the list of valid onsets for the '
'input text, one onset per line'))
parser.add_argument(
'vowels_file', type=str, metavar='<vowels-file>',
help=('a file containing the list of vowels for '
'the input text, one vowel per line'))
parser.add_argument(
'-S', '--strip', action='store_true',
help='removes the end separators in syllabified output')
parser.add_argument(
'-t', '--tolerant', action='store_true',
help='tolerate syllabification failures and report them as warnings, '
'default is to fail at the first error')
parser.add_argument(
'-f', '--filling-vowel', action='store_true',
help='add a silent filling vowel to groups of consonants '
'with no vowel, by default words with no vowel cannot be syllabified')
@utils.CatchExceptions
def main():
"""Entry point of the 'wordseg-syll' command"""
streamin, streamout, separator, log, args = utils.prepare_main(
name='wordseg-syll',
description=__doc__,
separator=utils.Separator(' ', ';esyll', ';eword'),
add_arguments=_add_arguments)
# loads the onsets
if not os.path.isfile(args.onsets_file):
raise RuntimeError(
'unknown onsets file "{}"'.format(args.onsets_file))
onsets = Syllabifier.open_datafile(args.onsets_file)
# loads the vowels
if not os.path.isfile(args.vowels_file):
raise RuntimeError(
'unknown vowels file "{}"'.format(args.vowels_file))
vowels = Syllabifier.open_datafile(args.vowels_file)
log.info('loaded %s onsets', len(onsets))
log.debug('onsets are "%s"', ', '.join(onsets))
log.info('loaded %s vowels', len(vowels))
log.debug('vowels are "%s"', ', '.join(vowels))
log.debug('separator is %s', separator)
syllabifier = Syllabifier(
onsets, vowels, separator=separator,
filling_vowel=args.filling_vowel, log=log)
# syllabify the input text
sylls = utils.CountingIterator(syllabifier.syllabify(
streamin, strip=args.strip, tolerant=args.tolerant))
# display the output
log.info('syllabified %s utterances', sylls.count)
streamout.write('\n'.join(sylls) + '\n')
if __name__ == '__main__':
main()