"""Cepstral mean variance normalization (CMVN) on speech features
* The :class:`CmvnPostProcessor` class is used for accumulating CMVN
statistics and applying CMVN on features using accumulated
statistics. Uses the Kaldi implementation (see [kaldi-cmvn]_):
:class:`Features` --> CmvnPostProcessor --> :class:`Features`
* The :class:`SlidingWindowCmvnPostProcessor` class is used to apply sliding
window CMVN. With that class, each window is normalized independantly. Uses
the Kaldi implementation:
:class:`Features` --> SlidingWindowCmvnPostProcessor
--> :class:`Features`
Examples
--------
Compute MFCC features:
>>> import numpy as np
>>> from shennong.audio import Audio
>>> from shennong.processor.mfcc import MfccProcessor
>>> from shennong.postprocessor.cmvn import CmvnPostProcessor
>>> audio = Audio.load('./test/data/test.wav')
>>> mfcc = MfccProcessor(sample_rate=audio.sample_rate).process(audio)
Accumulate CMVN statistics and normalize the features (in real life
you want to accumulate statistics over several features, for example
on all features belonging to one speaker, so as to obtain a
normalization per speaker):
>>> processor = CmvnPostProcessor(mfcc.ndims)
>>> processor.accumulate(mfcc)
>>> cmvn = processor.process(mfcc)
The normalized features have a zero mean and unitary variance:
>>> np.all(np.isclose(cmvn.data.mean(axis=0), np.zeros(cmvn.ndims), atol=1e-6))
True
>>> np.all(np.isclose(cmvn.data.var(axis=0), np.ones(cmvn.ndims), atol=1e-6))
True
This module also provides a high-level method for applying CMVN to a
whole :class:`~shennong.features_collection.FeaturesCollection` at once:
>>> from shennong import FeaturesCollection
>>> from shennong.postprocessor.cmvn import apply_cmvn
>>> feats = FeaturesCollection(utt1=mfcc)
>>> cmvns = apply_cmvn(feats)
As above, the features has zero mean and unitary variance
>>> cmvn = cmvns['utt1']
>>> np.all(np.isclose(cmvn.data.mean(axis=0), np.zeros(cmvn.ndims), atol=1e-6))
True
>>> np.all(np.isclose(cmvn.data.var(axis=0), np.ones(cmvn.ndims), atol=1e-6))
True
Apply sliding-window normalization to the features:
>>> from shennong.postprocessor.cmvn import SlidingWindowCmvnPostProcessor
>>> processor = SlidingWindowCmvnPostProcessor(normalize_variance=True)
>>> window_size = 40
>>> processor.cmn_window = window_size
>>> processor.min_window = window_size
>>> sliding_cmvn = processor.process(mfcc)
Each frame of the original features has been normalized with statistics
computed in the window:
>>> frame = 70
>>> window = mfcc.data[frame-window_size//2:frame+window_size//2, :]
>>> norm_mfcc = (mfcc.data[frame,:] - window.mean(axis=0)) / window.std(axis=0)
>>> np.all(np.isclose(sliding_cmvn.data[frame, :], norm_mfcc, atol=1e-6))
True
References
----------
.. [kaldi-cmvn] https://kaldi-asr.org/doc/transform.html#transform_cmvn
"""
import copy
import numpy as np
import kaldi.matrix
import kaldi.transform.cmvn
import kaldi.feat.functions
from shennong.postprocessor.base import FeaturesPostProcessor
from shennong import Features, FeaturesCollection
[docs]class CmvnPostProcessor(FeaturesPostProcessor):
"""Computes CMVN statistics on speech features
Parameters
----------
dim : int
The features dimension, must be strictly positive
stats : array, shape = [2, dim+1]
Preaccumulated CMVN statistics (see :func:`CmvnPostProcessor:stats`)
Raises
------
ValueError
If ``dim`` is not a strictly positive integer
"""
def __init__(self, dim, stats=None):
super().__init__()
# init features dimension
if not isinstance(dim, int) or dim <= 0:
raise ValueError(
'dimension must be a strictly positive integer, it is {}'
.format(dim))
self._dim = dim
# init the pykaldi cmvn class
self._cmvn = kaldi.transform.cmvn.Cmvn(dim=dim)
# init the stats if specified
if stats is not None:
stats = np.asarray(stats)
if stats.shape != (2, self.dim+1):
raise ValueError(
'stats must be an array of shape {}, but is shaped as {}'
.format((2, self.dim+1), stats.shape))
self._cmvn.stats = kaldi.matrix.SubMatrix(stats)
@property
def name(self):
return 'cmvn'
@property
def dim(self):
"""The dimension of features on which to compute CMVN"""
return self._dim
@property
def stats(self):
"""The accumulated CMVN statistics
Array of shape `[2, dim+1]` with the following format:
* ``stats[0, :]`` represents the sum of accumulated feature
frames, used to estimate the accumulated mean.
* ``stats[1, :]`` represents the sum of element-wise squares
of accumulated feature frames, used to estimate the
accumulated variance.
* ``stats[0, -1]`` represents the weighted total count of
accumulated feature frames.
* ``stats[1, -1]`` is initialized to zero but otherwise is not
used.
"""
return self._cmvn.stats.numpy()
@property
def count(self):
"""The weighted total count of accumulated features frames"""
return self.stats[0, -1]
@property
def ndims(self):
return self.dim
[docs] def get_properties(self, features):
properties = super().get_properties(features)
properties[self.name]['stats'] = self.stats
return properties
[docs] def accumulate(self, features, weights=None):
"""Accumulates CMVN statistics
Computes the CMVN statistics for the given ``features`` and
accumulates them for further processing.
Parameters
----------
features : :class:`~shennong.features.Features`
The input features on which to accumulate statisitics.
weights : array, shape = [``features.nframes``, 1], optional
Weights to apply to each frame of the features (possibly
zero to ignore silences or non-speech
frames). Accumulation is non-weighted by default.
Raises
------
ValueError
If ``weights`` have more than one dimension or if
``weights`` length does not fit ``features`` dimension.
"""
# make sure weights have the expected dimension
if weights is not None:
if weights.ndim != 1:
raise ValueError(
'weights must have a single dimension but have {}'
.format(weights.ndim))
if weights.shape[0] != features.nframes:
raise ValueError(
'there is {} weights but {} feature frames, must be equal'
.format(weights.shape[0], features.nframes))
weights = kaldi.matrix.SubVector(weights)
# delegate to pykaldi implementation
self._cmvn.accumulate(
kaldi.matrix.SubMatrix(features.data),
weights=weights)
[docs] def process(self, features, norm_vars=True, skip_dims=None, reverse=False):
"""Applies the accumulated CMVN statistics to the given ``features``
Parameters
----------
features : :class:`~shennong.features.features.Features`
The input features on which to apply CMVN statisitics.
norm_vars : bool, optional
If False, do not apply variance normalization (only mean),
default to True.
skip_dims : list of positive integers, optional
Dimensions for which to skip normalization. Default is to
not skip any dimension.
reverse : bool, optional
Whether to apply CMVN in a reverse sense, so as to
transform zero-mean, unit-variance features into features
with the desired mean and variance.
Returns
-------
cmvn_features : :class:`~shennong.features.features.Features`
The normalized features
Raises
------
ValueError
If no stats have been accumulated
"""
# make sure we have accumulated some stats
if self.count < 1.0:
raise ValueError(
'insufficient accumulation of stats for CMVN, '
'must be >= 1.0 but is {}'.format(self.count))
# skip dims in pykaldi is a destructive operation (alteration
# of self.stats), so we work by copy here, to avoid modifying
# statistics.
if not skip_dims:
cmvn = self._cmvn
else:
# make sure all skipped dims are valid dims
dmin, dmax = min(skip_dims), max(skip_dims)
if dmin < 0 or dmax >= features.ndims:
raise ValueError(
'skipped dimensions must be in [0, {}[ but are in [{}, {}['
.format(features.ndims, dmin, dmax))
# work by copy to not alter self.stats
cmvn = kaldi.transform.cmvn.Cmvn(dim=self.dim)
cmvn.stats = kaldi.matrix.DoubleMatrix(self.stats)
cmvn.skip_dims(skip_dims)
data = kaldi.matrix.SubMatrix(features.data)
cmvn.apply(data, norm_vars=norm_vars, reverse=reverse)
return Features(
data.numpy(), features.times,
properties=self.get_properties(features))
[docs]def apply_cmvn(feats_collection, by_collection=True, norm_vars=True,
weights=None, skip_dims=None):
"""CMVN normalization of a collection of features
This function is a simple wrapper on the class
:class:`~shennong.features.CmvnPostProcessor` that allows to
accumulate and apply CMVN statistics over a whole collections of
features.
Warnings
--------
The features in the collection must have the same
dimensionality. It is assumed they are all extracted from the same
processor. If this is not the case, a ValueError is raised.
Parameters
----------
feats_collection : :class:`~shennong.FeaturesCollection`
The collection of features on wich to apply CMVN normlization.
Each features in the collection is assumed to have consistent
dimensions.
by_collection : bool, optional
When True, accumulate and apply CMVN over the entire
collection. When False, do it independently for each features
in the collection. Default to True.
norm_vars : bool, optional
If False, do not apply variance normalization (only mean),
default to True.
weights : dict of arrays, optional
For each features in the collection, an array of weights to
apply on the features frames, if specified we must have
``weights.keys() == feats_collections.keys()`` (see
:func:`CmvnPostProcessor.accumulate`). Unweighted by default.
skip_dims : list of integers
The dimensions for which to skip the normalization (see
:func:`CmvnPostProcessor.process`). Default is to normalize
all dimensions.
Returns
-------
cmvn_feats_collection : :class:`~shennong.features.FeaturesCollection`
Raises
------
ValueError
If something goes wrong during CMVN processing.
"""
# extract the features dimension
dim = set(f.ndims for f in feats_collection.values())
if not len(dim) == 1:
raise ValueError(
'features in the collection must have consistent dimensions '
'but dimensions are: {}'.format(sorted(dim)))
dim = list(dim)[0]
# check weights
if weights is not None and weights.keys() != feats_collection.keys():
raise ValueError('keys differ for weights and features collection')
# check skip_dims
if skip_dims is not None:
sdmin, sdmax = min(skip_dims), max(skip_dims)
if sdmin < 0 or sdmax >= dim:
raise ValueError(
'out of bounds dimensions in skip_dims, must be in [0, {}] '
'but are in [{}, {}]'.format(dim-1, sdmin, sdmax))
if by_collection:
# accumulate CMVN stats over the whole collection
cmvn = CmvnPostProcessor(dim)
for k, f in feats_collection.items():
cmvn.accumulate(
f, weights=weights[k] if weights is not None else None)
# apply CMVN stats
return FeaturesCollection(
{k: cmvn.process(f, norm_vars=norm_vars, skip_dims=skip_dims)
for k, f in feats_collection.items()})
# independently for each features in the collection,
# accumulate and apply CMNV stats
cmvn_collection = FeaturesCollection()
for k, f in feats_collection.items():
cmvn = CmvnPostProcessor(f.ndims)
cmvn.accumulate(
f, weights=weights[k] if weights is not None else None)
cmvn_collection[k] = cmvn.process(
f, norm_vars=norm_vars, skip_dims=skip_dims)
return cmvn_collection
[docs]class SlidingWindowCmvnPostProcessor(FeaturesPostProcessor):
"""Compute sliding-window normalization on speech features
Parameters
----------
center : bool, optional
Whether to center the window on the current frame, default to True
cmn_window : int, optional
Window size for average CMN computation, default to 600
min_window : int, optional
Minimum CMN window used at start of decoding, default to 100
max_warnings : int, optional
Maximum warning to report per utterance, default to 5
normalize_variance : bool, optional
Whether to normalize variance to one, default to False
"""
def __init__(self, center=True, cmn_window=600, min_window=100,
max_warnings=5, normalize_variance=False):
super().__init__()
self._options = kaldi.feat.functions.SlidingWindowCmnOptions()
self.center = center
self.cmn_window = cmn_window
self.max_warnings = max_warnings
self.min_window = min_window
self.normalize_variance = normalize_variance
@property
def name(self):
return 'sliding_window_cmvn'
@property
def ndims(self):
raise ValueError('output dimension for sliding '
'window CMVN processor depends on input')
@property
def center(self):
"""Whether to center the window on the current frame"""
return self._options.center
@center.setter
def center(self, value):
self._options.center = value
@property
def cmn_window(self):
"""Window size for average CMN computation"""
return self._options.cmn_window
@cmn_window.setter
def cmn_window(self, value):
self._options.cmn_window = value
@property
def min_window(self):
"""Minimum CMN window used at start of decoding"""
return self._options.min_window
@min_window.setter
def min_window(self, value):
self._options.min_window = value
@property
def max_warnings(self):
"""Maximum warning to report per utterance"""
return self._options.max_warnings
@max_warnings.setter
def max_warnings(self, value):
self._options.max_warnings = value
@property
def normalize_variance(self):
"""Whether to normalize variance to one"""
return self._options.normalize_variance
@normalize_variance.setter
def normalize_variance(self, value):
self._options.normalize_variance = value
[docs] def get_properties(self, features):
properties = copy.deepcopy(features.properties)
properties[self.name] = self.get_params()
if 'pipeline' not in properties:
properties['pipeline'] = []
properties['pipeline'].append({
'name': self.name,
'columns': [0, features.ndims - 1]})
return properties
[docs] def process(self, features):
"""Applies sliding-window cepstral mean and/or variance normalization
on `features` with the specified options
Parameters
----------
features : :class:`~shennong.features.Features`
The input features.
Returns
-------
slid_window_cmvn_feats : :class:`~shennong.features.Features`
The normalized features.
"""
data = kaldi.matrix.Matrix(*features.data.shape)
kaldi.feat.functions.sliding_window_cmn(
self._options, kaldi.matrix.SubMatrix(features.data), data)
return Features(
data.numpy(),
features.times,
self.get_properties(features))