"""Compute Voice Activity Detection (VAD) on features log-energy
:class:`~shennong.features.features.Features` -->
VadPostProcessor -->
:class:`~shennong.features.features.Features`
Compute voice-activity detection for speech features using the Kaldi
implementation see [kaldi-vad]_: The output is, for each input frame,
1 if we judge the frame as voiced, 0 otherwise. There are no
continuity constraints.
This method is a very simple energy-based method which only looks at
the first coefficient of the input features, which is assumed to be
**a log-energy or something similar**. If working from the raw signal,
extract the energy using
:class:`~shennong.features.processor.energy.EnergyProcessor`.
A cutoff is set, we use a formula of the general type:
.. math::
\\textrm{cutoff} = 5.0 + 0.5 * (\\textrm{average log} - \\textrm{energy}),
and for each frame the decision is based on the proportion of frames
in a context window around the current frame, which are above this
cutoff.
.. note::
This code is geared toward speaker-id applications and is not
suitable for automatic speech recognition (ASR) because it makes
independent decisions for each frame without imposing any notion
of continuity.
Examples
--------
>>> import numpy as np
>>> from shennong.audio import Audio
>>> from shennong.features.processor.mfcc import MfccProcessor
>>> from shennong.features.postprocessor.vad import VadPostProcessor
>>> audio = Audio.load('./test/data/test.wav')
>>> mfcc = MfccProcessor().process(audio)
Computes the voice activity detection on the extracted MFCCs:
>>> processor = VadPostProcessor()
>>> vad = processor.process(mfcc)
For each frames of the MFCCs, vad is 1 if detected as a voiced frame,
0 otherwise:
>>> nframes = mfcc.shape[0]
>>> vad.shape == (nframes, 1)
True
>>> nvoiced = sum(vad.data[vad.data == 1])
>>> print('{} voiced frames out of {}'.format(nvoiced, nframes))
119 voiced frames out of 140
References
----------
.. [kaldi-vad] https://kaldi-asr.org/doc/voice-activity-detection_8h.html
"""
import kaldi.matrix
import kaldi.ivector
import numpy as np
from shennong.features import Features
from shennong.features.postprocessor.base import FeaturesPostProcessor
[docs]class VadPostProcessor(FeaturesPostProcessor):
"""Computes VAD on speech features
"""
def __init__(self, energy_threshold=5.0, energy_mean_scale=0.5,
frames_context=0, proportion_threshold=0.6):
self._options = kaldi.ivector.VadEnergyOptions()
self.energy_threshold = energy_threshold
self.energy_mean_scale = energy_mean_scale
self.frames_context = frames_context
self.proportion_threshold = proportion_threshold
@property
def name(self):
return 'vad'
@property
def energy_threshold(self):
"""Constant term in energy threshold for MFCC0 for VAD
See also :func:`energy_mean_scale`
"""
return np.float32(self._options.vad_energy_threshold)
@energy_threshold.setter
def energy_threshold(self, value):
self._options.vad_energy_threshold = value
@property
def energy_mean_scale(self):
"""Scale factor of the mean log-energy
If this is set to `s`, to get the actual threshold we let `m`
be the mean log-energy of the file, and use `s*m +`
:func:`energy_threshold`. Must be greater or equal to 0.
"""
return np.float32(self._options.vad_energy_mean_scale)
@energy_mean_scale.setter
def energy_mean_scale(self, value):
if value < 0:
raise ValueError(
'Energy mean scale must be >= 0, it is {}'.format(value))
self._options.vad_energy_mean_scale = value
@property
def frames_context(self):
"""Number of frames of context on each side of central frame
The size of the window for which energy is monitored is
`2 * frames_context + 1`. Must be greater or equal to 0.
"""
return self._options.vad_frames_context
@frames_context.setter
def frames_context(self, value):
if value < 0:
raise ValueError(
'frames_context must be >= 0, it is {}'.format(value))
self._options.vad_frames_context = value
@property
def proportion_threshold(self):
"""Proportion of frames beyond the energy threshold
Parameter controlling the proportion of frames within the
window that need to have more energy than the threshold. Must
be in ]0, 1[.
"""
return np.float32(self._options.vad_proportion_threshold)
@proportion_threshold.setter
def proportion_threshold(self, value):
if value <= 0 or value >= 1:
raise ValueError(
'proportion_threshold must be in ]0, 1[, it is {}'
.format(value))
self._options.vad_proportion_threshold = value
@property
def ndims(self):
return 1
[docs] def process(self, features):
"""Computes voice activity detection (VAD) on the input `features`
Parameters
----------
features : :class:`~shennong.features.features.Features`, shape = [n,m]
The speech features on which to look for voiced
frames. The first coefficient must be a log-energy (or
equivalent). Works well with
:class:`~shennong.features.processor.mfcc.MfccProcessor` and
:class:`~shennong.features.processor.plp.PlpProcessor`.
Returns
-------
vad : :class:`~shennong.features.features.Features`, shape = [n,1]
The output vad features are of dtype uint8 and contain 1
for voiced frames or 0 for unvoiced frames.
"""
data = kaldi.matrix.SubVector(
kaldi.ivector.compute_vad_energy(
self._options, kaldi.matrix.SubMatrix(features.data))).numpy()
return Features(
np.atleast_2d(data.astype(np.uint8)).T,
features.times, properties=self.get_properties(features))