Source code for shennong.features.postprocessor.vad

"""Compute Voice Activity Detection (VAD) on features log-energy

    :class:`~shennong.features.features.Features` -->
    VadPostProcessor -->
    :class:`~shennong.features.features.Features`

Compute voice-activity detection for speech features using the Kaldi
implementation see [kaldi-vad]_: The output is, for each input frame,
1 if we judge the frame as voiced, 0 otherwise. There are no
continuity constraints.

This method is a very simple energy-based method which only looks at
the first coefficient of the input features, which is assumed to be
**a log-energy or something similar**. If working from the raw signal,
extract the energy using
:class:`~shennong.features.processor.energy.EnergyProcessor`.

A cutoff is set, we use a formula of the general type:

.. math::

   \\textrm{cutoff} = 5.0 + 0.5 * (\\textrm{average log} - \\textrm{energy}),

and for each frame the decision is based on the proportion of frames
in a context window around the current frame, which are above this
cutoff.

.. note::

   This code is geared toward speaker-id applications and is not
   suitable for automatic speech recognition (ASR) because it makes
   independent decisions for each frame without imposing any notion
   of continuity.

Examples
--------

>>> import numpy as np
>>> from shennong.audio import Audio
>>> from shennong.features.processor.mfcc import MfccProcessor
>>> from shennong.features.postprocessor.vad import VadPostProcessor
>>> audio = Audio.load('./test/data/test.wav')
>>> mfcc = MfccProcessor().process(audio)

Computes the voice activity detection on the extracted MFCCs:

>>> processor = VadPostProcessor()
>>> vad = processor.process(mfcc)

For each frames of the MFCCs, vad is 1 if detected as a voiced frame,
0 otherwise:

>>> nframes = mfcc.shape[0]
>>> vad.shape == (nframes, 1)
True
>>> nvoiced = sum(vad.data[vad.data == 1])
>>> print('{} voiced frames out of {}'.format(nvoiced, nframes))
119 voiced frames out of 140


References
----------

.. [kaldi-vad] https://kaldi-asr.org/doc/voice-activity-detection_8h.html

"""

import kaldi.matrix
import kaldi.ivector
import numpy as np

from shennong.features import Features
from shennong.features.postprocessor.base import FeaturesPostProcessor


[docs]class VadPostProcessor(FeaturesPostProcessor): """Computes VAD on speech features """ def __init__(self, energy_threshold=5.0, energy_mean_scale=0.5, frames_context=0, proportion_threshold=0.6): self._options = kaldi.ivector.VadEnergyOptions() self.energy_threshold = energy_threshold self.energy_mean_scale = energy_mean_scale self.frames_context = frames_context self.proportion_threshold = proportion_threshold @property def name(self): return 'vad' @property def energy_threshold(self): """Constant term in energy threshold for MFCC0 for VAD See also :func:`energy_mean_scale` """ return np.float32(self._options.vad_energy_threshold) @energy_threshold.setter def energy_threshold(self, value): self._options.vad_energy_threshold = value @property def energy_mean_scale(self): """Scale factor of the mean log-energy If this is set to `s`, to get the actual threshold we let `m` be the mean log-energy of the file, and use `s*m +` :func:`energy_threshold`. Must be greater or equal to 0. """ return np.float32(self._options.vad_energy_mean_scale) @energy_mean_scale.setter def energy_mean_scale(self, value): if value < 0: raise ValueError( 'Energy mean scale must be >= 0, it is {}'.format(value)) self._options.vad_energy_mean_scale = value @property def frames_context(self): """Number of frames of context on each side of central frame The size of the window for which energy is monitored is `2 * frames_context + 1`. Must be greater or equal to 0. """ return self._options.vad_frames_context @frames_context.setter def frames_context(self, value): if value < 0: raise ValueError( 'frames_context must be >= 0, it is {}'.format(value)) self._options.vad_frames_context = value @property def proportion_threshold(self): """Proportion of frames beyond the energy threshold Parameter controlling the proportion of frames within the window that need to have more energy than the threshold. Must be in ]0, 1[. """ return np.float32(self._options.vad_proportion_threshold) @proportion_threshold.setter def proportion_threshold(self, value): if value <= 0 or value >= 1: raise ValueError( 'proportion_threshold must be in ]0, 1[, it is {}' .format(value)) self._options.vad_proportion_threshold = value @property def ndims(self): return 1
[docs] def process(self, features): """Computes voice activity detection (VAD) on the input `features` Parameters ---------- features : :class:`~shennong.features.features.Features`, shape = [n,m] The speech features on which to look for voiced frames. The first coefficient must be a log-energy (or equivalent). Works well with :class:`~shennong.features.processor.mfcc.MfccProcessor` and :class:`~shennong.features.processor.plp.PlpProcessor`. Returns ------- vad : :class:`~shennong.features.features.Features`, shape = [n,1] The output vad features are of dtype uint8 and contain 1 for voiced frames or 0 for unvoiced frames. """ data = kaldi.matrix.SubVector( kaldi.ivector.compute_vad_energy( self._options, kaldi.matrix.SubMatrix(features.data))).numpy() return Features( np.atleast_2d(data.astype(np.uint8)).T, features.times, properties=self.get_properties(features))