Source code for shennong.processor.base

"""This module implements the speech features extraction models (processors)

A speech features processor takes an audio signal as input and output features:

    :class:`~shennong.audio.Audio` --> FeaturesProcessor -->
    :class:`~shennong.features.Features`

"""

import abc
import kaldi.feat.window
import kaldi.feat.mel
import joblib
import numpy as np

from shennong import Features, FeaturesCollection
from shennong.base import BaseProcessor
from shennong.utils import get_njobs


[docs]class FeaturesProcessor(BaseProcessor, metaclass=abc.ABCMeta):
    """Base class of all the features extraction models"""
    @abc.abstractproperty
    def name(self):  # pragma: nocover
        """Name of the processor"""

    @abc.abstractproperty
    def ndims(self):  # pragma: nocover
        """Dimension of the output features frames"""

[docs]    def get_properties(self, **kwargs):
        """Return the processors properties as a dictionary"""
        params = self.get_params()
        params.update(kwargs)
        return {
            'pipeline': [
                {'name': self.name, 'columns': [0, self.ndims-1]}],
            self.name: params}

[docs]    @abc.abstractmethod
    def process(self, signal):
        """Returns features processed from an input `signal`

        Parameters
        ----------
        signal: :class`~shennong.audio.Audio`
            The input audio signal to process features on

        Returns
        -------
        features: :class:`~shennong.features.Features`
            The computed features

        """

[docs]    def process_all(self, utterances, njobs=None, **kwargs):
        """Returns features processed from several input `utterances`

        This function processes the features in parallel jobs.

        Parameters
        ----------
        utterances: :class`~shennong.uttterances.Utterances`
            The utterances on which to process features on.
        njobs: int, optional
            The number of parallel jobs to run in background. Default
            to the number of CPU cores available on the machine.
        **kwargs: dict, optional
            Extra arguments to be forwarded to the `process` method. Keys must
            be the same as for `utterances`.

        Returns
        -------
        features: :class:`~shennong.features_collection.FeaturesCollection`
            The computed features on each input signal. The keys of
            output `features` are the keys of the input `utterances`.

        Raises
        ------
        ValueError
            If the `njobs` parameter is <= 0 or if an entry is missing in
            optioanl kwargs.

        """
        # checks the number of background jobs
        njobs = get_njobs(njobs, log=self.log)

        # check the extra arguments
        for name, value in kwargs.items():
            if not isinstance(value, dict):
                raise ValueError(
                    f'argument "{name}" is not a dict')
            if value.keys() != utterances.by_name().keys():
                raise ValueError(
                    f'utterances and "{name}" have different names')

        def _process_one(utterance, **kwargs):
            return utterance.name, self.process(
                utterance.load_audio(),
                **{k: v[utterance.name] for k, v in kwargs.items()})

        verbose = 8 if self.log.getEffectiveLevel() > 10 else 0

        return FeaturesCollection(joblib.Parallel(
            n_jobs=njobs, verbose=verbose, prefer='threads')(
                joblib.delayed(_process_one)(utt, **kwargs)
                for utt in utterances))


[docs]class FramesProcessor(FeaturesProcessor, metaclass=abc.ABCMeta):
    """A base class for frame based features processors.

    Wrap the kaldi frames implementation. See [kaldi-frame]_.

    References
    ----------

    .. [kaldi-frame]
       http://kaldi-asr.org/doc/structkaldi_1_1FrameExtractionOptions.html

    """
    def __init__(self, sample_rate=16000, frame_shift=0.01,
                 frame_length=0.025, dither=1.0, preemph_coeff=0.97,
                 remove_dc_offset=True, window_type='povey',
                 round_to_power_of_two=True, blackman_coeff=0.42,
                 snip_edges=True):
        super().__init__()

        # frame extraction options
        self._frame_options = kaldi.feat.window.FrameExtractionOptions()
        self.sample_rate = sample_rate
        self.frame_shift = frame_shift
        self.frame_length = frame_length
        self.dither = dither
        self.preemph_coeff = preemph_coeff
        self.remove_dc_offset = remove_dc_offset
        self.window_type = window_type
        self.round_to_power_of_two = round_to_power_of_two
        self.blackman_coeff = blackman_coeff
        self.snip_edges = snip_edges

    @property
    def sample_rate(self):
        """Waveform sample frequency in Hertz

        Must match the sample rate of the signal specified in
        `process`

        """
        return np.float32(self._frame_options.samp_freq)

    @sample_rate.setter
    def sample_rate(self, value):
        self._frame_options.samp_freq = value

    @property
    def frame_shift(self):
        """Frame shift in seconds"""
        return np.float32(self._frame_options.frame_shift_ms / 1000.0)

    @frame_shift.setter
    def frame_shift(self, value):
        self._frame_options.frame_shift_ms = value * 1000.0

    @property
    def frame_length(self):
        """Frame length in seconds"""
        return np.float32(self._frame_options.frame_length_ms / 1000.0)

    @frame_length.setter
    def frame_length(self, value):
        self._frame_options.frame_length_ms = value * 1000.0

    @property
    def dither(self):
        """Amount of dithering

        0.0 means no dither

        """
        return np.float32(self._frame_options.dither)

    @dither.setter
    def dither(self, value):
        self._frame_options.dither = value

    @property
    def preemph_coeff(self):
        """Coefficient for use in signal preemphasis"""
        return np.float32(self._frame_options.preemph_coeff)

    @preemph_coeff.setter
    def preemph_coeff(self, value):
        self._frame_options.preemph_coeff = value

    @property
    def remove_dc_offset(self):
        """If True, subtract mean from waveform on each frame"""
        return self._frame_options.remove_dc_offset

    @remove_dc_offset.setter
    def remove_dc_offset(self, value):
        self._frame_options.remove_dc_offset = value

    @property
    def window_type(self):
        """Type of window

        Must be 'hamming', 'hanning', 'povey', 'rectangular' or
        'blackman'

        """
        return self._frame_options.window_type

    @window_type.setter
    def window_type(self, value):
        windows = ['hamming', 'hanning', 'povey', 'rectangular', 'blackman']
        if value not in windows:
            raise ValueError(
                'window type must be in {}, it is {}'.format(windows, value))
        self._frame_options.window_type = value

    @property
    def round_to_power_of_two(self):
        """If true, round window size to power of two

        This is done by zero-padding input to FFT

        """
        return self._frame_options.round_to_power_of_two

    @round_to_power_of_two.setter
    def round_to_power_of_two(self, value):
        self._frame_options.round_to_power_of_two = value

    @property
    def blackman_coeff(self):
        """Constant coefficient for generalized Blackman window

        Used only if `window_type` is 'blackman'

        """
        return np.float32(self._frame_options.blackman_coeff)

    @blackman_coeff.setter
    def blackman_coeff(self, value):
        self._frame_options.blackman_coeff = value

    @property
    def snip_edges(self):
        """If true, output only frames that completely fit in the file

        When True the number of frames depends on the `frame_length`.
        If False, the number of frames depends only on the
        `frame_shift`, and we reflect the data at the ends.

        """
        return self._frame_options.snip_edges

    @snip_edges.setter
    def snip_edges(self, value):
        self._frame_options.snip_edges = value

[docs]    def times(self, nframes):
        """Returns the times label for the rows given by :func:`process`"""
        return np.vstack((
            np.arange(nframes) * self.frame_shift,
            np.arange(nframes) * self.frame_shift + self.frame_length)).T


[docs]class MelFeaturesProcessor(FramesProcessor):
    """A base class for mel-based features processors

    The mel-based features are MFCC, PLP and filterbanks. The class
    implement common options for processing those features. See
    [kaldi-mel]_ and [kaldi-frame-2]_.

    References
    ----------

    .. [kaldi-frame-2]
       http://kaldi-asr.org/doc/structkaldi_1_1FrameExtractionOptions.html

    .. [kaldi-mel]
       http://kaldi-asr.org/doc/structkaldi_1_1MelBanksOptions.html

    """
    def __init__(self, sample_rate=16000, frame_shift=0.01,
                 frame_length=0.025, dither=1.0, preemph_coeff=0.97,
                 remove_dc_offset=True, window_type='povey',
                 round_to_power_of_two=True, blackman_coeff=0.42,
                 snip_edges=True, num_bins=23, low_freq=20,
                 high_freq=0, vtln_low=100, vtln_high=-500):
        # init of FramesProcessor parent
        super().__init__(
            sample_rate=sample_rate,
            frame_shift=frame_shift,
            frame_length=frame_length,
            dither=dither,
            preemph_coeff=preemph_coeff,
            remove_dc_offset=remove_dc_offset,
            window_type=window_type,
            round_to_power_of_two=round_to_power_of_two,
            blackman_coeff=blackman_coeff,
            snip_edges=snip_edges)

        # mel banks options
        self._mel_options = kaldi.feat.mel.MelBanksOptions()
        self.num_bins = num_bins
        self.low_freq = low_freq
        self.high_freq = high_freq
        self.vtln_low = vtln_low
        self.vtln_high = vtln_high

    @property
    def num_bins(self):
        """Number of triangular mel-frequency bins

        The minimal number of bins is 3

        """
        return self._mel_options.num_bins

    @num_bins.setter
    def num_bins(self, value):
        self._mel_options.num_bins = value

    @property
    def low_freq(self):
        """Low cutoff frequency for mel bins in Hertz"""
        return np.float32(self._mel_options.low_freq)

    @low_freq.setter
    def low_freq(self, value):
        self._mel_options.low_freq = value

    @property
    def high_freq(self):
        """High cutoff frequency for mel bins in Hertz

        If `high_freq` < 0, offset from the Nyquist frequency

        """
        return np.float32(self._mel_options.high_freq)

    @high_freq.setter
    def high_freq(self, value):
        self._mel_options.high_freq = value

    @property
    def vtln_low(self):
        """Low inflection point in piecewise linear VTLN warping function

        In Hertz

        """
        return np.float32(self._mel_options.vtln_low)

    @vtln_low.setter
    def vtln_low(self, value):
        self._mel_options.vtln_low = value

    @property
    def vtln_high(self):
        """High inflection point in piecewise linear VTLN warping function

        In Hertz. If `vtln_high` < 0, offset from `high_freq`

        """
        return np.float32(self._mel_options.vtln_high)

    @vtln_high.setter
    def vtln_high(self, value):
        self._mel_options.vtln_high = value

[docs]    def process(self, signal, vtln_warp=1.0):
        """Compute features with the specified options

        Do an optional feature-level vocal tract length normalization
        (VTLN) when `vtln_warp` != 1.0.

        Parameters
        ----------
        signal : Audio, shape = [nsamples, 1]
            The input audio signal to compute the features on, must be
            mono
        vtln_warp : float, optional
            The VTLN warping factor to be applied when computing
            features. Be 1.0 by default, meaning no warping is to be
            done.

        Returns
        -------
        features : `Features`, shape = [nframes, `ndims`]
            The computed features, output will have as many rows as there
            are frames (depends on the specified options `frame_shift`
            and `frame_length`).

        Raises
        ------
        ValueError
            If the input `signal` has more than one channel (i.e. is
            not mono). If `sample_rate` != `signal.sample_rate`.

        """
        return self._process(self._kaldi_processor, signal, vtln_warp)

    def _process(self, cls, signal, vtln_warp):
        """Inner process method common to all Kaldi Mel processors"""
        # ensure the signal is correct
        if signal.nchannels != 1:
            raise ValueError(
                'signal must have one dimension, but it has {}'
                .format(signal.nchannels))

        if self.sample_rate != signal.sample_rate:
            raise ValueError(
                'processor and signal mismatch in sample rates: '
                '{} != {}'.format(self.sample_rate, signal.sample_rate))

        # we need to forward options (because the assignation here is
        # done by copy, not by reference. If the user do 'p =
        # Processor(); p.dither = 0', this is forwarded to Kaldi here)
        self._options.frame_opts = self._frame_options
        self._options.mel_opts = self._mel_options

        # force 16 bits integers
        signal = signal.astype(np.int16).data
        data = kaldi.matrix.SubMatrix(
            cls(self._options).compute(
                kaldi.matrix.SubVector(signal), vtln_warp)).numpy()

        return Features(
            data,
            self.times(data.shape[0]),
            properties=self.get_properties(vtln_warp=vtln_warp))