Source code for shennong.processor.pitch_kaldi

"""Provides classes to extract pitch from an audio (speech) signal

This modules provides the classes :class:`KaldiPitchProcessor` and
:class:`KaldiPitchPostProcessor` which respectively computes the pitch from raw
speech and turns it into suitable features: it produces pitch and
probability-of-voicing estimates for use as features in automatic speech
recognition systems.

Uses the Kaldi implementation of pitch extraction and postprocessing
(see [Ghahremani2014]_ and [kaldi-pitch]_).

    :class:`~shennong.audio.Audio` ---> KaldiPitchProcessor \
    ---> KaldiPitchPostProcessor ---> :class:`~shennong.features.Features`

Examples
--------

>>> from shennong.audio import Audio
>>> from shennong.processor import (
...     KaldiPitchProcessor, KaldiPitchPostProcessor)
>>> audio = Audio.load('./test/data/test.wav')

Initialize a pitch processor with some options. Options can be
specified at construction, or after:

>>> processor = KaldiPitchProcessor(frame_shift=0.01, frame_length=0.025)
>>> processor.sample_rate = audio.sample_rate
>>> processor.min_f0 = 20
>>> processor.max_f0 = 500

Options can also being passed as a dictionnary:

>>> options = {
...     'sample_rate': audio.sample_rate,
...     'frame_shift': 0.01, 'frame_length': 0.025,
...     'min_f0': 20, 'max_f0': 500}
>>> processor = KaldiPitchProcessor(**options)

Compute the pitch with the specified options, the output is an
instance of :class:`~shennong.features.Features`:

>>> pitch = processor.process(audio)
>>> type(pitch)
<class 'shennong.features.Features'>
>>> pitch.shape
(140, 2)

The pitch post-processor works in the same way, input is the pitch,
output are features usable by speech processing tools:

>>> postprocessor = KaldiPitchPostProcessor()  # use default options
>>> postpitch = postprocessor.process(pitch)
>>> postpitch.shape
(140, 3)

References
----------

.. [Ghahremani2014] `A Pitch Extraction Algorithm Tuned for Automatic
     Speech Recognition, Pegah Ghahremani, Bagher BabaAli, Daniel
     Povey, Korbinian Riedhammer, Jan Trmal and Sanjeev Khudanpur,
     ICASSP 2014`

.. [kaldi-pitch] http://kaldi-asr.org/doc/pitch-functions_8h.html

"""

import copy
import kaldi.feat.pitch
import kaldi.matrix
import numpy as np

from shennong import Features
from shennong.processor.base import FeaturesProcessor
from shennong.postprocessor.base import FeaturesPostProcessor


[docs]class KaldiPitchProcessor(FeaturesProcessor): """Extracts the (NCCF, pitch) per frame from a speech signal The output will have as many rows as there are frames, and two columns corresponding to (NCCF, pitch). NCCF is the Normalized Cross Correlation Function. """ def __init__(self, sample_rate=16000, frame_shift=0.01, frame_length=0.025, min_f0=50, max_f0=400, soft_min_f0=10, penalty_factor=0.1, lowpass_cutoff=1000, resample_freq=4000, delta_pitch=0.005, nccf_ballast=7000, lowpass_filter_width=1, upsample_filter_width=5): super().__init__() self._options = kaldi.feat.pitch.PitchExtractionOptions() self.sample_rate = sample_rate self.frame_shift = frame_shift self.frame_length = frame_length self.min_f0 = min_f0 self.max_f0 = max_f0 self.soft_min_f0 = soft_min_f0 self.penalty_factor = penalty_factor self.lowpass_cutoff = lowpass_cutoff self.resample_freq = resample_freq self.delta_pitch = delta_pitch self.nccf_ballast = nccf_ballast self.lowpass_filter_width = lowpass_filter_width self.upsample_filter_width = upsample_filter_width @property def name(self): return 'pitch' @property def sample_rate(self): """Waveform sample frequency in Hertz Must match the sample rate of the signal specified in `process` """ return self._options.samp_freq @sample_rate.setter def sample_rate(self, value): self._options.samp_freq = value @property def frame_shift(self): """Frame shift in seconds""" return self._options.frame_shift_ms / 1000.0 @frame_shift.setter def frame_shift(self, value): self._options.frame_shift_ms = value * 1000.0 @property def frame_length(self): """Frame length in seconds""" return self._options.frame_length_ms / 1000.0 @frame_length.setter def frame_length(self, value): self._options.frame_length_ms = value * 1000.0 @property def min_f0(self): """Minimum F0 to search for in Hertz""" return self._options.min_f0 @min_f0.setter def min_f0(self, value): self._options.min_f0 = value @property def max_f0(self): """Maximum F0 to search for in Hertz""" return self._options.max_f0 @max_f0.setter def max_f0(self, value): self._options.max_f0 = value @property def soft_min_f0(self): """Minimum F0 to search, applied in soft way, in Hertz Must not exceed `min_f0` """ return self._options.soft_min_f0 @soft_min_f0.setter def soft_min_f0(self, value): self._options.soft_min_f0 = value @property def penalty_factor(self): """Cost factor for F0 change""" return np.float32(self._options.penalty_factor) @penalty_factor.setter def penalty_factor(self, value): self._options.penalty_factor = value @property def lowpass_cutoff(self): """Cutoff frequency for low-pass filter, in Hertz""" return self._options.lowpass_cutoff @lowpass_cutoff.setter def lowpass_cutoff(self, value): self._options.lowpass_cutoff = value @property def resample_freq(self): """Frequency that we down-sample the signal to, in Hertz Must be more than twice `lowpass_cutoff` """ return self._options.resample_freq @resample_freq.setter def resample_freq(self, value): self._options.resample_freq = value @property def delta_pitch(self): """Smallest relative change in pitch that the algorithm measures""" return np.float32(self._options.delta_pitch) @delta_pitch.setter def delta_pitch(self, value): self._options.delta_pitch = value @property def nccf_ballast(self): """Increasing this factor reduces NCCF for quiet frames This helps ensuring pitch continuity in unvoiced regions """ return self._options.nccf_ballast @nccf_ballast.setter def nccf_ballast(self, value): self._options.nccf_ballast = value @property def lowpass_filter_width(self): """Integer that determines filter width of lowpass filter More gives sharper filter """ return self._options.lowpass_filter_width @lowpass_filter_width.setter def lowpass_filter_width(self, value): self._options.lowpass_filter_width = value @property def upsample_filter_width(self): """Integer that determines filter width when upsampling NCCF""" return self._options.upsample_filter_width @upsample_filter_width.setter def upsample_filter_width(self, value): self._options.upsample_filter_width = value @property def ndims(self): return 2
[docs] def times(self, nframes): """Returns the time label for the rows given by the `process` method""" return np.vstack(( np.arange(nframes) * self.frame_shift, np.arange(nframes) * self.frame_shift + self.frame_length)).T
[docs] def process(self, signal): """Extracts the (NCCF, pitch) from a given speech `signal` Parameters ---------- signal : Audio The speech signal on which to estimate the pitch. The signal's sample rate must match the sample rate specified in the `PitchProcessor` options. Returns ------- raw_pitch_features : Features, shape = [nframes, 2] The output array has as many rows as there are frames (depends on the specified options `frame_shift` and `frame_length`), and two columns corresponding to (NCCF, pitch). Raises ------ ValueError If the input `signal` has more than one channel (i.e. is not mono). If `sample_rate` != `signal.sample_rate`. """ if signal.nchannels != 1: raise ValueError( 'audio signal must have one channel, but it has {}' .format(signal.nchannels)) if self.sample_rate != signal.sample_rate: raise ValueError( 'processor and signal mismatch in sample rates: ' '{} != {}'.format(self.sample_rate, signal.sample_rate)) # force 16 bits integers signal = signal.astype(np.int16).data data = kaldi.matrix.SubMatrix( kaldi.feat.pitch.compute_kaldi_pitch( self._options, kaldi.matrix.SubVector(signal))).numpy() return Features( data, self.times(data.shape[0]), properties=self.get_properties())
[docs]class KaldiPitchPostProcessor(FeaturesPostProcessor): """Processes the raw (NCCF, pitch) computed by the PitchProcessor Turns the raw pitch quantites into usable features. By default it will output three-dimensional features, (POV-feature, mean-subtracted-log-pitch, delta-of-raw-pitch), but this is configurable in the options. The number of rows of "output" will be the number of frames (rows) in "input", i.e. the number of frames. The number of columns will be the number of different types of features requested (by default, 3; 4 is the max). The four parameters `add_pov_feature`, `add_normalized_log_pitch`, `add_delta_pitch`, `add_raw_log_pitch` determine which features we create; by default we create the first three. POV stands for Probability of Voicing. """ def __init__(self, pitch_scale=2.0, pov_scale=2.0, pov_offset=0.0, delta_pitch_scale=10.0, delta_pitch_noise_stddev=0.005, normalization_left_context=75, normalization_right_context=75, delta_window=2, delay=0, add_pov_feature=True, add_normalized_log_pitch=True, add_delta_pitch=True, add_raw_log_pitch=False): super().__init__() self._options = kaldi.feat.pitch.ProcessPitchOptions() self.pitch_scale = pitch_scale self.pov_scale = pov_scale self.pov_offset = pov_offset self.delta_pitch_scale = delta_pitch_scale self.delta_pitch_noise_stddev = delta_pitch_noise_stddev self.normalization_left_context = normalization_left_context self.normalization_right_context = normalization_right_context self.delta_window = delta_window self.delay = delay self.add_pov_feature = add_pov_feature self.add_normalized_log_pitch = add_normalized_log_pitch self.add_delta_pitch = add_delta_pitch self.add_raw_log_pitch = add_raw_log_pitch @property def name(self): return 'pitch postprocessing' @property def pitch_scale(self): """Scaling factor for the final normalized log-pitch value""" return self._options.pitch_scale @pitch_scale.setter def pitch_scale(self, value): self._options.pitch_scale = value @property def pov_scale(self): """Scaling factor for final probability of voicing feature""" return self._options.pov_scale @pov_scale.setter def pov_scale(self, value): self._options.pov_scale = value @property def pov_offset(self): """This can be used to add an offset to the POV feature Intended for use in Kaldi's online decoding as a substitute for CMV (cepstral mean normalization) """ return self._options.pov_offset @pov_offset.setter def pov_offset(self, value): self._options.pov_offset = value @property def delta_pitch_scale(self): """Term to scale the final delta log-pitch feature""" return self._options.delta_pitch_scale @delta_pitch_scale.setter def delta_pitch_scale(self, value): self._options.delta_pitch_scale = value @property def delta_pitch_noise_stddev(self): """Standard deviation for noise we add to the delta log-pitch The stddev is added before scaling. Should be about the same as delta-pitch option to pitch creation. The purpose is to get rid of peaks in the delta-pitch caused by discretization of pitch values. """ return np.float32(self._options.delta_pitch_noise_stddev) @delta_pitch_noise_stddev.setter def delta_pitch_noise_stddev(self, value): self._options.delta_pitch_noise_stddev = value @property def normalization_left_context(self): """Left-context (in frames) for moving window normalization""" return self._options.normalization_left_context @normalization_left_context.setter def normalization_left_context(self, value): self._options.normalization_left_context = value @property def normalization_right_context(self): """Right-context (in frames) for moving window normalization""" return self._options.normalization_right_context @normalization_right_context.setter def normalization_right_context(self, value): self._options.normalization_right_context = value @property def delta_window(self): """Number of frames on each side of central frame""" return self._options.delta_window @delta_window.setter def delta_window(self, value): self._options.delta_window = value @property def delay(self): """Number of frames by which the pitch information is delayed""" return self._options.delay @delay.setter def delay(self, value): self._options.delay = value @property def add_pov_feature(self): """If true, the warped NCCF is added to output features""" return self._options.add_pov_feature @add_pov_feature.setter def add_pov_feature(self, value): self._options.add_pov_feature = value @property def add_normalized_log_pitch(self): """If true, the normalized log-pitch is added to output features Normalization is done with POV-weighted mean subtraction over 1.5 second window. """ return self._options.add_normalized_log_pitch @add_normalized_log_pitch.setter def add_normalized_log_pitch(self, value): self._options.add_normalized_log_pitch = value @property def add_delta_pitch(self): """If true, time derivative of log-pitch is added to output features""" return self._options.add_delta_pitch @add_delta_pitch.setter def add_delta_pitch(self, value): self._options.add_delta_pitch = value @property def add_raw_log_pitch(self): """If true, time derivative of log-pitch is added to output features""" return self._options.add_raw_log_pitch @add_raw_log_pitch.setter def add_raw_log_pitch(self, value): self._options.add_raw_log_pitch = value @property def ndims(self): return ( self.add_pov_feature + self.add_normalized_log_pitch + self.add_delta_pitch + self.add_raw_log_pitch)
[docs] def get_properties(self, features): properties = copy.deepcopy(features.properties) properties['pitch'][self.name] = self.get_params() properties['pipeline'][0]['columns'] = [0, self.ndims - 1] return properties
[docs] def process(self, raw_pitch): """Post process a raw pitch data as specified by the options Parameters ---------- raw_pitch : Features, shape = [n, 2] The pitch as extracted by the `KaldiPitchProcessor.process` method Returns ------- pitch : Features, shape = [n, 1 2 3 or 4] The post-processed pitch usable as speech features. The output columns are 'pov_feature', 'normalized_log_pitch', delta_pitch' and 'raw_log_pitch', in that order,if their respective options are set to True. Raises ------ ValueError If `raw_pitch` has not exactly two columns. If all the following options are False: 'add_pov_feature', 'add_normalized_log_pitch', 'add_delta_pitch' and 'add_raw_log_pitch' (at least one of them must be True). """ # check at least one required option is True if not (self.add_pov_feature or self.add_normalized_log_pitch or self.add_delta_pitch or self.add_raw_log_pitch): raise ValueError( 'at least one of the following options must be True: ' 'add_pov_feature, add_normalized_log_pitch, ' 'add_delta_pitch, add_raw_log_pitch') if raw_pitch.shape[1] != 2: raise ValueError( 'data shape must be (_, 2), but it is (_, {})' .format(raw_pitch.shape[1])) data = kaldi.matrix.SubMatrix( kaldi.feat.pitch.process_pitch( self._options, kaldi.matrix.SubMatrix(raw_pitch.data))).numpy() return Features( data, raw_pitch.times, properties=self.get_properties(raw_pitch))