"""Extraction of spectrogram from audio signals
Extract spectrogram (log of the power spectrum) from an audio
signal. Uses the Kaldi implementation (see [kaldi-spec]_):
:class:`~shennong.audio.Audio` ---> SpectrogramProcessor \
---> :class:`~shennong.features.features.Features`
Examples
--------
>>> from shennong.audio import Audio
>>> from shennong.features.processor.spectrogram import SpectrogramProcessor
>>> audio = Audio.load('./test/data/test.wav')
Initialize the spectrogram processor with some options and compute the
features:
>>> processor = SpectrogramProcessor(sample_rate=audio.sample_rate)
>>> processor.window_type = 'hanning'
>>> spect = processor.process(audio)
>>> spect.shape
(140, 257)
References
----------
.. [kaldi-spec] http://kaldi-asr.org/doc/classkaldi_1_1SpectrogramComputer.html
"""
import kaldi.feat.spectrogram
import numpy as np
from shennong.features import Features
from shennong.features.processor.base import FramesProcessor
[docs]class SpectrogramProcessor(FramesProcessor):
"""Spectogram"""
def __init__(self, sample_rate=16000, frame_shift=0.01,
frame_length=0.025, dither=1.0,
preemph_coeff=0.97, remove_dc_offset=True,
window_type='povey', round_to_power_of_two=True,
blackman_coeff=0.42, snip_edges=True,
energy_floor=0.0, raw_energy=True):
super().__init__(
sample_rate=sample_rate,
frame_shift=frame_shift,
frame_length=frame_length,
dither=dither,
preemph_coeff=preemph_coeff,
remove_dc_offset=remove_dc_offset,
window_type=window_type,
round_to_power_of_two=round_to_power_of_two,
blackman_coeff=blackman_coeff,
snip_edges=snip_edges)
self._options = kaldi.feat.spectrogram.SpectrogramOptions()
self._options.frame_opts = self._frame_options
self.energy_floor = energy_floor
self.raw_energy = raw_energy
@property
def name(self):
return 'spectrogram'
@property
def ndims(self):
return int(self._frame_options.padded_window_size() / 2 + 1)
@property
def energy_floor(self):
return self._options.energy_floor
@energy_floor.setter
def energy_floor(self, value):
self._options.energy_floor = value
@property
def raw_energy(self):
return self._options.raw_energy
@raw_energy.setter
def raw_energy(self, value):
self._options.raw_energy = bool(value)
[docs] def process(self, signal, vtln_warp=1.0):
"""Compute spectrogram with the specified options
Do an optional feature-level vocal tract length normalization
(VTLN) when `vtln_warp` != 1.0.
Parameters
----------
signal : Audio, shape = [nsamples, 1]
The input audio signal to compute the features on, must be
mono
vtln_warp : float, optional
The VTLN warping factor to be applied when computing
features. Be 1.0 by default, meaning no warping is to be
done.
Returns
-------
features : `Features`, shape = [nframes, `ndims`]
The computed features, output will have as many rows as there
are frames (depends on the specified options `frame_shift`
and `frame_length`).
Raises
------
ValueError
If the input `signal` has more than one channel (i.e. is
not mono). If `sample_rate` != `signal.sample_rate`.
"""
# ensure the signal is correct
if signal.nchannels != 1:
raise ValueError(
'signal must have one dimension, but it has {}'
.format(signal.nchannels))
if self.sample_rate != signal.sample_rate:
raise ValueError(
'processor and signal mismatch in sample rates: '
'{} != {}'.format(self.sample_rate, signal.sample_rate))
# we need to forward options (because the assignation here is
# done by copy, not by reference. If the user do 'p =
# Processor(); p.dither = 0', this is forwarded to Kaldi here)
self._options.frame_opts = self._frame_options
# force 16 bits integers
signal = signal.astype(np.int16).data
data = kaldi.matrix.SubMatrix(
kaldi.feat.spectrogram.Spectrogram(self._options).compute(
kaldi.matrix.SubVector(signal), vtln_warp)).numpy()
return Features(
data, self.times(data.shape[0]), properties=self.get_properties())