# coding: utf-8
###############################################################################
# #
# copyright (C) 2017 by Anna Silnova, Pavel Matejka, Oldrich Plchot, #
# Frantisek Grezl #
# #
# Brno Universioty of Technology #
# Faculty of information technology #
# Department of Computer Graphics and Multimedia #
# email: {isilnova,matejkap,iplchot,grezl}@vut.cz #
# #
###############################################################################
# #
# This software and provided models can be used freely for research #
# and educational purposes. For any other use, please contact BUT #
# and / or LDC representatives. #
# #
###############################################################################
# #
# Licensed under the Apache License, Version 2.0 (the "License"); #
# you may not use this file except in compliance with the License. #
# You may obtain a copy of the License at #
# #
# http://www.apache.org/licenses/LICENSE-2.0 #
# #
# Unless required by applicable law or agreed to in writing, software #
# distributed under the License is distributed on an "AS IS" BASIS, #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
# See the License for the specific language governing permissions and #
# limitations under the License. #
# #
###############################################################################
# #
# Adaptation for shennong made under GPL3 licence by Mathieu Bernard #
# <mathieu.a.bernard@inria.fr>. Original code available at #
# speech.fit.vutbr.cz/software/but-phonexia-bottleneck-feature-extractor. #
# Updated code (git repo, python3 compatibility, improvements) available at #
# https://gitlab.coml.lscp.ens.fr/mbernard/bottleneckfeatureextractor #
# #
###############################################################################
"""Extraction of bottleneck features from a speech signal
:class:`~shennong.audio.Audio` ---> BottleneckProcessor \
---> :class:`~shennong.features.features.Features`
This module provides the class
:class:`~shennong.features.processor.bottleneck.BottleneckProcessor`
which computes stacked bottleneck features from audio signals (see
[Silnova2018]_ and [Fer2017]_). This is an adpatation of the original
code released on [bottleneck-site]_. Features are extracted from one
of the three provided pre-trained neural networks:
* *FisherMono*: Trained on Fisher English (parts 1 and 2 datasets,
about 2000 hours of clean telephone speech) with 120 phoneme states
as output classes (40 phonemes, 3 state for each phoneme).
* *FisherTri*: Trained on the same datasets as *FisherMono*, with 2423
triphones as output classes.
* *BabelMulti*: Trained on 17 languages from the IARPA
[BABEL-project]_, with 3096 output classes (3 phoneme states per
each language stacked together).
Examples
--------
Compute bottleneck features on some speech using the multilingual
network (*BabelMulti*):
>>> from shennong.audio import Audio
>>> from shennong.features.processor.bottleneck import BottleneckProcessor
>>> audio = Audio.load('./test/data/test.wav')
>>> processor = BottleneckProcessor(weights='BabelMulti')
>>> features = processor.process(audio)
>>> features.shape
(140, 80)
References
----------
.. [bottleneck-site]
https://speech.fit.vutbr.cz/software/but-phonexia-bottleneck-feature-extractor
.. [BABEL-project]
https://www.iarpa.gov/index.php/research-programs/babel
.. [Silnova2018] Anna Silnova, Pavel Matejka, Ondrej Glembek, Oldrich
Plchot, Ondrej Novotny, Frantisek Grezl, Petr Schwarz, Lukas
Burget, Jan “Honza” Cernocky, "BUT/Phonexia Bottleneck Feature
Extractor", Submitted to Odyssey: The Speaker and Language
Recognition Workshop 2018
.. [Fer2017] Fér Radek, Matějka Pavel, Grézl František, Plchot
Oldřich, Veselý Karel and Černocký Jan. Multilingually Trained
Bottleneck Features in Spoken Language Recognition. Computer
Speech and Language. Amsterdam: Elsevier Science, 2017,
vol. 2017, no. 46, pp. 252-267.
"""
import os
import pkg_resources
import warnings
import numpy as np
import scipy.linalg as spl
import scipy.fftpack
from shennong.features import Features
from shennong.features.processor.base import FeaturesProcessor
def _add_dither(signal, level):
return signal + level * (np.random.rand(*signal.shape) * 2 - 1)
def _mel_inv(x):
return (np.exp(x/1127.)-1.)*700.
def _mel(x):
return 1127.*np.log(1. + x/700.)
def _framing(a, window, shift=1):
shape = (int((a.shape[0] - window) / shift + 1), window) + a.shape[1:]
strides = (a.strides[0] * shift, a.strides[0]) + a.strides[1:]
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
def _mel_fbank_mx(winlen_nfft, fs, numchans=20, lofreq=0.0, hifreq=None):
"""Returns mel filterbank as an array shaped (nfft/2+1, numchans)
Parameters
----------
winlen_nfft : int
Typically the window length as used in mfcc_htk() call. It is
used to determine number of samples for FFT computation
(NFFT). If positive, the value (window lenght) is rounded up
to the next higher power of two to obtain HTK-compatible NFFT.
If negative, NFFT is set to -winlen_nfft. In such case, the
parameter nfft in mfcc_htk() call should be set likewise.
fs : int
sampling frequency (in Hz)
numchans : int, optional
number of filter bank bands (default to 20)
lofreq : float, optional
frequency (Hz) where the first filter strats (default to 0.0)
hifreq : float, optional
frequency (Hz) where the last filter ends (default fs/2)
"""
if not hifreq:
hifreq = 0.5 * fs
nfft = (2**int(np.ceil(np.log2(winlen_nfft)))
if winlen_nfft > 0 else -int(winlen_nfft))
fbin_mel = _mel(np.arange(nfft / 2 + 1, dtype=float) * fs / nfft)
cbin_mel = np.linspace(_mel(lofreq), _mel(hifreq), numchans + 2)
cind = np.floor(_mel_inv(cbin_mel) / fs * nfft).astype(int) + 1
mfb = np.zeros((len(fbin_mel), numchans))
for i in range(numchans):
mfb[cind[i]:cind[i+1], i] = (
cbin_mel[i] - fbin_mel[cind[i]:cind[i+1]]) / (
cbin_mel[i] - cbin_mel[i+1])
mfb[cind[i+1]:cind[i+2], i] = (
cbin_mel[i+2] - fbin_mel[cind[i+1]:cind[i+2]]) / (
cbin_mel[i+2] - cbin_mel[i+1])
if lofreq > 0.0 and float(lofreq) / fs*nfft+0.5 > cind[0]:
mfb[cind[0], :] = 0.0 # Just to be HTK compatible
return mfb
def _fbank_htk(x, window, noverlap, fbank_mx):
"""Mel log Mel-filter bank channel outputs
Returns numchans-by-M matrix of log Mel-filter bank outputs extracted from
signal x, where M is the number of extracted frames, which can be computed
as floor((length(x)-noverlap)/(window-noverlap)).
Parameters
----------
x : array
input signal
window : int
frame window lentgth (in samples,
i.e. window_size/source_rate) or vector of widow weights
noverlap : int
overlapping between frames (in samples, i.e window -
target_rate/source_rate)
fbank_mx : array
array with (Mel) filter bank (as returned by function
:func:`mel_fbank_mx`)
"""
if np.isscalar(window):
window = np.hamming(window)
nfft = 2 ** int(np.ceil(np.log2(window.size)))
x = _framing(x.astype("float"), window.size, window.size-noverlap).copy()
x *= window
# inhibit a FutureWarning
with warnings.catch_warnings():
warnings.simplefilter('ignore', category=FutureWarning)
x = np.fft.rfft(x, nfft)
x = x.real**2 + x.imag**2
x = np.log(np.maximum(1.0, np.dot(x, fbank_mx)))
return x
def _uppertri_indices(dim, isdiag=False):
""" [utr utc]=uppertri_indices(D, isdiag) returns row and column indices
into upper triangular part of DxD matrices. Indices go in zigzag feshinon
starting by diagonal. For convenient encoding of diagonal matrices, 1:D
ranges are returned for both outputs utr and utc when ISDIAG is true.
"""
if isdiag:
utr = np.arange(dim)
utc = np.arange(dim)
else:
utr = np.hstack([np.arange(ii) for ii in range(dim, 0, -1)])
utc = np.hstack([np.arange(ii, dim) for ii in range(dim)])
return utr, utc
def _uppertri_to_sym(covs_ut2d, utr, utc):
""" covs = uppertri_to_sym(covs_ut2d) reformat vectorized upper triangual
matrices efficiently stored in columns of 2D matrix into full symmetric
matrices stored in 3rd dimension of 3D matrix
"""
(ut_dim, n_mix) = covs_ut2d.shape
dim = (np.sqrt(1 + 8 * ut_dim) - 1) / 2
covs_full = np.zeros((dim, dim, n_mix), dtype=covs_ut2d.dtype)
for ii in range(n_mix):
covs_full[:, :, ii][(utr, utc)] = covs_ut2d[:, ii]
covs_full[:, :, ii][(utc, utr)] = covs_ut2d[:, ii]
return covs_full
def _uppertri1d_from_sym(cov_full, utr, utc):
return cov_full[(utr, utc)]
def _uppertri1d_to_sym(covs_ut1d, utr, utc):
return _uppertri_to_sym(np.array(covs_ut1d)[:, None], utr, utc)[:, :, 0]
def _inv_posdef_and_logdet(M):
U = np.linalg.cholesky(M)
logdet = 2*np.sum(np.log(np.diagonal(U)))
invM = spl.solve(M, np.identity(M.shape[0], M.dtype), sym_pos=True)
return invM, logdet
def _gmm_eval_prep(weights, means, covs):
n_mix, dim = means.shape
GMM = dict()
is_full_cov = covs.shape[1] != dim
GMM['utr'], GMM['utc'] = _uppertri_indices(dim, not is_full_cov)
if is_full_cov:
GMM['gconsts'] = np.zeros(n_mix)
GMM['gconsts2'] = np.zeros(n_mix)
GMM['invCovs'] = np.zeros_like(covs)
GMM['invCovMeans'] = np.zeros_like(means)
for ii in range(n_mix):
_uppertri1d_to_sym(covs[ii], GMM['utr'], GMM['utc'])
invC, logdetC = _inv_posdef_and_logdet(
_uppertri1d_to_sym(covs[ii], GMM['utr'], GMM['utc']))
# log of Gauss. dist. normalizer + log weight + mu' invCovs mu
invCovMean = invC.dot(means[ii])
GMM['gconsts'][ii] = np.log(weights[ii]) - 0.5 * (
logdetC + means[ii].dot(invCovMean) + dim * np.log(2.0*np.pi))
GMM['gconsts2'][ii] = - 0.5 * (
logdetC + means[ii].dot(invCovMean) + dim * np.log(2.0*np.pi))
GMM['invCovMeans'][ii] = invCovMean
# Inverse covariance matrices are stored in columns of 2D
# matrix as vectorized upper triangual parts ...
GMM['invCovs'][ii] = _uppertri1d_from_sym(
invC, GMM['utr'], GMM['utc'])
# ... with elements above the diagonal multiply by 2
GMM['invCovs'][:, dim:] *= 2.0
else: # for diagonal
GMM['invCovs'] = 1 / covs
GMM['gconsts'] = np.log(weights) - 0.5 * (
np.sum(np.log(covs) + means**2 * GMM['invCovs'],
axis=1) + dim * np.log(2 * np.pi))
GMM['gconsts2'] = -0.5 * (
np.sum(np.log(covs) + means**2 * GMM['invCovs'],
axis=1) + dim * np.log(2 * np.pi))
GMM['invCovMeans'] = GMM['invCovs'] * means
# for weight = 0, prepare GMM for uninitialized model with single
# gaussian
if len(weights) == 1 and weights[0] == 0:
GMM['invCovs'] = np.zeros_like(GMM['invCovs'])
GMM['invCovMeans'] = np.zeros_like(GMM['invCovMeans'])
GMM['gconsts'] = np.ones(1)
return GMM
def _gmm_llhs(data, GMM):
"""llh = GMM_EVAL(d,GMM) returns vector of log-likelihoods evaluated
for each frame of dimXn_samples data matrix using GMM object. GMM
object must be initialized with GMM_EVAL_PREP function.
[llh N F] = GMM_EVAL(d,GMM,1) also returns accumulators with zero,
first order statistic.
[llh N F S] = GMM_EVAL(d,GMM,2) also returns accumulators with
second order statistic. For full covariance model second order
statiscics, only the vectorized upper triangual parts are stored
in columns of 2D matrix (similarly to GMM.invCovs).
"""
# quadratic expansion of data
data_sqr = data[:, GMM['utr']] * data[:, GMM['utc']]
# computate of log-likelihoods for each frame and all Gaussian
# components
gamma = -0.5 * data_sqr.dot(GMM['invCovs'].T) + data.dot(
GMM['invCovMeans'].T) + GMM['gconsts']
return gamma
def _gmm_eval(data, GMM, return_accums=0):
"""llh = GMM_EVAL(d,GMM) returns vector of log-likelihoods evaluated
for each frame of dimXn_samples data matrix using GMM object. GMM
object must be initialized with GMM_EVAL_PREP function.
[llh N F] = GMM_EVAL(d,GMM,1) also returns accumulators with zero,
first order statistic.
[llh N F S] = GMM_EVAL(d,GMM,2) also returns accumulators with
second order statistic. For full covariance model second order
statiscics, only the vectorized upper triangual parts are stored
in columns of 2D matrix (similarly to GMM.invCovs).
"""
# quadratic expansion of data
data_sqr = data[:, GMM['utr']] * data[:, GMM['utc']]
# computate of log-likelihoods for each frame and all Gaussian components
gamma = -0.5 * data_sqr.dot(GMM['invCovs'].T) + data.dot(
GMM['invCovMeans'].T) + GMM['gconsts']
llh = _logsumexp(gamma, axis=1)
if return_accums == 0:
return llh
gamma = np.exp(gamma.T - llh)
N = gamma.sum(axis=1)
F = gamma.dot(data)
if return_accums == 1:
return llh, N, F
S = gamma.dot(data_sqr)
return llh, N, F, S
def _logsumexp(x, axis=0):
xmax = x.max(axis)
ex = np.exp(x - np.expand_dims(xmax, axis))
x = xmax + np.lib.scimath.log(np.sum(ex, axis))
not_finite = ~np.isfinite(xmax)
x[not_finite] = xmax[not_finite]
return x
def _gmm_update(N, F, S):
"""weights means covs = gmm_update(N,F,S) return GMM parameters,
which are updated from accumulators
"""
dim = F.shape[1]
is_diag_cov = S.shape[1] == dim
utr, utc = _uppertri_indices(dim, is_diag_cov)
sumN = N.sum()
weights = N / sumN
means = F / N[:, np.newaxis]
covs = S / N[:, np.newaxis] - means[:, utr] * means[:, utc]
return weights, means, covs
def _compute_vad(s, log, win_length=200, win_overlap=120,
n_realignment=5, threshold=0.3, bugfix=False):
warnings.filterwarnings('error')
# power signal for energy computation
if bugfix is False:
s = s ** 2 # yields to negative squares because s are int16
else:
s = s.astype(np.float64) ** 2
# frame signal with overlap
F = _framing(s, win_length, win_length - win_overlap)
# sum frames to get energy
E = F.sum(axis=1).astype(np.float64)
# E = np.sqrt(E)
# E = np.log(E)
# normalize the energy
try:
E -= E.mean()
E /= E.std()
# initialization
mm = np.array((-1.00, 0.00, 1.00))[:, np.newaxis]
ee = np.array((1.00, 1.00, 1.00))[:, np.newaxis]
ww = np.array((0.33, 0.33, 0.33))
GMM = _gmm_eval_prep(ww, mm, ee)
E = E[:, np.newaxis]
for i in range(n_realignment):
# collect GMM statistics
llh, N, F, S = _gmm_eval(E, GMM, return_accums=2)
# update model
ww, mm, ee = _gmm_update(N, F, S)
# wrap model
GMM = _gmm_eval_prep(ww, mm, ee)
# evaluate the gmm llhs
llhs = _gmm_llhs(E, GMM)
llh = _logsumexp(llhs, axis=1)[:, np.newaxis]
llhs = np.exp(llhs - llh)
out = np.zeros(llhs.shape[0], dtype=np.bool)
out[llhs[:, 0] < threshold] = True
except RuntimeWarning:
log.warning("signal contains only silence")
out = np.zeros(E.shape[0], dtype=np.bool)
return out
def _dct_basis(nbasis, length):
# the same DCT as in matlab
return scipy.fftpack.idct(np.eye(nbasis, length), norm='ortho')
def _sigmoid_fun(x):
return 1 / (1 + np.exp(-x))
def _preprocess_nn_input(X, left_ctx=5, right_ctx=5):
X = _framing(X, left_ctx+1+right_ctx).transpose(0, 2, 1)
dct_basis = 6
dct_xform = _dct_basis(dct_basis, left_ctx+right_ctx+1)
dct_xform[0] = np.sqrt(2./(left_ctx+right_ctx+1))
hamming_dct = (dct_xform*np.hamming(left_ctx+right_ctx+1)).T
return np.dot(
X.reshape(-1, hamming_dct.shape[0]),
hamming_dct).reshape(X.shape[0], -1)
def _create_nn_extract_st_BN(X, param_dict, bn_position):
mean = param_dict['input_mean']
std = param_dict['input_std']
Y = (X + mean) * std
num_of_layers = int((len(param_dict.keys()) - 5) / 2)
# n_hidden_before_BN --> sigmoid
# BN activation --> linear
for ii, f in enumerate(
[lambda x: _sigmoid_fun(x)]*bn_position+[lambda x:x]):
W = param_dict['W'+str(ii+1)]
b = param_dict['b'+str(ii+1)]
Y = f(Y.dot(W) + b)
Y1 = np.hstack([Y[0:-20], Y[5:-15], Y[10:-10], Y[15:-5], Y[20:]])
bn_mean = param_dict['bn_mean']
bn_std = param_dict['bn_std']
Y1 = (Y1+bn_mean) * bn_std
for ii, f in enumerate(
[lambda x: _sigmoid_fun(x)]*(
num_of_layers - bn_position-2) + [lambda x:x]):
W = param_dict['W'+str(ii+bn_position+3)]
b = param_dict['b'+str(ii+bn_position+3)]
Y1 = f(Y1.dot(W) + b)
return Y1, Y
[docs]class BottleneckProcessor(FeaturesProcessor):
"""Bottleneck features from a pre-trained neural network
Parameters
----------
weights : 'BabelMulti', 'FisherMono' or 'FisherMulti'
The pretrained weights to use for features extraction
Raises
------
ValueError
If the `weights` are invalid
RuntimeError
If the weights file cannot be found (meaning shennong is not
correctly installed on your system)
"""
# load of the weights (do it statically to not load the weights
# several times when running multiple instances of the
# BottleneckProcessor)
_loaded_weights = {}
def __init__(self, weights='BabelMulti', dither=0.1):
self.weights = weights
self.dither = dither
self._get_weights()
@property
def name(self):
return 'bottleneck'
@property
def dither(self):
"""Amount of dithering
0.0 means no dither
"""
return self._dither
@dither.setter
def dither(self, value):
self._dither = float(value)
@property
def weights(self):
"""The name of the pretrained weights used to extract the features
Must be 'BabelMulti', 'FisherMono' or 'FisherTri'.
"""
return self._weights
@weights.setter
def weights(self, value):
available_weights = self.available_weights()
if value not in available_weights:
raise ValueError(
'invalid weights "{}", choose in "{}"'.format(
value, ', '.join(sorted(available_weights.keys()))))
self._weights = value
@property
def ndims(self):
"""The dimension of extracted frames
Cannot be tuned because the underlying neural networks are
trained with this parameter.
"""
return 80
@property
def sample_rate(self):
"""Processing sample frequency in Hertz
Cannot be tuned because the underlying neural networks are
trained with this parameter.
"""
return 8000
@property
def frame_length(self):
"""The length of extracted frames (in seconds)
Cannot be tuned because the underlying neural networks are
trained with this parameter.
"""
return 0.025
@property
def frame_shift(self):
"""The time shift between two consecutive frames (in seconds)
Cannot be tuned because the underlying neural networks are
trained with this parameter.
"""
return 0.01
def _get_weights(self):
if self.weights not in self._loaded_weights:
# load the weights if not already loaded
weights_file = self.available_weights()[self.weights]
self._log.info('loading %s', os.path.basename(weights_file))
# explicitely load all the data once, instead of having a file
# descriptor
with np.load(weights_file) as w:
self._loaded_weights[self.weights] = {
k: v for k, v in w.items()}
return self._loaded_weights[self.weights]
[docs] @classmethod
def available_weights(cls):
"""Return the pretrained weights files as a dict (name -> file)
Returns
-------
weight_files : dict
A mapping 'weights name' -> 'weights files', where the
files are absolutes paths to compressed numpy array (.npz
format). The 'weights name' is either *BabelMulti*,
*FisherMono* or *FisherTri*.
Raises
------
RuntimeError
If the directory `shennong/share/bottleneck` is not found,
or if all the weights files are missing in it.
"""
# locate the directory shennong/share/bottleneck, raise if it
# cannot be found
directory = pkg_resources.resource_filename(
pkg_resources.Requirement.parse('shennong'),
'shennong/share/bottleneck')
if not os.path.isdir(directory): # pragma: nocover
raise RuntimeError('directory not found: {}'.format(directory))
# retrieve the weights files
expected_files = {
f[0]: os.path.join(directory, f[1] + '.npz') for f in
[('BabelMulti', 'Babel-ML17_FBANK_HL1500_SBN80_PhnStates3096'),
('FisherMono', 'FisherEnglish_FBANK_HL500_SBN80_PhnStates120'),
('FisherTri', 'FisherEnglish_FBANK_HL500_SBN80_triphones2423')]}
# make sure all the files are here, raise a RuntimeError if
# all files are missing, log a warning is only one or two
# files are missing
files = {k: v for k, v in expected_files.items() if os.path.isfile(v)}
if not files: # pragma: nocover
raise RuntimeError('no weights file found in {}'.format(directory))
for k in expected_files.keys():
if k not in files: # pragma: nocover
cls.log.warning('weights file for "%s" is unavailable', k)
return files
[docs] def process(self, signal):
"""Computes bottleneck features on an audio `signal`
Use a pre-trained neural network to extract bottleneck
features. Features have a frame shift of 10 ms and frame
length of 25 ms.
Parameters
----------
signal : Audio, shape = [nsamples, 1]
The input audio signal to compute the features on, must be
mono. The signal is up/down-sampled at 8 kHz during
processing.
Returns
-------
features : Features, shape = [nframes, 80]
The computes bottleneck features will have as many rows as
there are frames (depends on the `signal` duration, expect
about 100 frames per second), each frame with 80
dimensions.
Raises
------
RuntimeError
If no speech is detected on the `signal` during the voice
activity detection preprocessing step.
"""
# force resampling to 8 kHz and 16 bits integers
need_resample = (
signal.sample_rate != 8000 or
signal.dtype is not np.dtype(np.int16))
if need_resample:
self._log.debug(
'resampling audio from %dHz@%db to %dHz@%db',
signal.sample_rate, signal.dtype.itemsize * 8, 8000, 16)
signal = signal.resample(8000).astype(np.int16)
signal = signal.data
# define parameters to extract mel filterbanks. Those
# parameters cannot be tuned because the networks are trained
# with them... frame_noverlap is the number of samples to
# overlap in each frame, so the frame_shift is 200 - 120 = 80
frame_length = 200
frame_noverlap = 120
frame_shift = frame_length - frame_noverlap
# voice activity detection TODO implement user-provided VAD
# (vad input format could be an instance of Alignment, or
# simply an array of bool).
vad = _compute_vad(
signal, self._log,
win_length=frame_length, win_overlap=frame_noverlap)
# ensure we have some voiced frames in the signal
voiced_frames = sum(vad)
if not voiced_frames:
raise RuntimeError(
'no voice detected in signal, failed to extract features')
self._log.debug('%d frames of speech detected (on %d total frames)',
voiced_frames, len(vad))
# from audio signal to mel filterbank
signal = _add_dither(signal, self.dither)
window = np.hamming(frame_length)
fbank_mx = _mel_fbank_mx(
window.size, 8000, numchans=24, lofreq=64.0, hifreq=3800.0)
fea = _fbank_htk(signal, window, frame_noverlap, fbank_mx)
# center the mel features from voiced frames mean
fea -= np.mean(fea[vad], axis=0)
# add a global context to the mel features
left_ctx = right_ctx = 15
fea = np.r_[np.repeat(fea[[0]], left_ctx, axis=0),
fea,
np.repeat(fea[[-1]], right_ctx, axis=0)]
# compute the network output from mel features
left_ctx_bn1 = right_ctx_bn1 = self._get_weights()['context']
nn_input = _preprocess_nn_input(fea, left_ctx_bn1, right_ctx_bn1)
nn_output = np.vstack(_create_nn_extract_st_BN(
nn_input, self._get_weights(), 2)[0])
# compute the timestamps for each output frame
times = (1.0 / 8000) * np.vstack((
np.arange(nn_output.shape[0]) * frame_shift,
np.arange(nn_output.shape[0]) * frame_shift + frame_length)).T
# return the final bottleneck features
return Features(nn_output, times, self.get_properties())