"""Saves and loads features collections to/from various file formats
The following table shows the obtained file size, writing and reading
times on MFCC features computed on the `Zero Resource Speech
Challenge 2019 <https://zerospeech.com/2019>`_ train database
(English, about 26 hours of speech and 10k files):
=========== ========= ========= ============ ============
File format Extension File size Writing time Reading time
=========== ========= ========= ============ ============
h5features .h5f 562.9 MB 0:00:20 0:00:08
pickle .pkl 609.8 MB 0:00:08 0:00:06
numpy .npz 582.8 MB 0:02:07 0:00:19
matlab .mat 481.8 MB 0:00:58 0:00:13
kaldi .ark 927.8 MB 0:00:10 0:00:15
JSON .json 6.3 GB 0:11:34 1:04:25
=========== ========= ========= ============ ============
"""
import abc
import copy
import os
import pickle
import h5features
import json_tricks
import kaldi.matrix
import kaldi.util.table
import numpy as np
import scipy
from shennong.utils import get_logger, array2list
[docs]def supported_extensions():
"""Returns the list of file extensions to save/load features
Returns
-------
serializers : dict
File extensions mapped to their related serializer class
"""
return {
'.npz': NumpySerializer,
'.mat': MatlabSerializer,
'.json': JsonSerializer,
'.pkl': PickleSerializer,
'.h5f': H5featuresSerializer,
'.ark': KaldiSerializer}
[docs]def supported_serializers():
"""Returns the list of file format serializers to save/load features
Returns
-------
serializers : dict
Serializers names mapped to their related class
"""
return {
'numpy': NumpySerializer,
'matlab': MatlabSerializer,
'json': JsonSerializer,
'pickle': PickleSerializer,
'h5features': H5featuresSerializer,
'kaldi': KaldiSerializer}
[docs]def get_serializer(cls, filename, serializer=None):
"""Returns the file serializer from filename extension or serializer name
Parameters
----------
cls : class
Must be :class:`shennong.features.FeaturesCollection`, this is
a tweak to avoid circular imports
filename : str
The file to be handled (load or save)
serializer : str, optional
If not None must be one of the :func:`supported_serializers`, if
not specified, guess the serializer from the `filename`
extension using :func:`supported_extensions`.
Returns
-------
serializer : instance of :class:`FeaturesSerializer`
The guessed serializer class, a child class of
:class:`FeaturesSerializer`.
Raises
------
ValueError
If the serializer class cannot be guessed, or if `cls` is not
:class:`~shennong.features.FeaturesCollection`
"""
if cls.__name__ != 'FeaturesCollection':
raise ValueError(
'The `cls` parameter must be shennong.features.FeaturesCollection')
if serializer is None:
# guess serializer from file extension
ext = os.path.splitext(filename)[1]
if not ext:
raise ValueError('no extension nor serializer name specified')
try:
serializer = supported_extensions()[ext]
except KeyError:
raise ValueError(
'invalid extension {}, must be in {}'.format(
ext, list(supported_extensions().keys())))
else:
try:
serializer = supported_serializers()[serializer]
except KeyError:
raise ValueError(
'invalid serializer {}, must be in {}'.format(
serializer, list(supported_serializers().keys())))
return serializer(cls, filename)
[docs]class FeaturesSerializer(metaclass=abc.ABCMeta):
"""Base class of a features file serializer
This class must be specialized to handle a given file type.
Parameters
----------
cls : class
Must be :class:`shennong.features.FeaturesCollection`, this is
a tweak to avoid circular imports
filename : str
The file to save/load features to/from
"""
_log = get_logger(__name__)
def __init__(self, cls, filename):
self._features_collection = cls
self._features = self._features_collection._value_type
self._filename = filename
@property
def filename(self):
return self._filename
@abc.abstractmethod
def _save(self, features): # pragma: nocover
pass
@abc.abstractmethod
def _load(self): # pragma: nocover
pass
[docs] def load(self, **kwargs):
"""Returns a collection of features from the `filename`
Returns
-------
features : :class:`~shennong.features.FeaturesCollection`
The features stored in the file.
kwargs : optional
Optional supplementary arguments, specific to each serializer.
Raises
------
IOError
If the input file does not exist or cannot be read.
ValueError
If the features cannot be loaded from the file or are not
in a valid state.
"""
if not os.path.isfile(self.filename):
raise IOError('file not found: {}'.format(self.filename))
if not os.access(self.filename, os.R_OK):
raise IOError('file not readable: {}'.format(self.filename))
features = self._load(**kwargs)
if not features.is_valid():
raise ValueError(
'features not valid in file: {}'.format(self.filename))
return features
[docs] def save(self, features, **kwargs):
"""Saves a collection of `features` to a file
Parameters
----------
features : :class:`~shennong.features.FeaturesCollection`
The features to store in the file.
kwargs : optional
Optional supplementary arguments, specific to each serializer.
Raises
------
IOError
If the output file already exists.
ValueError
If the features cannot be saved to the file, are not in a
valid state or are not an instance of
:class:`~shennong.features.FeaturesCollection`.
"""
if os.path.isfile(self.filename):
raise IOError('file already exists: {}'.format(self.filename))
if not isinstance(features, self._features_collection):
raise ValueError(
'features must be {} but are {}'
.format(
self._features_collection.__name__,
features.__class__.__name__))
if not features.is_valid():
raise ValueError('features are not valid')
self._save(features, **kwargs)
[docs]class NumpySerializer(FeaturesSerializer):
"""Saves and loads features to/from the numpy '.npz' format"""
def _save(self, features, compress=True):
self._log.info('writing %s', self.filename)
# represent the features as dictionaries
data = {k: v._to_dict() for k, v in features.items()}
# save (and optionally compress) the features
save = np.savez_compressed if compress is True else np.savez
save(open(self.filename, 'wb'), features=data, allow_pickle=True)
def _load(self):
self._log.info('loading %s', self.filename)
data = np.load(
open(self.filename, 'rb'), allow_pickle=True)['features'].tolist()
features = self._features_collection()
for k, v in data.items():
features[k] = self._features._from_dict(v, validate=False)
return features
[docs]class MatlabSerializer(FeaturesSerializer):
"""Saves and loads features to/from the matlab '.mat' format"""
def _save(self, features, compress=True):
self._log.info('writing %s', self.filename)
# represent the features as dictionaries
data = {k: v._to_dict() for k, v in features.items()}
# print(data['test']['properties'])
# save (and optionally compress) the features
scipy.io.savemat(
self.filename, data,
long_field_names=True,
appendmat=False, do_compression=compress)
def _load(self):
self._log.info('loading %s', self.filename)
data = self._check_keys(
scipy.io.loadmat(
self.filename, appendmat=False, squeeze_me=True,
mat_dtype=True, struct_as_record=False))
features = self._features_collection()
for k, v in data.items():
if k not in ('__header__', '__version__', '__globals__'):
features[k] = self._features(
v['data'],
v['times'],
self._make_list(self._check_keys(v['properties'])),
validate=False)
return features
@staticmethod
def _check_keys(d):
"""Checks if entries in dictionary are mat-objects.
If yes todict is called to change them to nested dictionaries.
From https://stackoverflow.com/a/8832212
"""
for key in d:
if isinstance(d[key], scipy.io.matlab.mio5_params.mat_struct):
d[key] = MatlabSerializer._todict(d[key])
elif isinstance(d[key], (list, np.ndarray)):
d[key] = [MatlabSerializer._todict(dd) for dd in d[key]]
return d
@staticmethod
def _todict(matobj):
"""Constructs from matobjects nested dictionaries
From https://stackoverflow.com/a/8832212
"""
d = {}
for strg in matobj._fieldnames:
elem = matobj.__dict__[strg]
if isinstance(elem, scipy.io.matlab.mio5_params.mat_struct):
d[strg] = MatlabSerializer._todict(elem)
else:
d[strg] = elem
return d
@staticmethod
def _make_list(properties):
if 'pipeline' in properties:
# matlab format collapse a list of a single element into
# that element, we need to rebuild that list here
if isinstance(properties['pipeline'], list):
properties['pipeline'] = [
array2list(p) for p in properties['pipeline']]
else:
properties['pipeline'] = [
array2list(properties['pipeline'])]
return properties
[docs]class JsonSerializer(FeaturesSerializer):
"""Saves and loads features to/from the JSON format"""
def _save(self, features):
self._log.info('writing %s', self.filename)
open(self.filename, 'wt').write(json_tricks.dumps(features, indent=4))
def _load(self):
self._log.info('loading %s', self.filename)
return self._features_collection(
json_tricks.loads(open(self.filename, 'r').read()))
[docs]class PickleSerializer(FeaturesSerializer):
"""Saves and loads features to/from the Python pickle format"""
def _save(self, features):
self._log.info('writing %s', self.filename)
with open(self.filename, 'wb') as fh:
pickle.dump(features, fh)
def _load(self):
with open(self.filename, 'rb') as fh:
return pickle.load(fh)
[docs]class H5featuresSerializer(FeaturesSerializer):
"""Saves and loads features to/from the h5features format"""
def _save(self, features, groupname='features',
compression='lzf', chunk_size='auto'):
self._log.info('writing %s', self.filename)
# we safely use append mode as we are sure at this point the
# file does not exist (from FeaturesSerializer.save)
with h5features.Writer(
self.filename, mode='a', chunk_size=chunk_size,
compression=compression) as writer:
# append the feature in the file one by one (this avoid to
# duplicate the whole collection in memory, which can
# cause MemoryError on big datasets).
for k, v in features.items():
data = h5features.Data(
[k], [v.times], [v.data], properties=[v.properties])
writer.write(data, groupname=groupname, append=True)
def _load(self, groupname='features'):
self._log.info('loading %s', self.filename)
data = h5features.Reader(self.filename, groupname=groupname).read()
features = self._features_collection()
for n in range(len(data.items())):
features[data.items()[n]] = self._features(
data.features()[n],
data.labels()[n],
properties=data.properties()[n],
validate=False)
return features
[docs]class KaldiSerializer(FeaturesSerializer):
def __init__(self, cls, filename):
super().__init__(cls, filename)
# make sure the filename extension is '.ark'
filename_split = os.path.splitext(self.filename)
if filename_split[1] != '.ark':
raise ValueError(
'when saving to Kaldi ark format, the file extension must be '
'".ark", it is "{}"'.format(filename_split[1]))
self._fileroot = filename_split[0]
def _save(self, features, scp=False):
# writing features
ark = self._fileroot + '.ark'
if scp:
scp = self._fileroot + '.scp'
self._log.info('writing %s and %s', ark, scp)
wspecifier = 'ark,scp:' + ark + ',' + scp
else:
self._log.info('writing %s', ark)
wspecifier = 'ark:' + ark
with kaldi.util.table.DoubleMatrixWriter(wspecifier) as writer:
for k, v in features.items():
writer[k] = kaldi.matrix.DoubleSubMatrix(v.data)
# writing times
ark = self._fileroot + '.times.ark'
if scp:
scp = self._fileroot + '.times.scp'
self._log.info('writing %s and %s', ark, scp)
wspecifier = 'ark,scp:' + ark + ',' + scp
else:
self._log.info('writing %s', ark)
wspecifier = 'ark:' + ark
with kaldi.util.table.DoubleMatrixWriter(wspecifier) as writer:
for k, v in features.items():
# in case times are 1d, we force them to 2d so they
# can be wrote as kaldi matrices (we do the reverse
# 2d->1d on loading). We are copying the array to
# avoid a bug on macos.
writer[k] = kaldi.matrix.DoubleSubMatrix(
np.atleast_2d(v.times).copy())
# writing properties. As we are writing double arrays, we need
# to track the original dtype of features in the properties,
# to ensure equality on load
filename = self._fileroot + '.properties.json'
self._log.info('writing %s', filename)
data = {k: copy.deepcopy(v.properties) for k, v in features.items()}
for k, v in data.items():
data[k]['__dtype_data__'] = str(features[k].dtype)
data[k]['__dtype_times__'] = str(features[k].times.dtype)
open(filename, 'wt').write(json_tricks.dumps(data, indent=4))
def _load(self):
# loading properties
filename = self._fileroot + '.properties.json'
self._log.info('loading %s', filename)
if not os.path.isfile(filename):
raise IOError('file not found: {}'.format(filename))
properties = json_tricks.loads(open(filename, 'r').read())
# loading times
ark = self._fileroot + '.times.ark'
self._log.info('loading %s', ark)
if not os.path.isfile(ark):
raise IOError('file not found: {}'.format(ark))
rspecifier = 'ark:' + ark
with kaldi.util.table.SequentialDoubleMatrixReader(
rspecifier) as reader:
times = {k: v.numpy() for k, v in reader}
# postprocess times: do 2d->1d if they are 1d vectors
for k, v in times.items():
if v.shape[0] == 1:
times[k] = v.reshape((v.shape[1]))
# loading features
ark = self._fileroot + '.ark'
self._log.info('loading %s', ark)
# rspecifier = 'ark,scp:' + ark + ',' + scp
rspecifier = 'ark:' + ark
with kaldi.util.table.SequentialDoubleMatrixReader(
rspecifier) as reader:
data = {k: v.numpy() for k, v in reader}
if properties.keys() != data.keys():
raise ValueError(
'invalid features: items differ in data and properties')
if times.keys() != data.keys():
raise ValueError(
'invalid features: items differ in data and times')
return self._features_collection(
**{k: self._features(
data[k].astype(properties[k]['__dtype_data__']),
times[k].astype(properties[k]['__dtype_times__']),
properties={
k: p for k, p in properties[k].items()
if '__dtype_' not in k},
validate=False)
for k in data.keys()})