Source code for shennong.features.serializers

"""Saves and loads features collections to/from various file formats

The following table shows the obtained file size, writing and reading
times on MFCC features computed on the `Zero Resource Speech
Challenge 2019 <https://zerospeech.com/2019>`_ train database
(English, about 26 hours of speech and 10k files):

===========  =========  =========  ============  ============
File format  Extension  File size  Writing time  Reading time
===========  =========  =========  ============  ============
h5features   .h5f       562.9 MB   0:00:20       0:00:08
pickle       .pkl       609.8 MB   0:00:08       0:00:06
numpy        .npz       582.8 MB   0:02:07       0:00:19
matlab       .mat       481.8 MB   0:00:58       0:00:13
kaldi        .ark       927.8 MB   0:00:10       0:00:15
JSON         .json      6.3 GB     0:11:34       1:04:25
===========  =========  =========  ============  ============


"""

import abc
import copy
import os
import pickle

import h5features
import json_tricks
import kaldi.matrix
import kaldi.util.table
import numpy as np
import scipy

from shennong.utils import get_logger, array2list


[docs]def supported_extensions():
    """Returns the list of file extensions to save/load features

    Returns
    -------
    serializers : dict
        File extensions mapped to their related serializer class

    """
    return {
        '.npz': NumpySerializer,
        '.mat': MatlabSerializer,
        '.json': JsonSerializer,
        '.pkl': PickleSerializer,
        '.h5f': H5featuresSerializer,
        '.ark': KaldiSerializer}


[docs]def supported_serializers():
    """Returns the list of file format serializers to save/load features

    Returns
    -------
    serializers : dict
        Serializers names mapped to their related class

    """
    return {
        'numpy': NumpySerializer,
        'matlab': MatlabSerializer,
        'json': JsonSerializer,
        'pickle': PickleSerializer,
        'h5features': H5featuresSerializer,
        'kaldi': KaldiSerializer}


[docs]def get_serializer(cls, filename, serializer=None):
    """Returns the file serializer from filename extension or serializer name

    Parameters
    ----------
    cls : class
        Must be :class:`shennong.features.FeaturesCollection`, this is
        a tweak to avoid circular imports
    filename : str
        The file to be handled (load or save)
    serializer : str, optional
        If not None must be one of the :func:`supported_serializers`, if
        not specified, guess the serializer from the `filename`
        extension using :func:`supported_extensions`.

    Returns
    -------
    serializer : instance of :class:`FeaturesSerializer`
        The guessed serializer class, a child class of
        :class:`FeaturesSerializer`.

    Raises
    ------
    ValueError
        If the serializer class cannot be guessed, or if `cls` is not
        :class:`~shennong.features.FeaturesCollection`

    """
    if cls.__name__ != 'FeaturesCollection':
        raise ValueError(
            'The `cls` parameter must be shennong.features.FeaturesCollection')

    if serializer is None:
        # guess serializer from file extension
        ext = os.path.splitext(filename)[1]
        if not ext:
            raise ValueError('no extension nor serializer name specified')

        try:
            serializer = supported_extensions()[ext]
        except KeyError:
            raise ValueError(
                'invalid extension {}, must be in {}'.format(
                    ext, list(supported_extensions().keys())))
    else:
        try:
            serializer = supported_serializers()[serializer]
        except KeyError:
            raise ValueError(
                'invalid serializer {}, must be in {}'.format(
                    serializer, list(supported_serializers().keys())))

    return serializer(cls, filename)


[docs]class FeaturesSerializer(metaclass=abc.ABCMeta):
    """Base class of a features file serializer

    This class must be specialized to handle a given file type.

    Parameters
    ----------
    cls : class
        Must be :class:`shennong.features.FeaturesCollection`, this is
        a tweak to avoid circular imports
    filename : str
        The file to save/load features to/from

    """
    _log = get_logger(__name__)

    def __init__(self, cls, filename):
        self._features_collection = cls
        self._features = self._features_collection._value_type
        self._filename = filename

    @property
    def filename(self):
        return self._filename

    @abc.abstractmethod
    def _save(self, features):  # pragma: nocover
        pass

    @abc.abstractmethod
    def _load(self):  # pragma: nocover
        pass

[docs]    def load(self, **kwargs):
        """Returns a collection of features from the `filename`

        Returns
        -------
        features : :class:`~shennong.features.FeaturesCollection`
            The features stored in the file.
        kwargs : optional
            Optional supplementary arguments, specific to each serializer.

        Raises
        ------
        IOError
            If the input file does not exist or cannot be read.

        ValueError
            If the features cannot be loaded from the file or are not
            in a valid state.

        """
        if not os.path.isfile(self.filename):
            raise IOError('file not found: {}'.format(self.filename))
        if not os.access(self.filename, os.R_OK):
            raise IOError('file not readable: {}'.format(self.filename))

        features = self._load(**kwargs)

        if not features.is_valid():
            raise ValueError(
                'features not valid in file: {}'.format(self.filename))

        return features

[docs]    def save(self, features, **kwargs):
        """Saves a collection of `features` to a file

        Parameters
        ----------
        features : :class:`~shennong.features.FeaturesCollection`
            The features to store in the file.
        kwargs : optional
            Optional supplementary arguments, specific to each serializer.

        Raises
        ------
        IOError
            If the output file already exists.

        ValueError
            If the features cannot be saved to the file, are not in a
            valid state or are not an instance of
            :class:`~shennong.features.FeaturesCollection`.

        """
        if os.path.isfile(self.filename):
            raise IOError('file already exists: {}'.format(self.filename))

        if not isinstance(features, self._features_collection):
            raise ValueError(
                'features must be {} but are {}'
                .format(
                    self._features_collection.__name__,
                    features.__class__.__name__))

        if not features.is_valid():
            raise ValueError('features are not valid')

        self._save(features, **kwargs)


[docs]class NumpySerializer(FeaturesSerializer):
    """Saves and loads features to/from the numpy '.npz' format"""
    def _save(self, features, compress=True):
        self._log.info('writing %s', self.filename)

        # represent the features as dictionaries
        data = {k: v._to_dict() for k, v in features.items()}

        # save (and optionally compress) the features
        save = np.savez_compressed if compress is True else np.savez
        save(open(self.filename, 'wb'), features=data, allow_pickle=True)

    def _load(self):
        self._log.info('loading %s', self.filename)

        data = np.load(
            open(self.filename, 'rb'), allow_pickle=True)['features'].tolist()

        features = self._features_collection()
        for k, v in data.items():
            features[k] = self._features._from_dict(v, validate=False)
        return features


[docs]class MatlabSerializer(FeaturesSerializer):
    """Saves and loads features to/from the matlab '.mat' format"""
    def _save(self, features, compress=True):
        self._log.info('writing %s', self.filename)

        # represent the features as dictionaries
        data = {k: v._to_dict() for k, v in features.items()}
        # print(data['test']['properties'])

        # save (and optionally compress) the features
        scipy.io.savemat(
            self.filename, data,
            long_field_names=True,
            appendmat=False, do_compression=compress)

    def _load(self):
        self._log.info('loading %s', self.filename)

        data = self._check_keys(
            scipy.io.loadmat(
                self.filename, appendmat=False, squeeze_me=True,
                mat_dtype=True, struct_as_record=False))

        features = self._features_collection()
        for k, v in data.items():
            if k not in ('__header__', '__version__', '__globals__'):
                features[k] = self._features(
                    v['data'],
                    v['times'],
                    self._make_list(self._check_keys(v['properties'])),
                    validate=False)
        return features

    @staticmethod
    def _check_keys(d):
        """Checks if entries in dictionary are mat-objects.

        If yes todict is called to change them to nested dictionaries.

        From https://stackoverflow.com/a/8832212

        """
        for key in d:
            if isinstance(d[key], scipy.io.matlab.mio5_params.mat_struct):
                d[key] = MatlabSerializer._todict(d[key])
            elif isinstance(d[key], (list, np.ndarray)):
                d[key] = [MatlabSerializer._todict(dd) for dd in d[key]]
        return d

    @staticmethod
    def _todict(matobj):
        """Constructs from matobjects nested dictionaries

        From https://stackoverflow.com/a/8832212

        """
        d = {}
        for strg in matobj._fieldnames:
            elem = matobj.__dict__[strg]
            if isinstance(elem, scipy.io.matlab.mio5_params.mat_struct):
                d[strg] = MatlabSerializer._todict(elem)
            else:
                d[strg] = elem
        return d

    @staticmethod
    def _make_list(properties):
        if 'pipeline' in properties:
            # matlab format collapse a list of a single element into
            # that element, we need to rebuild that list here
            if isinstance(properties['pipeline'], list):
                properties['pipeline'] = [
                    array2list(p) for p in properties['pipeline']]
            else:
                properties['pipeline'] = [
                    array2list(properties['pipeline'])]
        return properties


[docs]class JsonSerializer(FeaturesSerializer):
    """Saves and loads features to/from the JSON format"""
    def _save(self, features):
        self._log.info('writing %s', self.filename)
        open(self.filename, 'wt').write(json_tricks.dumps(features, indent=4))

    def _load(self):
        self._log.info('loading %s', self.filename)
        return self._features_collection(
            json_tricks.loads(open(self.filename, 'r').read()))


[docs]class PickleSerializer(FeaturesSerializer):
    """Saves and loads features to/from the Python pickle format"""
    def _save(self, features):
        self._log.info('writing %s', self.filename)
        with open(self.filename, 'wb') as fh:
            pickle.dump(features, fh)

    def _load(self):
        with open(self.filename, 'rb') as fh:
            return pickle.load(fh)


[docs]class H5featuresSerializer(FeaturesSerializer):
    """Saves and loads features to/from the h5features format"""
    def _save(self, features, groupname='features',
              compression='lzf', chunk_size='auto'):
        self._log.info('writing %s', self.filename)

        # we safely use append mode as we are sure at this point the
        # file does not exist (from FeaturesSerializer.save)
        with h5features.Writer(
                self.filename, mode='a', chunk_size=chunk_size,
                compression=compression) as writer:
            # append the feature in the file one by one (this avoid to
            # duplicate the whole collection in memory, which can
            # cause MemoryError on big datasets).
            for k, v in features.items():
                data = h5features.Data(
                    [k], [v.times], [v.data], properties=[v.properties])
                writer.write(data, groupname=groupname, append=True)

    def _load(self, groupname='features'):
        self._log.info('loading %s', self.filename)

        data = h5features.Reader(self.filename, groupname=groupname).read()

        features = self._features_collection()
        for n in range(len(data.items())):
            features[data.items()[n]] = self._features(
                data.features()[n],
                data.labels()[n],
                properties=data.properties()[n],
                validate=False)
        return features


[docs]class KaldiSerializer(FeaturesSerializer):
    def __init__(self, cls, filename):
        super().__init__(cls, filename)

        # make sure the filename extension is '.ark'
        filename_split = os.path.splitext(self.filename)
        if filename_split[1] != '.ark':
            raise ValueError(
                'when saving to Kaldi ark format, the file extension must be '
                '".ark", it is "{}"'.format(filename_split[1]))

        self._fileroot = filename_split[0]

    def _save(self, features, scp=False):
        # writing features
        ark = self._fileroot + '.ark'
        if scp:
            scp = self._fileroot + '.scp'
            self._log.info('writing %s and %s', ark, scp)
            wspecifier = 'ark,scp:' + ark + ',' + scp
        else:
            self._log.info('writing %s', ark)
            wspecifier = 'ark:' + ark
        with kaldi.util.table.DoubleMatrixWriter(wspecifier) as writer:
            for k, v in features.items():
                writer[k] = kaldi.matrix.DoubleSubMatrix(v.data)

        # writing times
        ark = self._fileroot + '.times.ark'
        if scp:
            scp = self._fileroot + '.times.scp'
            self._log.info('writing %s and %s', ark, scp)
            wspecifier = 'ark,scp:' + ark + ',' + scp
        else:
            self._log.info('writing %s', ark)
            wspecifier = 'ark:' + ark
        with kaldi.util.table.DoubleMatrixWriter(wspecifier) as writer:
            for k, v in features.items():
                # in case times are 1d, we force them to 2d so they
                # can be wrote as kaldi matrices (we do the reverse
                # 2d->1d on loading). We are copying the array to
                # avoid a bug on macos.
                writer[k] = kaldi.matrix.DoubleSubMatrix(
                    np.atleast_2d(v.times).copy())

        # writing properties. As we are writing double arrays, we need
        # to track the original dtype of features in the properties,
        # to ensure equality on load
        filename = self._fileroot + '.properties.json'
        self._log.info('writing %s', filename)
        data = {k: copy.deepcopy(v.properties) for k, v in features.items()}
        for k, v in data.items():
            data[k]['__dtype_data__'] = str(features[k].dtype)
            data[k]['__dtype_times__'] = str(features[k].times.dtype)
        open(filename, 'wt').write(json_tricks.dumps(data, indent=4))

    def _load(self):
        # loading properties
        filename = self._fileroot + '.properties.json'
        self._log.info('loading %s', filename)
        if not os.path.isfile(filename):
            raise IOError('file not found: {}'.format(filename))

        properties = json_tricks.loads(open(filename, 'r').read())

        # loading times
        ark = self._fileroot + '.times.ark'
        self._log.info('loading %s', ark)
        if not os.path.isfile(ark):
            raise IOError('file not found: {}'.format(ark))

        rspecifier = 'ark:' + ark
        with kaldi.util.table.SequentialDoubleMatrixReader(
                rspecifier) as reader:
            times = {k: v.numpy() for k, v in reader}

        # postprocess times: do 2d->1d if they are 1d vectors
        for k, v in times.items():
            if v.shape[0] == 1:
                times[k] = v.reshape((v.shape[1]))

        # loading features
        ark = self._fileroot + '.ark'
        self._log.info('loading %s', ark)

        # rspecifier = 'ark,scp:' + ark + ',' + scp
        rspecifier = 'ark:' + ark
        with kaldi.util.table.SequentialDoubleMatrixReader(
                rspecifier) as reader:
            data = {k: v.numpy() for k, v in reader}

        if properties.keys() != data.keys():
            raise ValueError(
                'invalid features: items differ in data and properties')

        if times.keys() != data.keys():
            raise ValueError(
                'invalid features: items differ in data and times')

        return self._features_collection(
            **{k: self._features(
                data[k].astype(properties[k]['__dtype_data__']),
                times[k].astype(properties[k]['__dtype_times__']),
                properties={
                    k: p for k, p in properties[k].items()
                    if '__dtype_' not in k},
                validate=False)
               for k in data.keys()})