Source code for shennong.features_collection

"""Provides the `FeaturesCollection` class to manipulate speech features

- A `FeaturesCollection` is basically a dictionnary of
  :class:`~shennong.features.Features` indexed by names.

- A collection can be saved to and loaded from a file with the :func:`save` and
  :func:`load` methods.

Supported file formats
----------------------

The following table details the supported file formats and compares the
obtained file size, writing and reading times on MFCC features computed on the
`Buckeye Corpus <https://buckeyecorpus.osu.edu>`_ (English, 40 speakers, about
38 hours of speech and 254 files):

===========  =========  =========  ============  ============
File format  Extension  File size  Writing time  Reading time
===========  =========  =========  ============  ============
pickle       .pkl       883.7 MB   0:00:07       0:00:05
h5features   .h5f       873.0 MB   0:00:21       0:00:07
numpy        .npz       869.1 MB   0:02:30       0:00:22
matlab       .mat       721.1 MB   0:00:59       0:00:11
kaldi        .ark       1.3 GB     0:00:06       0:00:07
CSV          folder     4.8 GB     0:03:02       0:03:11
===========  =========  =========  ============  ============

- **pickle**: standard Python format, fast and efficient for little to medium
  datasets.

- **h5features**: based on HDF5 and specialized for very big datasets. Supports
  partial read/write of datasets bigger than RAM. The documention is available
  at https://docs.cognitive-ml.fr/h5features.

- **numpy**: standard numpy format.

- **matlab** and **kaldi**: for compatibility.

- **csv**: each features in the collection is wrote as plain text in a
  dedicated file, with an optional JSON file storing features properties.

Examples
--------

>>> import os
>>> import numpy as np
>>> from shennong import Features, FeaturesCollection

Create a collection of two random features

>>> fc = FeaturesCollection()
>>> fc['feat1'] = Features(np.random.random((5, 2)), np.linspace(0, 4, num=5))
>>> fc['feat2'] = Features(np.random.random((3, 2)), np.linspace(0, 2, num=3))
>>> fc.keys()
dict_keys(['feat1', 'feat2'])

Save the collection to a npz file

>>> fc.save('features.npz')

Load it back to a new collection

>>> fc2 = FeaturesCollection.load('features.npz')
>>> fc2.keys()
dict_keys(['feat1', 'feat2'])
>>> fc == fc2
True

>>> os.remove('features.npz')

"""

import collections
import numpy as np

from shennong import Features
from shennong.logger import get_logger
from shennong.serializers import get_serializer


[docs]class FeaturesCollection(dict):
    """Handles a collection of :class:`~shennong.Features` as a dictionary"""
[docs]    @classmethod
    def load(cls, filename, serializer=None,
             log=get_logger('serializer', 'warning')):
        """Loads a FeaturesCollection from a `filename`

        Parameters
        ----------
        filename : str
            The file to load
        serializer : str, optional
            The file serializer to use for loading, if not specified
            guess the serializer from the `filename` extension
        log : logging.Logger, optional
            Where to send log messages. Default to a logger named 'serializer'
            with a 'warning' level.

        Returns
        -------
        features : :class:`~shennong.features.FeaturesCollection`
            The features loaded from the `filename`

        Raises
        ------
        IOError
            If the `filename` cannot be read
        ValueError
            If the `serializer` or the file extension is not supported,
            if the features loading fails.

        """
        return get_serializer(cls, filename, log, serializer).load()

[docs]    def save(self, filename, serializer=None, with_properties=True,
             log=get_logger('serializer', 'warning'), **kwargs):
        """Saves a FeaturesCollection to a `filename`

        Parameters
        ----------
        filename : str
            The file to write
        serializer : str, optional
            The file serializer to use for loading, if not specified
            guess the serializer from the `filename` extension
        with_properties : bool, optional
            When False do not save the features properties, default to True.
        log : logging.Logger, optional
            Where to send log messages. Default to a logger named 'serializer'
            with a 'warning' level.
        compress : bool_or_str_or_int, optional
            Only valid for numpy (.npz), matlab (.mat) and h5features (.h5f)
            serializers. When True compress the file. Default to True.
        scp : bool, optional
            Only valid for kaldi (.ark) serializer. When True writes a .scp
            file along with the .ark file. Default to False.

        Raises
        ------
        IOError
            If the file `filename` already exists
        ValueError
            If the `serializer` or the file extension is not supported,
            if the features saving fails.

        """
        get_serializer(self.__class__, filename, log, serializer).save(
            self, with_properties=with_properties, **kwargs)

[docs]    def is_valid(self):
        """Returns True if all the features in the collection are valid"""
        for features in self.values():
            if not features.is_valid():
                return False
        return True

[docs]    def is_close(self, other, rtol=1e-5, atol=1e-8):
        """Returns True `self` is approximately equal to `other`

        Parameters
        ----------
        other : FeaturesCollection
            The collection of features to compare to the current one
        rtol : float, optional
            Relative tolerance
        atol : float, optional
            Absolute tolerance

        Returns
        -------
        equal : bool
            True if this collection is almost equal to the `other`

        See Also
        --------
        Features.is_close, numpy.allclose

        """
        if not self.keys() == other.keys():
            return False

        for k in self.keys():
            if not self[k].is_close(other[k], rtol=rtol, atol=atol):
                return False

        return True

[docs]    def partition(self, index):
        """Returns a partition of the collection as a dict of FeaturesCollection

        This method is usefull to create sub-collections from an
        existing one, for instance to make one sub-collection per
        speaker, or per gender, etc...

        Parameters
        ----------
        index : dict
            A mapping with, for each item in this collection, the
            sub-collection they belong to in the partition. We must
            have ``index.keys() == self.keys()``.

        Returns
        -------
        features : dict of FeaturesCollection
            A dictionnary of FeaturesCollection instances, one per
            speaker defined in `index`.

        Raises
        ------
        ValueError
            If one utterance in the collection is not mapped in
            `index`.

        """
        undefined_utts = set(self.keys()).difference(index.keys())
        if undefined_utts:
            raise ValueError(
                'following items are not defined in the partition index: {}'
                .format(', '.join(sorted(undefined_utts))))

        reverse_index = collections.defaultdict(list)
        for key, value in index.items():
            reverse_index[value].append(key)

        return {k: FeaturesCollection({item: self[item] for item in items})
                for k, items in reverse_index.items()}

[docs]    def trim(self, vad):
        """Returns a new instance of FeaturesCollection where each features
        has been trimmed with the corresponding VAD.

        Parameters
        ----------
        vad : dict of boolean ndarrays
            A dictionnary of arrays indicating which frame to keep.

        Returns
        -------
        features: FeaturesCollection
            A new FeaturesCollection trimmed with the input VAD

        Raises
        ------
        ValueError
            If the utterances are not the same. If the VAD arrays are
            not boolean arrays.
        """
        if vad.keys() != self.keys():
            raise ValueError('Vad keys are different from this keys.')

        for key in vad.keys():
            if vad[key].dtype != np.dtype('bool'):
                raise ValueError('Vad arrays must be arrays of bool.')
            if vad[key].shape[0] != self[key].nframes:
                raise ValueError(
                    'Vad arrays length must be equal to the number of frames.')

        return FeaturesCollection({
            k: Features(
                self[k].data[vad[k]],
                self[k].times[vad[k]],
                properties=self[k].properties) for k in self.keys()})