Source code for shennong.features.features

"""Builds, saves, loads and manipulate speech features"""


import collections
import copy
import logging
import numpy as np

from shennong.features.serializers import get_serializer
from shennong.utils import dict_equal


[docs]class Features: _log = logging.getLogger() def __init__(self, data, times, properties={}, validate=True): self._data = data self._times = times self._properties = properties # make sure the features are in a valid state if validate is True: self.validate() @property def data(self): """The underlying features data as a numpy matrix""" return self._data @property def times(self): """The frames timestamps on the vertical axis""" return self._times @property def dtype(self): """The type of the features data samples""" return self.data.dtype @property def shape(self): """The shape of the features data, as (nframes, ndims)""" return self.data.shape @property def ndims(self): """The number of dimensions of a features frame (feat.shape[1])""" return self.shape[1] @property def nframes(self): """The number of features frames (feat.shape[0])""" return self.shape[0] @property def properties(self): """A dictionnary of properties used to build the features Properties are references to the features extraction pipeline, parameters and source audio file used to generate the features. """ return self._properties def _to_dict(self): """Returns the features as a dictionary Returns ------- features : dict A dictionary with the following keys: 'data', 'times' and 'properties'. """ return { 'data': self.data, 'times': self.times, 'properties': self.properties} @staticmethod def _from_dict(features, validate=True): """Return an instance of Features loaded from a dictionary Parameters ---------- features : dict The dictionary to load the features from. Must have the following keys: 'data', 'times' and 'properties'. validate : bool, optional When True, validate the features before returning. Default to True Returns ------- An instance of ``Features`` Raises ------ ValueError If the ``features`` don't have the requested keys or if the underlying features data is not valid. """ requested_keys = {'data', 'times', 'properties'} missing_keys = requested_keys - set(features.keys()) if missing_keys: raise ValueError( 'cannot read features from dict, missing keys: {}' .format(', '.join(missing_keys))) return Features( features['data'], features['times'], properties=features['properties'], validate=validate) def __eq__(self, other): """Returns True if `self` is equal `other`, False otherwise""" # object identity if self is other: return True # quick tests on attributes if self.shape != other.shape or self.dtype != other.dtype: return False # properties equality if not dict_equal(self.properties, other.properties): return False # timestamps equality if not np.array_equal(self.times, other.times): return False # features matrices equality if not np.array_equal(self.data, other.data): return False return True
[docs] def is_close(self, other, rtol=1e-5, atol=1e-8): """Returns True if `self` is approximately equal to `other` Parameters ---------- other : Features The Features instance to be compared to this one rtol : float, optional Relative tolerance atol : float, optional Absolute tolerance Returns ------- equal : bool True if these features are almost equal to the `other` See Also -------- FeaturesCollection.is_close, numpy.allclose """ if self is other: return True if self.shape != other.shape: return False if not dict_equal(self.properties, other.properties): return False if not np.array_equal(self.times, other.times): return False if not np.allclose(self.data, other.data, atol=atol, rtol=rtol): return False return True
[docs] def copy(self, dtype=None, subsample=None): """Returns a copy of the features Allocates new arrays for data, times and properties Parameters ---------- dtype : type, optional When specified converts the data and times arrays to the requested `dtype` subsample : int, optional When specified subsample the features every `subsample` frames. When not specified do not do subsampling. Raises ------ ValueError If `subsample` is defined but is not a strictly positive integer. Returns ------- features : Features A new instance of Features copied from this one. """ # by default we do not subsample if subsample is None: subsample = 1 else: if not isinstance(subsample, int) or subsample <= 0: raise ValueError( f'subsample must be a strictly positive integer, ' f'it is: {subsample}') if dtype: return Features( self.data[0:self.nframes:subsample].astype(dtype), self.times[0:self.nframes:subsample].astype(dtype), properties=copy.deepcopy(self.properties), validate=False) return Features( self.data[0:self.nframes:subsample].copy(), self.times[0:self.nframes:subsample].copy(), properties=copy.deepcopy(self.properties), validate=False)
[docs] def is_valid(self): """Returns True if the features are in a valid state Returns False otherwise. Consistency is checked for features's data, times and properties. See Also -------- Features.validate """ try: self.validate() except ValueError: return False return True
[docs] def validate(self): """Raises a ValueError if the features are not in a valid state""" # accumulate detected errors and display them at the end errors = [] # basic checks on types if not isinstance(self.data, np.ndarray): errors.append('data must be a numpy array') if not isinstance(self.times, np.ndarray): errors.append('times must be a numpy array') if not isinstance(self.properties, dict): errors.append('properties must be a dictionnary') if errors: raise ValueError( 'invalid features data types: {}'.format(', '.join(errors))) # check arrays dimensions if not self.data.ndim == 2: errors.append( 'data dimension must be 2 but is {}'.format(self.data.ndim)) if self.times.ndim > 2: errors.append( 'times dimension must be 1 or 2 but is {}'.format( self.times.ndim)) if self.times.ndim == 2 and self.times.shape[1] != 2: errors.append('times shape[1] must be 2, it is {}'.format( self.times.shape[1])) nframes1 = self.data.shape[0] nframes2 = self.times.shape[0] if not nframes1 == nframes2: errors.append( 'mismatch in number of frames: {} for data but {} ' 'for times'.format(nframes1, nframes2)) if errors: raise ValueError( 'invalid features dimensions: {}'.format(', '.join(errors))) # check if time is increasing. This check comes from # h5features/labels.py index = (np.argsort(self.times) if self.times.ndim == 1 else np.lexsort(self.times.T)) if not all(n == index[n] for n in range(self.nframes)): raise ValueError('times is not sorted in increasing order') # check all values in array are finit (not infinity nor nan) if not np.all(np.isfinite(self.data)): raise ValueError( 'data contains non-finite numbers (nan of infinity)')
[docs] def concatenate(self, other, tolerance=0): """Returns the concatenation of this features with `other` Build a new Features instance made of the concatenation of this instance with the other instance. Their `times` must be the equal. Parameters ---------- other : Features, shape = [nframes +/- tolerance, ndim2] The other features to concatenate at the end of this one tolerance : int, optional If the number of frames of the two features is different, trim the longest one up to a frame difference of `tolerance`, otherwise raise a ValueError. This option is usefull when concatenating pitch with other 'standard' features because pitch processing includes a downsampling which can alter the resulting number of frames (the same tolerance is applied in Kaldi, e.g. in paste-feats). Default to 0. Returns ------- features : Features, shape = [nframes +/- tolerance, ndim1 + ndim2] Raises ------ ValueError If `other` cannot be concatenated because of inconsistencies: number of frames difference greater than tolerance, inequal times values. """ # check the number of frames is within the tolerance need_trim = False diff = abs(self.nframes - other.nframes) if diff: if not tolerance: raise ValueError( 'features have a different number of frames') if tolerance and diff > tolerance: raise ValueError( 'features differs number of frames, and ' 'greater than tolerance: |{} - {}| > {}'.format( self.nframes, other.nframes, tolerance)) self._log.warning( 'features differs in number of frames, but ' 'within tolerance (|%s - %s| <= %s), trim the longest one', self.nframes, other.nframes, tolerance) need_trim = True # trim the longest features to the size of the shortest one data1 = self.data data2 = other.data times1 = self.times times2 = other.times if need_trim: if self.nframes > other.nframes: data1 = data1[:-diff] times1 = times1[:-diff] else: data2 = data2[:-diff] times2 = times2[:-diff] # ensures time axis is shared accross the two features if not np.allclose(times1, times2): raise ValueError('times are not equal') # merge properties of the two features properties = copy.deepcopy(self.properties) other_properties = copy.deepcopy(other.properties) properties.update( {k: v for k, v in other_properties.items() if k != 'pipeline'}) if 'pipeline' not in properties: properties['pipeline'] = [] if 'pipeline' in other_properties: for k in other_properties['pipeline']: properties['pipeline'].append(k) columns = properties['pipeline'][-1]['columns'] properties['pipeline'][-1]['columns'] = [ columns[0] + self.ndims, columns[1] + self.ndims] return Features( np.hstack((data1, data2)), times1, properties=properties)
[docs]class FeaturesCollection(dict): # a tweak inspired by C++ metaprogramming to avoid import loops # with shennong.features.serializers _value_type = Features
[docs] @classmethod def load(cls, filename, serializer=None): """Loads a FeaturesCollection from a `filename` Parameters ---------- filename : str The file to load serializer : str, optional The file serializer to use for loading, if not specified guess the serializer from the `filename` extension Returns ------- features : :class:`~shennong.features.FeaturesCollection` The features loaded from the `filename` Raises ------ IOError If the `filename` cannot be read ValueError If the `serializer` or the file extension is not supported, if the features loading fails. """ return get_serializer(cls, filename, serializer).load()
[docs] def save(self, filename, serializer=None, **kwargs): get_serializer( self.__class__, filename, serializer).save(self, **kwargs)
[docs] def is_valid(self): """Returns True if all the features in the collection are valid""" for features in self.values(): if not features.is_valid(): return False return True
[docs] def is_close(self, other, rtol=1e-5, atol=1e-8): """Returns True `self` is approximately equal to `other` Parameters ---------- other : FeaturesCollection The collection of features to compare to the current one rtol : float, optional Relative tolerance atol : float, optional Absolute tolerance Returns ------- equal : bool True if this collection is almost equal to the `other` See Also -------- Features.is_close, numpy.allclose """ if not self.keys() == other.keys(): return False for k in self.keys(): if not self[k].is_close(other[k], rtol=rtol, atol=atol): return False return True
[docs] def partition(self, index): """Returns a partition of the collection as a dict of FeaturesCollection This method is usefull to create sub-collections from an existing one, for instance to make one sub-collection per speaker, or per gender, etc... Parameters ---------- index : dict A mapping with, for each item in this collection, the sub-collection they belong to in the partition. We must have ``index.keys() == self.keys()``. Returns ------- features : dict of FeaturesCollection A dictionnary of FeaturesCollection instances, one per speaker defined in `index`. Raises ------ ValueError If one utterance in the collection is not mapped in `index`. """ undefined_utts = set(self.keys()).difference(index.keys()) if undefined_utts: raise ValueError( 'following items are not defined in the partition index: {}' .format(', '.join(sorted(undefined_utts)))) reverse_index = collections.defaultdict(list) for key, value in index.items(): reverse_index[value].append(key) return {k: FeaturesCollection({item: self[item] for item in items}) for k, items in reverse_index.items()}
[docs] def trim(self, vad): """Returns a new instance of FeaturesCollection where each features has been trimmed with the corresponding VAD. Parameters ---------- vad : dict of boolean ndarrays A dictionnary of arrays indicating which frame to keep. Returns ------- features: FeaturesCollection A new FeaturesCollection trimmed with the input VAD Raises ------ ValueError If the utterances are not the same. If the VAD arrays are not boolean arrays. """ if vad.keys() != self.keys(): raise ValueError('Vad keys are different from this keys.') for key in vad.keys(): if vad[key].dtype != np.dtype('bool'): raise ValueError('Vad arrays must be arrays of bool.') if vad[key].shape[0] != self[key].nframes: raise ValueError( 'Vad arrays length must be equal to the number of frames.') return FeaturesCollection({ k: Features( self[k].data[vad[k]], self[k].times[vad[k]], properties=self[k].properties) for k in self.keys()})