Source code for ABXpy.task

#!/usr/bin/env python
"""This module is used for creating a new task and preprocessing.

This module contains the functions to specify and initialise a new ABX
task, compute and display the statistics, and generate the ABX
triplets and pairs.

It can also be used in a command line. See task --help for the
documentation

Usage
-----

From the command line:

.. code-block:: bash

    python task.py my_data.item \
        -o column1 -a column2 column3 -b column4 column5 \
        -f "[attr == 0 for attr in column3_X]"

my_data.item is a special file containing an index of the database and
a set of labels or attributes. See input format
[here](http://abxpy.readthedocs.io/en/latest/FilesFormat.html#dataset)

In python:

.. code-block:: python

    import ABXpy.task

    # create a new task and compute the statistics
    myTask = ABXpy.task.Task(
        'data.item', 'on_label', 'across_feature', 'by_label',
        filters=my_filters, regressors=my_regressors)

    # display statistics
    print myTask.stats

    # generate a h5db file 'data.abx'containing all the triplets and pairs
    myTask.generate_triplets()


Example
-------

An example of ABX triplet:

+------+------+------+
|  A   |  B   |  X   |
+======+======+======+
| on_1 | on_2 | on_1 |
+------+------+------+
| ac_1 | ac_1 | ac_2 |
+------+------+------+
| by   | by   | by   |
+------+------+------+

A and X share the same 'on' attribute; A and B share the same 'across'
attribute; A,B and X share the same 'by' attribute

"""

import argparse
import os
import sys
import tempfile

from six import iteritems, itervalues

import h5py
import numpy as np
import pandas as pd
import tables
import warnings

import ABXpy.database.database as database
import ABXpy.h5tools.np2h5 as np2h5
import ABXpy.h5tools.h52np as h52np
import ABXpy.h5tools.h5_handler as h5_handler
import ABXpy.h5tools.h5io as h5io
import ABXpy.sampling.sampler as sampler
import ABXpy.sideop.filter_manager as filter_manager
import ABXpy.sideop.regressor_manager as regressor_manager
import ABXpy.misc.progress_display as progress_display
from ABXpy.misc.type_fitting import fit_integer_type

# FIXME many of the fixmes should be presented as feature requests in
# a github instead of fixmes
#
# FIXME get a memory and speed efficient mechanism for storing a task
# on disk and loading it back (pickling doesn't work well)
#
# FIXME filter out empty 'on-across-by' blocks and empty 'by' blocks
# as soon as possible (i.e. when computing stats)
#
# FIXME generate unique_pairs in separate file
#
# FIXME find a better scheme for naming 'by' datasets in HDF5 files
# (to remove the current warning)
#
# FIXME efficiently dealing with case where there is no across
#
# FIXME syntax to specify names for side-ops when computing them on
# the fly or at the very least number of output (default is one)
#
# FIXME implementing file locking, md5 hash and path for integrity
# checks and logging warnings using the standard logging library of
# python + a verbose stuff
#
# FIXME putting metadata in h5files + pretty print it
#
# FIXME dataset size for task file seems too big when filtering so as
# to get only 3 different talkers?
#
# FIXME allow specifying regressors and filters from within python
# using something like (which should be integrated with the existing
# dbfun stuff):
#
#   class ABX_context(object):
#       def __init__(self, db):
#           # init fields with None
#           context = ABX_context(db_file)
#       def new_filter(context):
# 	    return [True for e in context.talker_A]
#
# FIXME allow other ways of providing the hierarchical db (directly in
# pandas format, etc.)
#
# More complicated FIXMES
# -----------------------
#
# FIXME taking by datasets as the basic unit was a mistake, because
# cases where there many small by datasets happen. Find a way to group
# them when needed both in the computations and in the h5 files
#
# FIXME allow by sampling customization depending on the analyzes to
# be carried out


[docs]class Task(object):
    """Define an ABX task for a given database.

    Attributes
    ----------

    `stats` : dict. Contain several statistics about the task. The main
        3 attributes are:

    - nb_blocks the number of blocks of ABX triplets sharing the same 'on',
      'across' and 'by' features.

    - nb_triplets the number of triplets considered.

    - nb_by_levels the number of blocks of ABX triplets sharing the same
      'by' attribute.

    Parameters
    ----------

    db_name : str
        the filename of database on which the ABX task is applied.

    on : str
        the 'on' attribute of the ABX task. A and X share the same 'on'
        attribute and B has a different one.

    across : list, optional
        a list of strings containing the 'across' attributes of the ABX
        task. A and B share the same 'across' attributes and X has a
        different one.

    by : list, optional
        a list of strings containing the 'by' attributes of the ABX task. A,B
        and X share the same 'by' attributes.

    filters : list, optional
        a list of string specifying a filter on A, B or X.

    regressors : list, optional
        a list of string specifying a filter on A, B or X.

    verbose : bool, optional
        display additionnal information is set to True.

    """
    def __init__(self, db_name, on, across=None, by=None,
                 filters=None, regressors=None, verbose=False):
        # check the item file is here
        if not os.path.isfile(db_name):
            raise AssertionError('item file {} not found'.format(db_name))

        # check 'on' is a string
        if not isinstance(on, str):
            raise AssertionError('ON attribute must be specified by a string')

        # parse input arguments
        self.database = db_name
        self.verbose = verbose
        self.on = [on]
        self.across = self._init_as_list(across)
        self.by = self._init_as_list(by)

        # load the item database and check it
        self.db, self.db_hierarchy, feat_db = database.load(
            self.database, features_info=True)
        self._init_check_database()

        # if 'by' or 'across' are empty create appropriate dummy
        # columns. '#' is forbidden in user names for columns. Note
        # that this additional columns are not in the db_hierarchy,
        # but Thomas don't think this is problematic.
        if not self.by:
            self.db['#by'] = 0
            self.by = ['#by']

        if not self.across:
            self.db['#across'] = range(len(self.db))
            self.across = ['#across']

        # setup filters
        self.filters = filter_manager.FilterManager(
            self.db_hierarchy, self.on, self.across, self.by,
            [] if filters is None else filters)

        # setup regressors
        self.regressors = regressor_manager.RegressorManager(
            self.db, self.db_hierarchy, self.on, self.across, self.by,
            [] if regressors is None else regressors)

        # some other attributes that are populated during the database
        # preparation below
        self.by_dbs = {}
        self.types = {}
        self.feat_dbs = {}
        self.on_blocks = {}
        self.across_blocks = {}
        self.on_across_blocks = {}
        self.antiacross_blocks = {}

        # prepare the database for generating the triplets
        self._init_prepare_database(feat_db)
        self._init_prepare_types()

        # compute some statistics about the task
        self.compute_statistics()

    @staticmethod
    def _init_as_list(arg):
        """Helper method to load an argument as a list"""
        if arg is None:
            return []
        elif isinstance(arg, str):
            return [arg]
        else:
            return arg

    def _init_check_database(self):
        """Check the ABX database (item file)"""
        # FIXME add additional checks, for example that columns in BY,
        # ACROSS, ON are not the same ? (see task structure notes)
        # also that location columns are not used
        if self.verbose:
            print('checking input database {}'.format(self.database))

        # check that required columns are present
        cols = set(self.db.columns)
        message = (
            ' argument is invalid, check that all the provided attributes '
            'are defined in the database {}'.format(self.database))
        # the argument of issuperset needs to be a list ...
        assert cols.issuperset(self.on), 'ON' + message
        assert cols.issuperset(self.across), 'ACROSS' + message
        assert cols.issuperset(self.by), 'BY' + message

        for col in cols:
            assert '_' not in col, \
                col + ': you cannot use underscore in column names'
            assert '#' not in col, \
                col + ': you cannot use \'#\' in column names'

        if self.verbose:
            print("input database verified")

    def _init_prepare_database(self, feat_db):
        """Prepare the database for triplet generation"""
        by_groups = self.db.groupby(self.by)

        if self.verbose:
            display = progress_display.ProgressDisplay()
            display.add('block', 'Preprocessing by block', len(by_groups))

        for by_key, by_frame in by_groups:
            if self.verbose:
                display.update('block', 1)
                display.display()

            # allow to get by values as well as values of other variables
            # that are determined by these
            by_values = dict(by_frame.iloc[0])

            # apply 'by' filters
            if self.filters.by_filter(by_values):
                # get analogous feat_db
                by_feat_db = feat_db.iloc[by_frame.index]

                # drop indexes
                by_frame = by_frame.reset_index(drop=True)

                # reset_index to get an index relative to the 'by' db,
                # the original index could be conserved in an additional
                # 'index' column if necessary by removing the drop=True, but
                # this would add another constraint on the possible column name
                by_feat_db = by_feat_db.reset_index(drop=True)

                # apply generic filters
                by_frame = self.filters.generic_filter(by_values, by_frame)

                self.by_dbs[by_key] = by_frame
                self.feat_dbs[by_key] = by_feat_db

                def _by_dbs(l): return self.by_dbs[by_key].groupby(l)
                self.on_blocks[by_key] = _by_dbs(self.on)
                self.across_blocks[by_key] = _by_dbs(self.across)
                self.on_across_blocks[by_key] = _by_dbs(self.on + self.across)

                if len(self.across) > 1:
                    self.antiacross_blocks[by_key] = dict()
                    for across_key in self.across_blocks[by_key].groups:
                        b = True
                        for i, col in enumerate(self.across):
                            b = b * (by_frame[col] != across_key[i])
                        self.antiacross_blocks[by_key][across_key] = (
                            by_frame[b].index)

    def _init_prepare_types(self):
        """Determining appropriate numeric type to represent index

        Currently used only for numpy arrays and h5 storage, might
        also be used for panda frames.

        """
        # len(db)-1 wouldn't work here because there could be missing
        # index due to generic filtering
        self.types = {
            key: fit_integer_type(np.max(db.index.values), is_signed=False)
            for key, db in iteritems(self.by_dbs)}

[docs]    def compute_statistics(self, approximate=False):
        """Compute the statistics of the task

        The number of ABX triplets is exact in most cases if approximate is
        set to false. The other statistics can only be approxrimate in the case
        where there are A, B, X or ABX filters.

        Parameters
        ----------

        approximate : bool
            approximate the number of triplets

        """
        self.stats = {}
        self.stats['approximate'] = bool(self.filters.A or self.filters.B or
                                         self.filters.X or self.filters.ABX)
        self.stats['approximate_nb_triplets'] = approximate and self.stats[
            'approximate']
        self.stats['nb_by_levels'] = len(self.by_dbs)
        self.by_stats = {}

        if self.verbose:
            display = progress_display.ProgressDisplay()
            display.add('block', 'Computing statistics for by block',
                        self.stats['nb_by_levels'])

        for by in self.by_dbs:
            if self.verbose:
                display.update('block', 1)
                display.display()

            stats = {}
            stats['nb_items'] = len(self.by_dbs[by])
            stats['on_levels'] = self.on_blocks[by].size()
            stats['nb_on_levels'] = len(stats['on_levels'])
            stats['across_levels'] = self.across_blocks[by].size()
            stats['nb_across_levels'] = len(stats['across_levels'])
            stats['on_across_levels'] = self.on_across_blocks[by].size()
            stats['nb_on_across_levels'] = len(stats['on_across_levels'])
            self.by_stats[by] = stats

        self.stats['nb_blocks'] = sum(
            [bystats['nb_on_across_levels']
             for bystats in self.by_stats.values()])

        if self.verbose:
            display = progress_display.ProgressDisplay()
            display.add(
                'block', 'Computing statistics for by/on/across block',
                self.stats['nb_blocks'])

        for by, db in iteritems(self.by_dbs):
            stats = self.by_stats[by]
            stats['block_sizes'] = {}
            stats['nb_triplets'] = 0
            stats['nb_across_pairs'] = 0
            stats['nb_on_pairs'] = 0

            # iterate over on/across blocks
            for block_key, count in iteritems(stats['on_across_levels']):
                if self.verbose:
                    display.update('block', 1)
                    display.display()

                block = self.on_across_blocks[by].groups[block_key]
                on_across_by_values = dict(db.iloc[block[0]])

                # retrieve the on and across keys (as they are stored
                # in the panda object)
                on, across = on_across_from_key(block_key)

                # apply the filter and check if block is empty
                if self.filters.on_across_by_filter(on_across_by_values):
                    n_A = count
                    n_X = stats['on_levels'][on]

                    # FIXME quick fix to process case whith no across, but
                    # better done in a separate loop ...
                    if self.across == ['#across']:
                        n_B = stats['nb_items'] - n_X
                    else:
                        n_B = stats['across_levels'][across] - n_A

                    n_X = n_X - n_A
                    stats['nb_across_pairs'] += n_A * n_B
                    stats['nb_on_pairs'] += n_A * n_X

                    need_approx = approximate or not(
                        self.filters.A or self.filters.B
                        or self.filters.X or self.filters.ABX)

                    if need_approx and not isinstance(across, tuple):
                        stats['nb_triplets'] += n_A * n_B * n_X
                        stats['block_sizes'][block_key] = n_A * n_B * n_X
                    else:
                        # count exact number of triplets, could be further
                        # optimized because it isn't necessary to do the whole
                        # triplet generation, in particular in the case where
                        # there are no ABX filters
                        nb_triplets = self.on_across_triplets(
                            by, on, across, block, on_across_by_values,
                            with_regressors=False).shape[0]

                        stats['nb_triplets'] += nb_triplets
                        stats['block_sizes'][block_key] = nb_triplets
                else:
                    stats['block_sizes'][block_key] = 0

        self.stats['nb_triplets'] = sum(
            [bystats['nb_triplets'] for bystats in self.by_stats.values()])

        # FIXME remove empty by blocks then remove empty on_across_by
        # blocks here, also reset self.n_blocks in consequence
        self.n_blocks = self.stats['nb_blocks']

[docs]    def on_across_triplets(self, by, on, across,
                           on_across_block, on_across_by_values,
                           with_regressors=True):
        """Generate all possible triplets for a given by block.

        Given an on_across_block of the database and the parameters of the
        task, this function will generate the complete set of triplets and
        the regressors.

        Parameters
        ----------

        by : int
            The block index

        on, across : int
            The task attributes

        on_across_block : list
            the block

        on_across_by_values : dict
            the actual values

        with_regressors : bool, optional
            By default, true

        Returns
        -------

        triplets : numpy.Array
            the set of triplets generated

        regressors : numpy.Array
            the regressors generated

        """
        # find all possible A, B, X where A and X have the 'on'
        # feature of the block and A and B have the 'across' feature
        # of the block
        A = np.array(on_across_block, dtype=self.types[by])
        on_set = set(self.on_blocks[by].groups[on])

        # FIXME quick fix to process case with no across, but better
        # done in a separate loop...
        if self.across == ['#across']:
            # in this case A is a singleton and B can be anything in
            # the by block that doesn't have the same 'on' as A
            B = np.array(
                list(set(self.by_dbs[by].index).difference(on_set)),
                dtype=self.types[by])
        else:
            # remove B with the same 'on' than A
            B = np.array(
                list(set(self.across_blocks[by].groups[across]).difference(A)),
                dtype=self.types[by])

        # remove X with the same 'across' than A
        if type(across) is tuple:
            antiacross_set = set(self.antiacross_blocks[by][across])
            X = np.array(list(antiacross_set & on_set), dtype=self.types[by])
        else:
            X = np.array(list(on_set.difference(A)), dtype=self.types[by])

        # apply singleton filters
        db = self.by_dbs[by]
        if self.filters.A:
            iA = self.filters.A_filter(on_across_by_values, db, A)
            A = A[iA]
        if self.filters.B:
            iB = self.filters.B_filter(on_across_by_values, db, B)
            B = B[iB]
        if self.filters.X:
            iX = self.filters.X_filter(on_across_by_values, db, X)
            X = X[iX]

        # instantiate A, B, X regressors here
        if with_regressors:
            self.regressors.set_A_regressors(on_across_by_values, db, A)
            self.regressors.set_B_regressors(on_across_by_values, db, B)
            self.regressors.set_X_regressors(on_across_by_values, db, X)

        # A, B, X can then be combined efficiently in a full (or
        # randomly sampled) factorial design
        size = len(A) * len(B) * len(X)
        on_across_block_index = [0]

        if size > 0:
            ind_type = fit_integer_type(size, is_signed=False)
            indices = np.arange(size, dtype=ind_type)

            # generate triplets from indices
            iX = np.mod(indices, len(X))
            iB = np.mod(np.floor_divide(indices, len(X)), len(B))
            iA = np.floor_divide(indices, len(B) * len(X))
            triplets = np.column_stack((A[iA], B[iB], X[iX]))

            # apply triplets filters
            if self.filters.ABX:
                ABX_filter_ind = self.filters.ABX_filter(
                    on_across_by_values, db, triplets)
                triplets = triplets[ABX_filter_ind]
                size = triplets.shape[0]

            if with_regressors:
                # If I understand correctly, this is supposed to first
                # give a unique id to each combination of regressors
                # and then sort the triplets so as to have triplets
                # with the same combination of regressors next to each
                # other.  Is the on_across_block_index created ever
                # reused for allowing indexed and thus more efficient
                # access in later procesing (e.g. in analyze.py) or is
                # it only for thresholding ?

                # reindexing by regressors
                Breg = [reg[iB] for regs in self.regressors.B_regressors
                        for reg in regs]

                Xreg = [reg[iX] for regs in self.regressors.X_regressors
                        for reg in regs]

                if self.filters.ABX:
                    Breg = [reg[ABX_filter_ind] for reg in Breg]
                    Xreg = [reg[ABX_filter_ind] for reg in Xreg]

                regs = np.array(Breg + Xreg).T
                if len(regs) != 0:
                    n_regs = np.max(regs, 0) + 1

                    # FIXME how do we know that all regressors are in
                    # integer format here ? This is only guaranteed
                    # for indexed regressors...
                    assert np.prod(n_regs) < 18446744073709551615, \
                        "type not big enough"

                    reg_ind_type = fit_integer_type(
                        np.prod(n_regs), is_signed=False)

                    new_index = regs[:, 0].astype(reg_ind_type)
                    for i in range(1, len(n_regs)):
                        new_index = regs[:, i] + n_regs[i] * new_index

                    permut = np.argsort(new_index)

                    # the organization should be revamped: the real
                    # sorting is done by the line just above, while
                    # the 'sort_and_threshold' function is only really
                    # doing something when thresholding, otherwise it
                    # is just generating the on_across_block_index
                    # which would be better done in another
                    # function...
                    thr_sort_permut, on_across_block_index = (
                        sort_and_threshold(
                            permut, new_index, reg_ind_type,
                            threshold=self.threshold))
                    triplets = triplets[thr_sort_permut]
                else:
                    # FIXME was a bug breaking tests -> variable need
                    # to be defined
                    thr_sort_permut = np.empty(shape=0, dtype=np.uint8)

        else:
            # empty block...
            triplets = np.empty(shape=(0, 3), dtype=self.types[by])

            # the following lines assign empty values to all the
            # variables used to set regressors. Would be nicer to let
            # the regressor code detect empty triplets and handle it
            # by itself... (it is useless if not(with_regressors)
            iA = np.empty(shape=0, dtype=np.uint8)
            iB = np.empty(shape=0, dtype=np.uint8)
            iX = np.empty(shape=0, dtype=np.uint8)

            if self.filters.ABX:
                ABX_filter_ind = np.empty(shape=0, dtype=np.uint8)

            thr_sort_permut = np.empty(shape=0, dtype=np.uint8)

        if not with_regressors:
            return triplets
        else:
            if self.regressors.ABX:
                # instantiate ABX regressors here
                self.regressors.set_ABX_regressors(
                    on_across_by_values, db, triplets)

            # make sure we have those variables defined
            try:
                ABX_filter_ind
            except NameError:
                ABX_filter_ind = None

            # compute the task regressors
            regressors = self._compute_regressors(
                triplets, iA, iB, iX, ABX_filter_ind, thr_sort_permut)

            return (triplets,
                    regressors,
                    np.array(on_across_block_index)[:, None])

    # FIXME add a mechanism to allow the specification of a random seed in a
    # way that would produce reliably the same triplets on different machines
    # (means cross-platform random number generator + having its state so as
    # to be sure that no other random number generation calls to it are
    # altering the sequence).
    #
    # FIXME use an object that guarantees that the stream will not be
    # perturbed by external codes calls to np.random.
[docs]    def generate_triplets(self, output=None,
                          threshold=None, tmpdir=None, seed=None):
        """Generate all possible triplets for the whole task

        Generate the triplets and the pairs for an ABXpy.Task and
        store it in a h5db file.

        Parameters
        ----------

        output : filename, optional
            The output file. If not specified, it will automatically
            create a new file with the same name as the input file.

        threshold : TODO

        tmpdir : directory, optional
           where to write temporary files

        seed : int, optional
           seed for initializing the random number generator

        """
        # reinitialize the random generator with the provided seed
        # (TODO this is only used for sampling, so it should be moved
        # out during code refactoring -> put that in main(), or in
        # sampling module?)
        np.random.seed(seed)

        # check we have triplets in the database
        if self.stats['nb_triplets'] == 0:
            warnings.warn('There are no possible ABX triplets'
                          ' in the specified task', UserWarning)
            return
        self.total_n_triplets = self.stats['nb_triplets']

        # setup threshold
        self.threshold = threshold if threshold is not None else False

        # setup output file, raise an error if the file already exists
        if output is None:
            output = os.path.splitext(self.database)[0] + '.abx'
        if os.path.exists(output):
            raise ValueError(
                'The output file already exists: {}'.format(output))
        if self.verbose:
            print('writing output to {}'.format(output))

        self.n_triplets = self.total_n_triplets

        display = None
        if self.verbose:
            display = progress_display.ProgressDisplay()
            display.add(
                'block', 'Computing triplets for by/on/across block',
                self.n_blocks)
            display.add(
                'triplets', 'Triplets considered:', self.total_n_triplets)

        by_block_indices = [0]
        self.current_index = 0

        # fill output file with list of needed ABX triplets, it is done
        # independently for each 'by' value
        with np2h5.NP2H5(h5file=output) as fh:
            # FIXME test if not fixed size impacts performance a lot
            out = fh.add_dataset(
                group='triplets',
                dataset='data',
                n_rows=self.n_triplets,
                n_columns=3,
                item_type=fit_integer_type(self.total_n_triplets),
                fixed_size=False)

            out_block_index = fh.add_dataset(
                group='triplets',
                dataset='on_across_block_index',
                n_rows=self.stats['nb_blocks'],
                n_columns=1,
                item_type=fit_integer_type(self.stats['nb_blocks']),
                fixed_size=False)

            empty_by_blocks = []
            bys = []
            for by, db in iteritems(self.by_dbs):
                # class for efficiently writing to datasets of the output file
                # (using a buffer under the hood)
                if self.verbose:
                    print("Writing ABX triplets to task file...")

                # allow to get by values as well as values of other
                # variables that are determined by these
                by_values = dict(db.iloc[0])

                datasets, indexes = self.regressors.get_regressor_info()
                with h5io.H5IO(
                        filename=output,
                        datasets=datasets,
                        indexes=indexes,
                        group='/regressors/{}/'.format(str(by))) as out_regs:
                    self._compute_triplets(
                        by, out, out_block_index, out_regs, db, fh,
                        by_values, display=display)

                    # if no triplets found: delete by block
                    if self.current_index == by_block_indices[-1]:
                        empty_by_blocks.append(by)
                    else:
                        by_block_indices.append(self.current_index)
                        bys.append(by)

            # saving by index, deleting empty by blocks:
            aux = np.array(
                by_block_indices,
                dtype=fit_integer_type(by_block_indices[-1]))
            by_block_indices = np.hstack((aux[:-1, None], aux[1:, None]))
            for by in empty_by_blocks:
                del self.by_dbs[by]

            fh.file.create_dataset(
                'bys', (aux.shape[0] - 1,),
                dtype=h5py.special_dtype(vlen=str))
            fh.file['bys'][:] = [str(by) for by in bys]

            fh.file['triplets'].create_dataset(
                'by_index', data=by_block_indices)
            fh.file['triplets/data'].resize(aux[-1], axis=0)

        if self.verbose:
            print('done.')

        with warnings.catch_warnings():
            warnings.simplefilter('ignore', tables.NaturalNameWarning)
            self._generate_pairs(output, tmpdir=tmpdir)

    def _compute_triplets(self, by, out, out_block_index,
                          out_regs, db, fh, by_values, display=None):
        # instantiate by regressors here
        self.regressors.set_by_regressors(by_values)

        # iterate over on/across blocks
        on_across_blocks = iteritems(self.on_across_blocks[by].groups)
        for block_key, block in on_across_blocks:
            if self.verbose:
                display.update('block', 1)

            # allow to get on, across, by values as well as values of
            # other variables that are determined by these
            on_across_by_values = dict(db.iloc[block[0]])
            if self.filters.on_across_by_filter(on_across_by_values):
                # instantiate on_across_by regressors here
                self.regressors.set_on_across_by_regressors(
                    on_across_by_values)

                on, across = on_across_from_key(block_key)

                triplets, regressors, on_across_block_index = (
                    self.on_across_triplets(
                        by, on, across, block, on_across_by_values))

                out.write(triplets)
                out_regs.write(regressors, indexed=True)
                out_block_index.write(on_across_block_index)
                self.current_index += triplets.shape[0]

                if self.verbose:
                    display.update(
                        'triplets',
                        self.by_stats[by]['block_sizes'][block_key])

            if self.verbose:
                display.display()

    # FIXME clean this function (maybe do a few well-separated sub-functions
    # for getting the pairs and unique them)
    def _generate_pairs(self, output=None, tmpdir=None):
        """Generate the pairs associated to the triplet list"""

        # FIXME change this to a random file name to avoid overwriting problems
        # default name for output file
        if output is None:
            (basename, _) = os.path.splitext(self.database)
            output = basename + '.abx'
        # list all pairs
        n_pairs_dict = {}
        max_ind_dict = {}
        try:
            _, output_tmp = tempfile.mkstemp(dir=tmpdir)
            for n_by, (by, db) in enumerate(iteritems(self.by_dbs)):
                if self.verbose > 0:
                    print("Writing AX/BX pairs to task file...")
                with h5py.File(output, 'a') as fh:
                    triplets_attrs = fh['/triplets']['by_index'][n_by][...]
                max_ind = np.max(db.index.values)
                max_ind_dict[by] = max_ind
                pair_key_type = fit_integer_type(
                    (max_ind + 1) ** 2 - 1, is_signed=False)
                with h52np.H52NP(output) as f_in:
                    with np2h5.NP2H5(output_tmp) as f_out:
                        inp = f_in.add_subdataset('triplets', 'data',
                                                  indexes=triplets_attrs)
                        out = f_out.add_dataset(
                            'pairs', str(by), n_columns=1,
                            item_type=pair_key_type, fixed_size=False)
                        for data in inp:
                            triplets = pair_key_type(data)
                            n = triplets.shape[0]
                            ind = np.arange(n)
                            i1 = 2 * ind
                            i2 = 2 * ind + 1

                            pairs = np.empty(
                                shape=(2 * n, 1),
                                dtype=pair_key_type)

                            # FIXME change the encoding (and type_fitting)
                            # so that A,B and B,A have the same code ...
                            # (take a=min(a,b), b=max(a,b))
                            # FIXME but allow a flag to control the
                            # behavior to be able to enforce A,X and B,X
                            # order when using assymetrical distance
                            # functions
                            pairs[i1, 0] = triplets[:, 0] + (
                                max_ind + 1) * triplets[:, 2]  # AX
                            pairs[i2, 0] = triplets[:, 1] + (
                                max_ind + 1) * triplets[:, 2]  # BX

                            # FIXME do a unique here already? Do not store
                            # the inverse mapping ? (could sort triplets on
                            # pair1, complete pair1, sort on pair2,
                            # complete pair 2 and shuffle ?)
                            out.write(pairs)

                    sort_pairs(output_tmp, by, tmpdir=tmpdir)

                    # counting unique
                    with np2h5.NP2H5(output_tmp) as f_out:
                        with h52np.H52NP(output_tmp) as f_in:
                            inp = f_in.add_dataset('pairs', str(by))
                            n_pairs = 0
                            last = -1
                            for pairs in inp:
                                # unique alters the shape
                                pairs = np.reshape(pairs, (pairs.shape[0], 1))
                                n_pairs += np.unique(pairs).size
                                if pairs[0, 0] == last:
                                    n_pairs -= 1
                                if pairs.size > 0:
                                    last = pairs[-1, 0]
                        n_pairs_dict[by] = n_pairs

                    # FIXME should have a unique function directly instead of
                    # sorting + unique ?
                    with np2h5.NP2H5(output_tmp) as f_out:
                        with h52np.H52NP(output_tmp) as f_in:
                            inp = f_in.add_dataset('pairs', str(by))
                            out = f_out.add_dataset(
                                'unique_pairs', str(by),
                                n_rows=n_pairs,
                                n_columns=1,
                                item_type=pair_key_type,
                                fixed_size=False)

                            # out = out_unique_pairs
                            last = -1
                            for pairs in inp:
                                pairs = np.unique(pairs)
                                # unique alters the shape
                                pairs = np.reshape(pairs, (pairs.shape[0], 1))
                                if pairs[0, 0] == last:
                                    pairs = pairs[1:]
                                if pairs.size > 0:
                                    last = pairs[-1, 0]
                                    out.write(pairs)

                        # store for ulterior decoding
                        store = pd.HDFStore(output)

                        # use append to make use of table format,
                        # which is better at handling strings without
                        # much space (fixed-size format)
                        store.append('/feat_dbs/' + str(by), self.feat_dbs[by],
                                     expectedrows=len(self.feat_dbs[by]))
                        store.close()
                        # FIXME generate inverse mapping to triplets
                        # (1 and 2) ?

            # Now merge all datasets
            by_index = 0
            with np2h5.NP2H5(output) as f_out:
                n_rows = sum(itervalues(n_pairs_dict))
                out_unique_pairs = f_out.add_dataset(
                    'unique_pairs', 'data', n_rows=n_rows, n_columns=1,
                    item_type=np.int64, fixed_size=False)
                for n_by, (by, db) in enumerate(iteritems(self.by_dbs)):
                    triplets_attrs = f_out.file['/triplets']['by_index'][n_by]
                    if triplets_attrs[0] == triplets_attrs[1]:
                        # subdataset is empty
                        continue

                    with h52np.H52NP(output_tmp) as f_in:
                        inp = f_in.add_dataset('unique_pairs', str(by))
                        for pairs in inp:
                            out_unique_pairs.write(pairs)

                    with h5py.File(output, 'a') as fh:
                        fh['/unique_pairs'].attrs[str(by)] = (
                            max_ind_dict[by] + 1,
                            by_index, by_index + n_pairs_dict[by])
                    by_index += n_pairs_dict[by]
        finally:
            os.remove(output_tmp)
        if self.verbose:
            print("done.")

    # number of triplets when triplets with same on, across, by are
    # counted as one
    #
    # FIXME current implementation won't work with A, B, X or ABX filters
    #
    # FIXME lots of code in this function is repicated from
    # on_across_triplets, generate_triplets and/or compute_stats: the
    # maximum possible should be factored out, including the loop over
    # by, loop over on_across iteration structure
[docs]    def compute_nb_levels(self):
        if self.filters.A or self.filters.B or self.filters.X or \
           self.filters.ABX:
            raise ValueError(
                'Current implementation do not support computing nb_levels in '
                'the presence of A, B, X, or ABX filters')

        if self.verbose:
            display = progress_display.ProgressDisplay()
            display.add(
                'block', 'Computing nb_levels for by block',
                self.stats['nb_by_levels'])

        for by, db in iteritems(self.by_dbs):
            if self.verbose:
                display.update('block', 1)
                display.display()

            n = 0
            # iterate over on/across blocks
            for block_key, n_block in (
                    iteritems(self.by_stats[by]['on_across_levels'])):
                block = self.on_across_blocks[by].groups[block_key]
                on_across_by_values = dict(db.ix[block[0]])
                on, across = on_across_from_key(block_key)

                if self.filters.on_across_by_filter(on_across_by_values):
                    # find all possible A, B, X where A and X have the 'on'
                    # feature of the block and A and B have the 'across'
                    # feature of the block
                    A = np.array(
                        self.on_across_blocks[by].groups[block_key],
                        dtype=self.types[by])
                    X = self.on_blocks[by].groups[on]

                    # FIXME quick fix to process case whith no across, but
                    # better done in a separate loop ...
                    if self.across == ['#across']:
                        # in this case A is a singleton and B can be anything
                        # in the by block that doesn't have the same 'on' as A
                        B = np.array(
                            list(set(self.by_dbs[by].index).difference(X)),
                            dtype=self.types[by])
                    else:
                        B = self.across_blocks[by].groups[across]

                        # remove B with the same 'on' than A
                        B = np.array(
                            list(set(B).difference(A)), dtype=self.types[by])

                    # remove X with the same 'across' than A
                    X = np.array(
                        list(set(X).difference(A)), dtype=self.types[by])

                    if B.size > 0 and X.size > 0:
                        # case were there was no across specified is different
                        if self.across == ["#across"]:
                            grouping = self.on
                        else:
                            grouping = self.on + self.across

                        n_level_B = len(db.iloc[B].groupby(grouping).groups)
                        n_level_X = len(db.iloc[X].groupby(grouping).groups)
                        n = n + n_level_B * n_level_X

            self.by_stats[by]['nb_levels'] = n

        self.stats['nb_levels'] = sum(
            stats['nb_levels'] for stats in self.by_stats.values())

[docs]    def print_stats(self, filename=None, summarized=True):
        if filename is None:
            self.print_stats_to_stream(sys.stdout, summarized)
        else:
            with open(filename, 'w') as h:
                self.print_stats_to_stream(h, summarized)

[docs]    def print_stats_to_stream(self, stream, summarized):
        import pprint

        stream.write('\n\n###### Global stats ######\n\n')
        pprint.pprint(self.stats, stream)

        stream.write('\n\n###### by blocks stats ######\n\n')
        if not(summarized):
            for by, stats in iteritems(self.by_stats):
                stream.write('### by level: %s ###\n' % str(by))
                pprint.pprint(stats, stream)
        else:
            try:
                self.compute_nb_levels()
            except ValueError:
                warnings.warn(
                    "Filters not fully supported, nb_levels per "
                    "by block wont be calculated", RuntimeWarning)

            for by, stats in iteritems(self.by_stats):
                stream.write('### by level: %s ###\n' % str(by))
                stream.write('nb_triplets: %d\n' % stats['nb_triplets'])

                if 'nb_levels' in stats:
                    stream.write('nb_levels: %d\n' % stats['nb_levels'])

                stream.write(
                    'nb_across_pairs: %d\n' % stats['nb_across_pairs'])
                stream.write(
                    'nb_on_pairs: %d\n' % stats['nb_on_pairs'])
                stream.write(
                    'nb_on_levels: %d\n' % stats['nb_on_levels'])
                stream.write(
                    'nb_across_levels: %d\n' % stats['nb_across_levels'])
                stream.write(
                    'nb_on_across_levels: %d\n' % stats['nb_on_across_levels'])

    def _compute_regressors(self, triplets, iA, iB, iX,
                            ABX_filter_ind, thr_sort_permut):
        """Helper method for Task.on_across_triplets"""
        # self.regressors.XXX contains either (for by and on_across_by)
        #   [[scalar_output_1_dbfun_1, scalar_output_2_dbfun_1,...],
        #    [scalar_output_1_dbfun_2, ...], ...]
        # or:
        #   [[np_array_output_1_dbfun_1, np_array_output_2_dbfun_1,...],
        #    [np_array_output_1_dbfun_2, ...], ...]
        #
        # FIXME change manager API so that self.regressors.A contains the
        # data and not the list of dbfun_s ?
        regressors = {}

        scalar_names = (
            self.regressors.by_names + self.regressors.on_across_by_names)
        scalar_regressors = (
            self.regressors.by_regressors
            + self.regressors.on_across_by_regressors)
        for names, regs in zip(scalar_names, scalar_regressors):
            for name, reg in zip(names, regs):
                regressors[name] = np.tile(
                    np.array(reg), (np.size(triplets, 0), 1))

        # lots of code duplication below...
        for names, regs in zip(self.regressors.A_names,
                               self.regressors.A_regressors):
            for name, reg in zip(names, regs):
                regressors[name] = reg[iA]
                if self.filters.ABX:
                    regressors[name] = regressors[name][ABX_filter_ind]
                    regressors[name] = regressors[name][thr_sort_permut]

        for names, regs in zip(self.regressors.B_names,
                               self.regressors.B_regressors):
            for name, reg in zip(names, regs):
                regressors[name] = reg[iB]
                if self.filters.ABX:
                    regressors[name] = regressors[name][ABX_filter_ind]
                regressors[name] = regressors[name][thr_sort_permut]

        for names, regs in zip(self.regressors.X_names,
                               self.regressors.X_regressors):
            for name, reg in zip(names, regs):
                regressors[name] = reg[iX]
                if self.filters.ABX:
                    regressors[name] = regressors[name][ABX_filter_ind]
                regressors[name] = regressors[name][thr_sort_permut]

        # FIXME implement this
        # for names, regs in zip(self.regressors.ABX_names,
        #                       self.regressors.ABX_regressors):
        #    for name, reg in zip(names, regs):
        #        regressors[name] = reg[indices,:]

        return regressors


# utility function necessary because of current inconsistencies in panda:
# you can't seem to index a dataframe with a tuple with only one element,
# even though tuple with more than one element are fine
[docs]def on_across_from_key(key):
    on = key[0]
    # if panda was more consistent we could use key[:1] instead ...
    across = key[1:]
    if len(across) == 1:  # this is the problematic case
        across = across[0]
    return on, across


[docs]def sort_and_threshold(permut, new_index, ind_type,
                       threshold=None, count_only=False):
    sorted_index = new_index[permut]
    flag = np.concatenate(
        ([True], sorted_index[1:] != sorted_index[:-1], [True]))
    unique_idx = np.nonzero(flag)[0]
    counts = unique_idx[1:] - unique_idx[:-1]

    if count_only:
        sampled = counts > threshold
        return new_index.shape[0] - np.sum(counts[sampled]) + \
            threshold * np.sum(sampled)

    new_permut = []
    i = 0
    for c in counts:
        if threshold and c > threshold:
            sampled_idx = sampler.sample_without_replacement(
                threshold, c, dtype=ind_type)
            sampled_idx += i
            new_permut.append(permut[sampled_idx])
        else:
            new_permut.append(permut[i:i+c])
        i += c
    return np.concatenate(new_permut), unique_idx


[docs]def sort_pairs(abx_file, by, memory=1000, tmpdir=None):
    """Sort pairs in a ABX task file

    abx_file: the hdf5 file generated by ABX task

    by:

    memory: available RAM in Mo

    tmpdir: dircetory to write temporary files

    """
    # estimate of the amount of data to be sorted
    with h5py.File(abx_file, 'a') as fh:
        n = fh['/pairs/' + str(by)].shape[0]
        i = fh['/pairs/' + str(by)].dtype.itemsize

    # harmonize units in Ko
    memory = 1000 * memory
    amount = n * i / 1000.

    # be conservative: aim at using no more than 3/4 the available
    # memory if enough memory take one chunk (this will do an
    # unnecessary full write and read of the file... could be
    # optimized easily, would it be beneficial to have large
    # o_buffer_size as well?)
    if amount <= 0.75 * memory:
        buffer_size = amount

    # else take around 30 chunks if possible (this seems efficient
    # given the current implem, using a larger number of chunks
    # efficiently might be possible if the reading chunks part of the
    # sort was cythonized ?)
    elif amount / 30. <= 0.75 * memory:
        buffer_size = amount / 30.

    # else take minimum number of chunks possible given the
    # available RAM
    else:
        buffer_size = 0.75 * memory

    # finally sort the pairs in place
    handler = h5_handler.H5Handler(abx_file, '/pairs/', str(by))
    handler.sort(buffer_size=buffer_size, tmpdir=tmpdir)


[docs]def parse_arguments():
    """Defines and parses input arguments for the command-line API"""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description='''ABX task specification

  'database' is a text file with the '.item' extension containing an
  index of the database and a set of labels or attributes. See input
  format specifications at
  http://abxpy.readthedocs.io/en/latest/FilesFormat.html#dataset

  The --on, --across, --by, --filters and --regressors arguments must
  be defined as columns in the database (e.g. speaker or phonemes, if
  your database contains columns defining these attributes)

Example call:

  task.py ./my_data.item -o column1 -a column2 column3 -b column4 column5 \\
      -f "[attr == 0 for attr in column3_X]"

  Because optional arguments can have various numbers of inputs, the
  positional arguments ('database' and 'output') must come before the
  mandatory ones, one being preceded by '--'. Thus, those two calls
  are equivalent:

    task.py ./my_data.item -o column1 -a column2 column3
    task.py -o column1 -a column2 column3 -- ./my_data.item''')

    parser.add_argument(
        '-v', '--verbose', action='store_true',
        help='output messages to the standard output')

    parser.add_argument(
        '--stats-only', action='store_true',
        help='add this flag if you only want some statistics '
        'about the specified task')

    parser.add_argument(
        '--tempdir', default=None,
        help='directory where temporary files will be stored')

    parser.add_argument(
        '--seed', default=None, type=int,
        help='seed used to initialize the pseudo-random number generator')

    # I/O files
    g1 = parser.add_argument_group('I/O files')
    g1.add_argument(
        'database', help='database item file used to form ABX triplets')

    g1.add_argument(
        'output', nargs='?', default=None,
        help='file to write the generated ABX task, '
        'default to the database basename with the \'.abx\' extension')

    # Task specification
    g2 = parser.add_argument_group('Task specification')

    g2.add_argument(
        '-o', '--on', required=True, help='ON attribute')

    g2.add_argument(
        '-a', '--across',  nargs='+', default=[],
        help='ACROSS attributes')

    g2.add_argument(
        '-b', '--by', nargs='+', default=[],
        help='BY attributes')

    g2.add_argument(
        '-f', '--filters', nargs='+', default=[],
        help='filters specification')

    g2.add_argument(
        '-r', '--regressors', nargs='+', default=[],
        help='regressors specification')

    g2.add_argument(
        '-t', '--threshold', default=None, type=int,
        help='threshold on the maximal size of a block of'
        ' triplets sharing the same regressors')

    return parser.parse_args()


[docs]def main():
    """Command-line API for generating ABX tasks"""
    # get arguments from command line
    args = parse_arguments()

    # checks on the output file
    # if args.stats_only:
    #     assert args.output, "The output file was not provided"
    if args.output and os.path.exists(args.output):
        warnings.warn("Overwriting task file " + args.output, UserWarning)
        os.remove(args.output)

    # initialize the task
    task = Task(
        args.database, args.on,
        across=args.across,
        by=args.by,
        filters=args.filters,
        regressors=args.regressors,
        verbose=args.verbose)

    if args.stats_only:
        task.print_stats()
    else:
        if args.tempdir and not os.path.exists(args.tempdir):
            os.makedirs(args.tempdir)

        # generate triplets and unique pairs
        task.generate_triplets(
            output=args.output,
            threshold=args.threshold,
            tmpdir=args.tempdir,
            seed=args.seed)


if __name__ == '__main__':
    main()