Source code for WorkflowWebTools.paramsregression

#pylint: disable=too-many-locals, too-complex

"""
The :py:mod:`paramsregression` module uses neural net classifiers to predict parameters
for error handling workflows.

.. Note::

   Regression part is a work in progress. Need more data.

:author: Daniel Abercrombie <dabercro@mit.edu>
"""

import sys
import json

from sklearn.neural_network import MLPClassifier

[docs]def convert_to_dense(errors, keys=None, allerrors=None, allsites=None):
    """
    Take a dictionary of sparse matrices,
    where the sparse matrices have keys of error code and them site names,
    and return the equivalent dense matrices.

    :param dict errors: The container for sparse matrices.
    :param list keys: Keys for each of the matrices.
                      Defaults to `'good_sites'` and `'bad_sites'`
    :param list allerrors: An ordered list of all the errors to consider
                           If both this and `allsites` are blank,
                           this function will just pull lists from the sparse matrices
    :param list allsites: An ordered list of all the sites to consider
    :returns: Container for two dense matrices
    :rtype: dict of lists of lists
    """
    keys = keys or ['good_sites', 'bad_sites']

    if not allerrors and not allsites:
        # First get all errors and all sites
        allerrors = set()
        allsites = set()

        for status in keys:
            for error, sites in errors[status].iteritems():
                allerrors.add(int(error))
                for site in sites:
                    allsites.add(site)

        allerrors = sorted(allerrors)
        allsites = sorted(allsites)

    # Build the dense output
    output = {}
    for status in keys:
        output[status] = [[0] * len(allsites) for _ in xrange(len(allerrors))]
        for i_error, error in enumerate(allerrors):
            for i_site, site in enumerate(allsites):
                output[status][i_error][i_site] += errors[status].get(str(error), {}).get(site, 0)

    return output


[docs]def get_classifier(raw_data, parameter, **kwargs):
    """
    Fit a classifier.
    If the module is run as a script,
    just print the training and test data output.
    Otherwise, return the classifier for farther use.

    :param dict raw_data: Raw data in the form of output from
                          :py:func:`actionshistorylink.dump_json`.
    :param str parameter: The parameter to classify.
    :param kwargs: These are kwargs for the ``sklearn.neural_network.MLPClassifier``
                   that is running underneath.
    :returns: Trained classifier model
    :rtype: sklearn.neural_network.MLPClassifier
    """

    primary_ids = sorted(set([key.split('/')[1] for key in raw_data.keys()]))

    # Only split samples when running interactive tests
    training_ids = primary_ids[0::2] if __name__ == '__main__' else primary_ids

    training_data = []
    training_target = []
    testing_data = []
    testing_target = []

    class_labels = []

    # Prepare the data

    allerrors = set()
    allsites = set()

    for key in sorted(raw_data):
        for status in ['good_sites', 'bad_sites']:
            matrix = raw_data[key]['errors'][status]
            # Only do this for sparse matrices
            if not isinstance(matrix, list):
                for error, sites in matrix.iteritems():
                    allerrors.add(int(error))
                    for site in sites:
                        allsites.add(site)

    allerrors = sorted(allerrors)
    allsites = sorted(allsites)

    for key in sorted(raw_data):
        if key.split('/')[1] in training_ids:
            data = training_data
            target = training_target
        else:
            data = testing_data
            target = testing_target

        errors = raw_data[key]['errors']
        if not isinstance(errors['good_sites'], list):
            errors = convert_to_dense(errors, allerrors=allerrors, allsites=allsites)

        data.append(sum(errors['good_sites'] + errors['bad_sites'], []))

        param = raw_data[key]['parameters'].get(parameter, '')
        if param in class_labels:
            target.append(class_labels.index(param))
        else:
            target.append(len(class_labels))
            class_labels.append(param)

    classifier = MLPClassifier(**kwargs)
    classifier.fit(training_data, training_target)

    if __name__ == '__main__':
        # Only does the following if running an interactive test
        def print_results(data, target):
            """Print the results of predictions.

            :param list data: Errors in the format of a matrix
            :param list target: The values that the data should correspond to
            """

            output = classifier.predict(data)

            right = 0

            for want, result in zip(target, output):

                if want == result:
                    status = 'RIGHT'
                    right += 1
                else:
                    status = 'WRONG'
                print '[%s] %i : %i -- %s : %s' % \
                    (status, want, result, class_labels[want], class_labels[result])

            print '%f (%i/%i)' % (100.0 * right/len(target), right, len(target))

        print '\nTraining:\n'
        print_results(training_data, training_target)

        print '\nTesting:\n'
        print_results(testing_data, testing_target)

    return classifier


[docs]def main():
    """This is for testing."""
    if len(sys.argv) > 2:
        parameter = sys.argv[2]
    else:
        parameter = 'action'

    with open(sys.argv[1], 'r') as input_file:
        raw_data = json.load(input_file)

    get_classifier(raw_data, parameter,
                   solver='lbfgs', hidden_layer_sizes=(100, 10))


if __name__ == '__main__':
    main()