Source code for WorkflowWebTools.paramsregression

#pylint: disable=too-many-locals, too-complex

"""
The :py:mod:`paramsregression` module uses neural net classifiers to predict parameters
for error handling workflows.

.. Note::

   Regression part is a work in progress. Need more data.

:author: Daniel Abercrombie <dabercro@mit.edu>
"""

import sys
import json

from sklearn.neural_network import MLPClassifier

[docs]def convert_to_dense(errors, keys=None, allerrors=None, allsites=None): """ Take a dictionary of sparse matrices, where the sparse matrices have keys of error code and them site names, and return the equivalent dense matrices. :param dict errors: The container for sparse matrices. :param list keys: Keys for each of the matrices. Defaults to `'good_sites'` and `'bad_sites'` :param list allerrors: An ordered list of all the errors to consider If both this and `allsites` are blank, this function will just pull lists from the sparse matrices :param list allsites: An ordered list of all the sites to consider :returns: Container for two dense matrices :rtype: dict of lists of lists """ keys = keys or ['good_sites', 'bad_sites'] if not allerrors and not allsites: # First get all errors and all sites allerrors = set() allsites = set() for status in keys: for error, sites in errors[status].iteritems(): allerrors.add(int(error)) for site in sites: allsites.add(site) allerrors = sorted(allerrors) allsites = sorted(allsites) # Build the dense output output = {} for status in keys: output[status] = [[0] * len(allsites) for _ in xrange(len(allerrors))] for i_error, error in enumerate(allerrors): for i_site, site in enumerate(allsites): output[status][i_error][i_site] += errors[status].get(str(error), {}).get(site, 0) return output
[docs]def get_classifier(raw_data, parameter, **kwargs): """ Fit a classifier. If the module is run as a script, just print the training and test data output. Otherwise, return the classifier for farther use. :param dict raw_data: Raw data in the form of output from :py:func:`actionshistorylink.dump_json`. :param str parameter: The parameter to classify. :param kwargs: These are kwargs for the ``sklearn.neural_network.MLPClassifier`` that is running underneath. :returns: Trained classifier model :rtype: sklearn.neural_network.MLPClassifier """ primary_ids = sorted(set([key.split('/')[1] for key in raw_data.keys()])) # Only split samples when running interactive tests training_ids = primary_ids[0::2] if __name__ == '__main__' else primary_ids training_data = [] training_target = [] testing_data = [] testing_target = [] class_labels = [] # Prepare the data allerrors = set() allsites = set() for key in sorted(raw_data): for status in ['good_sites', 'bad_sites']: matrix = raw_data[key]['errors'][status] # Only do this for sparse matrices if not isinstance(matrix, list): for error, sites in matrix.iteritems(): allerrors.add(int(error)) for site in sites: allsites.add(site) allerrors = sorted(allerrors) allsites = sorted(allsites) for key in sorted(raw_data): if key.split('/')[1] in training_ids: data = training_data target = training_target else: data = testing_data target = testing_target errors = raw_data[key]['errors'] if not isinstance(errors['good_sites'], list): errors = convert_to_dense(errors, allerrors=allerrors, allsites=allsites) data.append(sum(errors['good_sites'] + errors['bad_sites'], [])) param = raw_data[key]['parameters'].get(parameter, '') if param in class_labels: target.append(class_labels.index(param)) else: target.append(len(class_labels)) class_labels.append(param) classifier = MLPClassifier(**kwargs) classifier.fit(training_data, training_target) if __name__ == '__main__': # Only does the following if running an interactive test def print_results(data, target): """Print the results of predictions. :param list data: Errors in the format of a matrix :param list target: The values that the data should correspond to """ output = classifier.predict(data) right = 0 for want, result in zip(target, output): if want == result: status = 'RIGHT' right += 1 else: status = 'WRONG' print '[%s] %i : %i -- %s : %s' % \ (status, want, result, class_labels[want], class_labels[result]) print '%f (%i/%i)' % (100.0 * right/len(target), right, len(target)) print '\nTraining:\n' print_results(training_data, training_target) print '\nTesting:\n' print_results(testing_data, testing_target) return classifier
[docs]def main(): """This is for testing.""" if len(sys.argv) > 2: parameter = sys.argv[2] else: parameter = 'action' with open(sys.argv[1], 'r') as input_file: raw_data = json.load(input_file) get_classifier(raw_data, parameter, solver='lbfgs', hidden_layer_sizes=(100, 10))
if __name__ == '__main__': main()