Source code for dynamo_consistency.checkphedex

# pylint: disable=import-error

"""
A module that provides functions to check the comparison results to
the list of files and deletions in PhEDEx.

:author: Daniel Abercrombie <dabercro@mit.edu>
"""

import time
import logging

from common.interface.mysql import MySQL

from CMSToolBox.webtools import get_json
from . import config
from . import datatypes
from . import cache_tree

LOG = logging.getLogger(__name__)

[docs]def set_of_deletions(site): """ Get a list of datasets with approved deletion requests at a given site that were created within the number of days matching the **IgnoreAge** configuration parameter. This request is done via the PhEDEx ``deleterequests`` API. :param str site: The site that we want the list of deletion requests for. :returns: Datasets that are in deletion requests :rtype: set """ created_since = int( time.time() - float(config.config_dict().get('IgnoreAge', 0)) * 24 * 3600) # Get deletion requests in PhEDEx deletion_request = get_json( 'cmsweb.cern.ch', '/phedex/datasvc/json/prod/deleterequests', {'node': site, 'approval': 'approved', 'create_since': created_since}, use_https=True) # PhEDEx APIs are ridiculous # Here I get the dataset names of approved deletion requests in a single list datasets_for_deletion = set( [block['name'].split('#')[0] for block in sum( [request['data']['dbs']['block'] for request in \ deletion_request['phedex']['request']], [])] + \ [dataset['name'] for dataset in sum( [request['data']['dbs']['dataset'] for request in \ deletion_request['phedex']['request']], [])] ) if deletion_request else set() return datasets_for_deletion
[docs]@cache_tree('InventoryAge', 'phedexlisting') def get_phedex_tree(site): """ Get the file list tree from PhEDEx. Uses the InventoryAge configuration to determine when to refresh cache. :param str site: The site to get information from PhEDEx for. :returns: A tree containing file replicas that are supposed to be at the site :rtype: dynamo_consistency.datatypes.DirectoryInfo """ tree = datatypes.DirectoryInfo('/store') valid_list = config.config_dict().get('DirectoryList', []) sql = MySQL(config_file='/etc/my.cnf', db='dynamo', config_group='mysql-dynamo') datasets = sql.query('SELECT datasets.name ' 'FROM sites INNER JOIN dataset_replicas INNER JOIN datasets ' 'WHERE dataset_replicas.dataset_id=datasets.id AND ' 'dataset_replicas.site_id=sites.id and sites.name=%s', site) def add_files(dataset, retries): """ :param str dataset: Dataset to get from PhEDEx :param int retries: The number of times to retry PhEDEx call :returns: Whether or not the addition was successful :rtype: bool """ LOG.info('Getting PhEDEx contents for %s', dataset) phedex_response = get_json( 'cmsweb.cern.ch', '/phedex/datasvc/json/prod/filereplicas', {'node': site, 'dataset': dataset}, retries=retries, use_https=True) report = 0 if not phedex_response: LOG.warning('Bad response from PhEDEx for %s', dataset) return False for block in phedex_response['phedex']['block']: LOG.debug('%s', block) replica_list = [(replica['name'], replica['bytes'], int(replica['replica'][0]['time_create'] or time.time()), block['name']) \ for replica in block['file'] \ if replica['name'].split('/')[2] in valid_list] report += len(replica_list) tree.add_file_list(replica_list) LOG.info('%i files', report) return True separate = [] for primary in set([d.split('/')[1][:3] for d in datasets]): success = add_files('/%s*/*/*' % primary, 0) if not success: separate.append(primary) # Separate loop to retry datasets individually for dataset in [d for d in datasets if d.split('/')[1][:3] in separate]: success = add_files(dataset, 5) if not success: LOG.critical('Cannot get %s from PhEDEx. Do not trust results...', dataset) return tree
[docs]def check_for_datasets(site, orphan_list_file): """ Checks PhEDEx exhaustively to see if a dataset should exist at a site, according to PhEDEx, but has files marked as orphans according to our check. This is done via the PhEDEx ``filereplicas`` API. The number of filereplicas for each dataset is printed to the terminal. Datasets that contain any filereplicas are returned by this function. :param str site: The name of the site to check :param list orphan_list_file: List of LFNs that are listed as orphans at the site :returns: The list of number of files and datasets for each dataset that is supposed to have at least 1 file at the site. :rtype: list of tuples """ datasets = set() output = [] with open(orphan_list_file) as orphans: for line in orphans: split_name = line.split('/') dataset = '/%s/%s-%s/%s' % (split_name[4], split_name[3], split_name[6], split_name[5]) if dataset not in datasets: phedex_response = get_json( 'cmsweb.cern.ch', '/phedex/datasvc/json/prod/filereplicas', {'node': site, 'dataset': dataset}, use_https=True) num_files = sum(len(block['file']) for block in phedex_response['phedex']['block']) datasets.add(dataset) print num_files, dataset if num_files: output.append((num_files, dataset)) return output