From 18523a85c1a5a8dd676125056532c9b83e1dbe43 Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Thu, 31 Jul 2025 12:15:44 -0500 Subject: [PATCH 1/2] add cache option as default --- .pylintrc | 2 +- get_files_on_disk.py | 46 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/.pylintrc b/.pylintrc index 524fae9..2a9f7a0 100644 --- a/.pylintrc +++ b/.pylintrc @@ -123,7 +123,7 @@ no-docstring-rgx=__.*__ [FORMAT] # Maximum number of characters on a single line. -max-line-length=130 +max-line-length=150 # Maximum number of lines in a module max-module-lines=1000 diff --git a/get_files_on_disk.py b/get_files_on_disk.py index 5b6c25d..41d5dca 100755 --- a/get_files_on_disk.py +++ b/get_files_on_disk.py @@ -62,11 +62,48 @@ def sitecond(site): sys.path.pop(0) return filelist, sitelist -def main(dataset, user, outfile=None, verbose=False, allow=None, block=None): +def getCache(dataset, verbose=False): + """Gets cached file lists from cvmfs for pileup samples""" + filelist = None + cache_dir = "/cvmfs/cms.cern.ch/offcomp-prod/premixPUlist/" + cache_map_file = "pileup_mapping.txt" + cache_map_path = os.path.join(cache_dir, cache_map_file) + if os.path.isfile(cache_map_path): + cache_map = {} + with open(cache_map_path, 'r') as mapfile: # pylint: disable=unspecified-encoding + for line in mapfile: + line = line.rstrip() + linesplit = line.split() + if len(linesplit)==2: + cache_map[linesplit[0]] = linesplit[1] + + if dataset in cache_map: + cache_file = cache_map[dataset] + cache_file_path = os.path.join(cache_dir, cache_file) + if verbose: + print(f"Loading from cache: {cache_file_path}") + with open(cache_file_path, 'r') as cfile: # pylint: disable=unspecified-encoding + filelist = [line.rstrip() for line in cfile] + + return filelist + +def main(dataset, user, outfile=None, verbose=False, allow=None, block=None, cache=True): """Prints file list and site list""" - filelist, sitelist = getHosted(dataset, user, allow=allow, block=block) + filelist = None + sitelist = None - if verbose: + if cache: + if not allow and not block: + filelist = getCache(dataset, verbose) + # cache does not consider allow or block lists, so disable if they are requested + else: + if verbose: + print("Disabling cache because allow and/or block lists are specified") + + if not filelist: + filelist, sitelist = getHosted(dataset, user, allow=allow, block=block) + + if verbose and sitelist: print("Site list:") print("\n".join(f'{k}: {v}' for k,v in sitelist.items())) @@ -86,7 +123,8 @@ def main(dataset, user, outfile=None, verbose=False, allow=None, block=None): parser.add_argument("-o","--outfile",type=str,default=None,help="write to this file instead of stdout") parser.add_argument("-u","--user",type=str,default=default_user,help="username for rucio") parser.add_argument("-v","--verbose",default=False,action="store_true",help="print extra information (site list)") + parser.add_argument("--no-cache",default=False,action="store_true",help="do not use cached file lists from cvmfs") parser.add_argument("dataset",type=str,help="dataset to query") args = parser.parse_args() - main(args.dataset, args.user, outfile=args.outfile, verbose=args.verbose, allow=args.allow, block=args.block) + main(args.dataset, args.user, outfile=args.outfile, verbose=args.verbose, allow=args.allow, block=args.block, cache=not args.no_cache) From 356fc1d8d2c55b7619ebf3980e31a927a6c232fc Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Fri, 15 Aug 2025 16:24:30 -0500 Subject: [PATCH 2/2] add get_files_on_disk to readme --- README.md | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/README.md b/README.md index 053643d..50a7b34 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Table of Contents * [bind_condor.sh](#bind_condorsh) * [Usage](#usage-1) * [Setting up bindings](#setting-up-bindings) +* [get_files_on_disk.py](#get_files_on_diskpy) * [tunn](#tunn) * [Detailed usage](#detailed-usage) * [Web browser usage](#web-browser-usage) @@ -214,6 +215,45 @@ In this particular case, it is necessary to upgrade `pip` because the Python ver **NOTE**: These recipes only install the bindings for Python3. (Python2 was still the default in `CMSSW_10_6_X`.) You will need to make sure any scripts using the bindings are compatible with Python3. +## `get_files_on_disk.py` + +This script automates the process of querying Rucio to find only the files in a CMS data or MC sample that are currently hosted on disk. +(The most general form of this functionality is not currently available from other CMS database tools such as `dasgoclient`.) + +There are two major use cases for this tool: +1. Finding AOD (or earlier formats such as RECO or RAW) files for testing or development. (AOD samples are not hosted on disk by default, so typically only small subsets of a sample will be transferred to disk for temporary usage.) +2. Obtaining file lists for premixed pileup samples for private MC production. (Premixed pileup input samples are no longer fully hosted on disk because of resource limitations.) + +A fraction of each premixed pileup sample is subscribed to disk by the central production team, and the corresponding list of files is synced to cvmfs. +By default, this script will just copy this cached information. +This is the most stable and preferred approach, so only deviate from it if absolutely necessary. + +This script should *not* be run in batch jobs, as that can lead to an inadvertent distributed denial of service disruption of the CMS data management system. +The script will actively try to prevent you from running it in batch jobs. +Please run the script locally, before submitting your jobs, and send the resulting information as part of the job input files. + +The available options for this script are: +``` +usage: get_files_on_disk.py [-h] [-a [ALLOW ...] | -b [BLOCK ...]] [-o OUTFILE] [-u USER] [-v] [--no-cache] dataset + +Find all available files (those hosted on disk) for a given dataset + +positional arguments: + dataset dataset to query + +optional arguments: + -h, --help show this help message and exit + -a [ALLOW ...], --allow [ALLOW ...] + allow only these sites (default: None) + -b [BLOCK ...], --block [BLOCK ...] + block these sites (default: None) + -o OUTFILE, --outfile OUTFILE + write to this file instead of stdout (default: None) + -u USER, --user USER username for rucio (default: [user]) + -v, --verbose print extra information (site list) (default: False) + --no-cache do not use cached file lists from cvmfs (default: False) +``` + ## `tunn` A simple utility to create and manage SSH tunnels.