Classes | |
class | CMSHarvester |
CMSHarvester class. More... | |
class | CMSHarvesterHelpFormatter |
Helper class: CMSHarvesterHelpFormatter. More... | |
class | DBSXMLHandler |
Helper class: DBSXMLHandler. More... | |
class | Error |
Helper class: Error exception. More... | |
class | Usage |
Helper class: Usage exception. More... | |
Functions | |
def | build_dataset_ignore_list (self) |
def | build_dataset_list (self, input_method, input_name) |
class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": site_name = str(attrs["STORAGEELEMENT_SENAME"]) TODO TODO TODOUgly hack to get around cases like this:$ dbs search –query="find dataset, site, file.count where dataset=/RelValQCD_Pt_3000_3500/CMSSW_3_3_0_pre1-STARTUP31X_V4-v1/GEN-SIM-RECO"Using DBS instance at: http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServletProcessing ...More... | |
def | build_dataset_use_list (self) |
def | build_datasets_information (self) |
def | build_runs_ignore_list (self) |
def | build_runs_list (self, input_method, input_name) |
def | build_runs_use_list (self) |
def | check_cmssw (self) |
def | check_dataset_list (self) |
def | check_dbs (self) |
def | check_globaltag (self, globaltag=None) |
CRABMore... | |
def | check_globaltag_contains_ref_hist_key (self, globaltag, connect_name) |
def | check_globaltag_exists (self, globaltag, connect_name) |
def | check_input_status (self) |
def | check_ref_hist_mappings (self) |
def | check_ref_hist_tag (self, tag_name) |
def | create_and_check_castor_dir (self, castor_dir) |
def | create_and_check_castor_dirs (self) |
def | create_castor_path_name_common (self, dataset_name) |
def | create_castor_path_name_special (self, dataset_name, run_number, castor_path_common) |
def | create_config_file_name (self, dataset_name, run_number) |
def | create_crab_config (self) |
def | create_es_prefer_snippet (self, dataset_name) |
def | create_harvesting_config (self, dataset_name) |
def | create_harvesting_config_file_name (self, dataset_name) |
Only add the alarming piece to the file name if this isa spread-out dataset.More... | |
def | create_harvesting_output_file_name (self, dataset_name, run_number) |
def | create_me_extraction_config (self, dataset_name) |
In case this file is the second step (the real harvestingstep) of the two-step harvesting we have to tell it to useour local files.More... | |
def | create_me_summary_config_file_name (self, dataset_name) |
def | create_me_summary_output_file_name (self, dataset_name) |
def | create_multicrab_block_name (self, dataset_name, run_number, index) |
def | create_multicrab_config (self) |
CRABMore... | |
def | create_output_file_name (self, dataset_name, run_number=None) |
def | dbs_check_dataset_spread (self, dataset_name) |
def dbs_resolve_dataset_number_of_sites(self, dataset_name): """Ask DBS across how many sites this dataset has been spread out. More... | |
def | dbs_resolve_cmssw_version (self, dataset_name) |
def | dbs_resolve_dataset_name (self, dataset_name) |
def | dbs_resolve_datatype (self, dataset_name) |
def | dbs_resolve_globaltag (self, dataset_name) |
def | dbs_resolve_number_of_events (self, dataset_name, run_number=None) |
def | dbs_resolve_runs (self, dataset_name) |
def dbs_resolve_dataset_number_of_events(self, dataset_name): """Ask DBS across how many events this dataset has been spread out. More... | |
def | escape_dataset_name (self, dataset_name) |
if self.datasets_information[dataset_name]["num_events"][run_number] != 0: pdb.set_trace() DEBUG DEBUG DEBUG end More... | |
def | load_ref_hist_mappings (self) |
def | option_handler_caf_access (self, option, opt_str, value, parser) |
def | option_handler_castor_dir (self, option, opt_str, value, parser) |
def option_handler_dataset_name(self, option, opt_str, value, parser): """Specify the name(s) of the dataset(s) to be processed. More... | |
def | option_handler_crab_submission (self, option, opt_str, value, parser) |
def | option_handler_list_types (self, option, opt_str, value, parser) |
def | option_handler_no_t1access (self, option, opt_str, value, parser) |
def | option_handler_preferred_site (self, option, opt_str, value, parser) |
def | option_handler_saveByLumiSection (self, option, opt_str, value, parser) |
def | option_handler_sites (self, option, opt_str, value, parser) |
def | parse_cmd_line_options (self) |
def | pick_a_site (self, sites, cmssw_version) |
self.logger.debug("Checking CASTOR path piece `%s'" % \ piece) More... | |
def | process_dataset_ignore_list (self) |
def | process_runs_use_and_ignore_lists (self) |
def | ref_hist_mappings_needed (self, dataset_name=None) |
def | run (self) |
def | setup_dbs (self) |
Now we try to do a very simple DBS search.More... | |
def | setup_harvesting_info (self) |
def | show_exit_message (self) |
DEBUG DEBUG DEBUGThis is probably only useful to make sure we don't muckthings up, right?Figure out across how many sites this sample has been spread.More... | |
def | singlify_datasets (self) |
def | write_crab_config (self) |
def create_harvesting_config(self, dataset_name): """Create the Python harvesting configuration for a given job. More... | |
def | write_harvesting_config (self, dataset_name) |
def | write_me_extraction_config (self, dataset_name) |
def | write_multicrab_config (self) |
def cmsHarvester.build_dataset_ignore_list | ( | self | ) |
Build a list of datasets to ignore. NOTE: We should always have a list of datasets to process, but it may be that we don't have a list of datasets to ignore.
Definition at line 3442 of file cmsHarvester.py.
def cmsHarvester.build_dataset_list | ( | self, | |
input_method, | |||
input_name | |||
) |
class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": site_name = str(attrs["STORAGEELEMENT_SENAME"])
\
if len(site_name) < 1: return
run_number = int(attrs["RUNS_RUNNUMBER"]) file_name = str(attrs["FILES_LOGICALFILENAME"]) nevents = int(attrs["FILES_NUMBEROFEVENTS"])
if not files_info.has_key(run_number):
files_info[run_number] = {} files_info[run_number][file_name] = (nevents, [site_name]) elif not files_info[run_number].has_key(file_name):
files_info[run_number][file_name] = (nevents, [site_name]) else:
assert nevents == files_info[run_number][file_name][0]
files_info[run_number][file_name][1].append(site_name) OBSOLETE OBSOLETE OBSOLETE end site_names_ref = set(files_info[run_number].values()[0][1]) for site_names_tmp in files_info[run_number].values()[1:]: if set(site_names_tmp[1]) != site_names_ref: mirrored = False break def dbs_check_dataset_num_events(self, dataset_name): """Figure out the number of events in each run of this dataset. This is a more efficient way of doing this than calling dbs_resolve_number_of_events for each run. # BUG BUG BUG
""" # DEBUG DEBUG DEBUG
assert not self.dbs_api is None
api = self.dbs_api dbs_query = "find run.number, file.name, file.numevents where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: msg = "ERROR: Could not execute DBS query" self.logger.fatal(msg) raise Error(msg) try: files_info = {} class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": run_number = int(attrs["RUNS_RUNNUMBER"]) file_name = str(attrs["FILES_LOGICALFILENAME"]) nevents = int(attrs["FILES_NUMBEROFEVENTS"]) try: files_info[run_number][file_name] = nevents except KeyError: files_info[run_number] = {file_name: nevents} xml.sax.parseString(api_result, Handler()) except SAXParseException: msg = "ERROR: Could not parse DBS server output" self.logger.fatal(msg) raise Error(msg) num_events_catalog = {} for run_number in files_info.keys(): num_events_catalog[run_number] = sum(files_info[run_number].values()) # End of dbs_check_dataset_num_events. return num_events_catalog End of old version.
Build a list of all datasets to be processed.
Definition at line 3356 of file cmsHarvester.py.
References dbs_resolve_dataset_name().
def cmsHarvester.build_dataset_use_list | ( | self | ) |
Build a list of datasets to process.
Definition at line 3419 of file cmsHarvester.py.
def cmsHarvester.build_datasets_information | ( | self | ) |
Obtain all information on the datasets that we need to run. Use DBS to figure out all required information on our datasets, like the run numbers and the GlobalTag. All information is stored in the datasets_information member variable.
Definition at line 5319 of file cmsHarvester.py.
def cmsHarvester.build_runs_ignore_list | ( | self | ) |
Build a list of runs to ignore. NOTE: We should always have a list of runs to process, but it may be that we don't have a list of runs to ignore.
Definition at line 3540 of file cmsHarvester.py.
def cmsHarvester.build_runs_list | ( | self, | |
input_method, | |||
input_name | |||
) |
Definition at line 3468 of file cmsHarvester.py.
References createfilelist.int, and list().
def cmsHarvester.build_runs_use_list | ( | self | ) |
Build a list of runs to process.
Definition at line 3519 of file cmsHarvester.py.
def cmsHarvester.check_cmssw | ( | self | ) |
Check if CMSSW is setup.
Definition at line 2332 of file cmsHarvester.py.
def cmsHarvester.check_dataset_list | ( | self | ) |
Check list of dataset names for impossible ones. Two kinds of checks are done: - Checks for things that do not make sense. These lead to errors and skipped datasets. - Sanity checks. For these warnings are issued but the user is considered to be the authoritative expert. Checks performed: - The CMSSW version encoded in the dataset name should match self.cmssw_version. This is critical. - There should be some events in the dataset/run. This is critical in the sense that CRAB refuses to create jobs for zero events. And yes, this does happen in practice. E.g. the reprocessed CRAFT08 datasets contain runs with zero events. - A cursory check is performed to see if the harvesting type makes sense for the data type. This should prevent the user from inadvertently running RelVal for data. - It is not possible to run single-step harvesting jobs on samples that are not fully contained at a single site. - Each dataset/run has to be available at at least one site.
Definition at line 3793 of file cmsHarvester.py.
def cmsHarvester.check_dbs | ( | self | ) |
Check if globaltag exists. Check if globaltag exists as GlobalTag in the database given by self.frontier_connection_name['globaltag']. If globaltag is None, self.globaltag is used instead. If we're going to use reference histograms this method also checks for the existence of the required key in the GlobalTag.
Definition at line 4499 of file cmsHarvester.py.
def cmsHarvester.check_globaltag_contains_ref_hist_key | ( | self, | |
globaltag, | |||
connect_name | |||
) |
Check if globaltag contains the required RefHistos key.
Definition at line 4596 of file cmsHarvester.py.
def cmsHarvester.check_globaltag_exists | ( | self, | |
globaltag, | |||
connect_name | |||
) |
Check if globaltag exists.
Definition at line 4554 of file cmsHarvester.py.
References split.
def cmsHarvester.check_input_status | ( | self | ) |
Check completeness and correctness of input information. Check that all required information has been specified and that, at least as far as can be easily checked, it makes sense. NOTE: This is also where any default values are applied.
Definition at line 2191 of file cmsHarvester.py.
References join().
def cmsHarvester.check_ref_hist_mappings | ( | self | ) |
Make sure all necessary reference histograms exist. Check that for each of the datasets to be processed a reference histogram is specified and that that histogram exists in the database. NOTE: There's a little complication here. Since this whole thing was designed to allow (in principle) harvesting of both data and MC datasets in one go, we need to be careful to check the availability fof reference mappings only for those datasets that need it.
Definition at line 5279 of file cmsHarvester.py.
def cmsHarvester.check_ref_hist_tag | ( | self, | |
tag_name | |||
) |
Check the existence of tag_name in database connect_name. Check if tag_name exists as a reference histogram tag in the database given by self.frontier_connection_name['refhists'].
Definition at line 4641 of file cmsHarvester.py.
References join().
def cmsHarvester.create_and_check_castor_dir | ( | self, | |
castor_dir | |||
) |
Check existence of the give CASTOR dir, if necessary create it. Some special care has to be taken with several things like setting the correct permissions such that CRAB can store the output results. Of course this means that things like /castor/cern.ch/ and user/j/ have to be recognised and treated properly. NOTE: Only CERN CASTOR area (/castor/cern.ch/) supported for the moment. NOTE: This method uses some slightly tricky caching to make sure we don't keep over and over checking the same base paths.
Definition at line 1489 of file cmsHarvester.py.
References spr.find(), createfilelist.int, join(), SiStripPI.max, str, and ComparisonHelper.zip().
def cmsHarvester.create_and_check_castor_dirs | ( | self | ) |
Make sure all required CASTOR output dirs exist. This checks the CASTOR base dir specified by the user as well as all the subdirs required by the current set of jobs.
Definition at line 1430 of file cmsHarvester.py.
References SiStripPI.max.
def cmsHarvester.create_castor_path_name_common | ( | self, | |
dataset_name | |||
) |
Build the common part of the output path to be used on CASTOR. This consists of the CASTOR area base path specified by the user and a piece depending on the data type (data vs. MC), the harvesting type and the dataset name followed by a piece containing the run number and event count. (See comments in create_castor_path_name_special for details.) This method creates the common part, without run number and event count.
Definition at line 1326 of file cmsHarvester.py.
References create_castor_path_name_special(), python.rootplot.root2matplotlib.replace(), and digitizers_cfi.strip.
def cmsHarvester.create_castor_path_name_special | ( | self, | |
dataset_name, | |||
run_number, | |||
castor_path_common | |||
) |
Create the specialised part of the CASTOR output dir name. NOTE: To avoid clashes with `incremental harvesting' (re-harvesting when a dataset grows) we have to include the event count in the path name. The underlying `problem' is that CRAB does not overwrite existing output files so if the output file already exists CRAB will fail to copy back the output. NOTE: It's not possible to create different kinds of harvesting jobs in a single call to this tool. However, in principle it could be possible to create both data and MC jobs in a single go. NOTE: The number of events used in the path name is the _total_ number of events in the dataset/run at the time of harvesting. If we're doing partial harvesting the final results will reflect lower statistics. This is a) the easiest to code and b) the least likely to lead to confusion if someone ever decides to swap/copy around file blocks between sites.
Definition at line 1382 of file cmsHarvester.py.
Referenced by create_castor_path_name_common().
def cmsHarvester.create_config_file_name | ( | self, | |
dataset_name, | |||
run_number | |||
) |
Generate the name of the configuration file to be run by CRAB. Depending on the harvesting mode (single-step or two-step) this is the name of the real harvesting configuration or the name of the first-step ME summary extraction configuration.
Definition at line 4063 of file cmsHarvester.py.
def cmsHarvester.create_crab_config | ( | self | ) |
Create a CRAB configuration for a given job. NOTE: This is _not_ a complete (as in: submittable) CRAB configuration. It is used to store the common settings for the multicrab configuration. NOTE: Only CERN CASTOR area (/castor/cern.ch/) is supported. NOTE: According to CRAB, you `Must define exactly two of total_number_of_events, events_per_job, or number_of_jobs.'. For single-step harvesting we force one job, for the rest we don't really care. # BUG BUG BUG # With the current version of CRAB (2.6.1), in which Daniele # fixed the behaviour of no_block_boundary for me, one _has to # specify_ the total_number_of_events and one single site in # the se_white_list. # BUG BUG BUG end
Definition at line 4231 of file cmsHarvester.py.
References join().
def cmsHarvester.create_es_prefer_snippet | ( | self, | |
dataset_name | |||
) |
Build the es_prefer snippet for the reference histograms. The building of the snippet is wrapped in some care-taking code that figures out the name of the reference histogram set and makes sure the corresponding tag exists.
Definition at line 4687 of file cmsHarvester.py.
References join().
def cmsHarvester.create_harvesting_config | ( | self, | |
dataset_name | |||
) |
Create the Python harvesting configuration for harvesting. The basic configuration is created by Configuration.PyReleaseValidation.ConfigBuilder. (This mimics what cmsDriver.py does.) After that we add some specials ourselves. NOTE: On one hand it may not be nice to circumvent cmsDriver.py, on the other hand cmsDriver.py does not really do anything itself. All the real work is done by the ConfigBuilder so there is not much risk that we miss out on essential developments of cmsDriver in the future.
Definition at line 4722 of file cmsHarvester.py.
References join().
def cmsHarvester.create_harvesting_config_file_name | ( | self, | |
dataset_name | |||
) |
pdb.set_trace() if self.datasets_information[dataset_name] \ ["mirrored"][run_number] == False: config_file_name = config_file_name.replace(".py", "_partial.py")
Definition at line 4095 of file cmsHarvester.py.
Referenced by write_harvesting_config().
def cmsHarvester.create_harvesting_output_file_name | ( | self, | |
dataset_name, | |||
run_number | |||
) |
Generate the name to be used for the harvesting output file. This harvesting output file is the _final_ ROOT output file containing the harvesting results. In case of two-step harvesting there is an intermediate ME output file as well.
Definition at line 4167 of file cmsHarvester.py.
def cmsHarvester.create_me_extraction_config | ( | self, | |
dataset_name | |||
) |
if self.harvesting_mode == "two-step": castor_dir = self.datasets_information[dataset_name] \ ["castor_path"][run] customisations.append("") customisations.append("# This is the second step (the real") customisations.append("# harvesting step) of a two-step") customisations.append("# harvesting procedure.")
customisations.append("import pdb")
customisations.append("import commands") customisations.append("import os") customisations.append("castor_dir = \"s"" % castor_dir) customisations.append("cmd = "rfdir s" % castor_dir") customisations.append("(status, output) = commands.getstatusoutput(cmd)") customisations.append("if status != 0:") customisations.append(" print "ERROR"") customisations.append(" raise Exception, "ERROR"") customisations.append("file_names = [os.path.join("rfio:s" % path, i) for i in output.split() if i.startswith("EDM_summary") and i.endswith(".root")]") #customisations.append("pdb.set_trace()") customisations.append("process.source.fileNames = cms.untracked.vstring(*file_names)") customisations.append("") ########## def create_harvesting_config_two_step(self, dataset_name): """Create the Python harvesting configuration for two-step harvesting. """ # BUG BUG BUG config_contents = self.create_harvesting_config_single_step(dataset_name)
return config_contents
Definition at line 4948 of file cmsHarvester.py.
References create_output_file_name(), and join().
def cmsHarvester.create_me_summary_config_file_name | ( | self, | |
dataset_name | |||
) |
Definition at line 4109 of file cmsHarvester.py.
Referenced by write_me_extraction_config().
def cmsHarvester.create_me_summary_output_file_name | ( | self, | |
dataset_name | |||
) |
Generate the name of the intermediate ME file name to be used in two-step harvesting.
Definition at line 4199 of file cmsHarvester.py.
def cmsHarvester.create_multicrab_block_name | ( | self, | |
dataset_name, | |||
run_number, | |||
index | |||
) |
Create the block name to use for this dataset/run number. This is what appears in the brackets `[]' in multicrab.cfg. It is used as the name of the job and to create output directories.
Definition at line 4214 of file cmsHarvester.py.
def cmsHarvester.create_multicrab_config | ( | self | ) |
Create a multicrab.cfg file for all samples. This creates the contents for a multicrab.cfg file that uses the crab.cfg file (generated elsewhere) for the basic settings and contains blocks for each run of each dataset. # BUG BUG BUG # The fact that it's necessary to specify the se_white_list # and the total_number_of_events is due to our use of CRAB # version 2.6.1. This should no longer be necessary in the # future. # BUG BUG BUG end
Definition at line 4311 of file cmsHarvester.py.
Create the name of the output file name to be used. This is the name of the output file of the `first step'. In the case of single-step harvesting this is already the final harvesting output ROOT file. In the case of two-step harvesting it is the name of the intermediary ME summary file.
Definition at line 4123 of file cmsHarvester.py.
Referenced by create_me_extraction_config().
def cmsHarvester.dbs_check_dataset_spread | ( | self, | |
dataset_name | |||
) |
def dbs_resolve_dataset_number_of_sites(self, dataset_name): """Ask DBS across how many sites this dataset has been spread out.
This is especially useful to check that we do not submit a job supposed to run on a complete sample that is not contained at a single site. """ # DEBUG DEBUG DEBUG
assert not self.dbs_api is None
api = self.dbs_api dbs_query = "find count(site) where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: raise Error("ERROR: Could not execute DBS query") try: num_sites = [] class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": num_sites.append(str(attrs["COUNT_STORAGEELEMENT"])) xml.sax.parseString(api_result, Handler()) except SAXParseException: raise Error("ERROR: Could not parse DBS server output") # DEBUG DEBUG DEBUG assert len(num_sites) == 1
num_sites = int(num_sites[0]) # End of dbs_resolve_dataset_number_of_sites. return num_sites def dbs_check_dataset_spread(self, dataset_name): """Figure out across how many sites this dataset is spread. NOTE: This is something we need to figure out per run, since we want to submit harvesting jobs per run. Basically three things can happen with a given dataset:
assert not self.dbs_api is None
api = self.dbs_api dbs_query = "find run, run.numevents, site, file.count " \ "where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: msg = "ERROR: Could not execute DBS query" self.logger.fatal(msg) raise Error(msg) # Index things by run number. No cross-check is done to make
sample_info = {} try: class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": run_number = int(attrs["RUNS_RUNNUMBER"]) site_name = str(attrs["STORAGEELEMENT_SENAME"]) file_count = int(attrs["COUNT_FILES"])
event_count = int(attrs["RUNS_NUMBEROFEVENTS"])
info = (site_name, file_count, event_count) try: sample_info[run_number].append(info) except KeyError: sample_info[run_number] = [info] xml.sax.parseString(api_result, Handler()) except SAXParseException: msg = "ERROR: Could not parse DBS server output" self.logger.fatal(msg) raise Error(msg) # Now translate this into a slightly more usable mapping. sites = {} for (run_number, site_info) in six.iteritems(sample_info):
unique_file_counts = set([i[1] for i in site_info]) if len(unique_file_counts) == 1:
site_names = [self.pick_a_site([i[0] for i in site_info])] nevents = [site_info[0][2]] else:
site_names = [i[0] for i in site_info] nevents = [i[2] for i in site_info] sites[run_number] = zip(site_names, nevents) self.logger.debug("Sample `%s' spread is:" % dataset_name) run_numbers = sites.keys() run_numbers.sort() for run_number in run_numbers: self.logger.debug(" run # %6d: %d sites (%s)" % \ (run_number, len(sites[run_number]), ", ".join([i[0] for i in sites[run_number]]))) # End of dbs_check_dataset_spread. return sites # DEBUG DEBUG DEBUG
def dbs_check_dataset_spread_old(self, dataset_name): """Figure out across how many sites this dataset is spread. NOTE: This is something we need to figure out per run, since we want to submit harvesting jobs per run. Basically three things can happen with a given dataset:
assert not self.dbs_api is None
api = self.dbs_api dbs_query = "find run, run.numevents, site, file.count " \ "where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: msg = "ERROR: Could not execute DBS query" self.logger.fatal(msg) raise Error(msg) # Index things by run number. No cross-check is done to make
sample_info = {} try: class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": run_number = int(attrs["RUNS_RUNNUMBER"]) site_name = str(attrs["STORAGEELEMENT_SENAME"]) file_count = int(attrs["COUNT_FILES"])
event_count = int(attrs["RUNS_NUMBEROFEVENTS"])
info = (site_name, file_count, event_count) try: sample_info[run_number].append(info) except KeyError: sample_info[run_number] = [info] xml.sax.parseString(api_result, Handler()) except SAXParseException: msg = "ERROR: Could not parse DBS server output" self.logger.fatal(msg) raise Error(msg) # Now translate this into a slightly more usable mapping. sites = {} for (run_number, site_info) in six.iteritems(sample_info):
unique_file_counts = set([i[1] for i in site_info]) if len(unique_file_counts) == 1:
site_names = [self.pick_a_site([i[0] for i in site_info])] nevents = [site_info[0][2]] else:
site_names = [i[0] for i in site_info] nevents = [i[2] for i in site_info] sites[run_number] = zip(site_names, nevents) self.logger.debug("Sample `%s' spread is:" % dataset_name) run_numbers = sites.keys() run_numbers.sort() for run_number in run_numbers: self.logger.debug(" run # %6d: %d site(s) (%s)" % \ (run_number, len(sites[run_number]), ", ".join([i[0] for i in sites[run_number]]))) # End of dbs_check_dataset_spread_old. return sites
Figure out the number of events in each run of this dataset. This is a more efficient way of doing this than calling dbs_resolve_number_of_events for each run.
Definition at line 3075 of file cmsHarvester.py.
References mps_setup.append, createfilelist.int, mps_monitormerge.items, relativeConstraints.keys, list(), and MuonErrorMatrixValues_cff.values.
def cmsHarvester.dbs_resolve_cmssw_version | ( | self, | |
dataset_name | |||
) |
Ask DBS for the CMSSW version used to create this dataset.
Definition at line 2474 of file cmsHarvester.py.
def cmsHarvester.dbs_resolve_dataset_name | ( | self, | |
dataset_name | |||
) |
Use DBS to resolve a wildcarded dataset name.
Definition at line 2418 of file cmsHarvester.py.
Referenced by build_dataset_list().
def cmsHarvester.dbs_resolve_datatype | ( | self, | |
dataset_name | |||
) |
Ask DBS for the the data type (data or mc) of a given dataset.
Definition at line 2681 of file cmsHarvester.py.
def cmsHarvester.dbs_resolve_globaltag | ( | self, | |
dataset_name | |||
) |
Ask DBS for the globaltag corresponding to a given dataset. # BUG BUG BUG # This does not seem to work for data datasets? E.g. for # /Cosmics/Commissioning08_CRAFT0831X_V1_311_ReReco_FromSuperPointing_v1/RAW-RECO # Probaly due to the fact that the GlobalTag changed during # datataking... BUG BUG BUG end
Definition at line 2625 of file cmsHarvester.py.
Determine the number of events in a given dataset (and run). Ask DBS for the number of events in a dataset. If a run number is specified the number of events returned is that in that run of that dataset. If problems occur we throw an exception. # BUG BUG BUG # Since DBS does not return the number of events correctly, # neither for runs nor for whole datasets, we have to work # around that a bit... # BUG BUG BUG end
Definition at line 2734 of file cmsHarvester.py.
def cmsHarvester.dbs_resolve_runs | ( | self, | |
dataset_name | |||
) |
def dbs_resolve_dataset_number_of_events(self, dataset_name): """Ask DBS across how many events this dataset has been spread out.
This is especially useful to check that we do not submit a job supposed to run on a complete sample that is not contained at a single site. """ # DEBUG DEBUG DEBUG
assert not self.dbs_api is None
api = self.dbs_api dbs_query = "find count(site) where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: raise Error("ERROR: Could not execute DBS query") try: num_events = [] class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": num_events.append(str(attrs["COUNT_STORAGEELEMENT"])) xml.sax.parseString(api_result, Handler()) except SAXParseException: raise Error("ERROR: Could not parse DBS server output") # DEBUG DEBUG DEBUG assert len(num_events) == 1
num_events = int(num_events[0]) # End of dbs_resolve_dataset_number_of_events. return num_events
Ask DBS for the list of runs in a given dataset. # NOTE: This does not (yet?) skip/remove empty runs. There is # a bug in the DBS entry run.numevents (i.e. it always returns # zero) which should be fixed in the `next DBS release'. # See also: # https://savannah.cern.ch/bugs/?53452 # https://savannah.cern.ch/bugs/?53711
Definition at line 2568 of file cmsHarvester.py.
References createfilelist.int.
def cmsHarvester.escape_dataset_name | ( | self, | |
dataset_name | |||
) |
if self.datasets_information[dataset_name]["num_events"][run_number] != 0: pdb.set_trace() DEBUG DEBUG DEBUG end
Escape a DBS dataset name. Escape a DBS dataset name such that it does not cause trouble with the file system. This means turning each `/' into `__', except for the first one which is just removed.
Definition at line 4044 of file cmsHarvester.py.
def cmsHarvester.load_ref_hist_mappings | ( | self | ) |
Load the reference histogram mappings from file. The dataset name to reference histogram name mappings are read from a text file specified in self.ref_hist_mappings_file_name.
Definition at line 5203 of file cmsHarvester.py.
References FrontierConditions_GlobalTag_cff.file, SiStripPI.max, and digitizers_cfi.strip.
def cmsHarvester.option_handler_caf_access | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
Set the self.caf_access flag to try and create jobs that run on the CAF.
Definition at line 1102 of file cmsHarvester.py.
def cmsHarvester.option_handler_castor_dir | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
def option_handler_dataset_name(self, option, opt_str, value, parser): """Specify the name(s) of the dataset(s) to be processed.
It is checked to make sure that no dataset name or listfile names are given yet. If all is well (i.e. we still have a clean slate) the dataset name is stored for later use, otherwise a Usage exception is raised. """ if not self.input_method is None: if self.input_method == "dataset": raise Usage("Please only feed me one dataset specification") elif self.input_method == "listfile": raise Usage("Cannot specify both dataset and input list file") else: assert False, "Unknown input method `%s'" % self.input_method self.input_method = "dataset" self.input_name = value self.logger.info("Input method used: %s" % self.input_method) # End of option_handler_dataset_name. ########## def option_handler_listfile_name(self, option, opt_str, value, parser): """Specify the input list file containing datasets to be processed. It is checked to make sure that no dataset name or listfile names are given yet. If all is well (i.e. we still have a clean slate) the listfile name is stored for later use, otherwise a Usage exception is raised. """ if not self.input_method is None: if self.input_method == "listfile": raise Usage("Please only feed me one list file") elif self.input_method == "dataset": raise Usage("Cannot specify both dataset and input list file") else: assert False, "Unknown input method `%s'" % self.input_method self.input_method = "listfile" self.input_name = value self.logger.info("Input method used: %s" % self.input_method) # End of option_handler_listfile_name.
Specify where on CASTOR the output should go. At the moment only output to CERN CASTOR is supported. Eventually the harvested results should go into the central place for DQM on CASTOR anyway.
Definition at line 1060 of file cmsHarvester.py.
def cmsHarvester.option_handler_crab_submission | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
Crab jobs are not created and "submitted automatically",
Definition at line 1130 of file cmsHarvester.py.
def cmsHarvester.option_handler_list_types | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
List all harvesting types and their mappings. This lists all implemented harvesting types with their corresponding mappings to sequence names. This had to be separated out from the help since it depends on the CMSSW version and was making things a bit of a mess. NOTE: There is no way (at least not that I could come up with) to code this in a neat generic way that can be read both by this method and by setup_harvesting_info(). Please try hard to keep these two methods in sync!
Definition at line 1152 of file cmsHarvester.py.
def cmsHarvester.option_handler_no_t1access | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
Set the self.no_t1access flag to try and create jobs that run without special `t1access' role.
Definition at line 1085 of file cmsHarvester.py.
def cmsHarvester.option_handler_preferred_site | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
Definition at line 1146 of file cmsHarvester.py.
def cmsHarvester.option_handler_saveByLumiSection | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
Set process.dqmSaver.saveByLumiSectiont=1 in cfg harvesting file
Definition at line 1118 of file cmsHarvester.py.
def cmsHarvester.option_handler_sites | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
Definition at line 1140 of file cmsHarvester.py.
def cmsHarvester.parse_cmd_line_options | ( | self | ) |
Definition at line 1869 of file cmsHarvester.py.
def cmsHarvester.pick_a_site | ( | self, | |
sites, | |||
cmssw_version | |||
) |
self.logger.debug("Checking CASTOR path piece `%s'" % \ piece)
self.logger.debug("Checking `%s' against `%s'" % \ (castor_path_pieces[piece_index + check_size], castor_paths_dont_touch[check_size])) self.logger.debug(" skipping") else:
self.logger.debug(" accepting") Add piece to the path we're building. self.logger.debug("!!! Skip path piece `%s'? %s" % \ (piece, str(skip_this_path_piece))) self.logger.debug("Adding piece to path...") self.logger.debug("Path is now `%s'" % \ path)
Definition at line 1705 of file cmsHarvester.py.
def cmsHarvester.process_dataset_ignore_list | ( | self | ) |
Update the list of datasets taking into account the ones to ignore. Both lists have been generated before from DBS and both are assumed to be unique. NOTE: The advantage of creating the ignore list from DBS (in case a regexp is given) and matching that instead of directly matching the ignore criterion against the list of datasets (to consider) built from DBS is that in the former case we're sure that all regexps are treated exactly as DBS would have done without the cmsHarvester. NOTE: This only removes complete samples. Exclusion of single runs is done by the book keeping. So the assumption is that a user never wants to harvest just part (i.e. n out of N runs) of a sample.
Definition at line 3564 of file cmsHarvester.py.
def cmsHarvester.process_runs_use_and_ignore_lists | ( | self | ) |
Definition at line 3611 of file cmsHarvester.py.
Check if we need to load and check the reference mappings. For data the reference histograms should be taken automatically from the GlobalTag, so we don't need any mappings. For RelVals we need to know a mapping to be used in the es_prefer code snippet (different references for each of the datasets.) WARNING: This implementation is a bit convoluted.
Definition at line 5169 of file cmsHarvester.py.
def cmsHarvester.run | ( | self | ) |
Definition at line 5520 of file cmsHarvester.py.
def cmsHarvester.setup_dbs | ( | self | ) |
If that works
cmd = "dbs search --query=\"find dataset where dataset = impossible"" (status, output) = commands.getstatusoutput(cmd) pdb.set_trace() if status != 0 or \ output.lower().find("unsupported api call") > -1: self.logger.fatal("It seems DBS is not setup...") self.logger.fatal(" %s returns crap:" % cmd) for line in output.split("\n"): self.logger.fatal(" %s" % line) raise Error("ERROR: DBS needs to be setup first!")
Setup the Python side of DBS. For more information see the DBS Python API documentation: https://twiki.cern.ch/twiki/bin/view/CMS/DBSApiDocumentation
Definition at line 2392 of file cmsHarvester.py.
def cmsHarvester.setup_harvesting_info | ( | self | ) |
Fill our dictionary with all info needed to understand harvesting. This depends on the CMSSW version since at some point the names and sequences were modified. NOTE: There is no way (at least not that I could come up with) to code this in a neat generic way that can be read both by this method and by option_handler_list_types(). Please try hard to keep these two methods in sync!
Definition at line 1207 of file cmsHarvester.py.
def cmsHarvester.show_exit_message | ( | self | ) |
if num_sites == 1: self.logger.info(" sample is contained at a single site") else: self.logger.info(" sample is spread across %d sites" % \ num_sites) if num_sites < 1:
self.logger.warning(" --> skipping dataset which is not " \ "hosted anywhere")
Tell the user what to do now, after this part is done. This should provide the user with some (preferably copy-pasteable) instructions on what to do now with the setups and files that have been created.
Definition at line 5467 of file cmsHarvester.py.
def cmsHarvester.singlify_datasets | ( | self | ) |
Remove all but the largest part of all datasets. This allows us to harvest at least part of these datasets using single-step harvesting until the two-step approach works.
Definition at line 3740 of file cmsHarvester.py.
References mps_monitormerge.items, SiStripPI.max, and MuonErrorMatrixValues_cff.values.
def cmsHarvester.write_crab_config | ( | self | ) |
def create_harvesting_config(self, dataset_name): """Create the Python harvesting configuration for a given job.
NOTE: The reason to have a single harvesting configuration per sample is to be able to specify the GlobalTag corresponding to each sample. Since it has been decided that (apart from the prompt reco) datasets cannot contain runs with different GlobalTags, we don't need a harvesting config per run. NOTE: This is the place where we distinguish between single-step and two-step harvesting modes (at least for the Python job configuration). """ ### if self.harvesting_mode == "single-step": config_contents = self.create_harvesting_config_single_step(dataset_name) elif self.harvesting_mode == "two-step": config_contents = self.create_harvesting_config_two_step(dataset_name) else:
assert False, "ERROR: unknown harvesting mode `%s'" % \ self.harvesting_mode ### # End of create_harvesting_config. return config_contents
Write a CRAB job configuration Python file.
Definition at line 5045 of file cmsHarvester.py.
References FrontierConditions_GlobalTag_cff.file.
def cmsHarvester.write_harvesting_config | ( | self, | |
dataset_name | |||
) |
Write a harvesting job configuration Python file. NOTE: This knows nothing about single-step or two-step harvesting. That's all taken care of by create_harvesting_config.
Definition at line 5103 of file cmsHarvester.py.
References create_harvesting_config_file_name(), and FrontierConditions_GlobalTag_cff.file.
def cmsHarvester.write_me_extraction_config | ( | self, | |
dataset_name | |||
) |
Write an ME-extraction configuration Python file. This `ME-extraction' (ME = Monitoring Element) is the first step of the two-step harvesting.
Definition at line 5136 of file cmsHarvester.py.
References create_me_summary_config_file_name(), and FrontierConditions_GlobalTag_cff.file.
def cmsHarvester.write_multicrab_config | ( | self | ) |
Write a multi-CRAB job configuration Python file.
Definition at line 5074 of file cmsHarvester.py.
References FrontierConditions_GlobalTag_cff.file.
cmsHarvester.caf_access |
Definition at line 1107 of file cmsHarvester.py.
cmsHarvester.castor_base_dir |
Definition at line 1076 of file cmsHarvester.py.
cmsHarvester.cmssw_version |
Definition at line 2346 of file cmsHarvester.py.
cmsHarvester.crab_submission |
Definition at line 1134 of file cmsHarvester.py.
cmsHarvester.datasets_information |
Definition at line 5339 of file cmsHarvester.py.
cmsHarvester.datasets_to_ignore |
Definition at line 3456 of file cmsHarvester.py.
cmsHarvester.datasets_to_use |
Definition at line 3430 of file cmsHarvester.py.
cmsHarvester.dbs_api |
Definition at line 2405 of file cmsHarvester.py.
cmsHarvester.globaltag |
Definition at line 2306 of file cmsHarvester.py.
cmsHarvester.harvesting_info |
Definition at line 1313 of file cmsHarvester.py.
cmsHarvester.harvesting_mode |
Definition at line 2215 of file cmsHarvester.py.
cmsHarvester.harvesting_type |
Definition at line 3857 of file cmsHarvester.py.
cmsHarvester.Jsonfilename |
Definition at line 3706 of file cmsHarvester.py.
cmsHarvester.Jsonlumi |
Definition at line 3680 of file cmsHarvester.py.
cmsHarvester.non_t1access |
Definition at line 1091 of file cmsHarvester.py.
cmsHarvester.nr_max_sites |
Definition at line 1142 of file cmsHarvester.py.
cmsHarvester.option_parser |
Definition at line 1878 of file cmsHarvester.py.
cmsHarvester.preferred_site |
Definition at line 1148 of file cmsHarvester.py.
cmsHarvester.ref_hist_mappings_file_name |
Definition at line 2257 of file cmsHarvester.py.
cmsHarvester.runs_to_ignore |
Definition at line 3553 of file cmsHarvester.py.
cmsHarvester.runs_to_use |
Definition at line 3529 of file cmsHarvester.py.
cmsHarvester.saveByLumiSection |
Definition at line 1121 of file cmsHarvester.py.