Classes | |
class | CMSHarvester |
CMSHarvester class. More... | |
class | CMSHarvesterHelpFormatter |
Helper class: CMSHarvesterHelpFormatter. More... | |
class | DBSXMLHandler |
Helper class: DBSXMLHandler. More... | |
class | Error |
Helper class: Error exception. More... | |
class | Usage |
Helper class: Usage exception. More... | |
Functions | |
def | build_dataset_ignore_list (self) |
def | build_dataset_list (self, input_method, input_name) |
def dbs_check_dataset_num_events(self, dataset_name): """Figure out the number of events in each run of this dataset. More... | |
def | build_dataset_use_list (self) |
def | build_datasets_information (self) |
def | build_runs_ignore_list (self) |
def | build_runs_list (self, input_method, input_name) |
def | build_runs_use_list (self) |
def | check_cmssw (self) |
def | check_dataset_list (self) |
def | check_dbs (self) |
def | check_globaltag (self, globaltag=None) |
def | check_globaltag_contains_ref_hist_key (self, globaltag, connect_name) |
def | check_globaltag_exists (self, globaltag, connect_name) |
def | check_input_status (self) |
def | check_ref_hist_mappings (self) |
def | check_ref_hist_tag (self, tag_name) |
def | create_and_check_castor_dir (self, castor_dir) |
def | create_and_check_castor_dirs (self) |
def | create_castor_path_name_common (self, dataset_name) |
def | create_castor_path_name_special (self, dataset_name, run_number, castor_path_common) |
def | create_config_file_name (self, dataset_name, run_number) |
def | create_crab_config (self) |
def | create_es_prefer_snippet (self, dataset_name) |
def | create_harvesting_config (self, dataset_name) |
def | create_harvesting_config_file_name (self, dataset_name) |
def | create_harvesting_output_file_name (self, dataset_name, run_number) |
def | create_me_extraction_config (self, dataset_name) |
def create_harvesting_config_two_step(self, dataset_name): """Create the Python harvesting configuration for two-step harvesting. More... | |
def | create_me_summary_config_file_name (self, dataset_name) |
def | create_me_summary_output_file_name (self, dataset_name) |
def | create_multicrab_block_name (self, dataset_name, run_number, index) |
def | create_multicrab_config (self) |
def | create_output_file_name (self, dataset_name, run_number=None) |
def | dbs_check_dataset_spread (self, dataset_name) |
def dbs_resolve_dataset_number_of_sites(self, dataset_name): """Ask DBS across how many sites this dataset has been spread out. More... | |
def | dbs_resolve_cmssw_version (self, dataset_name) |
def | dbs_resolve_dataset_name (self, dataset_name) |
def | dbs_resolve_datatype (self, dataset_name) |
def | dbs_resolve_globaltag (self, dataset_name) |
def | dbs_resolve_number_of_events (self, dataset_name, run_number=None) |
def | dbs_resolve_runs (self, dataset_name) |
def dbs_resolve_dataset_number_of_events(self, dataset_name): """Ask DBS across how many events this dataset has been spread out. More... | |
def | escape_dataset_name (self, dataset_name) |
def | load_ref_hist_mappings (self) |
def | option_handler_caf_access (self, option, opt_str, value, parser) |
def | option_handler_castor_dir (self, option, opt_str, value, parser) |
def option_handler_dataset_name(self, option, opt_str, value, parser): """Specify the name(s) of the dataset(s) to be processed. More... | |
def | option_handler_crab_submission (self, option, opt_str, value, parser) |
def | option_handler_list_types (self, option, opt_str, value, parser) |
def | option_handler_no_t1access (self, option, opt_str, value, parser) |
def | option_handler_preferred_site (self, option, opt_str, value, parser) |
def | option_handler_saveByLumiSection (self, option, opt_str, value, parser) |
def | option_handler_sites (self, option, opt_str, value, parser) |
def | parse_cmd_line_options (self) |
def | pick_a_site (self, sites, cmssw_version) |
def | process_dataset_ignore_list (self) |
def | process_runs_use_and_ignore_lists (self) |
def | ref_hist_mappings_needed (self, dataset_name=None) |
def | run (self) |
def | setup_dbs (self) |
def | setup_harvesting_info (self) |
def | show_exit_message (self) |
def | singlify_datasets (self) |
def | write_crab_config (self) |
def create_harvesting_config(self, dataset_name): """Create the Python harvesting configuration for a given job. More... | |
def | write_harvesting_config (self, dataset_name) |
def | write_me_extraction_config (self, dataset_name) |
def | write_multicrab_config (self) |
def cmsHarvester.build_dataset_ignore_list | ( | self | ) |
Build a list of datasets to ignore. NOTE: We should always have a list of datasets to process, but it may be that we don't have a list of datasets to ignore.
Definition at line 3444 of file cmsHarvester.py.
References info().
def cmsHarvester.build_dataset_list | ( | self, | |
input_method, | |||
input_name | |||
) |
def dbs_check_dataset_num_events(self, dataset_name): """Figure out the number of events in each run of this dataset.
This is a more efficient way of doing this than calling dbs_resolve_number_of_events for each run.
"""
assert not self.dbs_api is None
api = self.dbs_api dbs_query = "find run.number, file.name, file.numevents where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: msg = "ERROR: Could not execute DBS query" self.logger.fatal(msg) raise Error(msg) try: files_info = {} class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": run_number = int(attrs["RUNS_RUNNUMBER"]) file_name = str(attrs["FILES_LOGICALFILENAME"]) nevents = int(attrs["FILES_NUMBEROFEVENTS"]) try: files_info[run_number][file_name] = nevents except KeyError: files_info[run_number] = {file_name: nevents} xml.sax.parseString(api_result, Handler()) except SAXParseException: msg = "ERROR: Could not parse DBS server output" self.logger.fatal(msg) raise Error(msg) num_events_catalog = {} for run_number in files_info.keys(): num_events_catalog[run_number] = sum(files_info[run_number].values())
return num_events_catalog End of old version.
Build a list of all datasets to be processed.
Definition at line 3358 of file cmsHarvester.py.
References dbs_resolve_dataset_name(), info(), and edm.print().
def cmsHarvester.build_dataset_use_list | ( | self | ) |
Build a list of datasets to process.
Definition at line 3421 of file cmsHarvester.py.
References info(), list(), and ComparisonHelper.zip().
def cmsHarvester.build_datasets_information | ( | self | ) |
Obtain all information on the datasets that we need to run. Use DBS to figure out all required information on our datasets, like the run numbers and the GlobalTag. All information is stored in the datasets_information member variable.
Definition at line 5321 of file cmsHarvester.py.
def cmsHarvester.build_runs_ignore_list | ( | self | ) |
Build a list of runs to ignore. NOTE: We should always have a list of runs to process, but it may be that we don't have a list of runs to ignore.
Definition at line 3542 of file cmsHarvester.py.
References info().
def cmsHarvester.build_runs_list | ( | self, | |
input_method, | |||
input_name | |||
) |
def cmsHarvester.build_runs_use_list | ( | self | ) |
Build a list of runs to process.
Definition at line 3521 of file cmsHarvester.py.
References info().
def cmsHarvester.check_cmssw | ( | self | ) |
def cmsHarvester.check_dataset_list | ( | self | ) |
Check list of dataset names for impossible ones. Two kinds of checks are done: - Checks for things that do not make sense. These lead to errors and skipped datasets. - Sanity checks. For these warnings are issued but the user is considered to be the authoritative expert. Checks performed: - The CMSSW version encoded in the dataset name should match self.cmssw_version. This is critical. - There should be some events in the dataset/run. This is critical in the sense that CRAB refuses to create jobs for zero events. And yes, this does happen in practice. E.g. the reprocessed CRAFT08 datasets contain runs with zero events. - A cursory check is performed to see if the harvesting type makes sense for the data type. This should prevent the user from inadvertently running RelVal for data. - It is not possible to run single-step harvesting jobs on samples that are not fully contained at a single site. - Each dataset/run has to be available at at least one site.
Definition at line 3795 of file cmsHarvester.py.
References info(), relativeConstraints.keys, and MessageLogger_cfi.warning.
def cmsHarvester.check_dbs | ( | self | ) |
def cmsHarvester.check_globaltag | ( | self, | |
globaltag = None |
|||
) |
Check if globaltag exists. Check if globaltag exists as GlobalTag in the database given by self.frontier_connection_name['globaltag']. If globaltag is None, self.globaltag is used instead. If we're going to use reference histograms this method also checks for the existence of the required key in the GlobalTag.
Definition at line 4501 of file cmsHarvester.py.
def cmsHarvester.check_globaltag_contains_ref_hist_key | ( | self, | |
globaltag, | |||
connect_name | |||
) |
Check if globaltag contains the required RefHistos key.
Definition at line 4598 of file cmsHarvester.py.
def cmsHarvester.check_globaltag_exists | ( | self, | |
globaltag, | |||
connect_name | |||
) |
Check if globaltag exists.
Definition at line 4556 of file cmsHarvester.py.
References debug, info(), and cms::dd.split().
def cmsHarvester.check_input_status | ( | self | ) |
Check completeness and correctness of input information. Check that all required information has been specified and that, at least as far as can be easily checked, it makes sense. NOTE: This is also where any default values are applied.
Definition at line 2193 of file cmsHarvester.py.
def cmsHarvester.check_ref_hist_mappings | ( | self | ) |
Make sure all necessary reference histograms exist. Check that for each of the datasets to be processed a reference histogram is specified and that that histogram exists in the database. NOTE: There's a little complication here. Since this whole thing was designed to allow (in principle) harvesting of both data and MC datasets in one go, we need to be careful to check the availability fof reference mappings only for those datasets that need it.
Definition at line 5281 of file cmsHarvester.py.
References info().
def cmsHarvester.check_ref_hist_tag | ( | self, | |
tag_name | |||
) |
Check the existence of tag_name in database connect_name. Check if tag_name exists as a reference histogram tag in the database given by self.frontier_connection_name['refhists'].
Definition at line 4643 of file cmsHarvester.py.
def cmsHarvester.create_and_check_castor_dir | ( | self, | |
castor_dir | |||
) |
Check existence of the give CASTOR dir, if necessary create it. Some special care has to be taken with several things like setting the correct permissions such that CRAB can store the output results. Of course this means that things like /castor/cern.ch/ and user/j/ have to be recognised and treated properly. NOTE: Only CERN CASTOR area (/castor/cern.ch/) supported for the moment. NOTE: This method uses some slightly tricky caching to make sure we don't keep over and over checking the same base paths.
Definition at line 1491 of file cmsHarvester.py.
References debug, and spr.find().
def cmsHarvester.create_and_check_castor_dirs | ( | self | ) |
Make sure all required CASTOR output dirs exist. This checks the CASTOR base dir specified by the user as well as all the subdirs required by the current set of jobs.
Definition at line 1432 of file cmsHarvester.py.
References debug, info(), SiStripPI.max, and MessageLogger_cfi.warning.
def cmsHarvester.create_castor_path_name_common | ( | self, | |
dataset_name | |||
) |
Build the common part of the output path to be used on CASTOR. This consists of the CASTOR area base path specified by the user and a piece depending on the data type (data vs. MC), the harvesting type and the dataset name followed by a piece containing the run number and event count. (See comments in create_castor_path_name_special for details.) This method creates the common part, without run number and event count.
Definition at line 1328 of file cmsHarvester.py.
References python.rootplot.root2matplotlib.replace(), and digitizers_cfi.strip.
def cmsHarvester.create_castor_path_name_special | ( | self, | |
dataset_name, | |||
run_number, | |||
castor_path_common | |||
) |
Create the specialised part of the CASTOR output dir name. NOTE: To avoid clashes with `incremental harvesting' (re-harvesting when a dataset grows) we have to include the event count in the path name. The underlying `problem' is that CRAB does not overwrite existing output files so if the output file already exists CRAB will fail to copy back the output. NOTE: It's not possible to create different kinds of harvesting jobs in a single call to this tool. However, in principle it could be possible to create both data and MC jobs in a single go. NOTE: The number of events used in the path name is the _total_ number of events in the dataset/run at the time of harvesting. If we're doing partial harvesting the final results will reflect lower statistics. This is a) the easiest to code and b) the least likely to lead to confusion if someone ever decides to swap/copy around file blocks between sites.
Definition at line 1382 of file cmsHarvester.py.
def cmsHarvester.create_config_file_name | ( | self, | |
dataset_name, | |||
run_number | |||
) |
Generate the name of the configuration file to be run by CRAB. Depending on the harvesting mode (single-step or two-step) this is the name of the real harvesting configuration or the name of the first-step ME summary extraction configuration.
Definition at line 4065 of file cmsHarvester.py.
Referenced by create_multicrab_config().
def cmsHarvester.create_crab_config | ( | self | ) |
Create a CRAB configuration for a given job. NOTE: This is _not_ a complete (as in: submittable) CRAB configuration. It is used to store the common settings for the multicrab configuration. NOTE: Only CERN CASTOR area (/castor/cern.ch/) is supported. NOTE: According to CRAB, you `Must define exactly two of total_number_of_events, events_per_job, or number_of_jobs.'. For single-step harvesting we force one job, for the rest we don't really care. # BUG BUG BUG # With the current version of CRAB (2.6.1), in which Daniele # fixed the behaviour of no_block_boundary for me, one _has to # specify_ the total_number_of_events and one single site in # the se_white_list. # BUG BUG BUG end
Definition at line 4233 of file cmsHarvester.py.
References spr.find(), and join().
def cmsHarvester.create_es_prefer_snippet | ( | self, | |
dataset_name | |||
) |
Build the es_prefer snippet for the reference histograms. The building of the snippet is wrapped in some care-taking code that figures out the name of the reference histogram set and makes sure the corresponding tag exists.
Definition at line 4689 of file cmsHarvester.py.
References join().
def cmsHarvester.create_harvesting_config | ( | self, | |
dataset_name | |||
) |
Create the Python harvesting configuration for harvesting. The basic configuration is created by Configuration.PyReleaseValidation.ConfigBuilder. (This mimics what cmsDriver.py does.) After that we add some specials ourselves. NOTE: On one hand it may not be nice to circumvent cmsDriver.py, on the other hand cmsDriver.py does not really do anything itself. All the real work is done by the ConfigBuilder so there is not much risk that we miss out on essential developments of cmsDriver in the future.
Definition at line 4724 of file cmsHarvester.py.
References join().
def cmsHarvester.create_harvesting_config_file_name | ( | self, | |
dataset_name | |||
) |
def cmsHarvester.create_harvesting_output_file_name | ( | self, | |
dataset_name, | |||
run_number | |||
) |
Generate the name to be used for the harvesting output file. This harvesting output file is the _final_ ROOT output file containing the harvesting results. In case of two-step harvesting there is an intermediate ME output file as well.
Definition at line 4169 of file cmsHarvester.py.
References spr.find().
def cmsHarvester.create_me_extraction_config | ( | self, | |
dataset_name | |||
) |
def create_harvesting_config_two_step(self, dataset_name): """Create the Python harvesting configuration for two-step harvesting.
"""
config_contents = self.create_harvesting_config_single_step(dataset_name)
return config_contents
Definition at line 4950 of file cmsHarvester.py.
References create_output_file_name(), and join().
def cmsHarvester.create_me_summary_config_file_name | ( | self, | |
dataset_name | |||
) |
def cmsHarvester.create_me_summary_output_file_name | ( | self, | |
dataset_name | |||
) |
Generate the name of the intermediate ME file name to be used in two-step harvesting.
Definition at line 4201 of file cmsHarvester.py.
def cmsHarvester.create_multicrab_block_name | ( | self, | |
dataset_name, | |||
run_number, | |||
index | |||
) |
Create the block name to use for this dataset/run number. This is what appears in the brackets `[]' in multicrab.cfg. It is used as the name of the job and to create output directories.
Definition at line 4216 of file cmsHarvester.py.
def cmsHarvester.create_multicrab_config | ( | self | ) |
Create a multicrab.cfg file for all samples. This creates the contents for a multicrab.cfg file that uses the crab.cfg file (generated elsewhere) for the basic settings and contains blocks for each run of each dataset. # BUG BUG BUG # The fact that it's necessary to specify the se_white_list # and the total_number_of_events is due to our use of CRAB # version 2.6.1. This should no longer be necessary in the # future. # BUG BUG BUG end
Definition at line 4313 of file cmsHarvester.py.
References create_config_file_name(), create_output_file_name(), info(), join(), relativeConstraints.keys, edm.print(), and FastTimerService_cff.range.
def cmsHarvester.create_output_file_name | ( | self, | |
dataset_name, | |||
run_number = None |
|||
) |
Create the name of the output file name to be used. This is the name of the output file of the `first step'. In the case of single-step harvesting this is already the final harvesting output ROOT file. In the case of two-step harvesting it is the name of the intermediary ME summary file.
Definition at line 4125 of file cmsHarvester.py.
Referenced by create_me_extraction_config(), and create_multicrab_config().
def cmsHarvester.dbs_check_dataset_spread | ( | self, | |
dataset_name | |||
) |
def dbs_resolve_dataset_number_of_sites(self, dataset_name): """Ask DBS across how many sites this dataset has been spread out.
This is especially useful to check that we do not submit a job supposed to run on a complete sample that is not contained at a single site. """
assert not self.dbs_api is None
api = self.dbs_api dbs_query = "find count(site) where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: raise Error("ERROR: Could not execute DBS query") try: num_sites = [] class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": num_sites.append(str(attrs["COUNT_STORAGEELEMENT"])) xml.sax.parseString(api_result, Handler()) except SAXParseException: raise Error("ERROR: Could not parse DBS server output")
assert len(num_sites) == 1
num_sites = int(num_sites[0])
return num_sites def dbs_check_dataset_spread(self, dataset_name): """Figure out across how many sites this dataset is spread. NOTE: This is something we need to figure out per run, since we want to submit harvesting jobs per run. Basically three things can happen with a given dataset:
assert not self.dbs_api is None
api = self.dbs_api dbs_query = "find run, run.numevents, site, file.count " \ "where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: msg = "ERROR: Could not execute DBS query" self.logger.fatal(msg) raise Error(msg)
sample_info = {} try: class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": run_number = int(attrs["RUNS_RUNNUMBER"]) site_name = str(attrs["STORAGEELEMENT_SENAME"]) file_count = int(attrs["COUNT_FILES"])
event_count = int(attrs["RUNS_NUMBEROFEVENTS"])
info = (site_name, file_count, event_count) try: sample_info[run_number].append(info) except KeyError: sample_info[run_number] = [info] xml.sax.parseString(api_result, Handler()) except SAXParseException: msg = "ERROR: Could not parse DBS server output" self.logger.fatal(msg) raise Error(msg)
sites = {} for (run_number, site_info) in six.iteritems(sample_info):
unique_file_counts = set([i[1] for i in site_info]) if len(unique_file_counts) == 1:
site_names = [self.pick_a_site([i[0] for i in site_info])] nevents = [site_info[0][2]] else:
site_names = [i[0] for i in site_info] nevents = [i[2] for i in site_info] sites[run_number] = zip(site_names, nevents) self.logger.debug("Sample `%s' spread is:" % dataset_name) run_numbers = sites.keys() run_numbers.sort() for run_number in run_numbers: self.logger.debug(" run # %6d: %d sites (%s)" % \ (run_number, len(sites[run_number]), ", ".join([i[0] for i in sites[run_number]])))
return sites
def dbs_check_dataset_spread_old(self, dataset_name): """Figure out across how many sites this dataset is spread. NOTE: This is something we need to figure out per run, since we want to submit harvesting jobs per run. Basically three things can happen with a given dataset:
assert not self.dbs_api is None
api = self.dbs_api dbs_query = "find run, run.numevents, site, file.count " \ "where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: msg = "ERROR: Could not execute DBS query" self.logger.fatal(msg) raise Error(msg)
sample_info = {} try: class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": run_number = int(attrs["RUNS_RUNNUMBER"]) site_name = str(attrs["STORAGEELEMENT_SENAME"]) file_count = int(attrs["COUNT_FILES"])
event_count = int(attrs["RUNS_NUMBEROFEVENTS"])
info = (site_name, file_count, event_count) try: sample_info[run_number].append(info) except KeyError: sample_info[run_number] = [info] xml.sax.parseString(api_result, Handler()) except SAXParseException: msg = "ERROR: Could not parse DBS server output" self.logger.fatal(msg) raise Error(msg)
sites = {} for (run_number, site_info) in six.iteritems(sample_info):
unique_file_counts = set([i[1] for i in site_info]) if len(unique_file_counts) == 1:
site_names = [self.pick_a_site([i[0] for i in site_info])] nevents = [site_info[0][2]] else:
site_names = [i[0] for i in site_info] nevents = [i[2] for i in site_info] sites[run_number] = zip(site_names, nevents) self.logger.debug("Sample `%s' spread is:" % dataset_name) run_numbers = sites.keys() run_numbers.sort() for run_number in run_numbers: self.logger.debug(" run # %6d: %d site(s) (%s)" % \ (run_number, len(sites[run_number]), ", ".join([i[0] for i in sites[run_number]])))
return sites
Figure out the number of events in each run of this dataset. This is a more efficient way of doing this than calling dbs_resolve_number_of_events for each run.
Definition at line 3077 of file cmsHarvester.py.
References cms::cuda.assert(), and debug.
def cmsHarvester.dbs_resolve_cmssw_version | ( | self, | |
dataset_name | |||
) |
Ask DBS for the CMSSW version used to create this dataset.
Definition at line 2476 of file cmsHarvester.py.
References cms::cuda.assert().
def cmsHarvester.dbs_resolve_dataset_name | ( | self, | |
dataset_name | |||
) |
Use DBS to resolve a wildcarded dataset name.
Definition at line 2420 of file cmsHarvester.py.
References cms::cuda.assert(), and MessageLogger_cfi.warning.
Referenced by build_dataset_list().
def cmsHarvester.dbs_resolve_datatype | ( | self, | |
dataset_name | |||
) |
Ask DBS for the the data type (data or mc) of a given dataset.
Definition at line 2683 of file cmsHarvester.py.
References cms::cuda.assert().
def cmsHarvester.dbs_resolve_globaltag | ( | self, | |
dataset_name | |||
) |
Ask DBS for the globaltag corresponding to a given dataset. # BUG BUG BUG # This does not seem to work for data datasets? E.g. for # /Cosmics/Commissioning08_CRAFT0831X_V1_311_ReReco_FromSuperPointing_v1/RAW-RECO # Probaly due to the fact that the GlobalTag changed during # datataking... BUG BUG BUG end
Definition at line 2627 of file cmsHarvester.py.
References cms::cuda.assert().
def cmsHarvester.dbs_resolve_number_of_events | ( | self, | |
dataset_name, | |||
run_number = None |
|||
) |
Determine the number of events in a given dataset (and run). Ask DBS for the number of events in a dataset. If a run number is specified the number of events returned is that in that run of that dataset. If problems occur we throw an exception. # BUG BUG BUG # Since DBS does not return the number of events correctly, # neither for runs nor for whole datasets, we have to work # around that a bit... # BUG BUG BUG end
Definition at line 2736 of file cmsHarvester.py.
References cms::cuda.assert().
def cmsHarvester.dbs_resolve_runs | ( | self, | |
dataset_name | |||
) |
def dbs_resolve_dataset_number_of_events(self, dataset_name): """Ask DBS across how many events this dataset has been spread out.
This is especially useful to check that we do not submit a job supposed to run on a complete sample that is not contained at a single site. """
assert not self.dbs_api is None
api = self.dbs_api dbs_query = "find count(site) where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: raise Error("ERROR: Could not execute DBS query") try: num_events = [] class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": num_events.append(str(attrs["COUNT_STORAGEELEMENT"])) xml.sax.parseString(api_result, Handler()) except SAXParseException: raise Error("ERROR: Could not parse DBS server output")
assert len(num_events) == 1
num_events = int(num_events[0])
return num_events
Ask DBS for the list of runs in a given dataset. # NOTE: This does not (yet?) skip/remove empty runs. There is # a bug in the DBS entry run.numevents (i.e. it always returns # zero) which should be fixed in the `next DBS release'. # See also: # https://savannah.cern.ch/bugs/?53452 # https://savannah.cern.ch/bugs/?53711
Definition at line 2570 of file cmsHarvester.py.
References cms::cuda.assert(), and createfilelist.int.
def cmsHarvester.escape_dataset_name | ( | self, | |
dataset_name | |||
) |
Escape a DBS dataset name. Escape a DBS dataset name such that it does not cause trouble with the file system. This means turning each `/' into `__', except for the first one which is just removed.
Definition at line 4046 of file cmsHarvester.py.
def cmsHarvester.load_ref_hist_mappings | ( | self | ) |
Load the reference histogram mappings from file. The dataset name to reference histogram name mappings are read from a text file specified in self.ref_hist_mappings_file_name.
Definition at line 5205 of file cmsHarvester.py.
References FrontierConditions_GlobalTag_cff.file, info(), relativeConstraints.keys, SiStripPI.max, and digitizers_cfi.strip.
def cmsHarvester.option_handler_caf_access | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
Set the self.caf_access flag to try and create jobs that run on the CAF.
Definition at line 1104 of file cmsHarvester.py.
def cmsHarvester.option_handler_castor_dir | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
def option_handler_dataset_name(self, option, opt_str, value, parser): """Specify the name(s) of the dataset(s) to be processed.
It is checked to make sure that no dataset name or listfile names are given yet. If all is well (i.e. we still have a clean slate) the dataset name is stored for later use, otherwise a Usage exception is raised. """ if not self.input_method is None: if self.input_method == "dataset": raise Usage("Please only feed me one dataset specification") elif self.input_method == "listfile": raise Usage("Cannot specify both dataset and input list file") else: assert False, "Unknown input method `%s'" % self.input_method self.input_method = "dataset" self.input_name = value self.logger.info("Input method used: %s" % self.input_method)
def option_handler_listfile_name(self, option, opt_str, value, parser): """Specify the input list file containing datasets to be processed. It is checked to make sure that no dataset name or listfile names are given yet. If all is well (i.e. we still have a clean slate) the listfile name is stored for later use, otherwise a Usage exception is raised. """ if not self.input_method is None: if self.input_method == "listfile": raise Usage("Please only feed me one list file") elif self.input_method == "dataset": raise Usage("Cannot specify both dataset and input list file") else: assert False, "Unknown input method `%s'" % self.input_method self.input_method = "listfile" self.input_name = value self.logger.info("Input method used: %s" % self.input_method)
Specify where on CASTOR the output should go. At the moment only output to CERN CASTOR is supported. Eventually the harvested results should go into the central place for DQM on CASTOR anyway.
Definition at line 1062 of file cmsHarvester.py.
def cmsHarvester.option_handler_crab_submission | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
Crab jobs are not created and "submitted automatically",
Definition at line 1132 of file cmsHarvester.py.
def cmsHarvester.option_handler_list_types | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
List all harvesting types and their mappings. This lists all implemented harvesting types with their corresponding mappings to sequence names. This had to be separated out from the help since it depends on the CMSSW version and was making things a bit of a mess. NOTE: There is no way (at least not that I could come up with) to code this in a neat generic way that can be read both by this method and by setup_harvesting_info(). Please try hard to keep these two methods in sync!
Definition at line 1154 of file cmsHarvester.py.
References edm.print().
def cmsHarvester.option_handler_no_t1access | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
Set the self.no_t1access flag to try and create jobs that run without special `t1access' role.
Definition at line 1087 of file cmsHarvester.py.
def cmsHarvester.option_handler_preferred_site | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
Definition at line 1148 of file cmsHarvester.py.
def cmsHarvester.option_handler_saveByLumiSection | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
Set process.dqmSaver.saveByLumiSectiont=1 in cfg harvesting file
Definition at line 1120 of file cmsHarvester.py.
def cmsHarvester.option_handler_sites | ( | self, | |
option, | |||
opt_str, | |||
value, | |||
parser | |||
) |
Definition at line 1142 of file cmsHarvester.py.
def cmsHarvester.parse_cmd_line_options | ( | self | ) |
Definition at line 1871 of file cmsHarvester.py.
def cmsHarvester.pick_a_site | ( | self, | |
sites, | |||
cmssw_version | |||
) |
Definition at line 1707 of file cmsHarvester.py.
References debug, relativeConstraints.error, and info().
def cmsHarvester.process_dataset_ignore_list | ( | self | ) |
Update the list of datasets taking into account the ones to ignore. Both lists have been generated before from DBS and both are assumed to be unique. NOTE: The advantage of creating the ignore list from DBS (in case a regexp is given) and matching that instead of directly matching the ignore criterion against the list of datasets (to consider) built from DBS is that in the former case we're sure that all regexps are treated exactly as DBS would have done without the cmsHarvester. NOTE: This only removes complete samples. Exclusion of single runs is done by the book keeping. So the assumption is that a user never wants to harvest just part (i.e. n out of N runs) of a sample.
Definition at line 3566 of file cmsHarvester.py.
References debug, info(), and relativeConstraints.keys.
def cmsHarvester.process_runs_use_and_ignore_lists | ( | self | ) |
Definition at line 3613 of file cmsHarvester.py.
References info(), edm.print(), and MessageLogger_cfi.warning.
def cmsHarvester.ref_hist_mappings_needed | ( | self, | |
dataset_name = None |
|||
) |
Check if we need to load and check the reference mappings. For data the reference histograms should be taken automatically from the GlobalTag, so we don't need any mappings. For RelVals we need to know a mapping to be used in the es_prefer code snippet (different references for each of the datasets.) WARNING: This implementation is a bit convoluted.
Definition at line 5171 of file cmsHarvester.py.
References relativeConstraints.keys.
def cmsHarvester.run | ( | self | ) |
Definition at line 5522 of file cmsHarvester.py.
References info(), relativeConstraints.keys, update, and contentValuesCheck.values.
def cmsHarvester.setup_dbs | ( | self | ) |
cmd = "dbs search --query=\"find dataset where dataset = impossible"" (status, output) = commands.getstatusoutput(cmd) pdb.set_trace() if status != 0 or \ output.lower().find("unsupported api call") > -1: self.logger.fatal("It seems DBS is not setup...") self.logger.fatal(" %s returns crap:" % cmd) for line in output.split("\n"): self.logger.fatal(" %s" % line) raise Error("ERROR: DBS needs to be setup first!")
Setup the Python side of DBS. For more information see the DBS Python API documentation: https://twiki.cern.ch/twiki/bin/view/CMS/DBSApiDocumentation
Definition at line 2394 of file cmsHarvester.py.
def cmsHarvester.setup_harvesting_info | ( | self | ) |
Fill our dictionary with all info needed to understand harvesting. This depends on the CMSSW version since at some point the names and sequences were modified. NOTE: There is no way (at least not that I could come up with) to code this in a neat generic way that can be read both by this method and by option_handler_list_types(). Please try hard to keep these two methods in sync!
Definition at line 1209 of file cmsHarvester.py.
def cmsHarvester.show_exit_message | ( | self | ) |
Tell the user what to do now, after this part is done. This should provide the user with some (preferably copy-pasteable) instructions on what to do now with the setups and files that have been created.
Definition at line 5469 of file cmsHarvester.py.
References info(), and MessageLogger_cfi.warning.
def cmsHarvester.singlify_datasets | ( | self | ) |
Remove all but the largest part of all datasets. This allows us to harvest at least part of these datasets using single-step harvesting until the two-step approach works.
Definition at line 3742 of file cmsHarvester.py.
References mps_monitormerge.items, SiStripPI.max, contentValuesCheck.values, and MessageLogger_cfi.warning.
def cmsHarvester.write_crab_config | ( | self | ) |
def create_harvesting_config(self, dataset_name): """Create the Python harvesting configuration for a given job.
NOTE: The reason to have a single harvesting configuration per sample is to be able to specify the GlobalTag corresponding to each sample. Since it has been decided that (apart from the prompt reco) datasets cannot contain runs with different GlobalTags, we don't need a harvesting config per run. NOTE: This is the place where we distinguish between single-step and two-step harvesting modes (at least for the Python job configuration). """
if self.harvesting_mode == "single-step": config_contents = self.create_harvesting_config_single_step(dataset_name) elif self.harvesting_mode == "two-step": config_contents = self.create_harvesting_config_two_step(dataset_name) else:
assert False, "ERROR: unknown harvesting mode `%s'" % \ self.harvesting_mode
return config_contents
Write a CRAB job configuration Python file.
Definition at line 5047 of file cmsHarvester.py.
References FrontierConditions_GlobalTag_cff.file, and info().
def cmsHarvester.write_harvesting_config | ( | self, | |
dataset_name | |||
) |
Write a harvesting job configuration Python file. NOTE: This knows nothing about single-step or two-step harvesting. That's all taken care of by create_harvesting_config.
Definition at line 5105 of file cmsHarvester.py.
References create_harvesting_config_file_name(), debug, and FrontierConditions_GlobalTag_cff.file.
def cmsHarvester.write_me_extraction_config | ( | self, | |
dataset_name | |||
) |
Write an ME-extraction configuration Python file. This `ME-extraction' (ME = Monitoring Element) is the first step of the two-step harvesting.
Definition at line 5138 of file cmsHarvester.py.
References create_me_summary_config_file_name(), debug, and FrontierConditions_GlobalTag_cff.file.
def cmsHarvester.write_multicrab_config | ( | self | ) |
Write a multi-CRAB job configuration Python file.
Definition at line 5076 of file cmsHarvester.py.
References FrontierConditions_GlobalTag_cff.file, and info().
|
private |
Definition at line 40 of file cmsHarvester.py.
|
private |
Definition at line 39 of file cmsHarvester.py.
cmsHarvester.all_file_names |
Definition at line 3232 of file cmsHarvester.py.
cmsHarvester.all_sites_found |
Definition at line 1864 of file cmsHarvester.py.
cmsHarvester.caf_access |
Definition at line 1109 of file cmsHarvester.py.
cmsHarvester.castor_base_dir |
Definition at line 1078 of file cmsHarvester.py.
cmsHarvester.castor_path_checks_cache |
self.logger.debug("Path is now `%s'" % \ path)
Definition at line 1604 of file cmsHarvester.py.
cmsHarvester.castor_path_common |
if num_sites == 1: self.logger.info(" sample is contained at a single site") else: self.logger.info(" sample is spread across %d sites" % \ num_sites) if num_sites < 1:
self.logger.warning(" --> skipping dataset which is not " \ "hosted anywhere")
Definition at line 5453 of file cmsHarvester.py.
cmsHarvester.castor_paths |
Definition at line 5457 of file cmsHarvester.py.
cmsHarvester.cmd |
Definition at line 1633 of file cmsHarvester.py.
cmsHarvester.cmd_line_opts |
Definition at line 2170 of file cmsHarvester.py.
cmsHarvester.cmssw_version |
Definition at line 2348 of file cmsHarvester.py.
cmsHarvester.complete_sites |
site_names_ref = set(files_info[run_number].values()[0][1]) for site_names_tmp in files_info[run_number].values()[1:]: if set(site_names_tmp[1]) != site_names_ref: mirrored = False break
Definition at line 3277 of file cmsHarvester.py.
cmsHarvester.config_contents |
if self.harvesting_mode == "two-step": castor_dir = self.datasets_information[dataset_name] \ ["castor_path"][run] customisations.append("") customisations.append("# This is the second step (the real") customisations.append("# harvesting step) of a two-step") customisations.append("# harvesting procedure.")
customisations.append("import pdb")
customisations.append("import commands") customisations.append("import os") customisations.append("castor_dir = \"s"" % castor_dir) customisations.append("cmd = "rfdir s" % castor_dir") customisations.append("(status, output) = commands.getstatusoutput(cmd)") customisations.append("if status != 0:") customisations.append(" print "ERROR"") customisations.append(" raise Exception, "ERROR"") customisations.append("file_names = [os.path.join("rfio:s" % path, i) for i in output.split() if i.startswith("EDM_summary") and i.endswith(".root")]") #customisations.append("pdb.set_trace()") customisations.append("process.source.fileNames = cms.untracked.vstring(*file_names)") customisations.append("")
Definition at line 4926 of file cmsHarvester.py.
cmsHarvester.config_file_name |
pdb.set_trace() if self.datasets_information[dataset_name] \ ["mirrored"][run_number] == False: config_file_name = config_file_name.replace(".py", "_partial.py")
Definition at line 4086 of file cmsHarvester.py.
cmsHarvester.crab_submission |
Definition at line 1136 of file cmsHarvester.py.
cmsHarvester.dataset_names_after_checks |
Definition at line 4031 of file cmsHarvester.py.
cmsHarvester.dataset_names_after_checks_tmp |
Definition at line 4024 of file cmsHarvester.py.
cmsHarvester.datasets_information |
Definition at line 5341 of file cmsHarvester.py.
cmsHarvester.datasets_to_ignore |
Definition at line 3458 of file cmsHarvester.py.
cmsHarvester.datasets_to_use |
Definition at line 4040 of file cmsHarvester.py.
cmsHarvester.dbs_api |
Definition at line 2407 of file cmsHarvester.py.
cmsHarvester.empty_runs |
Definition at line 4008 of file cmsHarvester.py.
cmsHarvester.exit_code |
Definition at line 5690 of file cmsHarvester.py.
cmsHarvester.file_name |
Definition at line 3176 of file cmsHarvester.py.
cmsHarvester.files_at_site |
Definition at line 3236 of file cmsHarvester.py.
cmsHarvester.files_info |
Definition at line 3162 of file cmsHarvester.py.
cmsHarvester.files_without_sites |
Definition at line 3202 of file cmsHarvester.py.
cmsHarvester.globaltag |
Definition at line 2308 of file cmsHarvester.py.
cmsHarvester.harvesting_info |
Definition at line 1315 of file cmsHarvester.py.
cmsHarvester.harvesting_mode |
Definition at line 2217 of file cmsHarvester.py.
cmsHarvester.harvesting_type |
Definition at line 3859 of file cmsHarvester.py.
cmsHarvester.Jsonfilename |
Definition at line 3708 of file cmsHarvester.py.
cmsHarvester.Jsonlumi |
cmsHarvester.mirrored |
Definition at line 3223 of file cmsHarvester.py.
cmsHarvester.msg |
class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": site_name = str(attrs["STORAGEELEMENT_SENAME"])
Definition at line 1641 of file cmsHarvester.py.
cmsHarvester.nevents |
Definition at line 3177 of file cmsHarvester.py.
Referenced by SiStripHitEffFromCalibTree.algoAnalyze(), DTT0CalibrationRMS.analyze(), DTDigiTask.analyze(), DTResolutionAnalysisTest.beginRun(), DTLocalTriggerBaseTest.beginRun(), DTEfficiencyTest.beginRun(), DTDigiTask.dqmBeginRun(), DTNoiseAnalysisTest.dqmEndLuminosityBlock(), ZDCDigiStudy.dqmEndRun(), DTChamberEfficiencyTest.DTChamberEfficiencyTest(), DTNoiseAnalysisTest.DTNoiseAnalysisTest(), DTOccupancyTest.DTOccupancyTest(), DTOccupancyTestML.DTOccupancyTestML(), DTResolutionTest.DTResolutionTest(), DTRunConditionVarClient.DTRunConditionVarClient(), DTT0CalibrationRMS.DTT0CalibrationRMS(), gen::BaseHadronizer.generateLHE(), popcon::EcalPedestalsHandler.readPedestalTimestamp(), popcon::EcalPedestalsHandler.readPedestalTree(), DTLocalTriggerBaseTest.setConfig(), edm::IndexIntoFile.setNumberOfEvents(), TagProbeFitter.setSplitMode(), StatisticsPlots(), SummaryHisto(), DTChamberEfficiencyTest.~DTChamberEfficiencyTest(), DTDigiTask.~DTDigiTask(), DTEfficiencyTest.~DTEfficiencyTest(), DTLocalTriggerBaseTest.~DTLocalTriggerBaseTest(), DTNoiseAnalysisTest.~DTNoiseAnalysisTest(), DTResolutionAnalysisTest.~DTResolutionAnalysisTest(), and DTResolutionTest.~DTResolutionTest().
cmsHarvester.non_t1access |
Definition at line 1093 of file cmsHarvester.py.
cmsHarvester.nr_max_sites |
Definition at line 1144 of file cmsHarvester.py.
cmsHarvester.num_events_catalog |
Definition at line 3216 of file cmsHarvester.py.
cmsHarvester.num_events_dataset |
Definition at line 3986 of file cmsHarvester.py.
cmsHarvester.num_sites |
if self.datasets_information[dataset_name]["num_events"][run_number] != 0: pdb.set_trace() DEBUG DEBUG DEBUG end
Definition at line 3956 of file cmsHarvester.py.
cmsHarvester.option_parser |
Definition at line 1880 of file cmsHarvester.py.
cmsHarvester.output |
Definition at line 1634 of file cmsHarvester.py.
cmsHarvester.path |
else:
self.logger.debug(" accepting") Add piece to the path we're building. self.logger.debug("!!! Skip path piece `%s'? %s" % \ (piece, str(skip_this_path_piece))) self.logger.debug("Adding piece to path...")
Definition at line 1593 of file cmsHarvester.py.
cmsHarvester.permissions |
Definition at line 1650 of file cmsHarvester.py.
Referenced by cond::CredentialStore.updatePrincipal().
cmsHarvester.permissions_new |
Definition at line 1680 of file cmsHarvester.py.
cmsHarvester.permissions_target |
Definition at line 1674 of file cmsHarvester.py.
cmsHarvester.preferred_site |
Definition at line 1150 of file cmsHarvester.py.
cmsHarvester.ref_hist_mappings_file_name |
Definition at line 2259 of file cmsHarvester.py.
cmsHarvester.run_number |
Definition at line 3175 of file cmsHarvester.py.
cmsHarvester.runs_to_ignore |
Definition at line 3555 of file cmsHarvester.py.
cmsHarvester.runs_to_use |
Definition at line 3531 of file cmsHarvester.py.
cmsHarvester.saveByLumiSection |
Definition at line 1123 of file cmsHarvester.py.
cmsHarvester.site_names |
Definition at line 3218 of file cmsHarvester.py.
cmsHarvester.sites_with_complete_copies |
Definition at line 3234 of file cmsHarvester.py.
cmsHarvester.skip_this_path_piece |
self.logger.debug("Checking CASTOR path piece `%s'" % \ piece)
self.logger.debug("Checking `%s' against `%s'" % \ (castor_path_pieces[piece_index + check_size], castor_paths_dont_touch[check_size])) self.logger.debug(" skipping")
Definition at line 1585 of file cmsHarvester.py.
cmsHarvester.status |
Definition at line 1634 of file cmsHarvester.py.
cmsHarvester.tmp |
tmp = self.datasets_information[dataset_name] \ ["num_events"]
for dataset_name in self.datasets_to_use.keys(): self.datasets_to_use[dataset_name] = self.datasets_information[dataset_name]["runs"]
OBSOLETE OBSOLETE OBSOLETE end
Definition at line 3983 of file cmsHarvester.py.
cmsHarvester.traceback_string |
Definition at line 5715 of file cmsHarvester.py.
cmsHarvester.twiki_url |
Definition at line 43 of file cmsHarvester.py.