Classes
class	CMSHarvester
	CMSHarvester class. More...

class	CMSHarvesterHelpFormatter
	Helper class: CMSHarvesterHelpFormatter. More...

class	DBSXMLHandler
	Helper class: DBSXMLHandler. More...

class	Error
	Helper class: Error exception. More...

class	Usage
	Helper class: Usage exception. More...

Functions
def	build_dataset_ignore_list

def	build_dataset_list
	def dbs_check_dataset_num_events(self, dataset_name): """Figure out the number of events in each run of this dataset. More...

def	build_dataset_use_list

def	build_datasets_information

def	build_runs_ignore_list

def	build_runs_list

def	build_runs_use_list

def	check_cmssw

def	check_dataset_list

def	check_dbs

def	check_globaltag

def	check_globaltag_contains_ref_hist_key

def	check_globaltag_exists

def	check_input_status

def	check_ref_hist_mappings

def	check_ref_hist_tag

def	create_and_check_castor_dir

def	create_and_check_castor_dirs

def	create_castor_path_name_common

def	create_castor_path_name_special

def	create_config_file_name

def	create_crab_config

def	create_es_prefer_snippet

def	create_harvesting_config

def	create_harvesting_config_file_name

def	create_harvesting_output_file_name

def	create_me_extraction_config
	More...

def	create_me_summary_config_file_name

def	create_me_summary_output_file_name

def	create_multicrab_block_name

def	create_multicrab_config

def	create_output_file_name

def	dbs_check_dataset_spread
	def dbs_resolve_dataset_number_of_sites(self, dataset_name): """Ask DBS across how many sites this dataset has been spread out. More...

def	dbs_resolve_cmssw_version

def	dbs_resolve_dataset_name

def	dbs_resolve_datatype

def	dbs_resolve_globaltag

def	dbs_resolve_number_of_events

def	dbs_resolve_runs
	def dbs_resolve_dataset_number_of_events(self, dataset_name): """Ask DBS across how many events this dataset has been spread out. More...

def	escape_dataset_name

def	load_ref_hist_mappings

def	option_handler_caf_access

def	option_handler_castor_dir
	def option_handler_dataset_name(self, option, opt_str, value, parser): """Specify the name(s) of the dataset(s) to be processed. More...

def	option_handler_crab_submission

def	option_handler_list_types

def	option_handler_no_t1access

def	option_handler_preferred_site

def	option_handler_saveByLumiSection

def	option_handler_sites

def	parse_cmd_line_options

def	pick_a_site

def	process_dataset_ignore_list

def	process_runs_use_and_ignore_lists

def	ref_hist_mappings_needed

def	run

def	setup_dbs
	Now we try to do a very simple DBS search. More...

def	setup_harvesting_info

def	show_exit_message

def	singlify_datasets

def	write_crab_config
	def create_harvesting_config(self, dataset_name): """Create the Python harvesting configuration for a given job. More...

def	write_harvesting_config

def	write_me_extraction_config

def	write_multicrab_config

Variables
string	__author__ = "Jeroen Hegeman (jeroen.hegeman@cern.ch),"

string	__version__ = "3.8.2p1"
	File : cmsHarvest.py Authors : Jeroen Hegeman (jeroe.nosp@m.n.he.nosp@m.geman.nosp@m.@cer.nosp@m.n.ch) Niklas Pietsch (nikla.nosp@m.s.pi.nosp@m.etsch.nosp@m.@des.nosp@m.y.de) Franseco Costanza (franc.nosp@m.esco.nosp@m..cost.nosp@m.anza.nosp@m.@desy.nosp@m..de) Last change: 20100308. More...

string	action = "callback"

list	all_file_names = files_info[run_number]

list	all_t1

	caf_access

	callback = self.option_handler_input_Jsonrunfile,

	castor_base_dir

list	castor_dir = self.datasets_information[dataset_name]
	CRAB More...

tuple	castor_path_common = self.create_castor_path_name_common(dataset_name)
	DEBUG DEBUG DEBUG This is probably only useful to make sure we don't muck things up, right? Figure out across how many sites this sample has been spread. More...

tuple	castor_paths

	castor_prefix = self.castor_prefix

string	cmd = "rfstat %s"
	self.logger.debug("Path is now `%s'" % \ path) More...

list	cmssw_version = self.datasets_information[dataset_name]

list	complete_sites
	site_names_ref = set(files_info[run_number].values()[0][1]) for site_names_tmp in files_info[run_number].values()[1:]: if set(site_names_tmp[1]) != site_names_ref: mirrored = False break More...

tuple	config_builder = ConfigBuilder(config_options, with_input=True)

	config_contents = config_builder.pythonCfgCode
	In case this file is the second step (the real harvesting step) of the two-step harvesting we have to tell it to use our local files. More...

tuple	config_file_name = self.create_me_summary_config_file_name(dataset_name)
	Only add the alarming piece to the file name if this is a spread-out dataset. More...

list	connect_name = self.frontier_connection_name["globaltag"]

dictionary	country_codes

string	crab_config = "\n"
	CRAB More...

	crab_submission

list	customisations = [""]

tuple	dataset_name_escaped = self.escape_dataset_name(dataset_name)

tuple	dataset_names = self.datasets_to_use.keys()

	dataset_names_after_checks = dataset_names_after_checks_tmp

tuple	dataset_names_after_checks_tmp = copy.deepcopy(dataset_names_after_checks)

	datasets_information

	datasets_to_ignore

	datasets_to_use

list	datatype = self.datasets_information[dataset_name]

	dbs_api

tuple	empty_runs = dict(tmp)

tuple	es_prefer_snippet = self.create_es_prefer_snippet(dataset_name)

int	exit_code = 1

list	file_name = handler.results["file.name"]

list	files_at_site

dictionary	files_info = {}

list	files_without_sites

list	globaltag = self.datasets_information[dataset_name]

	harvesting_info

	harvesting_mode

	harvesting_type

string	help = "Jsonfile containing dictionary of run/lumisections pairs. "

string	index = "site_%02d"

	Jsonfilename

	Jsonlumi

int	loop = 0
	CMSSW More...

string	marker = "\n"

list	marker_lines = []

string	metavar = "JSONRUNFILE"

	mirrored = None

string	msg = "Could not create directory `%s'"
	class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": site_name = str(attrs["STORAGEELEMENT_SENAME"]) TODO TODO TODO Ugly hack to get around cases like this: $ dbs search –query="find dataset, site, file.count where dataset=/RelValQCD_Pt_3000_3500/CMSSW_3_3_0_pre1-STARTUP31X_V4-v1/GEN-SIM-RECO" Using DBS instance at: http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet Processing ... More...

tuple	multicrab_block_name

string	multicrab_config = "\n"

list	multicrab_config_lines = []

tuple	nevents = int(handler.results["file.numevents"][index])

	non_t1access

	nr_max_sites

dictionary	num_events_catalog = {}

tuple	num_events_dataset = sum(tmp)

tuple	num_sites
	if self.datasets_information[dataset_name]["num_events"][run_number] != 0: pdb.set_trace() DEBUG DEBUG DEBUG end More...

int	number_max_sites = self.nr_max_sites+1

	option_parser

	output_file_name = self.\

tuple	path = os.path.join(path, piece)
	else: Piece not in the list, fine. More...

tuple	permissions = extract_permissions(output)

list	permissions_new = []

string	permissions_target = "775"

	preferred_site

	ref_hist_mappings_file_name

tuple	run_number = int(handler.results["run.number"][index])

list	runs = self.datasets_to_use[dataset_name]

	runs_to_ignore

	runs_to_use

	saveByLumiSection

tuple	se_name = choice(t1_sites)

string	sep = "#"

	site_name = None

tuple	site_names = list(set([j for i in files_info[run_number].values() for j in i[1]]))

list	sites = [self.preferred_site]

list	sites_forbidden = []

list	sites_with_complete_copies = []

	skip_this_path_piece = True
	self.logger.debug("Checking CASTOR path piece `%s'" % \ piece) More...

list	t1_sites = []

list	tmp
	TODO TODO TODO Need to think about where this should go, but somewhere we have to move over the fact that we want to process all runs for each dataset that we're considering. More...

tuple	traceback_string = traceback.format_exc()

string	twiki_url = "https://twiki.cern.ch/twiki/bin/view/CMS/CmsHarvester"

string	type = "string"

tuple	use_es_prefer = (self.harvesting_type == "RelVal")

	use_refs = use_es_preferor\

	UserName = output

	workflow_name = dataset_name

Function Documentation

def cmsHarvester.build_dataset_ignore_list ( self )

Build a list of datasets to ignore.

NOTE: We should always have a list of datasets to process, but
it may be that we don't have a list of datasets to ignore.

Definition at line 3445 of file cmsHarvester.py.

def cmsHarvester.build_dataset_list	(	self,
		input_method,
		input_name
	)

def dbs_check_dataset_num_events(self, dataset_name): """Figure out the number of events in each run of this dataset.

     This is a more efficient way of doing this than calling
    dbs_resolve_number_of_events for each run.         # BUG BUG BUG

This might very well not work at all for spread-out samples. (?)

BUG BUG BUG end

""" # DEBUG DEBUG DEBUG

If we get here DBS should have been set up already.

assert not self.dbs_api is None

DEBUG DEBUG DEBUG end

api = self.dbs_api dbs_query = "find run.number, file.name, file.numevents where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: msg = "ERROR: Could not execute DBS query" self.logger.fatal(msg) raise Error(msg) try: files_info = {} class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": run_number = int(attrs["RUNS_RUNNUMBER"]) file_name = str(attrs["FILES_LOGICALFILENAME"]) nevents = int(attrs["FILES_NUMBEROFEVENTS"]) try: files_info[run_number][file_name] = nevents except KeyError: files_info[run_number] = {file_name: nevents} xml.sax.parseString(api_result, Handler()) except SAXParseException: msg = "ERROR: Could not parse DBS server output" self.logger.fatal(msg) raise Error(msg) num_events_catalog = {} for run_number in files_info.keys(): num_events_catalog[run_number] = sum(files_info[run_number].values()) # End of dbs_check_dataset_num_events. return num_events_catalog End of old version.

Build a list of all datasets to be processed.

Definition at line 3358 of file cmsHarvester.py.

def cmsHarvester.build_dataset_use_list ( self )

Build a list of datasets to process.

Definition at line 3422 of file cmsHarvester.py.

def cmsHarvester.build_datasets_information ( self )

Obtain all information on the datasets that we need to run.

Use DBS to figure out all required information on our
datasets, like the run numbers and the GlobalTag. All
information is stored in the datasets_information member
variable.

Definition at line 5323 of file cmsHarvester.py.

def cmsHarvester.build_runs_ignore_list ( self )

Build a list of runs to ignore.

NOTE: We should always have a list of runs to process, but
it may be that we don't have a list of runs to ignore.

Definition at line 3543 of file cmsHarvester.py.

def cmsHarvester.build_runs_list	(	self,
		input_method,
		input_name
	)

Definition at line 3471 of file cmsHarvester.py.

def cmsHarvester.build_runs_use_list ( self )

Build a list of runs to process.

Definition at line 3522 of file cmsHarvester.py.

def cmsHarvester.check_cmssw ( self )

Check if CMSSW is setup.

Definition at line 2333 of file cmsHarvester.py.

def cmsHarvester.check_dataset_list ( self )

Check list of dataset names for impossible ones.

Two kinds of checks are done:
- Checks for things that do not make sense. These lead to
  errors and skipped datasets.
- Sanity checks. For these warnings are issued but the user is
  considered to be the authoritative expert.

Checks performed:
- The CMSSW version encoded in the dataset name should match
  self.cmssw_version. This is critical.
- There should be some events in the dataset/run. This is
  critical in the sense that CRAB refuses to create jobs for
  zero events. And yes, this does happen in practice. E.g. the
  reprocessed CRAFT08 datasets contain runs with zero events.
- A cursory check is performed to see if the harvesting type
  makes sense for the data type. This should prevent the user
  from inadvertently running RelVal for data.
- It is not possible to run single-step harvesting jobs on
  samples that are not fully contained at a single site.
- Each dataset/run has to be available at at least one site.

Definition at line 3796 of file cmsHarvester.py.

def cmsHarvester.check_dbs ( self )

Check if DBS is setup.

Definition at line 2356 of file cmsHarvester.py.

def cmsHarvester.check_globaltag	(	self,
		globaltag = `None`
	)

Check if globaltag exists.

Check if globaltag exists as GlobalTag in the database given
by self.frontier_connection_name['globaltag']. If globaltag is
None, self.globaltag is used instead.

If we're going to use reference histograms this method also
checks for the existence of the required key in the GlobalTag.

Definition at line 4503 of file cmsHarvester.py.

def cmsHarvester.check_globaltag_contains_ref_hist_key	(	self,
		globaltag,
		connect_name
	)

Check if globaltag contains the required RefHistos key.

Definition at line 4600 of file cmsHarvester.py.

def cmsHarvester.check_globaltag_exists	(	self,
		globaltag,
		connect_name
	)

Check if globaltag exists.

Definition at line 4558 of file cmsHarvester.py.

def cmsHarvester.check_input_status ( self )

Check completeness and correctness of input information.

Check that all required information has been specified and
that, at least as far as can be easily checked, it makes
sense.

NOTE: This is also where any default values are applied.

Definition at line 2192 of file cmsHarvester.py.

def cmsHarvester.check_ref_hist_mappings ( self )

Make sure all necessary reference histograms exist.

Check that for each of the datasets to be processed a
reference histogram is specified and that that histogram
exists in the database.

NOTE: There's a little complication here. Since this whole
thing was designed to allow (in principle) harvesting of both
data and MC datasets in one go, we need to be careful to check
the availability fof reference mappings only for those
datasets that need it.

Definition at line 5283 of file cmsHarvester.py.

def cmsHarvester.check_ref_hist_tag	(	self,
		tag_name
	)

Check the existence of tag_name in database connect_name.

Check if tag_name exists as a reference histogram tag in the
database given by self.frontier_connection_name['refhists'].

Definition at line 4645 of file cmsHarvester.py.

def cmsHarvester.create_and_check_castor_dir	(	self,
		castor_dir
	)

Check existence of the give CASTOR dir, if necessary create
it.

Some special care has to be taken with several things like
setting the correct permissions such that CRAB can store the
output results. Of course this means that things like
/castor/cern.ch/ and user/j/ have to be recognised and treated
properly.

NOTE: Only CERN CASTOR area (/castor/cern.ch/) supported for
the moment.

NOTE: This method uses some slightly tricky caching to make
sure we don't keep over and over checking the same base paths.

Definition at line 1489 of file cmsHarvester.py.

def cmsHarvester.create_and_check_castor_dirs ( self )

Make sure all required CASTOR output dirs exist.

This checks the CASTOR base dir specified by the user as well
as all the subdirs required by the current set of jobs.

Definition at line 1429 of file cmsHarvester.py.

def cmsHarvester.create_castor_path_name_common	(	self,
		dataset_name
	)

Build the common part of the output path to be used on
CASTOR.

This consists of the CASTOR area base path specified by the
user and a piece depending on the data type (data vs. MC), the
harvesting type and the dataset name followed by a piece
containing the run number and event count. (See comments in
create_castor_path_name_special for details.) This method
creates the common part, without run number and event count.

Definition at line 1325 of file cmsHarvester.py.

def cmsHarvester.create_castor_path_name_special	(	self,
		dataset_name,
		run_number,
		castor_path_common
	)

Create the specialised part of the CASTOR output dir name.

NOTE: To avoid clashes with `incremental harvesting'
(re-harvesting when a dataset grows) we have to include the
event count in the path name. The underlying `problem' is that
CRAB does not overwrite existing output files so if the output
file already exists CRAB will fail to copy back the output.

NOTE: It's not possible to create different kinds of
harvesting jobs in a single call to this tool. However, in
principle it could be possible to create both data and MC jobs
in a single go.

NOTE: The number of events used in the path name is the
_total_ number of events in the dataset/run at the time of
harvesting. If we're doing partial harvesting the final
results will reflect lower statistics. This is a) the easiest
to code and b) the least likely to lead to confusion if
someone ever decides to swap/copy around file blocks between
sites.

Definition at line 1381 of file cmsHarvester.py.

def cmsHarvester.create_config_file_name	(	self,
		dataset_name,
		run_number
	)

Generate the name of the configuration file to be run by
CRAB.

Depending on the harvesting mode (single-step or two-step)
this is the name of the real harvesting configuration or the
name of the first-step ME summary extraction configuration.

Definition at line 4066 of file cmsHarvester.py.

def cmsHarvester.create_crab_config ( self )

Create a CRAB configuration for a given job.

NOTE: This is _not_ a complete (as in: submittable) CRAB
configuration. It is used to store the common settings for the
multicrab configuration.

NOTE: Only CERN CASTOR area (/castor/cern.ch/) is supported.

NOTE: According to CRAB, you `Must define exactly two of
total_number_of_events, events_per_job, or
number_of_jobs.'. For single-step harvesting we force one job,
for the rest we don't really care.

# BUG BUG BUG
# With the current version of CRAB (2.6.1), in which Daniele
# fixed the behaviour of no_block_boundary for me, one _has to
# specify_ the total_number_of_events and one single site in
# the se_white_list.
# BUG BUG BUG end

Definition at line 4234 of file cmsHarvester.py.

def cmsHarvester.create_es_prefer_snippet	(	self,
		dataset_name
	)

Build the es_prefer snippet for the reference histograms.

The building of the snippet is wrapped in some care-taking
code that figures out the name of the reference histogram set
and makes sure the corresponding tag exists.

Definition at line 4691 of file cmsHarvester.py.

def cmsHarvester.create_harvesting_config	(	self,
		dataset_name
	)

Create the Python harvesting configuration for harvesting.

The basic configuration is created by
Configuration.PyReleaseValidation.ConfigBuilder. (This mimics
what cmsDriver.py does.) After that we add some specials
ourselves.

NOTE: On one hand it may not be nice to circumvent
cmsDriver.py, on the other hand cmsDriver.py does not really
do anything itself. All the real work is done by the
ConfigBuilder so there is not much risk that we miss out on
essential developments of cmsDriver in the future.

Definition at line 4726 of file cmsHarvester.py.

def cmsHarvester.create_harvesting_config_file_name	(	self,
		dataset_name
	)

Definition at line 4098 of file cmsHarvester.py.

def cmsHarvester.create_harvesting_output_file_name	(	self,
		dataset_name,
		run_number
	)

Generate the name to be used for the harvesting output file.

This harvesting output file is the _final_ ROOT output file
containing the harvesting results. In case of two-step
harvesting there is an intermediate ME output file as well.

Definition at line 4170 of file cmsHarvester.py.

def cmsHarvester.create_me_extraction_config	(	self,
		dataset_name
	)

def create_harvesting_config_two_step(self, dataset_name):
    """Create the Python harvesting configuration for two-step
    harvesting.         """         # BUG BUG BUG
    config_contents = self.create_harvesting_config_single_step(dataset_name)

BUG BUG BUG end

End of create_harvesting_config_two_step.

return config_contents

Definition at line 4952 of file cmsHarvester.py.

def cmsHarvester.create_me_summary_config_file_name	(	self,
		dataset_name
	)

Definition at line 4112 of file cmsHarvester.py.

def cmsHarvester.create_me_summary_output_file_name	(	self,
		dataset_name
	)

Generate the name of the intermediate ME file name to be
used in two-step harvesting.

Definition at line 4202 of file cmsHarvester.py.

def cmsHarvester.create_multicrab_block_name	(	self,
		dataset_name,
		run_number,
		index
	)

Create the block name to use for this dataset/run number.

This is what appears in the brackets `[]' in multicrab.cfg. It
is used as the name of the job and to create output
directories.

Definition at line 4217 of file cmsHarvester.py.

def cmsHarvester.create_multicrab_config ( self )

Create a multicrab.cfg file for all samples.

This creates the contents for a multicrab.cfg file that uses
the crab.cfg file (generated elsewhere) for the basic settings
and contains blocks for each run of each dataset.

# BUG BUG BUG
# The fact that it's necessary to specify the se_white_list
# and the total_number_of_events is due to our use of CRAB
# version 2.6.1. This should no longer be necessary in the
# future.
# BUG BUG BUG end

Definition at line 4314 of file cmsHarvester.py.

def cmsHarvester.create_output_file_name	(	self,
		dataset_name,
		run_number = `None`
	)

Create the name of the output file name to be used.

This is the name of the output file of the `first step'. In
the case of single-step harvesting this is already the final
harvesting output ROOT file. In the case of two-step
harvesting it is the name of the intermediary ME summary
file.

Definition at line 4126 of file cmsHarvester.py.

def cmsHarvester.dbs_check_dataset_spread	(	self,
		dataset_name
	)

def dbs_resolve_dataset_number_of_sites(self, dataset_name): """Ask DBS across how many sites this dataset has been spread out.

     This is especially useful to check that we do not submit a job
    supposed to run on a complete sample that is not contained at
    a single site.         """         # DEBUG DEBUG DEBUG

If we get here DBS should have been set up already.

assert not self.dbs_api is None

DEBUG DEBUG DEBUG end

api = self.dbs_api dbs_query = "find count(site) where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: raise Error("ERROR: Could not execute DBS query") try: num_sites = [] class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": num_sites.append(str(attrs["COUNT_STORAGEELEMENT"])) xml.sax.parseString(api_result, Handler()) except SAXParseException: raise Error("ERROR: Could not parse DBS server output") # DEBUG DEBUG DEBUG assert len(num_sites) == 1

DEBUG DEBUG DEBUG end

num_sites = int(num_sites[0]) # End of dbs_resolve_dataset_number_of_sites. return num_sites def dbs_check_dataset_spread(self, dataset_name): """Figure out across how many sites this dataset is spread. NOTE: This is something we need to figure out per run, since we want to submit harvesting jobs per run. Basically three things can happen with a given dataset:

the whole dataset is available on a single site,
the whole dataset is available (mirrored) at multiple sites,
the dataset is spread across multiple sites and there is no single site containing the full dataset in one place. NOTE: If all goes well, it should not be possible that anything but a full dataset is mirrored. So we ignore the possibility in which for example one site contains the full dataset and two others mirror half of it. ANOTHER NOTE: According to some people this last case could actually happen. I will not design for it, but make sure it ends up as a false negative, in which case we just loose some efficiency and treat the dataset (unnecessarily) as spread-out. We don't really care about the first two possibilities, but in the third case we need to make sure to run the harvesting in two-step mode. This method checks with DBS which of the above cases is true for the dataset name given, and returns a 1 for the first two cases, and the number of sites across which the dataset is spread for the third case. The way in which this is done is by asking how many files each site has for the dataset. In the first case there is only one site, in the second case all sites should have the same number of files (i.e. the total number of files in the dataset) and in the third case the file counts from all sites should add up to the total file count for the dataset. """ # DEBUG DEBUG DEBUG
If we get here DBS should have been set up already.

assert not self.dbs_api is None

DEBUG DEBUG DEBUG end

api = self.dbs_api dbs_query = "find run, run.numevents, site, file.count " \ "where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: msg = "ERROR: Could not execute DBS query" self.logger.fatal(msg) raise Error(msg) # Index things by run number. No cross-check is done to make

sure we get results for each and every run in the

dataset. I'm not sure this would make sense since we'd be

cross-checking DBS info with DBS info anyway. Note that we

use the file count per site to see if we're dealing with an

incomplete vs. a mirrored dataset.

sample_info = {} try: class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": run_number = int(attrs["RUNS_RUNNUMBER"]) site_name = str(attrs["STORAGEELEMENT_SENAME"]) file_count = int(attrs["COUNT_FILES"])

BUG BUG BUG

Doh! For some reason DBS never returns any other

event count than zero.

event_count = int(attrs["RUNS_NUMBEROFEVENTS"])

BUG BUG BUG end

info = (site_name, file_count, event_count) try: sample_info[run_number].append(info) except KeyError: sample_info[run_number] = [info] xml.sax.parseString(api_result, Handler()) except SAXParseException: msg = "ERROR: Could not parse DBS server output" self.logger.fatal(msg) raise Error(msg) # Now translate this into a slightly more usable mapping. sites = {} for (run_number, site_info) in sample_info.iteritems():

Quick-n-dirty trick to see if all file counts are the

same.

unique_file_counts = set([i[1] for i in site_info]) if len(unique_file_counts) == 1:

Okay, so this must be a mirrored dataset.

We have to pick one but we have to be careful. We

cannot submit to things like a T0, a T1, or CAF.

site_names = [self.pick_a_site([i[0] for i in site_info])] nevents = [site_info[0][2]] else:

Looks like this is a spread-out sample.

site_names = [i[0] for i in site_info] nevents = [i[2] for i in site_info] sites[run_number] = zip(site_names, nevents) self.logger.debug("Sample `%s' spread is:" % dataset_name) run_numbers = sites.keys() run_numbers.sort() for run_number in run_numbers: self.logger.debug(" run # %6d: %d sites (%s)" % \ (run_number, len(sites[run_number]), ", ".join([i[0] for i in sites[run_number]]))) # End of dbs_check_dataset_spread. return sites # DEBUG DEBUG DEBUG

Just kept for debugging now.

def dbs_check_dataset_spread_old(self, dataset_name): """Figure out across how many sites this dataset is spread. NOTE: This is something we need to figure out per run, since we want to submit harvesting jobs per run. Basically three things can happen with a given dataset:

the whole dataset is available on a single site,
the whole dataset is available (mirrored) at multiple sites,
the dataset is spread across multiple sites and there is no single site containing the full dataset in one place. NOTE: If all goes well, it should not be possible that anything but a full dataset is mirrored. So we ignore the possibility in which for example one site contains the full dataset and two others mirror half of it. ANOTHER NOTE: According to some people this last case could actually happen. I will not design for it, but make sure it ends up as a false negative, in which case we just loose some efficiency and treat the dataset (unnecessarily) as spread-out. We don't really care about the first two possibilities, but in the third case we need to make sure to run the harvesting in two-step mode. This method checks with DBS which of the above cases is true for the dataset name given, and returns a 1 for the first two cases, and the number of sites across which the dataset is spread for the third case. The way in which this is done is by asking how many files each site has for the dataset. In the first case there is only one site, in the second case all sites should have the same number of files (i.e. the total number of files in the dataset) and in the third case the file counts from all sites should add up to the total file count for the dataset. """ # DEBUG DEBUG DEBUG
If we get here DBS should have been set up already.

assert not self.dbs_api is None

DEBUG DEBUG DEBUG end

api = self.dbs_api dbs_query = "find run, run.numevents, site, file.count " \ "where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: msg = "ERROR: Could not execute DBS query" self.logger.fatal(msg) raise Error(msg) # Index things by run number. No cross-check is done to make

sure we get results for each and every run in the

dataset. I'm not sure this would make sense since we'd be

cross-checking DBS info with DBS info anyway. Note that we

use the file count per site to see if we're dealing with an

incomplete vs. a mirrored dataset.

sample_info = {} try: class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": run_number = int(attrs["RUNS_RUNNUMBER"]) site_name = str(attrs["STORAGEELEMENT_SENAME"]) file_count = int(attrs["COUNT_FILES"])

BUG BUG BUG

Doh! For some reason DBS never returns any other

event count than zero.

event_count = int(attrs["RUNS_NUMBEROFEVENTS"])

BUG BUG BUG end

info = (site_name, file_count, event_count) try: sample_info[run_number].append(info) except KeyError: sample_info[run_number] = [info] xml.sax.parseString(api_result, Handler()) except SAXParseException: msg = "ERROR: Could not parse DBS server output" self.logger.fatal(msg) raise Error(msg) # Now translate this into a slightly more usable mapping. sites = {} for (run_number, site_info) in sample_info.iteritems():

Quick-n-dirty trick to see if all file counts are the

same.

unique_file_counts = set([i[1] for i in site_info]) if len(unique_file_counts) == 1:

Okay, so this must be a mirrored dataset.

We have to pick one but we have to be careful. We

cannot submit to things like a T0, a T1, or CAF.

site_names = [self.pick_a_site([i[0] for i in site_info])] nevents = [site_info[0][2]] else:

Looks like this is a spread-out sample.

site_names = [i[0] for i in site_info] nevents = [i[2] for i in site_info] sites[run_number] = zip(site_names, nevents) self.logger.debug("Sample `%s' spread is:" % dataset_name) run_numbers = sites.keys() run_numbers.sort() for run_number in run_numbers: self.logger.debug(" run # %6d: %d site(s) (%s)" % \ (run_number, len(sites[run_number]), ", ".join([i[0] for i in sites[run_number]]))) # End of dbs_check_dataset_spread_old. return sites

DEBUG DEBUG DEBUG end

Figure out the number of events in each run of this dataset.

This is a more efficient way of doing this than calling
dbs_resolve_number_of_events for each run.

Definition at line 3077 of file cmsHarvester.py.

def cmsHarvester.dbs_resolve_cmssw_version	(	self,
		dataset_name
	)

Ask DBS for the CMSSW version used to create this dataset.

Definition at line 2475 of file cmsHarvester.py.

def cmsHarvester.dbs_resolve_dataset_name	(	self,
		dataset_name
	)

Use DBS to resolve a wildcarded dataset name.

Definition at line 2419 of file cmsHarvester.py.

def cmsHarvester.dbs_resolve_datatype	(	self,
		dataset_name
	)

Ask DBS for the the data type (data or mc) of a given
dataset.

Definition at line 2683 of file cmsHarvester.py.

def cmsHarvester.dbs_resolve_globaltag	(	self,
		dataset_name
	)

Ask DBS for the globaltag corresponding to a given dataset.

# BUG BUG BUG
# This does not seem to work for data datasets? E.g. for
# /Cosmics/Commissioning08_CRAFT0831X_V1_311_ReReco_FromSuperPointing_v1/RAW-RECO
# Probaly due to the fact that the GlobalTag changed during
# datataking...
BUG BUG BUG end

Definition at line 2627 of file cmsHarvester.py.

def cmsHarvester.dbs_resolve_number_of_events	(	self,
		dataset_name,
		run_number = `None`
	)

Determine the number of events in a given dataset (and run).

Ask DBS for the number of events in a dataset. If a run number
is specified the number of events returned is that in that run
of that dataset. If problems occur we throw an exception.

# BUG BUG BUG
# Since DBS does not return the number of events correctly,
# neither for runs nor for whole datasets, we have to work
# around that a bit...
# BUG BUG BUG end

Definition at line 2736 of file cmsHarvester.py.

def cmsHarvester.dbs_resolve_runs	(	self,
		dataset_name
	)

def dbs_resolve_dataset_number_of_events(self, dataset_name): """Ask DBS across how many events this dataset has been spread out.

     This is especially useful to check that we do not submit a job
    supposed to run on a complete sample that is not contained at
    a single site.         """         # DEBUG DEBUG DEBUG

If we get here DBS should have been set up already.

assert not self.dbs_api is None

DEBUG DEBUG DEBUG end

api = self.dbs_api dbs_query = "find count(site) where dataset = %s " \ "and dataset.status = VALID" % \ dataset_name try: api_result = api.executeQuery(dbs_query) except DbsApiException: raise Error("ERROR: Could not execute DBS query") try: num_events = [] class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": num_events.append(str(attrs["COUNT_STORAGEELEMENT"])) xml.sax.parseString(api_result, Handler()) except SAXParseException: raise Error("ERROR: Could not parse DBS server output") # DEBUG DEBUG DEBUG assert len(num_events) == 1

DEBUG DEBUG DEBUG end

num_events = int(num_events[0]) # End of dbs_resolve_dataset_number_of_events. return num_events

Ask DBS for the list of runs in a given dataset.

# NOTE: This does not (yet?) skip/remove empty runs. There is
# a bug in the DBS entry run.numevents (i.e. it always returns
# zero) which should be fixed in the `next DBS release'.
# See also:
#   https://savannah.cern.ch/bugs/?53452
#   https://savannah.cern.ch/bugs/?53711

Definition at line 2569 of file cmsHarvester.py.

def cmsHarvester.escape_dataset_name	(	self,
		dataset_name
	)

Escape a DBS dataset name.

Escape a DBS dataset name such that it does not cause trouble
with the file system. This means turning each `/' into `__',
except for the first one which is just removed.

Definition at line 4047 of file cmsHarvester.py.

def cmsHarvester.load_ref_hist_mappings ( self )

Load the reference histogram mappings from file.

The dataset name to reference histogram name mappings are read
from a text file specified in self.ref_hist_mappings_file_name.

Definition at line 5207 of file cmsHarvester.py.

def cmsHarvester.option_handler_caf_access	(	self,
		option,
		opt_str,
		value,
		parser
	)

Set the self.caf_access flag to try and create jobs that
run on the CAF.

Definition at line 1101 of file cmsHarvester.py.

def cmsHarvester.option_handler_castor_dir	(	self,
		option,
		opt_str,
		value,
		parser
	)

def option_handler_dataset_name(self, option, opt_str, value, parser): """Specify the name(s) of the dataset(s) to be processed.

     It is checked to make sure that no dataset name or listfile
    names are given yet. If all is well (i.e. we still have a
    clean slate) the dataset name is stored for later use,
    otherwise a Usage exception is raised.         """         if not self.input_method is None:
        if self.input_method == "dataset":
            raise Usage("Please only feed me one dataset specification")
        elif self.input_method == "listfile":
            raise Usage("Cannot specify both dataset and input list file")
        else:
            assert False, "Unknown input method `%s'" % self.input_method
    self.input_method = "dataset"
    self.input_name = value
    self.logger.info("Input method used: %s" % self.input_method)         # End of option_handler_dataset_name.     ##########     def option_handler_listfile_name(self, option, opt_str, value, parser):
    """Specify the input list file containing datasets to be processed.         It is checked to make sure that no dataset name or listfile
    names are given yet. If all is well (i.e. we still have a
    clean slate) the listfile name is stored for later use,
    otherwise a Usage exception is raised.         """         if not self.input_method is None:
        if self.input_method == "listfile":
            raise Usage("Please only feed me one list file")
        elif self.input_method == "dataset":
            raise Usage("Cannot specify both dataset and input list file")
        else:
            assert False, "Unknown input method `%s'" % self.input_method
    self.input_method = "listfile"
    self.input_name = value
    self.logger.info("Input method used: %s" % self.input_method)         # End of option_handler_listfile_name. @verbatim Specify where on CASTOR the output should go.

At the moment only output to CERN CASTOR is supported. Eventually the harvested results should go into the central place for DQM on CASTOR anyway.

Definition at line 1059 of file cmsHarvester.py.

def cmsHarvester.option_handler_crab_submission	(	self,
		option,
		opt_str,
		value,
		parser
	)

Crab jobs are not created and
    "submitted automatically",

Definition at line 1129 of file cmsHarvester.py.

def cmsHarvester.option_handler_list_types	(	self,
		option,
		opt_str,
		value,
		parser
	)

List all harvesting types and their mappings.

This lists all implemented harvesting types with their
corresponding mappings to sequence names. This had to be
separated out from the help since it depends on the CMSSW
version and was making things a bit of a mess.

NOTE: There is no way (at least not that I could come up with)
to code this in a neat generic way that can be read both by
this method and by setup_harvesting_info(). Please try hard to
keep these two methods in sync!

Definition at line 1151 of file cmsHarvester.py.

def cmsHarvester.option_handler_no_t1access	(	self,
		option,
		opt_str,
		value,
		parser
	)

Set the self.no_t1access flag to try and create jobs that
run without special `t1access' role.

Definition at line 1084 of file cmsHarvester.py.

def cmsHarvester.option_handler_preferred_site	(	self,
		option,
		opt_str,
		value,
		parser
	)

Definition at line 1145 of file cmsHarvester.py.

def cmsHarvester.option_handler_saveByLumiSection	(	self,
		option,
		opt_str,
		value,
		parser
	)

Set process.dqmSaver.saveByLumiSectiont=1 in cfg harvesting file

Definition at line 1117 of file cmsHarvester.py.

def cmsHarvester.option_handler_sites	(	self,
		option,
		opt_str,
		value,
		parser
	)

Definition at line 1139 of file cmsHarvester.py.

def cmsHarvester.parse_cmd_line_options ( self )

Definition at line 1870 of file cmsHarvester.py.

def cmsHarvester.pick_a_site	(	self,
		sites,
		cmssw_version
	)

Definition at line 1706 of file cmsHarvester.py.

def cmsHarvester.process_dataset_ignore_list ( self )

Update the list of datasets taking into account the ones to
ignore.

Both lists have been generated before from DBS and both are
assumed to be unique.

NOTE: The advantage of creating the ignore list from DBS (in
case a regexp is given) and matching that instead of directly
matching the ignore criterion against the list of datasets (to
consider) built from DBS is that in the former case we're sure
that all regexps are treated exactly as DBS would have done
without the cmsHarvester.

NOTE: This only removes complete samples. Exclusion of single
runs is done by the book keeping. So the assumption is that a
user never wants to harvest just part (i.e. n out of N runs)
of a sample.

Definition at line 3567 of file cmsHarvester.py.

def cmsHarvester.process_runs_use_and_ignore_lists ( self )

Definition at line 3614 of file cmsHarvester.py.

def cmsHarvester.ref_hist_mappings_needed	(	self,
		dataset_name = `None`
	)

Check if we need to load and check the reference mappings.

For data the reference histograms should be taken
automatically from the GlobalTag, so we don't need any
mappings. For RelVals we need to know a mapping to be used in
the es_prefer code snippet (different references for each of
the datasets.)

WARNING: This implementation is a bit convoluted.

Definition at line 5173 of file cmsHarvester.py.

def cmsHarvester.run ( self )

Definition at line 5525 of file cmsHarvester.py.

def cmsHarvester.setup_dbs ( self )

Now we try to do a very simple DBS search.

If that works

instead of giving us the `Unsupported API call' crap, we

should be good to go.

NOTE: Not ideal, I know, but it reduces the amount of

complaints I get...

cmd = "dbs search --query=\"find dataset where dataset = impossible"" (status, output) = commands.getstatusoutput(cmd) pdb.set_trace() if status != 0 or \ output.lower().find("unsupported api call") > -1: self.logger.fatal("It seems DBS is not setup...") self.logger.fatal(" %s returns crap:" % cmd) for line in output.split("\n"): self.logger.fatal(" %s" % line) raise Error("ERROR: DBS needs to be setup first!")

Setup the Python side of DBS.

For more information see the DBS Python API documentation:
https://twiki.cern.ch/twiki/bin/view/CMS/DBSApiDocumentation

Definition at line 2393 of file cmsHarvester.py.

def cmsHarvester.setup_harvesting_info ( self )

Fill our dictionary with all info needed to understand
harvesting.

This depends on the CMSSW version since at some point the
names and sequences were modified.

NOTE: There is no way (at least not that I could come up with)
to code this in a neat generic way that can be read both by
this method and by option_handler_list_types(). Please try
hard to keep these two methods in sync!

Definition at line 1206 of file cmsHarvester.py.

def cmsHarvester.show_exit_message ( self )

Tell the user what to do now, after this part is done.

This should provide the user with some (preferably
copy-pasteable) instructions on what to do now with the setups
and files that have been created.

Definition at line 5472 of file cmsHarvester.py.

def cmsHarvester.singlify_datasets ( self )

Remove all but the largest part of all datasets.

This allows us to harvest at least part of these datasets
using single-step harvesting until the two-step approach
works.

Definition at line 3743 of file cmsHarvester.py.

def cmsHarvester.write_crab_config ( self )

def create_harvesting_config(self, dataset_name): """Create the Python harvesting configuration for a given job.

     NOTE: The reason to have a single harvesting configuration per
    sample is to be able to specify the GlobalTag corresponding to
    each sample. Since it has been decided that (apart from the
    prompt reco) datasets cannot contain runs with different
    GlobalTags, we don't need a harvesting config per run.         NOTE: This is the place where we distinguish between
    single-step and two-step harvesting modes (at least for the
    Python job configuration).         """         ###         if self.harvesting_mode == "single-step":
        config_contents = self.create_harvesting_config_single_step(dataset_name)
    elif self.harvesting_mode == "two-step":
        config_contents = self.create_harvesting_config_two_step(dataset_name)
    else:

Impossible harvesting mode, we should never get here.

assert False, "ERROR: unknown harvesting mode `%s'" % \ self.harvesting_mode ### # End of create_harvesting_config. return config_contents

Write a CRAB job configuration Python file.

Definition at line 5049 of file cmsHarvester.py.

def cmsHarvester.write_harvesting_config	(	self,
		dataset_name
	)

Write a harvesting job configuration Python file.

NOTE: This knows nothing about single-step or two-step
harvesting. That's all taken care of by
create_harvesting_config.

Definition at line 5107 of file cmsHarvester.py.

def cmsHarvester.write_me_extraction_config	(	self,
		dataset_name
	)

Write an ME-extraction configuration Python file.

This `ME-extraction' (ME = Monitoring Element) is the first
step of the two-step harvesting.

Definition at line 5140 of file cmsHarvester.py.

def cmsHarvester.write_multicrab_config ( self )

Write a multi-CRAB job configuration Python file.

Definition at line 5078 of file cmsHarvester.py.

Variable Documentation

string cmsHarvester.__author__ = "Jeroen Hegeman (jeroen.hegeman@cern.ch),"

Definition at line 38 of file cmsHarvester.py.

string cmsHarvester.__version__ = "3.8.2p1"

File : cmsHarvest.py Authors : Jeroen Hegeman (jeroe.nosp@m.n.he.nosp@m.geman.nosp@m.@cer.nosp@m.n.ch) Niklas Pietsch (nikla.nosp@m.s.pi.nosp@m.etsch.nosp@m.@des.nosp@m.y.de) Franseco Costanza (franc.nosp@m.esco.nosp@m..cost.nosp@m.anza.nosp@m.@desy.nosp@m..de) Last change: 20100308.

Purpose : Main program to run all kinds of harvesting. For more information please refer to the CMS Twiki url mentioned just below here.

Definition at line 37 of file cmsHarvester.py.

string cmsHarvester.action = "callback"

Definition at line 2056 of file cmsHarvester.py.

tuple cmsHarvester.all_file_names = files_info[run_number]

Definition at line 3232 of file cmsHarvester.py.

list cmsHarvester.all_t1

Initial value:

 = [
                 "srm-cms.cern.ch",
                 "ccsrm.in2p3.fr",
                 "cmssrm-fzk.gridka.de",
                 "cmssrm.fnal.gov",
                 "gridka-dCache.fzk.de",
                 "srm-cms.gridpp.rl.ac.uk",
                 "srm.grid.sinica.edu.tw",
                 "srm2.grid.sinica.edu.tw",
                 "srmcms.pic.es",
                 "storm-fe-cms.cr.cnaf.infn.it"
                 ]

Definition at line 1723 of file cmsHarvester.py.

cmsHarvester.caf_access

Definition at line 1106 of file cmsHarvester.py.

cmsHarvester.callback = self.option_handler_input_Jsonrunfile,

Definition at line 2057 of file cmsHarvester.py.

Referenced by CaloDualConeSelector< HBHERecHit >.selectCallback(), and CaloConeSelector< T >.selectCallback().

cmsHarvester.castor_base_dir

Definition at line 1075 of file cmsHarvester.py.

tuple cmsHarvester.castor_dir = self.datasets_information[dataset_name]

CRAB

GRID

USER

Definition at line 4357 of file cmsHarvester.py.

tuple cmsHarvester.castor_path_common = self.create_castor_path_name_common(dataset_name)

DEBUG DEBUG DEBUG

This is probably only useful to make sure we don't muck

things up, right?

Figure out across how many sites this sample has been spread.

if num_sites == 1: self.logger.info(" sample is contained at a single site") else: self.logger.info(" sample is spread across %d sites" % \ num_sites) if num_sites < 1:

NOTE: This should not happen with any valid dataset.

self.logger.warning(" --> skipping dataset which is not " \ "hosted anywhere")

DEBUG DEBUG DEBUG end

Definition at line 5456 of file cmsHarvester.py.

tuple cmsHarvester.castor_paths

Initial value:

 = dict(list(zip(runs,
                                     [self.create_castor_path_name_special(dataset_name, i, castor_path_common) \
                                      for i in runs])))

Definition at line 5460 of file cmsHarvester.py.

cmsHarvester.castor_prefix = self.castor_prefix

Definition at line 4352 of file cmsHarvester.py.

string cmsHarvester.cmd = "rfstat %s"

self.logger.debug("Path is now `%s'" % \ path)

Definition at line 1632 of file cmsHarvester.py.

cmsHarvester.cmssw_version = self.datasets_information[dataset_name]

Definition at line 4395 of file cmsHarvester.py.

list cmsHarvester.complete_sites

Initial value:

1 = [site for site in sites \

2 if site in sites_with_complete_copies]

site_names_ref = set(files_info[run_number].values()[0][1]) for site_names_tmp in files_info[run_number].values()[1:]: if set(site_names_tmp[1]) != site_names_ref: mirrored = False break

Definition at line 3277 of file cmsHarvester.py.

tuple cmsHarvester.config_builder = ConfigBuilder(config_options, with_input=True)

Definition at line 4795 of file cmsHarvester.py.

string cmsHarvester.config_contents = config_builder.pythonCfgCode

In case this file is the second step (the real harvesting

step) of the two-step harvesting we have to tell it to use

our local files.

if self.harvesting_mode == "two-step": castor_dir = self.datasets_information[dataset_name] \ ["castor_path"][run] customisations.append("") customisations.append("# This is the second step (the real") customisations.append("# harvesting step) of a two-step") customisations.append("# harvesting procedure.")

BUG BUG BUG

To be removed in production version.

customisations.append("import pdb")

BUG BUG BUG end

customisations.append("import commands") customisations.append("import os") customisations.append("castor_dir = \"s"" % castor_dir) customisations.append("cmd = "rfdir s" % castor_dir") customisations.append("(status, output) = commands.getstatusoutput(cmd)") customisations.append("if status != 0:") customisations.append(" print "ERROR"") customisations.append(" raise Exception, "ERROR"") customisations.append("file_names = [os.path.join("rfio:s" % path, i) for i in output.split() if i.startswith("EDM_summary") and i.endswith(".root")]") #customisations.append("pdb.set_trace()") customisations.append("process.source.fileNames = cms.untracked.vstring(*file_names)") customisations.append("")

Definition at line 4800 of file cmsHarvester.py.

cmsHarvester.config_file_name = self.create_me_summary_config_file_name(dataset_name)

Only add the alarming piece to the file name if this is

a spread-out dataset.

pdb.set_trace() if self.datasets_information[dataset_name] \ ["mirrored"][run_number] == False: config_file_name = config_file_name.replace(".py", "_partial.py")

Definition at line 4087 of file cmsHarvester.py.

list cmsHarvester.connect_name = self.frontier_connection_name["globaltag"]

Definition at line 4834 of file cmsHarvester.py.

dictionary cmsHarvester.country_codes

Initial value:

 = {
           "CAF" : "caf.cern.ch",
           "CH" : "srm-cms.cern.ch",
           "FR" : "ccsrm.in2p3.fr",
           "DE" : "cmssrm-fzk.gridka.de",
           "GOV" : "cmssrm.fnal.gov",
           "DE2" : "gridka-dCache.fzk.de",
           "UK" : "srm-cms.gridpp.rl.ac.uk",
           "TW" : "srm.grid.sinica.edu.tw",
           "TW2" : "srm2.grid.sinica.edu.tw",
           "ES" : "srmcms.pic.es",
           "IT" : "storm-fe-cms.cr.cnaf.infn.it"
             }

Definition at line 1736 of file cmsHarvester.py.

string cmsHarvester.crab_config = "\n"

CRAB

GRID

USER

CMSSW

CAF

Definition at line 4307 of file cmsHarvester.py.

cmsHarvester.crab_submission

Definition at line 1133 of file cmsHarvester.py.

list cmsHarvester.customisations = [""]

Definition at line 4830 of file cmsHarvester.py.

tuple cmsHarvester.dataset_name_escaped = self.escape_dataset_name(dataset_name)

Definition at line 4351 of file cmsHarvester.py.

tuple cmsHarvester.dataset_names = self.datasets_to_use.keys()

Definition at line 4346 of file cmsHarvester.py.

cmsHarvester.dataset_names_after_checks = dataset_names_after_checks_tmp

Definition at line 4032 of file cmsHarvester.py.

tuple cmsHarvester.dataset_names_after_checks_tmp = copy.deepcopy(dataset_names_after_checks)

Definition at line 4025 of file cmsHarvester.py.

cmsHarvester.datasets_information

Definition at line 5343 of file cmsHarvester.py.

cmsHarvester.datasets_to_ignore

Definition at line 3459 of file cmsHarvester.py.

cmsHarvester.datasets_to_use

Definition at line 3433 of file cmsHarvester.py.

list cmsHarvester.datatype = self.datasets_information[dataset_name]

Definition at line 4784 of file cmsHarvester.py.

cmsHarvester.dbs_api

Definition at line 2406 of file cmsHarvester.py.

tuple cmsHarvester.empty_runs = dict(tmp)

Definition at line 4009 of file cmsHarvester.py.

tuple cmsHarvester.es_prefer_snippet = self.create_es_prefer_snippet(dataset_name)

Definition at line 4883 of file cmsHarvester.py.

int cmsHarvester.exit_code = 1

Definition at line 5693 of file cmsHarvester.py.

list cmsHarvester.file_name = handler.results["file.name"]

Definition at line 3176 of file cmsHarvester.py.

Referenced by DTTPGLutFile.open(), and L1TriggerLutFile.open().

tuple cmsHarvester.files_at_site

Initial value:

 = [i for (i, (j, k)) \
                                      in files_info[run_number].items() \
                                      if site_name in k]

Definition at line 3236 of file cmsHarvester.py.

dictionary cmsHarvester.files_info = {}

Definition at line 3162 of file cmsHarvester.py.

list cmsHarvester.files_without_sites

Initial value:

 = [i for (i, j) in \
                                    files_info[run_number].items() \
                                    if len(j[1]) < 1]

Definition at line 3202 of file cmsHarvester.py.

cmsHarvester.globaltag = self.datasets_information[dataset_name]

Definition at line 4787 of file cmsHarvester.py.

cmsHarvester.harvesting_info

Definition at line 1312 of file cmsHarvester.py.

cmsHarvester.harvesting_mode

Definition at line 2216 of file cmsHarvester.py.

cmsHarvester.harvesting_type

Definition at line 3860 of file cmsHarvester.py.

string cmsHarvester.help = "Jsonfile containing dictionary of run/lumisections pairs. "

Definition at line 2054 of file cmsHarvester.py.

string cmsHarvester.index = "site_%02d"

Definition at line 4379 of file cmsHarvester.py.

Referenced by edm::UnscheduledCallProducer::WorkerLookup.add(), pat::PackedTriggerPrescales.addPrescaledTrigger(), DetGroupMerger.addSameLevel(), HcalCovarianceMatrices.addValues(), HcalCholeskyMatrices.addValues(), CastorCondObjectContainer< Item >.addValues(), JetPlotsExample< Jet >.analyze(), TrackCategoriesAnalyzer.analyze(), TrackingParticleCategoriesAnalyzer.analyze(), L1uGTTreeProducer.analyze(), VEcalCalibBlock.at(), Phase2OTBarrelLayerBuilder.build(), PixelBarrelLayerBuilder.build(), TOBLayerBuilder.build(), FWPRCaloTowerProxyBuilder.build(), FWPCaloHitProxyBuilder.build(), FWPFEcalRecHitRPProxyBuilder.build(), FWSimpleProxyBuilder.build(), FWPFEcalRecHitLegoProxyBuilder.build(), FWSimpleProxyBuilder.buildViewType(), LocalCacheFile.cache(), Averages.calc(), MedianCommonModeCalculator.calculateCommonMode(), PrimaryVertexAssignment.chargedHadronVertex(), PFPileUpAlgo.chargedHadronVertex(), FWFromSliceSelector.clear(), reco::GsfComponent5D.covariance(), edm::WaitingTaskList.createNode(), customizeTrackingMonitorSeedNumber.customise_trackMon_IterativeTracking_PHASE1(), customizeTrackingMonitorSeedNumber.customise_trackMon_IterativeTracking_PHASE1PU140(), customizeTrackingMonitorSeedNumber.customise_trackMon_IterativeTracking_PHASE1PU70(), pat::MuonSelector.customSelection_(), pat::ElectronSelector.customSelection_(), EcalShapeBase.derivative(), SurveyDet.derivatives(), GenericMVAJetTagComputer.discriminator(), FWHistSliceSelector.doSelect(), FWHFTowerSliceSelector.doSelect(), FWHistSliceSelector.doUnselect(), FWHFTowerSliceSelector.doUnselect(), TKStatus.dumpTkDcsStatus(), edm::RefVectorBase< key_type >.eraseAtIndex(), HLTBitVariable.eval(), TrackClassifierByProxy< Collection >.evaluate(), VertexClassifierByProxy< reco::SecondaryVertexTagInfoCollection >.evaluate(), CastorQIEShape.expand(), HcalQIEShape.expand(), MeasurementDet.fastMeasurements(), VpspScanTask.fill(), PedestalsTask.fill(), FWCandidateTowerProxyBuilder.fillCaloData(), FWPFCandidateTowerProxyBuilder.fillCaloData(), FWCaloTowerProxyBuilderBase.fillCaloData(), reco.fillCovariance(), FWTGeoRecoGeometry::Info.fillPoints(), DYGenFilter.filter(), TagInfoMVACategorySelector.findCategory(), SiStripProcessedRawDigiProducer.findInput(), CompositeTECPetal.findPar(), sistrip::MeanAndStdDev.fit(), CSCDBGains.gain(), GammaSeries(), JetTagComputer::TagInfoHelper.get(), l1t::PhysicsToBitConverter.Get32bitWordLinkEven(), l1t::PhysicsToBitConverter.Get32bitWordLinkOdd(), JetTagComputer::TagInfoHelper.getBase(), SiStripDelay.getBaseDelay(), HcalRechitIsoCalculator.getBkgSubHcalRechitIso(), Gflash.getCalorimeterNumber(), HBHELinearMap.getChannelTriple(), GenericMVAComputerCache.getComputer(), ECFAdder.getECF(), HcalRechitIsoCalculator.getHcalRechitIso(), SiStripPlotGain.getHisto(), SiStripPlotGain.getHistos(), ecaldqm::MESetMulti.getIndex(), SiStripDelay.getLabelName(), SiStripGain.getLabelName(), TKinFitter.getMeasParticle(), ClusterSummary.getModule(), pat::PackedTriggerPrescales.getPrescaleForIndex(), pat::PackedTriggerPrescales.getPrescaleForName(), SiStripGain.getRange(), SiStripDelay.getRcdName(), SiStripGain.getRcdName(), GBRTree2D.GetResponse(), CaloGeometry.getSubdetectorGeometry(), SiStripGain.getTagNorm(), SiStripDelay.getTagSign(), HcalTestNumberingScheme.getUnitID(), HcalNumberingScheme.getUnitID(), ZdcNumberingScheme.getUnitID(), HGCNumberingScheme.getUnitID(), TKinFitter.getUnmeasParticle(), CaloGeometry.getValidDetIds(), CastorCondObjectContainer< Item >.getValues(), HcalCovarianceMatrices.getValues(), HcalCholeskyMatrices.getValues(), reco::GsfComponent5D.GsfComponent5D(), CastorCondObjectContainer< Item >.hashed_id(), HLTSummaryFilter.hltFilter(), HcalCondObjectContainerBase.indexFor(), triggerExpression::PathReader.init(), Thrust.initialAxis(), MultiGaussianStateTransform.innerMultiState1D(), SimpleVFATFrameCollection.Insert(), MuonDigiCollection< CSCDetId, CSCCLCTDigi >.insertDigi(), SimpleVFATFrameCollection.InsertEmptyFrame(), TkPixelMeasurementDet.isActive(), TkPixelMeasurementDet.isEmpty(), btag::Matching< Delta >.isMatched1st(), btag::Matching< Delta >.isMatched2nd(), CSCDBGasGainCorrection.item(), CSCDBChipSpeedCorrection.item(), CSCDBGains.item(), CSCDBNoiseMatrix.item(), CSCDBPedestals.item(), CSCDBCrosstalk.item(), CSCChamberTimeCorrections.item(), L1AcceptBunchCrossing.L1AcceptBunchCrossing(), TrackQuality.layer(), LayerTriplets.layers(), CSCDBCrosstalk.linter(), python.diff_provenance.difference.list_diff(), python.diffProv.difference.list_diff(), fftjetcms::FFTJetInterface.loadInputCollection(), CSCDBCrosstalk.lslope(), main(), edm::ContainerMask< T >.mask(), ecaldqm::MESetMulti.MESetMulti(), MultiGaussianStateTransform.multiState1D(), HcalQIECoder.offset(), CastorQIECoder.offset(), EcalShapeBase.operator()(), Grid3D.operator()(), operator<<(), PhiMemoryImage.operator[](), MultiGaussianStateTransform.outerMultiState1D(), l1t::stage1::RCTEmRegionPacker.pack(), AlignmentSurfaceDeformations.parameters(), CaloTowerConstituentsMapBuilder.parseTextMap(), ParticleDecayProducer.ParticleDecayProducer(), CSCDBPedestals.pedestal(), CSCDBPedestals.pedestal_rms(), PFTauMVAInputDiscriminantTranslator.PFTauMVAInputDiscriminantTranslator(), SiStripRegionCabling.position(), EcalPreshowerGeometry.present(), l1t::Stage1Layer2EGammaAlgorithmImpHI.processEvent(), l1t::Stage1Layer2EGammaAlgorithmImpHW.processEvent(), SecondaryVertexTagInfoProxy.produce(), reco::CorrectedJetProducer< T >.produce(), cms::JetCorrectionProducer< T >.produce(), edm::LogErrorHarvester.produce(), JetSubstructurePacker.produce(), pat::PATConversionProducer.produce(), ScalersRawToDigi.produce(), HLTScoutingMuonProducer.produce(), pat::PATTriggerMatchEmbedder< PATObjectType >.produce(), JetDeltaRTagInfoValueMapProducer< T, I >.produce(), MuonDigiCollection< CSCDetId, CSCCLCTDigi >.put(), HGCDigiProducer.randomEngine(), HcalDigiProducer.randomEngine(), L1CaloRegionDetId.rctCard(), heppy::ReclusterJets.ReclusterJets(), SiStripRegionCabling.region(), edmplugin::CacheParser.replaceSpaces(), edmplugin::CacheParser.restoreSpaces(), CSCDBCrosstalk.rinter(), CSCDBCrosstalk.rslope(), FWPFEcalRecHitRPProxyBuilder.scaleProduct(), FWViewContextMenuHandlerGL.select(), HLTEventSelector.select(), FWModelExpressionSelector.select(), l1t::PhysicsToBitConverter.Set32bitWordLinkEven(), l1t::PhysicsToBitConverter.Set32bitWordLinkOdd(), TkPixelMeasurementDet.setActive(), TkPixelMeasurementDet.setActiveThisEvent(), CalibrationInterface< CategoryT, CalibDataT >.setCalibData(), edm::Path.setEarlyDeleteHelpers(), TkPixelMeasurementDet.setEmpty(), HBHEStatusBitSetter.SetFlagsFromDigi(), HBHEStatusBitSetter.SetFlagsFromRecHits(), HcalQIECoder.setOffset(), CastorQIECoder.setOffset(), reco::PFTrack.setPoint(), reco::SoftLeptonProperties.setQuality(), HcalQIECoder.setSlope(), CastorQIECoder.setSlope(), CaloGeometry.setSubdetGeometry(), CaloTopology.setSubdetTopology(), FWCaloTowerDetailView.setTextInfo(), FWPhotonDetailView.setTextInfo(), l1t::Muon.setTfMuonIndex(), SiPixelDetSummary.SiPixelDetSummary(), HcalQIECoder.slope(), CastorQIECoder.slope(), sortNtupleByEvent(), edm.stripNamespace(), MultipleAlgoIterator.subtractPedestal(), ReflectedIterator.subtractPedestal(), reco::GsfTrackExtra.tangentDeltaP(), reco::GsfTrackExtra.tangentMomentum(), reco::GsfTrackExtra.tangentPosition(), GBRTree2D.TerminalIndex(), GBRTreeD.TerminalIndex(), GBRTree.TerminalIndex(), TEveElementIter.TEveElementIter(), EcalShapeBase.timeIndex(), HFTimingTrust.timerr_hf(), CSCCFEBTimeSlice.timeSample(), reco::PFTrack.trajectoryPoint(), edm::TriggerNames.TriggerNames(), uniqueElectronFinder(), TrackCategories.unknownTrack(), VertexCategories.unknownVertex(), CSCTFEvent.unpack(), TkPixelMeasurementDet.update(), CSCDBGasGainCorrection.value(), CSCDBChipSpeedCorrection.value(), and sistrip::EnsembleCalibrationLA.write_ensembles_text().

cmsHarvester.Jsonfilename

Definition at line 3709 of file cmsHarvester.py.

cmsHarvester.Jsonlumi

Definition at line 3683 of file cmsHarvester.py.

int cmsHarvester.loop = 0

CMSSW

CAF

Definition at line 4392 of file cmsHarvester.py.

Referenced by DDCheckMaterial(), RawDataConverter.GetDigis(), pat::EventHypothesis.loop(), LMFColoredTable.setColor(), LMFColoredTable.setSystem(), DDI::Polyhedra.volume(), and DDI::Polycone.volume().

string cmsHarvester.marker = "\n"

Definition at line 4815 of file cmsHarvester.py.

Referenced by FWPhotonLegoProxyBuilder.build(), FWMuonGlimpseProxyBuilder.build(), and FWElectronLegoProxyBuilder.build().

list cmsHarvester.marker_lines = []

Definition at line 4807 of file cmsHarvester.py.

string cmsHarvester.metavar = "JSONRUNFILE"

Definition at line 2059 of file cmsHarvester.py.

cmsHarvester.mirrored = None

Definition at line 3223 of file cmsHarvester.py.

string cmsHarvester.msg = "Could not create directory `%s'"

class Handler(xml.sax.handler.ContentHandler): def startElement(self, name, attrs): if name == "result": site_name = str(attrs["STORAGEELEMENT_SENAME"])

TODO TODO TODO

Ugly hack to get around cases like this:

$ dbs search –query="find dataset, site, file.count where dataset=/RelValQCD_Pt_3000_3500/CMSSW_3_3_0_pre1-STARTUP31X_V4-v1/GEN-SIM-RECO"

Using DBS instance at: http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet

Processing ...

\

PATH STORAGEELEMENT_SENAME COUNT_FILES

_________________________________________________________________________________

/RelValQCD_Pt_3000_3500/CMSSW_3_3_0_pre1-STARTUP31X_V4-v1/GEN-SIM-RECO 1

/RelValQCD_Pt_3000_3500/CMSSW_3_3_0_pre1-STARTUP31X_V4-v1/GEN-SIM-RECO cmssrm.fnal.gov 12

/RelValQCD_Pt_3000_3500/CMSSW_3_3_0_pre1-STARTUP31X_V4-v1/GEN-SIM-RECO srm-cms.cern.ch 12

if len(site_name) < 1: return

TODO TODO TODO end

run_number = int(attrs["RUNS_RUNNUMBER"]) file_name = str(attrs["FILES_LOGICALFILENAME"]) nevents = int(attrs["FILES_NUMBEROFEVENTS"])

I know, this is a bit of a kludge.

if not files_info.has_key(run_number):

New run.

files_info[run_number] = {} files_info[run_number][file_name] = (nevents, [site_name]) elif not files_info[run_number].has_key(file_name):

New file for a known run.

files_info[run_number][file_name] = (nevents, [site_name]) else:

New entry for a known file for a known run.

DEBUG DEBUG DEBUG

Each file should have the same number of

events independent of the site it's at.

assert nevents == files_info[run_number][file_name][0]

DEBUG DEBUG DEBUG end

files_info[run_number][file_name][1].append(site_name) OBSOLETE OBSOLETE OBSOLETE end

Definition at line 1640 of file cmsHarvester.py.

tuple cmsHarvester.multicrab_block_name

Initial value:

1 = self.create_multicrab_block_name( \

2 dataset_name, run, index)

Definition at line 4415 of file cmsHarvester.py.

string cmsHarvester.multicrab_config = "\n"

Definition at line 4496 of file cmsHarvester.py.

list cmsHarvester.multicrab_config_lines = []

Definition at line 4339 of file cmsHarvester.py.

list cmsHarvester.nevents = int(handler.results["file.numevents"][index])

Definition at line 3177 of file cmsHarvester.py.

Referenced by gen::BaseHadronizer.generateLHE().

cmsHarvester.non_t1access

Definition at line 1090 of file cmsHarvester.py.

cmsHarvester.nr_max_sites

Definition at line 1141 of file cmsHarvester.py.

dictionary cmsHarvester.num_events_catalog = {}

Definition at line 3216 of file cmsHarvester.py.

tuple cmsHarvester.num_events_dataset = sum(tmp)

Definition at line 3987 of file cmsHarvester.py.

tuple cmsHarvester.num_sites

Initial value:

1 = len(self.datasets_information[dataset_name] \

2 ["sites"][run_number])

               if self.datasets_information[dataset_name]["num_events"][run_number] != 0:
                   pdb.set_trace()

DEBUG DEBUG DEBUG end

Definition at line 3957 of file cmsHarvester.py.

int cmsHarvester.number_max_sites = self.nr_max_sites+1

Definition at line 4337 of file cmsHarvester.py.

cmsHarvester.option_parser

Definition at line 1879 of file cmsHarvester.py.

cmsHarvester.output_file_name = self.\

Definition at line 4383 of file cmsHarvester.py.

tuple cmsHarvester.path = os.path.join(path, piece)

else:

Piece not in the list, fine.

                   self.logger.debug("  accepting")

Add piece to the path we're building. self.logger.debug("!!! Skip path piece `%s'? %s" % \ (piece, str(skip_this_path_piece))) self.logger.debug("Adding piece to path...")

Definition at line 1592 of file cmsHarvester.py.

tuple cmsHarvester.permissions = extract_permissions(output)

Definition at line 1649 of file cmsHarvester.py.

string cmsHarvester.permissions_new = []

Definition at line 1679 of file cmsHarvester.py.

string cmsHarvester.permissions_target = "775"

Definition at line 1673 of file cmsHarvester.py.

cmsHarvester.preferred_site

Definition at line 1147 of file cmsHarvester.py.

cmsHarvester.ref_hist_mappings_file_name

Definition at line 2258 of file cmsHarvester.py.

tuple cmsHarvester.run_number = int(handler.results["run.number"][index])

Definition at line 3175 of file cmsHarvester.py.

Referenced by HcalL1TriggerObjectsXml.HcalL1TriggerObjectsXml().

list cmsHarvester.runs = self.datasets_to_use[dataset_name]

Definition at line 4350 of file cmsHarvester.py.

cmsHarvester.runs_to_ignore

Definition at line 3556 of file cmsHarvester.py.

cmsHarvester.runs_to_use

Definition at line 3532 of file cmsHarvester.py.

cmsHarvester.saveByLumiSection

Definition at line 1120 of file cmsHarvester.py.

tuple cmsHarvester.se_name = choice(t1_sites)

Definition at line 1792 of file cmsHarvester.py.

string cmsHarvester.sep = "#"

Definition at line 4808 of file cmsHarvester.py.

Referenced by Tokenizer.join(), HcalTopologyRestrictionParser.parse(), and edm.tokenize().

list cmsHarvester.site_name = None

Definition at line 1773 of file cmsHarvester.py.

list cmsHarvester.site_names = list(set([j for i in files_info[run_number].values() for j in i[1]]))

Definition at line 3218 of file cmsHarvester.py.

list cmsHarvester.sites = [self.preferred_site]

Definition at line 1762 of file cmsHarvester.py.

list cmsHarvester.sites_forbidden = []

Definition at line 1709 of file cmsHarvester.py.

list cmsHarvester.sites_with_complete_copies = []

Definition at line 3234 of file cmsHarvester.py.

cmsHarvester.skip_this_path_piece = True

self.logger.debug("Checking CASTOR path piece `%s'" % \ piece)

self.logger.debug("Checking `%s' against `%s'" % \ (castor_path_pieces[piece_index + check_size], castor_paths_dont_touch[check_size])) self.logger.debug(" skipping")

Definition at line 1584 of file cmsHarvester.py.

list cmsHarvester.t1_sites = []

Definition at line 1779 of file cmsHarvester.py.

dictionary cmsHarvester.tmp

Initial value:

 = [j for (i, j) in self.datasets_information \
                    [dataset_name]["num_events"].items() \
                    if i in self.datasets_to_use[dataset_name]]

TODO TODO TODO

Need to think about where this should go, but

somewhere we have to move over the fact that we want

to process all runs for each dataset that we're

considering.

This basically means copying over the

information from self.datasets_information[]["runs"]

to self.datasets_to_use[].

for dataset_name in self.datasets_to_use.keys(): self.datasets_to_use[dataset_name] = self.datasets_information[dataset_name]["runs"]

TODO TODO TODO end

OBSOLETE OBSOLETE OBSOLETE end tmp = self.datasets_information[dataset_name] \ ["num_events"]

Definition at line 3984 of file cmsHarvester.py.

tuple cmsHarvester.traceback_string = traceback.format_exc()

Definition at line 5718 of file cmsHarvester.py.

string cmsHarvester.twiki_url = "https://twiki.cern.ch/twiki/bin/view/CMS/CmsHarvester"

Definition at line 41 of file cmsHarvester.py.

string cmsHarvester.type = "string"

Definition at line 2058 of file cmsHarvester.py.

tuple cmsHarvester.use_es_prefer = (self.harvesting_type == "RelVal")

Definition at line 4859 of file cmsHarvester.py.

cmsHarvester.use_refs = use_es_preferor\

Definition at line 4860 of file cmsHarvester.py.

cmsHarvester.UserName = output

Definition at line 4332 of file cmsHarvester.py.

cmsHarvester.workflow_name = dataset_name

Definition at line 4888 of file cmsHarvester.py.

Classes

Functions

Now we try to do a very simple DBS search.

Variables

CRAB

DEBUG DEBUG DEBUG

This is probably only useful to make sure we don't muck

things up, right?

Figure out across how many sites this sample has been spread.

In case this file is the second step (the real harvesting

step) of the two-step harvesting we have to tell it to use

our local files.

Only add the alarming piece to the file name if this is

a spread-out dataset.

CRAB

CMSSW

TODO TODO TODO

Ugly hack to get around cases like this:

$ dbs search –query="find dataset, site, file.count where dataset=/RelValQCD_Pt_3000_3500/CMSSW_3_3_0_pre1-STARTUP31X_V4-v1/GEN-SIM-RECO"

Using DBS instance at: http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet

Processing ...

Piece not in the list, fine.

TODO TODO TODO

Need to think about where this should go, but

somewhere we have to move over the fact that we want

to process all runs for each dataset that we're

considering.

Function Documentation

This might very well not work at all for spread-out samples. (?)

BUG BUG BUG end

If we get here DBS should have been set up already.

DEBUG DEBUG DEBUG end

BUG BUG BUG end

End of create_harvesting_config_two_step.

If we get here DBS should have been set up already.

DEBUG DEBUG DEBUG end

DEBUG DEBUG DEBUG end

If we get here DBS should have been set up already.

DEBUG DEBUG DEBUG end

sure we get results for each and every run in the

dataset. I'm not sure this would make sense since we'd be

cross-checking DBS info with DBS info anyway. Note that we

use the file count per site to see if we're dealing with an

incomplete vs. a mirrored dataset.

BUG BUG BUG

Doh! For some reason DBS never returns any other

event count than zero.

BUG BUG BUG end

Quick-n-dirty trick to see if all file counts are the

same.

Okay, so this must be a mirrored dataset.

We have to pick one but we have to be careful. We

cannot submit to things like a T0, a T1, or CAF.

Looks like this is a spread-out sample.

Just kept for debugging now.

If we get here DBS should have been set up already.

DEBUG DEBUG DEBUG end

sure we get results for each and every run in the

dataset. I'm not sure this would make sense since we'd be

cross-checking DBS info with DBS info anyway. Note that we

use the file count per site to see if we're dealing with an

incomplete vs. a mirrored dataset.

BUG BUG BUG

Doh! For some reason DBS never returns any other

event count than zero.

BUG BUG BUG end

Quick-n-dirty trick to see if all file counts are the

same.

Okay, so this must be a mirrored dataset.

We have to pick one but we have to be careful. We

cannot submit to things like a T0, a T1, or CAF.

Looks like this is a spread-out sample.

DEBUG DEBUG DEBUG end

If we get here DBS should have been set up already.

DEBUG DEBUG DEBUG end

DEBUG DEBUG DEBUG end

Now we try to do a very simple DBS search.

instead of giving us the `Unsupported API call' crap, we

should be good to go.

NOTE: Not ideal, I know, but it reduces the amount of