CMS 3D CMS Logo

List of all members | Public Member Functions | Public Attributes | Private Member Functions | Private Attributes | Static Private Attributes
tkal_create_file_lists.FileListCreator Class Reference
Inheritance diagram for tkal_create_file_lists.FileListCreator:

Public Member Functions

def __init__ (self, argv)
 
def create (self)
 

Public Attributes

 rereco
 

Private Member Functions

def _add_file_info (self, container, keys, fileinfo)
 
def _create_dataset_cff (self, name, file_list, json_file=None)
 
def _create_dataset_ini_section (self, name, collection, json_file=None)
 
def _create_dataset_txt (self, name, file_list)
 
def _create_file_lists (self)
 
def _create_hippy_txt (self, name, job_list)
 
def _create_json_file (self, name, first, last=None)
 
def _define_parser (self)
 
def _fulfill_iov_eventcount (self)
 
def _get_iovs (self, runs, useminiiovs=False)
 
def _get_track_collection (self, edm_file)
 
def _prepare_iov_datastructures (self)
 
def _prepare_run_datastructures (self)
 
def _print_eventcounts (self)
 
def _remove_file_info (self, container, keys, fileinfo)
 
def _request_dataset_information (self)
 
def _split_hippy_jobs (self)
 
def _validate_input (self)
 
def _write_file_lists (self)
 

Private Attributes

 _args
 
 _cache
 
 _dataset_regex
 
 _datasets
 
 _events_for_alignment
 
 _events_for_validation
 
 _events_in_dataset
 
 _file_info
 
 _files
 
 _files_alignment
 
 _files_validation
 
 _first_dataset_ini
 
 _formatted_dataset
 
 _hippy_jobs
 
 _iov_info_alignment
 
 _iov_info_validation
 
 _iovs
 
 _max_run
 
 _miniiovs
 
 _output_dir
 
 _parser
 
 _run_info
 

Static Private Attributes

string _dataset_template
 
string _event_count_log = "event_count_info.log"
 

Detailed Description

Create file lists for alignment and validation for a given dataset.

Definition at line 45 of file tkal_create_file_lists.py.

Constructor & Destructor Documentation

def tkal_create_file_lists.FileListCreator.__init__ (   self,
  argv 
)
Constructor taking the command line arguments.

Arguments:
- `args`: command line arguments

Definition at line 49 of file tkal_create_file_lists.py.

49  def __init__(self, argv):
50  """Constructor taking the command line arguments.
51 
52  Arguments:
53  - `args`: command line arguments
54  """
55 
56  self._first_dataset_ini = True
57  self._parser = self._define_parser()
58  self._args = self._parser.parse_args(argv)
59 
60  if not mps_tools.check_proxy():
61  print_msg(
62  "Please create proxy via 'voms-proxy-init -voms cms -rfc'.")
63  sys.exit(1)
64 
65  self._dataset_regex = re.compile(r"^/([^/]+)/([^/]+)/([^/]+)$")
66  self._validate_input()
67 
68  if self._args.test_mode:
70  import Configuration.PyReleaseValidation.relval_production as rvp
71  self._args.datasets = [rvs.steps[rvp.workflows[1000][1][0]]["INPUT"].dataSet]
72  self._validate_input() # ensure that this change is valid
73 
74  self._datasets = sorted([dataset
75  for pattern in self._args.datasets
76  for dataset in get_datasets(pattern)
77  if re.search(self._args.dataset_filter, dataset)])
78  if len(self._datasets) == 0:
79  print_msg("Found no dataset matching the pattern(s):")
80  for d in self._args.datasets: print_msg("\t"+d)
81  sys.exit(1)
82 
84  [re.sub(self._dataset_regex, r"\1_\2_\3", dataset)
85  for dataset in self._datasets])
86  self._output_dir = os.path.join(self._args.output_dir,
87  self._formatted_dataset)
88  self._output_dir = os.path.abspath(self._output_dir)
92 
93  try:
94  os.makedirs(self._output_dir)
95  except OSError as e:
96  if e.args == (17, "File exists"):
97  if self._args.force:
98  pass # do nothing, just clear the existing output
99  elif self._args.use_cache:
100  self._cache.load() # load cache before clearing the output
101  else:
102  print_msg("Directory '{}' already exists from previous runs"
103  " of the script. Use '--use-cache' if you want to"
104  " use the cached DAS-query results Or use "
105  "'--force' to remove it."
106  .format(self._output_dir))
107  sys.exit(1)
108  files = glob.glob(os.path.join(self._output_dir, "*"))
109  for f in files: os.remove(f)
110  else:
111  raise
112 
113 
def print_msg(text, line_break=True, log_file=None)
def get_datasets(dataset_pattern)

Member Function Documentation

def tkal_create_file_lists.FileListCreator._add_file_info (   self,
  container,
  keys,
  fileinfo 
)
private
Add file with `file_name` to `container` using `key`.

Arguments:
- `container`: dictionary holding information on files and event counts
- `keys`: keys to which the info should be added; will be created if not
  existing
- `file_name`: name of a dataset file

Definition at line 299 of file tkal_create_file_lists.py.

References mps_setup.append.

Referenced by tkal_create_file_lists.FileListCreator._fulfill_iov_eventcount().

299  def _add_file_info(self, container, keys, fileinfo):
300  """Add file with `file_name` to `container` using `key`.
301 
302  Arguments:
303  - `container`: dictionary holding information on files and event counts
304  - `keys`: keys to which the info should be added; will be created if not
305  existing
306  - `file_name`: name of a dataset file
307  """
308 
309  for key in keys:
310  if key not in container:
311  container[key] = {"events": 0,
312  "files": []}
313  container[key]["events"] += fileinfo.nevents / len(keys)
314  if fileinfo not in container[key]["files"]:
315  container[key]["files"].append(fileinfo)
316 
317 
def _add_file_info(self, container, keys, fileinfo)
def tkal_create_file_lists.FileListCreator._create_dataset_cff (   self,
  name,
  file_list,
  json_file = None 
)
private
Create configuration fragment to define a dataset.

Arguments:
- `name`: name of the configuration fragment
- `file_list`: list of files to write to `name`
- `json_file`: JSON file to be used for this dataset (optional)

Definition at line 760 of file tkal_create_file_lists.py.

References tkal_create_file_lists.FileListCreator._output_dir, tkal_create_file_lists.get_chunks(), join(), and tkal_create_file_lists.print_msg().

Referenced by tkal_create_file_lists.FileListCreator._write_file_lists().

760  def _create_dataset_cff(self, name, file_list, json_file = None):
761  """
762  Create configuration fragment to define a dataset.
763 
764  Arguments:
765  - `name`: name of the configuration fragment
766  - `file_list`: list of files to write to `name`
767  - `json_file`: JSON file to be used for this dataset (optional)
768  """
769 
770  if json_file is None: json_file = self._args.json # might still be None
771  if json_file is not None:
772  json_file = os.path.join(self._output_dir, json_file)
773 
774  name = "_".join(["Dataset",name, "cff.py"])
775  print_msg("Creating dataset configuration fragment: "+name)
776 
777  file_list_str = ""
778  for sub_list in get_chunks(file_list, 255):
779  file_list_str += ("readFiles.extend([\n'"+
780  "',\n'".join(fileinfo.name for fileinfo in sub_list)+
781  "'\n])\n")
782 
783  fragment = FileListCreator._dataset_template.format(
784  lumi_def = ("import FWCore.PythonUtilities.LumiList as LumiList\n\n"
785  "lumiSecs = cms.untracked.VLuminosityBlockRange()\n"
786  "goodLumiSecs = LumiList.LumiList(filename = "
787  "'{0:s}').getCMSSWString().split(',')"
788  .format(json_file)
789  if json_file else ""),
790  lumi_arg = ("lumisToProcess = lumiSecs,\n "
791  if json_file else ""),
792  lumi_extend = "lumiSecs.extend(goodLumiSecs)" if json_file else "",
793  files = file_list_str)
794 
795  with open(os.path.join(self._output_dir, name), "w") as f:
796  f.write(fragment)
797 
798 
def get_chunks(long_list, chunk_size)
def print_msg(text, line_break=True, log_file=None)
def _create_dataset_cff(self, name, file_list, json_file=None)
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
def tkal_create_file_lists.FileListCreator._create_dataset_ini_section (   self,
  name,
  collection,
  json_file = None 
)
private
Write dataset ini snippet.

Arguments:
- `name`: name of the dataset section
- `collection`: track collection of this dataset
- `json_file`: JSON file to be used for this dataset (optional)

Definition at line 532 of file tkal_create_file_lists.py.

References tkal_create_file_lists.FileListCreator._first_dataset_ini, join(), and tkal_create_file_lists.print_msg().

Referenced by tkal_create_file_lists.FileListCreator._write_file_lists().

532  def _create_dataset_ini_section(self, name, collection, json_file = None):
533  """Write dataset ini snippet.
534 
535  Arguments:
536  - `name`: name of the dataset section
537  - `collection`: track collection of this dataset
538  - `json_file`: JSON file to be used for this dataset (optional)
539  """
540 
541  if json_file:
542  splitted = name.split("_since")
543  file_list = "_since".join(splitted[:-1]
544  if len(splitted) > 1
545  else splitted)
546  else:
547  file_list = name
548  output = "[dataset:{}]\n".format(name)
549  output += "collection = {}\n".format(collection)
550  output += "inputFileList = ${{datasetdir}}/{}.txt\n".format(file_list)
551  output += "json = ${{datasetdir}}/{}\n".format(json_file) if json_file else ""
552 
553  if collection in ("ALCARECOTkAlCosmicsCTF0T",
554  "ALCARECOTkAlCosmicsInCollisions"):
555  if self._first_dataset_ini:
556  print_msg("\tDetermined cosmics dataset, i.e. please replace "
557  "'DUMMY_DECO_MODE_FLAG' and 'DUMMY_ZERO_TESLA_FLAG' "
558  "with the correct values.")
559  self._first_dataset_ini = False
560  output += "cosmicsDecoMode = DUMMY_DECO_MODE_FLAG\n"
561  output += "cosmicsZeroTesla = DUMMY_ZERO_TESLA_FLAG\n"
562  output += "\n"
563 
564  return output
565 
566 
def _create_dataset_ini_section(self, name, collection, json_file=None)
def print_msg(text, line_break=True, log_file=None)
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
def tkal_create_file_lists.FileListCreator._create_dataset_txt (   self,
  name,
  file_list 
)
private
Write alignment file list to disk.

Arguments:
- `name`: name of the file list
- `file_list`: list of files to write to `name`

Definition at line 739 of file tkal_create_file_lists.py.

References tkal_create_file_lists.FileListCreator._output_dir, join(), and tkal_create_file_lists.print_msg().

Referenced by tkal_create_file_lists.FileListCreator._write_file_lists().

739  def _create_dataset_txt(self, name, file_list):
740  """Write alignment file list to disk.
741 
742  Arguments:
743  - `name`: name of the file list
744  - `file_list`: list of files to write to `name`
745  """
746 
747  name += ".txt"
748  print_msg("Creating dataset file list: "+name)
749  with open(os.path.join(self._output_dir, name), "w") as f:
750  f.write("\n".join(fileinfo.name for fileinfo in file_list))
751 
752 
def _create_dataset_txt(self, name, file_list)
def print_msg(text, line_break=True, log_file=None)
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
def tkal_create_file_lists.FileListCreator._create_file_lists (   self)
private
Create file lists for alignment and validation.

Definition at line 390 of file tkal_create_file_lists.py.

Referenced by tkal_create_file_lists.FileListCreator.create().

391  """Create file lists for alignment and validation."""
392 
393  # collect files for alignment until minimal requirements are fulfilled
398 
399  max_range = (0
400  if self._args.events <= 0
401  else int(math.ceil(len(self._files)*self._args.fraction)))
402  use_for_alignment = True
403  for i, fileinfo in enumerate(self._file_info):
404  enough_events = self._events_for_alignment >= self._args.events
405  fraction_exceeded = i >= max_range
406  if enough_events or fraction_exceeded: use_for_alignment = False
407 
408  dataset, f, number_of_events, runs = fileinfo
409 
410  iovs = self._get_iovs(runs)
411  if use_for_alignment:
412  if iovs:
413  self._events_for_alignment += number_of_events
414  self._files_alignment.append(fileinfo)
415  self._add_file_info(self._iov_info_alignment, iovs, fileinfo)
416  else:
417  max_range += 1 # not used -> discard in fraction calculation
418  else:
419  if iovs:
420  self._events_for_validation += number_of_events
421  self._files_validation.append(fileinfo)
422  self._add_file_info(self._iov_info_validation, iovs, fileinfo)
423  if self._args.run_by_run:
424  self._add_file_info(self._run_info, runs, fileinfo)
425 
427 
428  self._split_hippy_jobs()
429 
430 
def _get_iovs(self, runs, useminiiovs=False)
def _add_file_info(self, container, keys, fileinfo)
def tkal_create_file_lists.FileListCreator._create_hippy_txt (   self,
  name,
  job_list 
)
private

Definition at line 753 of file tkal_create_file_lists.py.

References tkal_create_file_lists.FileListCreator._output_dir, join(), and tkal_create_file_lists.print_msg().

Referenced by tkal_create_file_lists.FileListCreator._write_file_lists().

753  def _create_hippy_txt(self, name, job_list):
754  name += "_hippy.txt"
755  print_msg("Creating dataset file list for HipPy: "+name)
756  with open(os.path.join(self._output_dir, name), "w") as f:
757  f.write("\n".join(",".join("'"+fileinfo.name+"'" for fileinfo in job) for job in job_list)+"\n")
758 
759 
def print_msg(text, line_break=True, log_file=None)
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
def tkal_create_file_lists.FileListCreator._create_json_file (   self,
  name,
  first,
  last = None 
)
private
Create JSON file with `name` covering runs from `first` to `last`.  If a
global JSON is provided, the resulting file is the intersection of the
file created here and the global one.
Returns the name of the created JSON file.

Arguments:
- `name`: name of the creted JSON file
- `first`: first run covered by the JSON file
- `last`: last run covered by the JSON file

Definition at line 567 of file tkal_create_file_lists.py.

References tkal_create_file_lists.FileListCreator._max_run, tkal_create_file_lists.FileListCreator._output_dir, and tkal_create_file_lists.print_msg().

Referenced by tkal_create_file_lists.FileListCreator._write_file_lists().

567  def _create_json_file(self, name, first, last = None):
568  """
569  Create JSON file with `name` covering runs from `first` to `last`. If a
570  global JSON is provided, the resulting file is the intersection of the
571  file created here and the global one.
572  Returns the name of the created JSON file.
573 
574  Arguments:
575  - `name`: name of the creted JSON file
576  - `first`: first run covered by the JSON file
577  - `last`: last run covered by the JSON file
578 
579  """
580 
581  if last is None: last = self._max_run
582  name += "_JSON.txt"
583  print_msg("Creating JSON file: "+name)
584 
585  json_file = LumiList.LumiList(runs = range(first, last+1))
586  if self._args.json:
587  global_json = LumiList.LumiList(filename = self._args.json)
588  json_file = json_file & global_json
589  json_file.writeJSON(os.path.join(self._output_dir, name))
590 
591  return name
592 
593 
def print_msg(text, line_break=True, log_file=None)
def _create_json_file(self, name, first, last=None)
def tkal_create_file_lists.FileListCreator._define_parser (   self)
private
Definition of command line argument parser.

Definition at line 126 of file tkal_create_file_lists.py.

126  def _define_parser(self):
127  """Definition of command line argument parser."""
128 
129  parser = argparse.ArgumentParser(
130  description = "Create file lists for alignment",
131  epilog = ("The tool will create a directory containing all file "
132  "lists and a log file with all relevant event counts "
133  "('{}').".format(FileListCreator._event_count_log)))
134  parser.add_argument("-i", "--input", dest = "datasets", required = True,
135  metavar = "DATASET", action = "append",
136  help = ("CMS dataset name; supports wildcards; "
137  "use multiple times for multiple datasets"))
138  parser.add_argument("--dataset-filter", default = "",
139  help = "regex to match within in the datasets matched,"
140  "in case the wildcard isn't flexible enough")
141  parser.add_argument("-j", "--json", dest = "json", metavar = "PATH",
142  help = "path to JSON file (optional)")
143  parser.add_argument("-f", "--fraction", dest = "fraction",
144  type = float, default = 1,
145  help = "max. fraction of files used for alignment")
146  parser.add_argument("--iov", dest = "iovs", metavar = "RUN", type = int,
147  action = "append", default = [],
148  help = ("define IOV by specifying first run; for "
149  "multiple IOVs use this option multiple "
150  "times; files from runs before the lowest "
151  "IOV are discarded (default: 1)"))
152  parser.add_argument("--miniiov", dest="miniiovs", metavar="RUN", type=int,
153  action="append", default=[],
154  help=("in addition to the standard IOVs, break up hippy jobs "
155  "at these points, so that jobs from before and after "
156  "these runs are not in the same job"))
157  parser.add_argument("-r", "--random", action = "store_true",
158  default = False, help = "select files randomly")
159  parser.add_argument("-n", "--events-for-alignment", "--maxevents",
160  dest = "events", type = int, metavar = "NUMBER",
161  help = ("number of events needed for alignment; the"
162  " remaining events in the dataset are used "
163  "for validation; if n<=0, all events are "
164  "used for validation"))
165  parser.add_argument("--all-events", action = "store_true",
166  help = "Use all events for alignment")
167  parser.add_argument("--tracks-for-alignment", dest = "tracks",
168  type = int, metavar = "NUMBER",
169  help = "number of tracks needed for alignment")
170  parser.add_argument("--track-rate", dest = "rate", type = float,
171  metavar = "NUMBER",
172  help = "number of tracks per event")
173  parser.add_argument("--run-by-run", dest = "run_by_run",
174  action = "store_true", default = False,
175  help = "create validation file list for each run")
176  parser.add_argument("--minimum-events-in-iov",
177  dest = "minimum_events_in_iov", metavar = "NUMBER",
178  type = int, default = 100000,
179  help = ("minimum number of events for alignment per"
180  " IOV; this option has a higher priority "
181  "than '-f/--fraction' "
182  "(default: %(default)s)"))
183  parser.add_argument("--minimum-events-validation",
184  dest = "minimum_events_validation",
185  metavar = "NUMBER", type = int, default = 1,
186  help = ("minimum number of events for validation; "
187  "applies to IOVs; in case of --run-by-run "
188  "it applies to runs runs "
189  "(default: %(default)s)"))
190  parser.add_argument("--use-cache", dest = "use_cache",
191  action = "store_true", default = False,
192  help = "use DAS-query results of previous run")
193  parser.add_argument("-o", "--output-dir", dest = "output_dir",
194  metavar = "PATH", default = os.getcwd(),
195  help = "output base directory (default: %(default)s)")
196  parser.add_argument("--create-ini", dest = "create_ini",
197  action = "store_true", default = False,
198  help = ("create dataset ini file based on the "
199  "created file lists"))
200  parser.add_argument("--force", action = "store_true", default = False,
201  help = ("remove output directory from previous "
202  "runs, if existing"))
203  parser.add_argument("--hippy-events-per-job", type = int, default = 1,
204  help = ("approximate number of events in each job for HipPy"))
205  parser.add_argument("--test-mode", dest = "test_mode",
206  action = "store_true", default = False,
207  help = argparse.SUPPRESS) # hidden option
208  return parser
209 
210 
def tkal_create_file_lists.FileListCreator._fulfill_iov_eventcount (   self)
private
Try to fulfill the requirement on the minimum number of events per IOV
in the alignment file list by picking files from the validation list.

Definition at line 431 of file tkal_create_file_lists.py.

References tkal_create_file_lists.FileListCreator._add_file_info(), tkal_create_file_lists.FileListCreator._events_for_alignment, tkal_create_file_lists.FileListCreator._events_for_validation, tkal_create_file_lists.FileListCreator._files_validation, tkal_create_file_lists.FileListCreator._get_iovs(), tkal_create_file_lists.FileListCreator._iov_info_alignment, tkal_create_file_lists.FileListCreator._iov_info_validation, tkal_create_file_lists.FileListCreator._iovs, tkal_create_file_lists.FileListCreator._remove_file_info(), and tkal_create_file_lists.FileListCreator._run_info.

432  """
433  Try to fulfill the requirement on the minimum number of events per IOV
434  in the alignment file list by picking files from the validation list.
435  """
436 
437  for iov in self._iovs:
438  if self._iov_info_alignment[iov]["events"] >= self._args.minimum_events_in_iov: continue
439  for fileinfo in self._files_validation[:]:
440  dataset, f, number_of_events, runs = fileinfo
441  iovs = self._get_iovs(runs)
442  if iov in iovs:
443  self._files_alignment.append(fileinfo)
444  self._events_for_alignment += number_of_events
445  self._add_file_info(self._iov_info_alignment, iovs, fileinfo)
446 
447  self._events_for_validation -= number_of_events
448  self._remove_file_info(self._iov_info_validation, iovs, fileinfo)
449  if self._args.run_by_run:
450  self._remove_file_info(self._run_info, runs, fileinfo)
451  self._files_validation.remove(fileinfo)
452 
453  if (self._iov_info_alignment[iov]["events"]
454  >= self._args.minimum_events_in_iov):
455  break # break the file loop if already enough events
456 
def _get_iovs(self, runs, useminiiovs=False)
def _add_file_info(self, container, keys, fileinfo)
def _remove_file_info(self, container, keys, fileinfo)
def tkal_create_file_lists.FileListCreator._get_iovs (   self,
  runs,
  useminiiovs = False 
)
private
Return the IOV start for `run`. Returns 'None' if the run is before any
defined IOV.

Arguments:
- `runs`: run numbers

Definition at line 275 of file tkal_create_file_lists.py.

References tkal_create_file_lists.FileListCreator._iovs, and tkal_create_file_lists.FileListCreator._miniiovs.

Referenced by tkal_create_file_lists.FileListCreator._fulfill_iov_eventcount(), and tkal_create_file_lists.FileListCreator._split_hippy_jobs().

275  def _get_iovs(self, runs, useminiiovs=False):
276  """
277  Return the IOV start for `run`. Returns 'None' if the run is before any
278  defined IOV.
279 
280  Arguments:
281  - `runs`: run numbers
282  """
283 
284  iovlist = self._miniiovs if useminiiovs else self._iovs
285 
286  iovs = []
287  for run in runs:
288  iov_index = bisect.bisect(iovlist, run)
289  if iov_index > 0: iovs.append(iovlist[iov_index-1])
290  return iovs
291 
292 
def _get_iovs(self, runs, useminiiovs=False)
def tkal_create_file_lists.FileListCreator._get_track_collection (   self,
  edm_file 
)
private
Extract track collection from given `edm_file`.

Arguments:
- `edm_file`: CMSSW dataset file

Definition at line 594 of file tkal_create_file_lists.py.

References ALCARECOTkAlBeamHalo_cff.filter, tkal_create_file_lists.print_msg(), split, and digitizers_cfi.strip.

Referenced by tkal_create_file_lists.FileListCreator._write_file_lists().

594  def _get_track_collection(self, edm_file):
595  """Extract track collection from given `edm_file`.
596 
597  Arguments:
598  - `edm_file`: CMSSW dataset file
599  """
600 
601  # use global redirector to allow also files not yet at your site:
602  cmd = ["edmDumpEventContent", r"root://cms-xrd-global.cern.ch/"+edm_file]
603  try:
604  event_content = subprocess.check_output(cmd).split("\n")
605  except subprocess.CalledProcessError as e:
606  splitted = edm_file.split("/")
607  try:
608  alcareco = splitted[splitted.index("ALCARECO")+1].split("-")[0]
609  alcareco = alcareco.replace("TkAlCosmics0T", "TkAlCosmicsCTF0T")
610  alcareco = "ALCARECO" + alcareco
611  print_msg("\tDetermined track collection as '{}'.".format(alcareco))
612  return alcareco
613  except ValueError:
614  if "RECO" in splitted:
615  print_msg("\tDetermined track collection as 'generalTracks'.")
616  return "generalTracks"
617  else:
618  print_msg("\tCould not determine track collection "
619  "automatically.")
620  print_msg("\tPlease replace 'DUMMY_TRACK_COLLECTION' with "
621  "the correct value.")
622  return "DUMMY_TRACK_COLLECTION"
623 
624  track_collections = []
625  for line in event_content:
626  splitted = line.split()
627  if len(splitted) > 0 and splitted[0] == r"vector<reco::Track>":
628  track_collections.append(splitted[1].strip().strip('"'))
629  if len(track_collections) == 0:
630  print_msg("No track collection found in file '{}'.".format(edm_file))
631  sys.exit(1)
632  elif len(track_collections) == 1:
633  print_msg("\tDetermined track collection as "
634  "'{}'.".format(track_collections[0]))
635  return track_collections[0]
636  else:
637  alcareco_tracks = filter(lambda x: x.startswith("ALCARECO"),
638  track_collections)
639  if len(alcareco_tracks) == 0 and "generalTracks" in track_collections:
640  print_msg("\tDetermined track collection as 'generalTracks'.")
641  return "generalTracks"
642  elif len(alcareco_tracks) == 1:
643  print_msg("\tDetermined track collection as "
644  "'{}'.".format(alcareco_tracks[0]))
645  return alcareco_tracks[0]
646  print_msg("\tCould not unambiguously determine track collection in "
647  "file '{}':".format(edm_file))
648  print_msg("\tPlease replace 'DUMMY_TRACK_COLLECTION' with "
649  "the correct value from the following list.")
650  for collection in track_collections:
651  print_msg("\t - "+collection)
652  return "DUMMY_TRACK_COLLECTION"
653 
654 
def print_msg(text, line_break=True, log_file=None)
double split
Definition: MVATrainer.cc:139
def tkal_create_file_lists.FileListCreator._prepare_iov_datastructures (   self)
private
Create the needed objects for IOV handling.

Definition at line 262 of file tkal_create_file_lists.py.

263  """Create the needed objects for IOV handling."""
264 
265  self._iovs = sorted(set(self._args.iovs))
266  if len(self._iovs) == 0: self._iovs.append(1)
267  self._iov_info_alignment = {iov: {"events": 0, "files": []}
268  for iov in self._iovs}
269  self._iov_info_validation = {iov: {"events": 0, "files": []}
270  for iov in self._iovs}
271 
272  self._miniiovs = sorted(set(self._iovs) | set(self._args.miniiovs))
273 
274 
def tkal_create_file_lists.FileListCreator._prepare_run_datastructures (   self)
private
Create the needed objects for run-by-run validation file lists.

Definition at line 293 of file tkal_create_file_lists.py.

294  """Create the needed objects for run-by-run validation file lists."""
295 
296  self._run_info = {}
297 
298 
def tkal_create_file_lists.FileListCreator._print_eventcounts (   self)
private
Print the event counts per file list and per IOV.

Definition at line 485 of file tkal_create_file_lists.py.

References tkal_create_file_lists.FileListCreator._events_for_alignment, tkal_create_file_lists.FileListCreator._events_for_validation, tkal_create_file_lists.FileListCreator._events_in_dataset, tkal_create_file_lists.FileListCreator._iov_info_alignment, tkal_create_file_lists.FileListCreator._iov_info_validation, tkal_create_file_lists.FileListCreator._output_dir, tkal_create_file_lists.FileListCreator._run_info, tkal_create_file_lists.print_msg(), and tkal_create_file_lists.FileListCreator.rereco.

Referenced by tkal_create_file_lists.FileListCreator.create().

486  """Print the event counts per file list and per IOV."""
487 
488  log = os.path.join(self._output_dir, FileListCreator._event_count_log)
489 
490  print_msg("Using {0:d} events for alignment ({1:.2f}%)."
492  100.0*
494  log_file = log)
495  for iov in sorted(self._iov_info_alignment):
496  print_msg(("Approximate events" if self.rereco else "Events") + " for alignment in IOV since {0:d}: {1:d}"
497  .format(iov, self._iov_info_alignment[iov]["events"]),
498  log_file = log)
499 
500  print_msg("Using {0:d} events for validation ({1:.2f}%)."
502  100.0*
504  log_file = log)
505 
506  for iov in sorted(self._iov_info_validation):
507  msg = ("Approximate events" if self.rereco else "Events") + " for validation in IOV since {0:d}: {1:d}".format(
508  iov, self._iov_info_validation[iov]["events"])
509  if (self._iov_info_validation[iov]["events"]
510  < self._args.minimum_events_validation):
511  msg += " (not enough events -> no dataset file will be created)"
512  print_msg(msg, log_file = log)
513 
514  for run in sorted(self._run_info):
515  msg = ("Approximate events" if self.rereco else "Events") + " for validation in run {0:d}: {1:d}".format(
516  run, self._run_info[run]["events"])
517  if (self._run_info[run]["events"]
518  < self._args.minimum_events_validation):
519  msg += " (not enough events -> no dataset file will be created)"
520  print_msg(msg, log_file = log)
521 
522  unused_events = (self._events_in_dataset
524  - self._events_for_alignment)
525  if unused_events > 0 != self._events_in_dataset:
526  print_msg("Unused events: {0:d} ({1:.2f}%)"
527  .format(unused_events,
528  100.0*unused_events/self._events_in_dataset),
529  log_file = log)
530 
531 
def print_msg(text, line_break=True, log_file=None)
def tkal_create_file_lists.FileListCreator._remove_file_info (   self,
  container,
  keys,
  fileinfo 
)
private
Remove file with `file_name` to `container` using `key`.

Arguments:
- `container`: dictionary holding information on files and event counts
- `keys`: keys from which the info should be removed
- `file_name`: name of a dataset file
- `event_count`: number of events in `file_name`

Definition at line 318 of file tkal_create_file_lists.py.

Referenced by tkal_create_file_lists.FileListCreator._fulfill_iov_eventcount().

318  def _remove_file_info(self, container, keys, fileinfo):
319  """Remove file with `file_name` to `container` using `key`.
320 
321  Arguments:
322  - `container`: dictionary holding information on files and event counts
323  - `keys`: keys from which the info should be removed
324  - `file_name`: name of a dataset file
325  - `event_count`: number of events in `file_name`
326  """
327 
328  for key in keys:
329  if key not in container: continue
330  try:
331  index = container[key]["files"].index(fileinfo)
332  except ValueError: # file not found
333  return
334  del container[key]["files"][index]
335  container[key]["events"] -= fileinfo.nevents / len(keys)
336 
337 
def _remove_file_info(self, container, keys, fileinfo)
def tkal_create_file_lists.FileListCreator._request_dataset_information (   self)
private
Retrieve general dataset information and create file list.

Definition at line 338 of file tkal_create_file_lists.py.

References tkal_create_file_lists.FileListCreator._events_in_dataset, tkal_create_file_lists.FileListCreator._file_info, tkal_create_file_lists.FileListCreator._files, tkal_create_file_lists.FileListCreator._max_run, and tkal_create_file_lists.print_msg().

Referenced by tkal_create_file_lists.FileListCreator.create().

339  """Retrieve general dataset information and create file list."""
340 
341  if not self._cache.empty:
342  print_msg("Using cached information.")
343  (self._events_in_dataset,
344  self._files,
345  self._file_info,
346  self._max_run) = self._cache.get()
347  self.rereco = any(len(fileinfo.runs)>1 for fileinfo in self._file_info)
348  if self._args.random: random.shuffle(self._files)
349  return
350 
351  # workaround to deal with KeyboardInterrupts in the worker processes:
352  # - ignore interrupt signals in workers (see initializer)
353  # - use a timeout of size sys.maxsize to avoid a bug in multiprocessing
354  number_of_processes = multiprocessing.cpu_count() - 1
355  number_of_processes = (number_of_processes
356  if number_of_processes > 0
357  else 1)
358  pool = multiprocessing.Pool(
359  processes = number_of_processes,
360  initializer = lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
361 
362  print_msg("Requesting information for the following dataset(s):")
363  for d in self._datasets: print_msg("\t"+d)
364  print_msg("This may take a while...")
365 
366  result = pool.map_async(get_events_per_dataset, self._datasets).get(sys.maxsize)
367  self._events_in_dataset = sum(result)
368 
369  result = pool.map_async(get_max_run, self._datasets).get(sys.maxsize)
370  self._max_run = max(result)
371 
372  result = sum(pool.map_async(get_file_info, self._datasets).get(sys.maxint), [])
373  files = pool.map_async(_make_file_info, result).get(sys.maxint)
374  self._file_info = sorted(fileinfo for fileinfo in files)
375 
376  self.rereco = any(len(fileinfo.runs)>1 for fileinfo in self._file_info)
377 
378  if self._args.test_mode:
379  self._file_info = self._file_info[-200:] # take only last chunk of files
380  self._files = [fileinfo.name for fileinfo in self._file_info]
381 
382  # write information to cache
383  self._cache.set(self._events_in_dataset, self._files, self._file_info,
384  self._max_run)
385  self._cache.dump()
386  if self._args.random:
387  random.shuffle(self._file_info)
388  self._files = [fileinfo.name for fileinfo in self._file_info]
389 
bool any(const std::vector< T > &v, const T &what)
Definition: ECalSD.cc:37
def print_msg(text, line_break=True, log_file=None)
T get(const Candidate &c)
Definition: component.h:55
def tkal_create_file_lists.FileListCreator._split_hippy_jobs (   self)
private

Definition at line 457 of file tkal_create_file_lists.py.

References tkal_create_file_lists.FileListCreator._datasets, tkal_create_file_lists.FileListCreator._files_alignment, tkal_create_file_lists.FileListCreator._get_iovs(), tkal_create_file_lists.FileListCreator._miniiovs, and objects.autophobj.float.

457  def _split_hippy_jobs(self):
458  hippyjobs = {}
459  for dataset, miniiov in itertools.product(self._datasets, self._miniiovs):
460  jobsforminiiov = []
461  hippyjobs[dataset,miniiov] = jobsforminiiov
462  eventsinthisjob = float("inf")
463  for fileinfo in self._files_alignment:
464  if fileinfo.dataset != dataset: continue
465  miniiovs = set(self._get_iovs(fileinfo.runs, useminiiovs=True))
466  if miniiov not in miniiovs: continue
467  if len(miniiovs) > 1:
468  hippyjobs[dataset,miniiov] = []
469  if eventsinthisjob >= self._args.hippy_events_per_job:
470  currentjob = []
471  jobsforminiiov.append(currentjob)
472  eventsinthisjob = 0
473  currentjob.append(fileinfo)
474  currentjob.sort()
475  eventsinthisjob += fileinfo.nevents
476 
477  self._hippy_jobs = {
478  (dataset, iov): sum((hippyjobs[dataset, miniiov]
479  for miniiov in self._miniiovs
480  if iov == max(_ for _ in self._iovs if _ <= miniiov)), []
481  )
482  for dataset, iov in itertools.product(self._datasets, self._iovs)
483  }
484 
def _get_iovs(self, runs, useminiiovs=False)
def tkal_create_file_lists.FileListCreator._validate_input (   self)
private
Validate command line arguments.

Definition at line 211 of file tkal_create_file_lists.py.

References tkal_create_file_lists.FileListCreator._dataset_regex, objects.autophobj.float, createfilelist.int, and tkal_create_file_lists.print_msg().

211  def _validate_input(self):
212  """Validate command line arguments."""
213 
214  if self._args.events is None:
215  if self._args.all_events:
216  self._args.events = float("inf")
217  print_msg("Using all tracks for alignment")
218  elif (self._args.tracks is None) and (self._args.rate is None):
219  msg = ("either -n/--events-for-alignment, --all-events, or both of "
220  "--tracks-for-alignment and --track-rate are required")
221  self._parser.error(msg)
222  elif (((self._args.tracks is not None) and (self._args.rate is None)) or
223  ((self._args.rate is not None)and (self._args.tracks is None))):
224  msg = ("--tracks-for-alignment and --track-rate must be used "
225  "together")
226  self._parser.error(msg)
227  else:
228  self._args.events = int(math.ceil(self._args.tracks /
229  self._args.rate))
230  print_msg("Requested {0:d} tracks with {1:.2f} tracks/event "
231  "-> {2:d} events for alignment."
232  .format(self._args.tracks, self._args.rate,
233  self._args.events))
234  else:
235  if (self._args.tracks is not None) or (self._args.rate is not None) or self._args.all_events:
236  msg = ("-n/--events-for-alignment must not be used with "
237  "--tracks-for-alignment, --track-rate, or --all-events")
238  self._parser.error(msg)
239  print_msg("Requested {0:d} events for alignment."
240  .format(self._args.events))
241 
242  for dataset in self._args.datasets:
243  if not re.match(self._dataset_regex, dataset):
244  print_msg("Dataset pattern '"+dataset+"' is not in CMS format.")
245  sys.exit(1)
246 
247  nonzero_events_per_iov = (self._args.minimum_events_in_iov > 0)
248  if nonzero_events_per_iov and self._args.fraction <= 0:
249  print_msg("Setting minimum number of events per IOV for alignment "
250  "to 0 because a non-positive fraction of alignment events"
251  " is chosen: {}".format(self._args.fraction))
252  nonzero_events_per_iov = False
253  self._args.minimum_events_in_iov = 0
254  if nonzero_events_per_iov and self._args.events <= 0:
255  print_msg("Setting minimum number of events per IOV for alignment "
256  "to 0 because a non-positive number of alignment events"
257  " is chosen: {}".format(self._args.events))
258  nonzero_events_per_iov = False
259  self._args.minimum_events_in_iov = 0
260 
261 
def print_msg(text, line_break=True, log_file=None)
def tkal_create_file_lists.FileListCreator._write_file_lists (   self)
private
Write file lists to disk.

Definition at line 655 of file tkal_create_file_lists.py.

References tkal_create_file_lists.FileListCreator._create_dataset_cff(), tkal_create_file_lists.FileListCreator._create_dataset_ini_section(), tkal_create_file_lists.FileListCreator._create_dataset_txt(), tkal_create_file_lists.FileListCreator._create_hippy_txt(), tkal_create_file_lists.FileListCreator._create_json_file(), tkal_create_file_lists.FileListCreator._datasets, tkal_create_file_lists.FileListCreator._files, tkal_create_file_lists.FileListCreator._files_alignment, tkal_create_file_lists.FileListCreator._files_validation, tkal_create_file_lists.FileListCreator._formatted_dataset, tkal_create_file_lists.FileListCreator._get_track_collection(), tkal_create_file_lists.FileListCreator._hippy_jobs, tkal_create_file_lists.FileListCreator._iov_info_alignment, tkal_create_file_lists.FileListCreator._iov_info_validation, tkal_create_file_lists.FileListCreator._iovs, tkal_create_file_lists.FileListCreator._output_dir, tkal_create_file_lists.FileListCreator._run_info, join(), tkal_create_file_lists.print_msg(), tkal_create_file_lists.FileListCreator.rereco, and str.

Referenced by tkal_create_file_lists.FileListCreator.create().

655  def _write_file_lists(self):
656  """Write file lists to disk."""
657 
659  self._create_hippy_txt(self._formatted_dataset, sum(self._hippy_jobs.values(), []))
660  self._create_dataset_cff(
661  "_".join(["Alignment", self._formatted_dataset]),
662  self._files_alignment)
663 
664  self._create_dataset_cff(
665  "_".join(["Validation", self._formatted_dataset]),
666  self._files_validation)
667 
668 
669  if self._args.create_ini:
670  dataset_ini_general = "[general]\n"
671  dataset_ini_general += "datasetdir = {}\n".format(self._output_dir)
672  dataset_ini_general += ("json = {}\n\n".format(self._args.json)
673  if self._args.json
674  else "\n")
675 
676  ini_path = self._formatted_dataset + ".ini"
677  print_msg("Creating dataset ini file: " + ini_path)
678  ini_path = os.path.join(self._output_dir, ini_path)
679 
680  collection = self._get_track_collection(self._files[0])
681 
682  with open(ini_path, "w") as f:
683  f.write(dataset_ini_general)
684  f.write(self._create_dataset_ini_section(
685  self._formatted_dataset, collection))
686 
687  iov_wise_ini = dataset_ini_general
688 
689  for i,iov in enumerate(sorted(self._iovs)):
690  iov_str = "since{0:d}".format(iov)
691  iov_str = "_".join([self._formatted_dataset, iov_str])
692 
693  if self.rereco:
694  if i == len(self._iovs) - 1:
695  last = None
696  else:
697  last = sorted(self._iovs)[i+1] - 1
698  local_json = self._create_json_file(iov_str, iov, last)
699  else:
700  local_json = None
701 
702  if self._args.create_ini:
703  iov_wise_ini += self._create_dataset_ini_section(iov_str,
704  collection,
705  local_json)
706 
707  self._create_dataset_txt(iov_str,
708  self._iov_info_alignment[iov]["files"])
709  self._create_hippy_txt(iov_str, sum((self._hippy_jobs[dataset,iov] for dataset in self._datasets), []))
710  self._create_dataset_cff(
711  "_".join(["Alignment", iov_str]),
712  self._iov_info_alignment[iov]["files"],
713  json_file=local_json)
714 
715  if (self._iov_info_validation[iov]["events"]
716  < self._args.minimum_events_validation):
717  continue
718  self._create_dataset_cff(
719  "_".join(["Validation", iov_str]),
720  self._iov_info_validation[iov]["files"],
721  json_file=local_json)
722 
723  if self._args.create_ini and iov_wise_ini != dataset_ini_general:
724  ini_path = self._formatted_dataset + "_IOVs.ini"
725  print_msg("Creating dataset ini file: " + ini_path)
726  ini_path = os.path.join(self._output_dir, ini_path)
727  with open(ini_path, "w") as f: f.write(iov_wise_ini)
728 
729  for run in sorted(self._run_info):
730  if args.rereco: continue #need to implement more jsons
731  if (self._run_info[run]["events"]
732  < self._args.minimum_events_validation):
733  continue
734  self._create_dataset_cff(
735  "_".join(["Validation", self._formatted_dataset, str(run)]),
736  self._run_info[run]["files"])
737 
738 
def _create_dataset_txt(self, name, file_list)
def _create_dataset_ini_section(self, name, collection, json_file=None)
def print_msg(text, line_break=True, log_file=None)
def _create_json_file(self, name, first, last=None)
def _create_dataset_cff(self, name, file_list, json_file=None)
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
#define str(s)
def tkal_create_file_lists.FileListCreator.create (   self)

Member Data Documentation

tkal_create_file_lists.FileListCreator._args
private
tkal_create_file_lists.FileListCreator._cache
private
tkal_create_file_lists.FileListCreator._dataset_regex
private
string tkal_create_file_lists.FileListCreator._dataset_template
staticprivate
Initial value:
1 = """\
2 import FWCore.ParameterSet.Config as cms
3 {lumi_def:s}
4 readFiles = cms.untracked.vstring()
5 source = cms.Source("PoolSource",
6  {lumi_arg:s}fileNames = readFiles)
7 {files:s}{lumi_extend:s}
8 maxEvents = cms.untracked.PSet(input = cms.untracked.int32(-1))
9 """

Definition at line 799 of file tkal_create_file_lists.py.

tkal_create_file_lists.FileListCreator._datasets
private
string tkal_create_file_lists.FileListCreator._event_count_log = "event_count_info.log"
staticprivate

Definition at line 123 of file tkal_create_file_lists.py.

tkal_create_file_lists.FileListCreator._events_for_alignment
private
tkal_create_file_lists.FileListCreator._events_for_validation
private
tkal_create_file_lists.FileListCreator._events_in_dataset
private
tkal_create_file_lists.FileListCreator._file_info
private
tkal_create_file_lists.FileListCreator._files
private
tkal_create_file_lists.FileListCreator._files_alignment
private
tkal_create_file_lists.FileListCreator._files_validation
private
tkal_create_file_lists.FileListCreator._first_dataset_ini
private
tkal_create_file_lists.FileListCreator._formatted_dataset
private
tkal_create_file_lists.FileListCreator._hippy_jobs
private
tkal_create_file_lists.FileListCreator._iov_info_alignment
private
tkal_create_file_lists.FileListCreator._iov_info_validation
private
tkal_create_file_lists.FileListCreator._iovs
private
tkal_create_file_lists.FileListCreator._max_run
private
tkal_create_file_lists.FileListCreator._miniiovs
private
tkal_create_file_lists.FileListCreator._output_dir
private
tkal_create_file_lists.FileListCreator._parser
private

Definition at line 57 of file tkal_create_file_lists.py.

tkal_create_file_lists.FileListCreator._run_info
private
tkal_create_file_lists.FileListCreator.rereco