CMS 3D CMS Logo

dataset.py
Go to the documentation of this file.
1 from __future__ import print_function
2 from __future__ import absolute_import
3 # idea stolen from:
4 # http://cmssw.cvs.cern.ch/cgi-bin/cmssw.cgi/CMSSW/
5 # PhysicsTools/PatAlgos/python/tools/cmsswVersionTools.py
6 from builtins import range
7 import bisect
8 import datetime
9 import json
10 import os
11 import re
12 import sys
13 
14 import Utilities.General.cmssw_das_client as das_client
15 from FWCore.PythonUtilities.LumiList import LumiList
16 
17 from .helperFunctions import cache
18 from .TkAlExceptions import AllInOneError
19 
20 class Dataset(object):
21  def __init__( self, datasetName, dasLimit = 0, tryPredefinedFirst = True,
22  cmssw = os.environ["CMSSW_BASE"], cmsswrelease = os.environ["CMSSW_RELEASE_BASE"],
23  magneticfield = None, dasinstance = None):
24  self.__name = datasetName
25  self.__origName = datasetName
26  self.__dasLimit = dasLimit
27  self.__dasinstance = dasinstance
28  self.__cmssw = cmssw
29  self.__cmsswrelease = cmsswrelease
30  self.__firstusedrun = None
31  self.__lastusedrun = None
32  self.__parentDataset = None
33 
34  # check, if dataset name matches CMS dataset naming scheme
35  if re.match( r'/.+/.+/.+', self.__name ):
36  self.__official = True
37  fileName = "Dataset" + self.__name.replace("/","_") + "_cff.py"
38  else:
39  self.__official = False
40  fileName = self.__name + "_cff.py"
41 
42  searchPath1 = os.path.join( self.__cmssw, "python",
43  "Alignment", "OfflineValidation",
44  fileName )
45  searchPath2 = os.path.join( self.__cmssw, "src",
46  "Alignment", "OfflineValidation",
47  "python", fileName )
48  searchPath3 = os.path.join( self.__cmsswrelease,
49  "python", "Alignment",
50  "OfflineValidation", fileName )
51  if self.__official and not tryPredefinedFirst:
52  self.__predefined = False
53  elif os.path.exists( searchPath1 ):
54  self.__predefined = True
55  self.__filename = searchPath1
56  elif os.path.exists( searchPath2 ):
57  msg = ("The predefined dataset '%s' does exist in '%s', but "
58  "you need to run 'scram b' first."
59  %( self.__name, searchPath2 ))
60  if self.__official:
61  print(msg)
62  print("Getting the data from DAS again. To go faster next time, run scram b.")
63  else:
64  raise AllInOneError( msg )
65  elif os.path.exists( searchPath3 ):
66  self.__predefined = True
67  self.__filename = searchPath3
68  elif self.__official:
69  self.__predefined = False
70  else:
71  msg = ("The predefined dataset '%s' does not exist. Please "
72  "create it first or check for typos."%( self.__name ))
73  raise AllInOneError( msg )
74 
75  if self.__predefined and self.__official:
76  self.__name = "Dataset" + self.__name.replace("/","_")
77 
78  if magneticfield is not None:
79  try:
80  magneticfield = float(magneticfield)
81  except ValueError:
82  raise AllInOneError("Bad magneticfield {} which can't be converted to float".format(magneticfield))
83  self.__inputMagneticField = magneticfield
84 
85  self.__dataType = self.__getDataType()
87 
88 
89  def __chunks( self, theList, n ):
90  """ Yield successive n-sized chunks from theList.
91  """
92  for i in range( 0, len( theList ), n ):
93  yield theList[i:i+n]
94 
95  __source_template= ("%(header)s"
96  "%(importCms)s"
97  "import FWCore.PythonUtilities.LumiList as LumiList\n\n"
98  "%(goodLumiSecStr)s"
99  "readFiles = cms.untracked.vstring()\n"
100  "secFiles = cms.untracked.vstring()\n"
101  "%(process)ssource = cms.Source(\"PoolSource\",\n"
102  "%(lumiStr)s"
103  "%(tab)s secondaryFileNames ="
104  "secFiles,\n"
105  "%(tab)s fileNames = readFiles\n"
106  ")\n"
107  "%(files)s\n"
108  "%(lumiSecExtend)s\n"
109  "%(process)smaxEvents = cms.untracked.PSet( "
110  "input = cms.untracked.int32(%(nEvents)s) )\n"
111  "%(skipEventsString)s\n")
112 
113  __dummy_source_template = ("readFiles = cms.untracked.vstring()\n"
114  "secFiles = cms.untracked.vstring()\n"
115  "%(process)ssource = cms.Source(\"PoolSource\",\n"
116  "%(tab)s secondaryFileNames ="
117  "secFiles,\n"
118  "%(tab)s fileNames = readFiles\n"
119  ")\n"
120  "readFiles.extend(['dummy_File.root'])\n"
121  "%(process)smaxEvents = cms.untracked.PSet( "
122  "input = cms.untracked.int32(%(nEvents)s) )\n"
123  "%(skipEventsString)s\n")
124 
125  def __lumiSelectionSnippet( self, jsonPath = None, firstRun = None, lastRun = None ):
126  lumiSecExtend = ""
127  if firstRun or lastRun or jsonPath:
128  if not jsonPath:
129  selectedRunList = self.__getRunList()
130  if firstRun:
131  selectedRunList = [ run for run in selectedRunList \
132  if self.__findInJson(run, "run_number") >= firstRun ]
133  if lastRun:
134  selectedRunList = [ run for run in selectedRunList \
135  if self.__findInJson(run, "run_number") <= lastRun ]
136  lumiList = [ str( self.__findInJson(run, "run_number") ) + ":1-" \
137  + str( self.__findInJson(run, "run_number") ) + ":max" \
138  for run in selectedRunList ]
139  splitLumiList = list( self.__chunks( lumiList, 255 ) )
140  else:
141  theLumiList = None
142  try:
143  theLumiList = LumiList ( filename = jsonPath )
144  except ValueError:
145  pass
146 
147  if theLumiList is not None:
148  allRuns = theLumiList.getRuns()
149  runsToRemove = []
150  for run in allRuns:
151  if firstRun and int( run ) < firstRun:
152  runsToRemove.append( run )
153  if lastRun and int( run ) > lastRun:
154  runsToRemove.append( run )
155  theLumiList.removeRuns( runsToRemove )
156  splitLumiList = list( self.__chunks(
157  theLumiList.getCMSSWString().split(','), 255 ) )
158  if not (splitLumiList and splitLumiList[0] and splitLumiList[0][0]):
159  splitLumiList = None
160  else:
161  with open(jsonPath) as f:
162  jsoncontents = f.read()
163  if "process.source.lumisToProcess" in jsoncontents:
164  msg = "%s is not a json file, but it seems to be a CMSSW lumi selection cff snippet. Trying to use it" % jsonPath
165  if firstRun or lastRun:
166  msg += ("\n (after applying firstRun and/or lastRun)")
167  msg += ".\nPlease note that, depending on the format of this file, it may not work as expected."
168  msg += "\nCheck your config file to make sure that it worked properly."
169  print(msg)
170 
171  runlist = self.__getRunList()
172  if firstRun or lastRun:
173  self.__firstusedrun = -1
174  self.__lastusedrun = -1
175  jsoncontents = re.sub(r"\d+:(\d+|max)(-\d+:(\d+|max))?", self.getForceRunRangeFunction(firstRun, lastRun), jsoncontents)
176  jsoncontents = (jsoncontents.replace("'',\n","").replace("''\n","")
177  .replace('"",\n','').replace('""\n',''))
178  self.__firstusedrun = max(self.__firstusedrun, int(self.__findInJson(runlist[0],"run_number")))
179  self.__lastusedrun = min(self.__lastusedrun, int(self.__findInJson(runlist[-1],"run_number")))
180  if self.__lastusedrun < self.__firstusedrun:
181  jsoncontents = None
182  else:
183  self.__firstusedrun = int(self.__findInJson(runlist[0],"run_number"))
184  self.__lastusedrun = int(self.__findInJson(runlist[-1],"run_number"))
185  lumiSecExtend = jsoncontents
186  splitLumiList = None
187  else:
188  raise AllInOneError("%s is not a valid json file!" % jsonPath)
189 
190  if splitLumiList and splitLumiList[0] and splitLumiList[0][0]:
191  lumiSecStr = [ "',\n'".join( lumis ) \
192  for lumis in splitLumiList ]
193  lumiSecStr = [ "lumiSecs.extend( [\n'" + lumis + "'\n] )" \
194  for lumis in lumiSecStr ]
195  lumiSecExtend = "\n".join( lumiSecStr )
196  runlist = self.__getRunList()
197  self.__firstusedrun = max(int(splitLumiList[0][0].split(":")[0]), int(self.__findInJson(runlist[0],"run_number")))
198  self.__lastusedrun = min(int(splitLumiList[-1][-1].split(":")[0]), int(self.__findInJson(runlist[-1],"run_number")))
199  elif lumiSecExtend:
200  pass
201  else:
202  msg = "You are trying to run a validation without any runs! Check that:"
203  if firstRun or lastRun:
204  msg += "\n - firstRun/begin and lastRun/end are correct for this dataset, and there are runs in between containing data"
205  if jsonPath:
206  msg += "\n - your JSON file is correct for this dataset, and the runs contain data"
207  if (firstRun or lastRun) and jsonPath:
208  msg += "\n - firstRun/begin and lastRun/end are consistent with your JSON file"
209  raise AllInOneError(msg)
210 
211  else:
212  if self.__inputMagneticField is not None:
213  pass #never need self.__firstusedrun or self.__lastusedrun
214  else:
215  runlist = self.__getRunList()
216  self.__firstusedrun = int(self.__findInJson(self.__getRunList()[0],"run_number"))
217  self.__lastusedrun = int(self.__findInJson(self.__getRunList()[-1],"run_number"))
218 
219  return lumiSecExtend
220 
221  def __fileListSnippet(self, crab=False, parent=False, firstRun=None, lastRun=None, forcerunselection=False):
222  if crab:
223  files = ""
224  else:
225  splitFileList = list( self.__chunks( self.fileList(firstRun=firstRun, lastRun=lastRun, forcerunselection=forcerunselection), 255 ) )
226  if not splitFileList:
227  raise AllInOneError("No files found for dataset {}. Check the spelling, or maybe specify another das instance?".format(self.__name))
228  fileStr = [ "',\n'".join( files ) for files in splitFileList ]
229  fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \
230  for files in fileStr ]
231  files = "\n".join( fileStr )
232 
233  if parent:
234  splitParentFileList = list( self.__chunks( self.fileList(parent=True, firstRun=firstRun, lastRun=lastRun, forcerunselection=forcerunselection), 255 ) )
235  parentFileStr = [ "',\n'".join( parentFiles ) for parentFiles in splitParentFileList ]
236  parentFileStr = [ "secFiles.extend( [\n'" + parentFiles + "'\n] )" \
237  for parentFiles in parentFileStr ]
238  parentFiles = "\n".join( parentFileStr )
239  files += "\n\n" + parentFiles
240 
241  return files
242 
243  def __createSnippet( self, jsonPath = None, begin = None, end = None,
244  firstRun = None, lastRun = None, repMap = None,
245  crab = False, parent = False ):
246 
247  if firstRun:
248  firstRun = int( firstRun )
249  if lastRun:
250  lastRun = int( lastRun )
251  if ( begin and firstRun ) or ( end and lastRun ):
252  msg = ( "The Usage of "
253  + "'begin' & 'firstRun' " * int( bool( begin and
254  firstRun ) )
255  + "and " * int( bool( ( begin and firstRun ) and
256  ( end and lastRun ) ) )
257  + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
258  + "is ambigous." )
259  raise AllInOneError( msg )
260  if begin or end:
261  ( firstRun, lastRun ) = self.convertTimeToRun(
262  begin = begin, end = end, firstRun = firstRun,
263  lastRun = lastRun )
264  if ( firstRun and lastRun ) and ( firstRun > lastRun ):
265  msg = ( "The lower time/runrange limit ('begin'/'firstRun') "
266  "chosen is greater than the upper time/runrange limit "
267  "('end'/'lastRun').")
268  raise AllInOneError( msg )
269 
270  lumiSecExtend = self.__lumiSelectionSnippet(jsonPath=jsonPath, firstRun=firstRun, lastRun=lastRun)
271  lumiStr = goodLumiSecStr = ""
272  if lumiSecExtend:
273  goodLumiSecStr = "lumiSecs = cms.untracked.VLuminosityBlockRange()\n"
274  lumiStr = " lumisToProcess = lumiSecs,\n"
275 
276  files = self.__fileListSnippet(crab=crab, parent=parent, firstRun=firstRun, lastRun=lastRun, forcerunselection=False)
277 
278  theMap = repMap
279  theMap["files"] = files
280  theMap["json"] = jsonPath
281  theMap["lumiStr"] = lumiStr
282  theMap["goodLumiSecStr"] = goodLumiSecStr%( theMap )
283  theMap["lumiSecExtend"] = lumiSecExtend
284  if crab:
285  dataset_snippet = self.__dummy_source_template%( theMap )
286  else:
287  dataset_snippet = self.__source_template%( theMap )
288  return dataset_snippet
289 
290  def __find_lt( self, a, x ):
291  'Find rightmost value less than x'
292  i = bisect.bisect_left( a, x )
293  if i:
294  return i-1
295  raise ValueError
296 
297  def __find_ge( self, a, x):
298  'Find leftmost item greater than or equal to x'
299  i = bisect.bisect_left( a, x )
300  if i != len( a ):
301  return i
302  raise ValueError
303 
304  def __findInJson(self, jsondict, strings):
305  if isinstance(strings, str):
306  strings = [ strings ]
307 
308  if len(strings) == 0:
309  return jsondict
310  if isinstance(jsondict,dict):
311  if strings[0] in jsondict:
312  try:
313  return self.__findInJson(jsondict[strings[0]], strings[1:])
314  except KeyError:
315  pass
316  else:
317  for a in jsondict:
318  if strings[0] in a:
319  try:
320  return self.__findInJson(a[strings[0]], strings[1:])
321  except (TypeError, KeyError): #TypeError because a could be a string and contain strings[0]
322  pass
323  #if it's not found
324  raise KeyError("Can't find " + strings[0])
325 
326  def forcerunrange(self, firstRun, lastRun, s):
327  """s must be in the format run1:lum1-run2:lum2"""
328  s = s.group()
329  run1 = s.split("-")[0].split(":")[0]
330  lum1 = s.split("-")[0].split(":")[1]
331  try:
332  run2 = s.split("-")[1].split(":")[0]
333  lum2 = s.split("-")[1].split(":")[1]
334  except IndexError:
335  run2 = run1
336  lum2 = lum1
337  if int(run2) < firstRun or int(run1) > lastRun:
338  return ""
339  if int(run1) < firstRun or firstRun < 0:
340  run1 = firstRun
341  lum1 = 1
342  if int(run2) > lastRun:
343  run2 = lastRun
344  lum2 = "max"
345  if int(run1) < self.__firstusedrun or self.__firstusedrun < 0:
346  self.__firstusedrun = int(run1)
347  if int(run2) > self.__lastusedrun:
348  self.__lastusedrun = int(run2)
349  return "%s:%s-%s:%s" % (run1, lum1, run2, lum2)
350 
351  def getForceRunRangeFunction(self, firstRun, lastRun):
352  def forcerunrangefunction(s):
353  return self.forcerunrange(firstRun, lastRun, s)
354  return forcerunrangefunction
355 
356  def __getData( self, dasQuery, dasLimit = 0 ):
357  dasData = das_client.get_data(dasQuery, dasLimit)
358  if isinstance(dasData, str):
359  jsondict = json.loads( dasData )
360  else:
361  jsondict = dasData
362  # Check, if the DAS query fails
363  try:
364  error = self.__findInJson(jsondict,["data","error"])
365  except KeyError:
366  error = None
367  if error or self.__findInJson(jsondict,"status") != 'ok' or "data" not in jsondict:
368  try:
369  jsonstr = self.__findInJson(jsondict,"reason")
370  except KeyError:
371  jsonstr = str(jsondict)
372  if len(jsonstr) > 10000:
373  jsonfile = "das_query_output_%i.txt"
374  i = 0
375  while os.path.lexists(jsonfile % i):
376  i += 1
377  jsonfile = jsonfile % i
378  theFile = open( jsonfile, "w" )
379  theFile.write( jsonstr )
380  theFile.close()
381  msg = "The DAS query returned an error. The output is very long, and has been stored in:\n" + jsonfile
382  else:
383  msg = "The DAS query returned a error. Here is the output\n" + jsonstr
384  msg += "\nIt's possible that this was a server error. If so, it may work if you try again later"
385  raise AllInOneError(msg)
386  return self.__findInJson(jsondict,"data")
387 
388  def __getDataType( self ):
389  if self.__predefined:
390  with open(self.__filename) as f:
391  datatype = None
392  for line in f.readlines():
393  if line.startswith("#data type: "):
394  if datatype is not None:
395  raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
396  datatype = line.replace("#data type: ", "").replace("\n","")
397  return datatype
398  return "unknown"
399 
400  dasQuery_type = ( 'dataset dataset=%s instance=%s detail=true | grep dataset.datatype,'
401  'dataset.name'%( self.__name, self.__dasinstance ) )
402  data = self.__getData( dasQuery_type )
403 
404  try:
405  return self.__findInJson(data, ["dataset", "datatype"])
406  except KeyError:
407  print ("Cannot find the datatype of the dataset '%s'\n"
408  "It may not be possible to automatically find the magnetic field,\n"
409  "and you will not be able run in CRAB mode"
410  %( self.name() ))
411  return "unknown"
412 
413  def __getParentDataset( self ):
414  dasQuery = "parent dataset=" + self.__name + " instance="+self.__dasinstance
415  data = self.__getData( dasQuery )
416  try:
417  return self.__findInJson(data, ["parent", "name"])
418  except KeyError:
419  raise AllInOneError("Cannot find the parent of the dataset '" + self.__name + "'\n"
420  "Here is the DAS output:\n" + str(jsondict) +
421  "\nIt's possible that this was a server error. If so, it may work if you try again later")
422 
423  def __getMagneticField( self ):
424  Bfieldlocation = os.path.join( self.__cmssw, "python", "Configuration", "StandardSequences" )
425  if not os.path.isdir(Bfieldlocation):
426  Bfieldlocation = os.path.join( self.__cmsswrelease, "python", "Configuration", "StandardSequences" )
427  Bfieldlist = [ f.replace("_cff.py",'') \
428  for f in os.listdir(Bfieldlocation) \
429  if f.startswith("MagneticField_") and f.endswith("_cff.py") ]
430  Bfieldlist.sort( key = lambda Bfield: -len(Bfield) ) #Put it in order of decreasing length, so that searching in the name gives the longer match
431 
432  if self.__inputMagneticField is not None:
433  if self.__inputMagneticField == 3.8:
434  return "MagneticField"
435  elif self.__inputMagneticField == 0:
436  return "MagneticField_0T"
437  else:
438  raise ValueError("Unknown input magnetic field {}".format(self.__inputMagneticField))
439 
440  if self.__predefined:
441  with open(self.__filename) as f:
442  datatype = None
443  Bfield = None
444  for line in f.readlines():
445  if line.startswith("#data type: "):
446  if datatype is not None:
447  raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
448  datatype = line.replace("#data type: ", "").replace("\n","")
449  datatype = datatype.split("#")[0].strip()
450  if line.startswith("#magnetic field: "):
451  if Bfield is not None:
452  raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
453  Bfield = line.replace("#magnetic field: ", "").replace("\n","")
454  Bfield = Bfield.split("#")[0].strip()
455  if Bfield is not None:
456  Bfield = Bfield.split(",")[0]
457  if Bfield in Bfieldlist or Bfield == "unknown":
458  return Bfield
459  else:
460  print("Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield)
461  print("Using Bfield='unknown' - this will revert to the default")
462  return "unknown"
463  elif datatype == "data":
464  return "MagneticField" #this should be in the "#magnetic field" line, but for safety in case it got messed up
465  else:
466  return "unknown"
467 
468  if self.__dataType == "data":
469  return "MagneticField"
470 
471  #try to find the magnetic field from DAS
472  #it seems to be there for the newer (7X) MC samples, except cosmics
473  dasQuery_B = ('dataset dataset=%s instance=%s'%(self.__name, self.__dasinstance))
474  data = self.__getData( dasQuery_B )
475 
476  try:
477  Bfield = self.__findInJson(data, ["dataset", "mcm", "sequences", "magField"])
478  if Bfield in Bfieldlist:
479  return Bfield
480  elif Bfield == "38T" or Bfield == "38T_PostLS1":
481  return "MagneticField"
482  elif "MagneticField_" + Bfield in Bfieldlist:
483  return "MagneticField_" + Bfield
484  elif Bfield == "":
485  pass
486  else:
487  print("Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield)
488  print("Using Bfield='unknown' - this will revert to the default magnetic field")
489  return "unknown"
490  except KeyError:
491  pass
492 
493  for possibleB in Bfieldlist:
494  if (possibleB != "MagneticField"
495  and possibleB.replace("MagneticField_","") in self.__name.replace("TkAlCosmics0T", "")):
496  #final attempt - try to identify the dataset from the name
497  #all cosmics dataset names contain "TkAlCosmics0T"
498  if possibleB == "MagneticField_38T" or possibleB == "MagneticField_38T_PostLS1":
499  return "MagneticField"
500  return possibleB
501 
502  return "unknown"
503 
504  def __getMagneticFieldForRun( self, run = -1, tolerance = 0.5 ):
505  """For MC, this returns the same as the previous function.
506  For data, it gets the magnetic field from the runs. This is important for
507  deciding which template to use for offlinevalidation
508  """
509  if self.__dataType == "mc" and self.__magneticField == "MagneticField":
510  return 3.8 #For 3.8T MC the default MagneticField is used
511  if self.__inputMagneticField is not None:
512  return self.__inputMagneticField
513  if "T" in self.__magneticField:
514  Bfield = self.__magneticField.split("T")[0].replace("MagneticField_","")
515  try:
516  return float(Bfield) / 10.0 #e.g. 38T and 38T_PostLS1 both return 3.8
517  except ValueError:
518  pass
519  if self.__predefined:
520  with open(self.__filename) as f:
521  Bfield = None
522  for line in f.readlines():
523  if line.startswith("#magnetic field: ") and "," in line:
524  if Bfield is not None:
525  raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
526  return float(line.replace("#magnetic field: ", "").split(",")[1].split("#")[0].strip())
527 
528  if run > 0:
529  dasQuery = ('run=%s instance=%s detail=true'%(run, self.__dasinstance)) #for data
530  data = self.__getData(dasQuery)
531  try:
532  return self.__findInJson(data, ["run","bfield"])
533  except KeyError:
534  return "unknown Can't get the magnetic field for run %s from DAS" % run
535 
536  #run < 0 - find B field for the first and last runs, and make sure they're compatible
537  # (to within tolerance)
538  #NOT FOOLPROOF! The magnetic field might go up and then down, or vice versa
539  if self.__firstusedrun is None or self.__lastusedrun is None:
540  return "unknown Can't get the exact magnetic field for the dataset until data has been retrieved from DAS."
541  firstrunB = self.__getMagneticFieldForRun(self.__firstusedrun)
542  lastrunB = self.__getMagneticFieldForRun(self.__lastusedrun)
543  try:
544  if abs(firstrunB - lastrunB) <= tolerance:
545  return .5*(firstrunB + lastrunB)
546  print(firstrunB, lastrunB, tolerance)
547  return ("unknown The beginning and end of your run range for %s\n"
548  "have different magnetic fields (%s, %s)!\n"
549  "Try limiting the run range using firstRun, lastRun, begin, end, or JSON,\n"
550  "or increasing the tolerance (in dataset.py) from %s.") % (self.__name, firstrunB, lastrunB, tolerance)
551  except TypeError:
552  try:
553  if "unknown" in firstrunB:
554  return firstrunB
555  else:
556  return lastrunB
557  except TypeError:
558  return lastrunB
559 
560  @cache
561  def __getFileInfoList( self, dasLimit, parent = False ):
562  if self.__predefined:
563  if parent:
564  extendstring = "secFiles.extend"
565  else:
566  extendstring = "readFiles.extend"
567  with open(self.__fileName) as f:
568  files = []
569  copy = False
570  for line in f.readlines():
571  if "]" in line:
572  copy = False
573  if copy:
574  files.append({name: line.translate(None, "', " + '"')})
575  if extendstring in line and "[" in line and "]" not in line:
576  copy = True
577  return files
578 
579  if parent:
580  searchdataset = self.parentDataset()
581  else:
582  searchdataset = self.__name
583  dasQuery_files = ( 'file dataset=%s instance=%s detail=true | grep file.name, file.nevents, '
584  'file.creation_time, '
585  'file.modification_time'%( searchdataset, self.__dasinstance ) )
586  print("Requesting file information for '%s' from DAS..."%( searchdataset ), end=' ')
587  sys.stdout.flush()
588  data = self.__getData( dasQuery_files, dasLimit )
589  print("Done.")
590  data = [ self.__findInJson(entry,"file") for entry in data ]
591  if len( data ) == 0:
592  msg = ("No files are available for the dataset '%s'. This can be "
593  "due to a typo or due to a DAS problem. Please check the "
594  "spelling of the dataset and/or retry to run "
595  "'validateAlignments.py'."%( self.name() ))
596  raise AllInOneError( msg )
597  fileInformationList = []
598  for file in data:
599  fileName = 'unknown'
600  try:
601  fileName = self.__findInJson(file, "name")
602  fileCreationTime = self.__findInJson(file, "creation_time")
603  fileNEvents = self.__findInJson(file, "nevents")
604  except KeyError:
605  print(("DAS query gives bad output for file '%s'. Skipping it.\n"
606  "It may work if you try again later.") % fileName)
607  fileNEvents = 0
608  # select only non-empty files
609  if fileNEvents == 0:
610  continue
611  fileDict = { "name": fileName,
612  "creation_time": fileCreationTime,
613  "nevents": fileNEvents
614  }
615  fileInformationList.append( fileDict )
616  fileInformationList.sort( key=lambda info: self.__findInJson(info,"name") )
617  return fileInformationList
618 
619  @cache
620  def __getRunList( self ):
621  dasQuery_runs = ( 'run dataset=%s instance=%s | grep run.run_number,'
622  'run.creation_time'%( self.__name, self.__dasinstance ) )
623  print("Requesting run information for '%s' from DAS..."%( self.__name ), end=' ')
624  sys.stdout.flush()
625  data = self.__getData( dasQuery_runs )
626  print("Done.")
627  data = [ self.__findInJson(entry,"run") for entry in data ]
628  data.sort( key = lambda run: self.__findInJson(run, "run_number") )
629  return data
630 
631  def __datetime(self, stringForDas):
632  if len(stringForDas) != 8:
633  raise AllInOneError(stringForDas + " is not a valid date string.\n"
634  + "DAS accepts dates in the form 'yyyymmdd'")
635  year = stringForDas[:4]
636  month = stringForDas[4:6]
637  day = stringForDas[6:8]
638  return datetime.date(int(year), int(month), int(day))
639 
640  def __dateString(self, date):
641  return str(date.year) + str(date.month).zfill(2) + str(date.day).zfill(2)
642 
643  def convertTimeToRun( self, begin = None, end = None,
644  firstRun = None, lastRun = None,
645  shortTuple = True ):
646  if ( begin and firstRun ) or ( end and lastRun ):
647  msg = ( "The Usage of "
648  + "'begin' & 'firstRun' " * int( bool( begin and
649  firstRun ) )
650  + "and " * int( bool( ( begin and firstRun ) and
651  ( end and lastRun ) ) )
652  + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
653  + "is ambigous." )
654  raise AllInOneError( msg )
655 
656  if begin or end:
657  runList = [ self.__findInJson(run, "run_number") for run in self.__getRunList() ]
658 
659  if begin:
660  lastdate = begin
661  for delta in [ 1, 5, 10, 20, 30 ]: #try searching for about 2 months after begin
662  firstdate = lastdate
663  lastdate = self.__dateString(self.__datetime(firstdate) + datetime.timedelta(delta))
664  dasQuery_begin = "run date between[%s,%s] instance=%s" % (firstdate, lastdate, self.__dasinstance)
665  begindata = self.__getData(dasQuery_begin)
666  if len(begindata) > 0:
667  begindata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
668  try:
669  runIndex = self.__find_ge( runList, self.__findInJson(begindata[0], ["run", "run_number"]))
670  except ValueError:
671  msg = ( "Your 'begin' is after the creation time of the last "
672  "run in the dataset\n'%s'"%( self.__name ) )
673  raise AllInOneError( msg )
674  firstRun = runList[runIndex]
675  begin = None
676  break
677 
678  if begin:
679  raise AllInOneError("No runs within a reasonable time interval after your 'begin'."
680  "Try using a 'begin' that has runs soon after it (within 2 months at most)")
681 
682  if end:
683  firstdate = end
684  for delta in [ 1, 5, 10, 20, 30 ]: #try searching for about 2 months before end
685  lastdate = firstdate
686  firstdate = self.__dateString(self.__datetime(lastdate) - datetime.timedelta(delta))
687  dasQuery_end = "run date between[%s,%s] instance=%s" % (firstdate, lastdate, self.__dasinstance)
688  enddata = self.__getData(dasQuery_end)
689  if len(enddata) > 0:
690  enddata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
691  try:
692  runIndex = self.__find_lt( runList, self.__findInJson(enddata[-1], ["run", "run_number"]))
693  except ValueError:
694  msg = ( "Your 'end' is before the creation time of the first "
695  "run in the dataset\n'%s'"%( self.__name ) )
696  raise AllInOneError( msg )
697  lastRun = runList[runIndex]
698  end = None
699  break
700 
701  if end:
702  raise AllInOneError("No runs within a reasonable time interval before your 'end'."
703  "Try using an 'end' that has runs soon before it (within 2 months at most)")
704 
705  if shortTuple:
706  return firstRun, lastRun
707  else:
708  return begin, end, firstRun, lastRun
709 
710  def dataType( self ):
711  if not self.__dataType:
712  self.__dataType = self.__getDataType()
713  return self.__dataType
714 
715  def magneticField( self ):
716  if not self.__magneticField:
717  self.__magneticField = self.__getMagneticField()
718  return self.__magneticField
719 
720  def magneticFieldForRun( self, run = -1 ):
721  return self.__getMagneticFieldForRun(run)
722 
723  def parentDataset( self ):
724  if not self.__parentDataset:
725  self.__parentDataset = self.__getParentDataset()
726  return self.__parentDataset
727 
728  def datasetSnippet( self, jsonPath = None, begin = None, end = None,
729  firstRun = None, lastRun = None, crab = False, parent = False ):
730  if not firstRun: firstRun = None
731  if not lastRun: lastRun = None
732  if not begin: begin = None
733  if not end: end = None
734  if self.__predefined and (jsonPath or begin or end or firstRun or lastRun):
735  msg = ( "The parameters 'JSON', 'begin', 'end', 'firstRun', and 'lastRun' "
736  "only work for official datasets, not predefined _cff.py files" )
737  raise AllInOneError( msg )
738  if self.__predefined and parent:
739  with open(self.__filename) as f:
740  if "secFiles.extend" not in f.read():
741  msg = ("The predefined dataset '%s' does not contain secondary files, "
742  "which your validation requires!") % self.__name
743  if self.__official:
744  self.__name = self.__origName
745  self.__predefined = False
746  print(msg)
747  print ("Retreiving the files from DAS. You will be asked if you want "
748  "to overwrite the old dataset.\n"
749  "It will still be compatible with validations that don't need secondary files.")
750  else:
751  raise AllInOneError(msg)
752 
753  if self.__predefined:
754  snippet = ("process.load(\"Alignment.OfflineValidation.%s_cff\")\n"
755  "process.maxEvents = cms.untracked.PSet(\n"
756  " input = cms.untracked.int32(.oO[nEvents]Oo. / .oO[parallelJobs]Oo.)\n"
757  ")\n"
758  "process.source.skipEvents=cms.untracked.uint32(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.)"
759  %(self.__name))
760  if not parent:
761  with open(self.__filename) as f:
762  if "secFiles.extend" in f.read():
763  snippet += "\nprocess.source.secondaryFileNames = cms.untracked.vstring()"
764  return snippet
765  theMap = { "process": "process.",
766  "tab": " " * len( "process." ),
767  "nEvents": ".oO[nEvents]Oo. / .oO[parallelJobs]Oo.",
768  "skipEventsString": "process.source.skipEvents=cms.untracked.uint32(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.)\n",
769  "importCms": "",
770  "header": ""
771  }
772  datasetSnippet = self.__createSnippet( jsonPath = jsonPath,
773  begin = begin,
774  end = end,
775  firstRun = firstRun,
776  lastRun = lastRun,
777  repMap = theMap,
778  crab = crab,
779  parent = parent )
780  if jsonPath == "" and begin == "" and end == "" and firstRun == "" and lastRun == "":
781  try:
782  self.dump_cff(parent = parent)
783  except AllInOneError as e:
784  print("Can't store the dataset as a cff:")
785  print(e)
786  print("This may be inconvenient in the future, but will not cause a problem for this validation.")
787  return datasetSnippet
788 
789  @cache
790  def dump_cff( self, outName = None, jsonPath = None, begin = None,
791  end = None, firstRun = None, lastRun = None, parent = False ):
792  if outName == None:
793  outName = "Dataset" + self.__name.replace("/", "_")
794  packageName = os.path.join( "Alignment", "OfflineValidation" )
795  if not os.path.exists( os.path.join(
796  self.__cmssw, "src", packageName ) ):
797  msg = ("You try to store the predefined dataset'%s'.\n"
798  "For that you need to check out the package '%s' to your "
799  "private relase area in\n"%( outName, packageName )
800  + self.__cmssw )
801  raise AllInOneError( msg )
802  theMap = { "process": "",
803  "tab": "",
804  "nEvents": str( -1 ),
805  "skipEventsString": "",
806  "importCms": "import FWCore.ParameterSet.Config as cms\n",
807  "header": "#Do not delete or (unless you know what you're doing) change these comments\n"
808  "#%(name)s\n"
809  "#data type: %(dataType)s\n"
810  "#magnetic field: .oO[magneticField]Oo.\n" #put in magnetic field later
811  %{"name": self.__name, #need to create the snippet before getting the magnetic field
812  "dataType": self.__dataType} #so that we know the first and last runs
813  }
814  dataset_cff = self.__createSnippet( jsonPath = jsonPath,
815  begin = begin,
816  end = end,
817  firstRun = firstRun,
818  lastRun = lastRun,
819  repMap = theMap,
820  parent = parent)
821  magneticField = self.__magneticField
822  if magneticField == "MagneticField":
823  magneticField = "%s, %s #%s" % (magneticField,
824  str(self.__getMagneticFieldForRun()).replace("\n"," ").split("#")[0].strip(),
825  "Use MagneticField_cff.py; the number is for determining which track selection to use."
826  )
827  dataset_cff = dataset_cff.replace(".oO[magneticField]Oo.",magneticField)
828  filePath = os.path.join( self.__cmssw, "src", packageName,
829  "python", outName + "_cff.py" )
830  if os.path.exists( filePath ):
831  existMsg = "The predefined dataset '%s' already exists.\n"%( outName )
832  askString = "Do you want to overwrite it? [y/n]\n"
833  inputQuery = existMsg + askString
834  while True:
835  userInput = raw_input( inputQuery ).lower()
836  if userInput == "y":
837  break
838  elif userInput == "n":
839  return
840  else:
841  inputQuery = askString
842  print ( "The predefined dataset '%s' will be stored in the file\n"
843  %( outName )
844  + filePath +
845  "\nFor future use you have to do 'scram b'." )
846  print()
847  theFile = open( filePath, "w" )
848  theFile.write( dataset_cff )
849  theFile.close()
850  return
851 
852  def createdatasetfile_hippy(self, filename, filesperjob, firstrun, lastrun):
853  with open(filename, "w") as f:
854  for job in self.__chunks(self.fileList(firstRun=firstrun, lastRun=lastrun, forcerunselection=True), filesperjob):
855  f.write(",".join("'{}'".format(file) for file in job)+"\n")
856 
857  @staticmethod
859  parts = filename.split("/")
860  result = error = None
861  if parts[0] != "" or parts[1] != "store":
862  error = "does not start with /store"
863  elif parts[2] in ["mc", "relval"]:
864  result = 1
865  elif not parts[-1].endswith(".root"):
866  error = "does not end with something.root"
867  elif len(parts) != 12:
868  error = "should be exactly 11 slashes counting the first one"
869  else:
870  runnumberparts = parts[-5:-2]
871  if not all(len(part)==3 for part in runnumberparts):
872  error = "the 3 directories {} do not have length 3 each".format("/".join(runnumberparts))
873  try:
874  result = int("".join(runnumberparts))
875  except ValueError:
876  error = "the 3 directories {} do not form an integer".format("/".join(runnumberparts))
877 
878  if error:
879  error = "could not figure out which run number this file is from:\n{}\n{}".format(filename, error)
880  raise AllInOneError(error)
881 
882  return result
883 
884  @cache
885  def fileList(self, parent=False, firstRun=None, lastRun=None, forcerunselection=False):
886  fileList = [ self.__findInJson(fileInfo,"name")
887  for fileInfo in self.fileInfoList(parent) ]
888 
889  if firstRun or lastRun:
890  if not firstRun: firstRun = -1
891  if not lastRun: lastRun = float('infinity')
892  unknownfilenames, reasons = [], set()
893  for filename in fileList[:]:
894  try:
895  if not firstRun <= self.getrunnumberfromfilename(filename) <= lastRun:
896  fileList.remove(filename)
897  except AllInOneError as e:
898  if forcerunselection: raise
899  unknownfilenames.append(e.message.split("\n")[1])
900  reasons .add (e.message.split("\n")[2])
901  if reasons:
902  if len(unknownfilenames) == len(fileList):
903  print("Could not figure out the run numbers of any of the filenames for the following reason(s):")
904  else:
905  print("Could not figure out the run numbers of the following filenames:")
906  for filename in unknownfilenames:
907  print(" "+filename)
908  print("for the following reason(s):")
909  for reason in reasons:
910  print(" "+reason)
911  print("Using the files anyway. The runs will be filtered at the CMSSW level.")
912  return fileList
913 
914  def fileInfoList( self, parent = False ):
915  return self.__getFileInfoList( self.__dasLimit, parent )
916 
917  def name( self ):
918  return self.__name
919 
920  def predefined( self ):
921  return self.__predefined
922 
923  @cache
924  def runList( self ):
925  return self.__getRunList()
926 
927 
928 if __name__ == '__main__':
929  print("Start testing...")
930  datasetName = '/MinimumBias/Run2012D-TkAlMinBias-v1/ALCARECO'
931  jsonFile = ( '/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/'
932  'Collisions12/8TeV/Prompt/'
933  'Cert_190456-207898_8TeV_PromptReco_Collisions12_JSON.txt' )
934  dataset = Dataset( datasetName )
935  print(dataset.datasetSnippet( jsonPath = jsonFile,
936  firstRun = "207800",
937  end = "20121128"))
938  dataset.dump_cff( outName = "Dataset_Test_TkAlMinBias_Run2012D",
939  jsonPath = jsonFile,
940  firstRun = "207800",
941  end = "20121128" )
def __getRunList(self)
Definition: dataset.py:620
def __lumiSelectionSnippet(self, jsonPath=None, firstRun=None, lastRun=None)
Definition: dataset.py:125
def __getFileInfoList(self, dasLimit, parent=False)
Definition: dataset.py:561
def __getDataType(self)
Definition: dataset.py:388
def __getMagneticFieldForRun(self, run=-1, tolerance=0.5)
Definition: dataset.py:504
def datasetSnippet(self, jsonPath=None, begin=None, end=None, firstRun=None, lastRun=None, crab=False, parent=False)
Definition: dataset.py:729
def __createSnippet(self, jsonPath=None, begin=None, end=None, firstRun=None, lastRun=None, repMap=None, crab=False, parent=False)
Definition: dataset.py:245
def __findInJson(self, jsondict, strings)
Definition: dataset.py:304
def magneticFieldForRun(self, run=-1)
Definition: dataset.py:720
def replace(string, replacements)
S & print(S &os, JobReport::InputFile const &f)
Definition: JobReport.cc:66
def fileInfoList(self, parent=False)
Definition: dataset.py:914
def __getMagneticField(self)
Definition: dataset.py:423
def forcerunrange(self, firstRun, lastRun, s)
Definition: dataset.py:326
def parentDataset(self)
Definition: dataset.py:723
def getrunnumberfromfilename(filename)
Definition: dataset.py:858
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
def magneticField(self)
Definition: dataset.py:715
T min(T a, T b)
Definition: MathUtil.h:58
def getForceRunRangeFunction(self, firstRun, lastRun)
Definition: dataset.py:351
def createdatasetfile_hippy(self, filename, filesperjob, firstrun, lastrun)
Definition: dataset.py:852
def convertTimeToRun(self, begin=None, end=None, firstRun=None, lastRun=None, shortTuple=True)
Definition: dataset.py:645
def __find_lt(self, a, x)
Definition: dataset.py:290
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
def __datetime(self, stringForDas)
Definition: dataset.py:631
tuple __dummy_source_template
Definition: dataset.py:113
def dataType(self)
Definition: dataset.py:710
def __dateString(self, date)
Definition: dataset.py:640
def dump_cff(self, outName=None, jsonPath=None, begin=None, end=None, firstRun=None, lastRun=None, parent=False)
Definition: dataset.py:791
def predefined(self)
Definition: dataset.py:920
def __chunks(self, theList, n)
Definition: dataset.py:89
def __getParentDataset(self)
Definition: dataset.py:413
def __fileListSnippet(self, crab=False, parent=False, firstRun=None, lastRun=None, forcerunselection=False)
Definition: dataset.py:221
def __find_ge(self, a, x)
Definition: dataset.py:297
def name(self)
Definition: dataset.py:917
#define str(s)
double split
Definition: MVATrainer.cc:139
def __getData(self, dasQuery, dasLimit=0)
Definition: dataset.py:356
def runList(self)
Definition: dataset.py:924
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run
def fileList(self, parent=False, firstRun=None, lastRun=None, forcerunselection=False)
Definition: dataset.py:885