CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
dataset.py
Go to the documentation of this file.
1 # idea stolen from:
2 # http://cmssw.cvs.cern.ch/cgi-bin/cmssw.cgi/CMSSW/
3 # PhysicsTools/PatAlgos/python/tools/cmsswVersionTools.py
4 import das_client
5 import json
6 import os
7 import bisect
8 import re
9 import datetime
10 from FWCore.PythonUtilities.LumiList import LumiList
11 from TkAlExceptions import AllInOneError
12 
13 
14 class Dataset:
15  def __init__( self, datasetName, dasLimit = 0, tryPredefinedFirst = True,
16  cmssw = os.environ["CMSSW_BASE"], cmsswrelease = os.environ["CMSSW_RELEASE_BASE"]):
17  self.__name = datasetName
18  self.__origName = datasetName
19  self.__dasLimit = dasLimit
20  self.__fileList = None
21  self.__fileInfoList = None
22  self.__runList = None
23  self.__alreadyStored = False
24  self.__cmssw = cmssw
25  self.__cmsswrelease = cmsswrelease
26  self.__firstusedrun = None
27  self.__lastusedrun = None
28  self.__parentDataset = None
29  self.__parentFileList = None
31 
32  # check, if dataset name matches CMS dataset naming scheme
33  if re.match( r'/.+/.+/.+', self.__name ):
34  self.__official = True
35  fileName = "Dataset" + self.__name.replace("/","_") + "_cff.py"
36  else:
37  self.__official = False
38  fileName = self.__name + "_cff.py"
39 
40  searchPath1 = os.path.join( self.__cmssw, "python",
41  "Alignment", "OfflineValidation",
42  fileName )
43  searchPath2 = os.path.join( self.__cmssw, "src",
44  "Alignment", "OfflineValidation",
45  "python", fileName )
46  searchPath3 = os.path.join( self.__cmsswrelease,
47  "python", "Alignment",
48  "OfflineValidation", fileName )
49  if self.__official and not tryPredefinedFirst:
50  self.__predefined = False
51  elif os.path.exists( searchPath1 ):
52  self.__predefined = True
53  self.__filename = searchPath1
54  elif os.path.exists( searchPath2 ):
55  msg = ("The predefined dataset '%s' does exist in '%s', but "
56  "you need to run 'scram b' first."
57  %( self.__name, searchPath2 ))
58  if self.__official:
59  print msg
60  print "Getting the data from DAS again. To go faster next time, run scram b."
61  else:
62  raise AllInOneError( msg )
63  elif os.path.exists( searchPath3 ):
64  self.__predefined = True
65  self.__filename = searchPath3
66  elif self.__official:
67  self.__predefined = False
68  else:
69  msg = ("The predefined dataset '%s' does not exist. Please "
70  "create it first or check for typos."%( self.__name ))
71  raise AllInOneError( msg )
72 
73  if self.__predefined and self.__official:
74  self.__name = "Dataset" + self.__name.replace("/","_")
75 
76  self.__dataType = self.__getDataType()
78 
79  def __chunks( self, theList, n ):
80  """ Yield successive n-sized chunks from theList.
81  """
82  for i in xrange( 0, len( theList ), n ):
83  yield theList[i:i+n]
84 
85  __source_template= ("%(header)s"
86  "%(importCms)s"
87  "import FWCore.PythonUtilities.LumiList as LumiList\n\n"
88  "%(goodLumiSecStr)s"
89  "readFiles = cms.untracked.vstring()\n"
90  "secFiles = cms.untracked.vstring()\n"
91  "%(process)ssource = cms.Source(\"PoolSource\",\n"
92  "%(lumiStr)s"
93  "%(tab)s secondaryFileNames ="
94  "secFiles,\n"
95  "%(tab)s fileNames = readFiles\n"
96  ")\n"
97  "%(files)s\n"
98  "%(lumiSecExtend)s\n"
99  "%(process)smaxEvents = cms.untracked.PSet( "
100  "input = cms.untracked.int32(%(nEvents)s) )\n"
101  "%(skipEventsString)s\n")
102 
103  __dummy_source_template = ("readFiles = cms.untracked.vstring()\n"
104  "secFiles = cms.untracked.vstring()\n"
105  "%(process)ssource = cms.Source(\"PoolSource\",\n"
106  "%(tab)s secondaryFileNames ="
107  "secFiles,\n"
108  "%(tab)s fileNames = readFiles\n"
109  ")\n"
110  "readFiles.extend(['dummy_File.root'])\n"
111  "%(process)smaxEvents = cms.untracked.PSet( "
112  "input = cms.untracked.int32(%(nEvents)s) )\n"
113  "%(skipEventsString)s\n")
114 
115  def __createSnippet( self, jsonPath = None, begin = None, end = None,
116  firstRun = None, lastRun = None, repMap = None,
117  crab = False, parent = False ):
118  if firstRun:
119  firstRun = int( firstRun )
120  if lastRun:
121  lastRun = int( lastRun )
122  if ( begin and firstRun ) or ( end and lastRun ):
123  msg = ( "The Usage of "
124  + "'begin' & 'firstRun' " * int( bool( begin and
125  firstRun ) )
126  + "and " * int( bool( ( begin and firstRun ) and
127  ( end and lastRun ) ) )
128  + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
129  + "is ambigous." )
130  raise AllInOneError( msg )
131  if begin or end:
132  ( firstRun, lastRun ) = self.convertTimeToRun(
133  begin = begin, end = end, firstRun = firstRun,
134  lastRun = lastRun )
135  if ( firstRun and lastRun ) and ( firstRun > lastRun ):
136  msg = ( "The lower time/runrange limit ('begin'/'firstRun') "
137  "chosen is greater than the upper time/runrange limit "
138  "('end'/'lastRun').")
139  raise AllInOneError( msg )
140  if self.predefined() and (jsonPath or begin or end or firstRun or lastRun):
141  msg = ( "The parameters 'JSON', 'begin', 'end', 'firstRun', and 'lastRun'"
142  "only work for official datasets, not predefined _cff.py files" )
143  raise AllInOneError( msg )
144  goodLumiSecStr = ""
145  lumiStr = ""
146  lumiSecExtend = ""
147  if firstRun or lastRun or jsonPath:
148  goodLumiSecStr = ( "lumiSecs = cms.untracked."
149  "VLuminosityBlockRange()\n" )
150  lumiStr = " lumisToProcess = lumiSecs,\n"
151  if not jsonPath:
152  selectedRunList = self.__getRunList()
153  if firstRun:
154  selectedRunList = [ run for run in selectedRunList \
155  if self.__findInJson(run, "run_number") >= firstRun ]
156  if lastRun:
157  selectedRunList = [ run for run in selectedRunList \
158  if self.__findInJson(run, "run_number") <= lastRun ]
159  lumiList = [ str( self.__findInJson(run, "run_number") ) + ":1-" \
160  + str( self.__findInJson(run, "run_number") ) + ":max" \
161  for run in selectedRunList ]
162  splitLumiList = list( self.__chunks( lumiList, 255 ) )
163  else:
164  theLumiList = None
165  try:
166  theLumiList = LumiList ( filename = jsonPath )
167  except ValueError:
168  pass
169 
170  if theLumiList is not None:
171  allRuns = theLumiList.getRuns()
172  runsToRemove = []
173  for run in allRuns:
174  if firstRun and int( run ) < firstRun:
175  runsToRemove.append( run )
176  if lastRun and int( run ) > lastRun:
177  runsToRemove.append( run )
178  theLumiList.removeRuns( runsToRemove )
179  splitLumiList = list( self.__chunks(
180  theLumiList.getCMSSWString().split(','), 255 ) )
181  else:
182  with open(jsonPath) as f:
183  jsoncontents = f.read()
184  if "process.source.lumisToProcess" in jsoncontents:
185  msg = "%s is not a json file, but it seems to be a CMSSW lumi selection cff snippet. Trying to use it" % jsonPath
186  if firstRun or lastRun:
187  msg += ("\n (after applying firstRun and/or lastRun)")
188  msg += ".\nPlease note that, depending on the format of this file, it may not work as expected."
189  msg += "\nCheck your config file to make sure that it worked properly."
190  print msg
191 
192  runlist = self.__getRunList()
193  if firstRun or lastRun:
194  self.__firstusedrun = -1
195  self.__lastusedrun = -1
196  jsoncontents = re.sub("\d+:(\d+|max)-\d+:(\d+|max)", self.getForceRunRangeFunction(firstRun, lastRun), jsoncontents)
197  self.__firstusedrun = max(self.__firstusedrun, int(self.__findInJson(runlist[0],"run_number")))
198  self.__lastusedrun = min(self.__lastusedrun, int(self.__findInJson(runlist[-1],"run_number")))
199  else:
200  self.__firstusedrun = int(self.__findInJson(runlist[0],"run_number"))
201  self.__lastusedrun = int(self.__findInJson(runlist[-1],"run_number"))
202  lumiSecExtend = jsoncontents
203  splitLumiList = [[""]]
204 
205  if splitLumiList and splitLumiList[0]:
206  if splitLumiList[0][0]:
207  lumiSecStr = [ "',\n'".join( lumis ) \
208  for lumis in splitLumiList ]
209  lumiSecStr = [ "lumiSecs.extend( [\n'" + lumis + "'\n] )" \
210  for lumis in lumiSecStr ]
211  lumiSecExtend = "\n".join( lumiSecStr )
212  runlist = self.__getRunList()
213  self.__firstusedrun = max(int(splitLumiList[0][0].split(":")[0]), int(self.__findInJson(runlist[0],"run_number")))
214  self.__lastusedrun = min(int(splitLumiList[-1][-1].split(":")[0]), int(self.__findInJson(runlist[-1],"run_number")))
215  else:
216  msg = "You are trying to run a validation without any runs! Check that:"
217  if firstRun or lastRun:
218  msg += "\n - firstRun and lastRun are correct for this dataset, and there are runs in between containing data"
219  if jsonPath:
220  msg += "\n - your JSON file is correct for this dataset, and the runs contain data"
221  if (firstRun or lastRun) and jsonPath:
222  msg += "\n - firstRun and lastRun are consistent with your JSON file"
223  if begin:
224  msg = msg.replace("firstRun", "begin")
225  if end:
226  msg = msg.replace("lastRun", "end")
227  raise AllInOneError(msg)
228 
229  else:
230  runlist = self.__getRunList()
231  self.__firstusedrun = int(self.__findInJson(self.__getRunList()[0],"run_number"))
232  self.__lastusedrun = int(self.__findInJson(self.__getRunList()[-1],"run_number"))
233 
234  if crab:
235  files = ""
236  else:
237  splitFileList = list( self.__chunks( self.fileList(), 255 ) )
238  fileStr = [ "',\n'".join( files ) for files in splitFileList ]
239  fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \
240  for files in fileStr ]
241  files = "\n".join( fileStr )
242 
243  if parent:
244  splitParentFileList = list( self.__chunks( self.fileList(parent = True), 255 ) )
245  parentFileStr = [ "',\n'".join( parentFiles ) for parentFiles in splitParentFileList ]
246  parentFileStr = [ "secFiles.extend( [\n'" + parentFiles + "'\n] )" \
247  for parentFiles in parentFileStr ]
248  parentFiles = "\n".join( parentFileStr )
249  files += "\n\n" + parentFiles
250 
251 
252  theMap = repMap
253  theMap["files"] = files
254  theMap["json"] = jsonPath
255  theMap["lumiStr"] = lumiStr
256  theMap["goodLumiSecStr"] = goodLumiSecStr%( theMap )
257  theMap["lumiSecExtend"] = lumiSecExtend
258  if crab:
259  dataset_snippet = self.__dummy_source_template%( theMap )
260  else:
261  dataset_snippet = self.__source_template%( theMap )
262  return dataset_snippet
263 
264  def __find_lt( self, a, x ):
265  'Find rightmost value less than x'
266  i = bisect.bisect_left( a, x )
267  if i:
268  return i-1
269  raise ValueError
270 
271  def __find_ge( self, a, x):
272  'Find leftmost item greater than or equal to x'
273  i = bisect.bisect_left( a, x )
274  if i != len( a ):
275  return i
276  raise ValueError
277 
278  def __findInJson(self, jsondict, strings):
279  if isinstance(strings, str):
280  strings = [ strings ]
281 
282  if len(strings) == 0:
283  return jsondict
284  if isinstance(jsondict,dict):
285  if strings[0] in jsondict:
286  try:
287  return self.__findInJson(jsondict[strings[0]], strings[1:])
288  except KeyError:
289  pass
290  else:
291  for a in jsondict:
292  if strings[0] in a:
293  try:
294  return self.__findInJson(a[strings[0]], strings[1:])
295  except (TypeError, KeyError): #TypeError because a could be a string and contain strings[0]
296  pass
297  #if it's not found
298  raise KeyError("Can't find " + strings[0])
299 
300  def forcerunrange(self, firstRun, lastRun, s):
301  """s must be in the format run1:lum1-run2:lum2"""
302  s = s.group()
303  run1 = s.split("-")[0].split(":")[0]
304  lum1 = s.split("-")[0].split(":")[1]
305  run2 = s.split("-")[1].split(":")[0]
306  lum2 = s.split("-")[1].split(":")[1]
307  if int(run2) < firstRun or int(run1) > lastRun:
308  return ""
309  if int(run1) < firstRun or firstRun < 0:
310  run1 = firstRun
311  lum1 = 1
312  if int(run2) > lastRun:
313  run2 = lastRun
314  lum2 = "max"
315  if int(run1) < self.__firstusedrun or self.__firstusedrun < 0:
316  self.__firstusedrun = int(run1)
317  if int(run2) > self.__lastusedrun:
318  self.__lastusedrun = int(run2)
319  return "%s:%s-%s:%s" % (run1, lum1, run2, lum2)
320 
321  def getForceRunRangeFunction(self, firstRun, lastRun):
322  def forcerunrangefunction(s):
323  return self.forcerunrange(firstRun, lastRun, s)
324  return forcerunrangefunction
325 
326  def __getData( self, dasQuery, dasLimit = 0 ):
327  dasData = das_client.get_data( 'https://cmsweb.cern.ch',
328  dasQuery, 0, dasLimit, False )
329  if isinstance(dasData, str):
330  jsondict = json.loads( dasData )
331  else:
332  jsondict = dasData
333  # Check, if the DAS query fails
334  try:
335  error = self.__findInJson(jsondict,["data","error"])
336  except KeyError:
337  error = None
338  if error or self.__findInJson(jsondict,"status") != 'ok' or "data" not in jsondict:
339  jsonstr = str(jsondict)
340  if len(jsonstr) > 10000:
341  jsonfile = "das_query_output_%i.txt"
342  i = 0
343  while os.path.lexists(jsonfile % i):
344  i += 1
345  jsonfile = jsonfile % i
346  theFile = open( jsonfile, "w" )
347  theFile.write( jsonstr )
348  theFile.close()
349  msg = "The DAS query returned an error. The output is very long, and has been stored in:\n" + jsonfile
350  else:
351  msg = "The DAS query returned a error. Here is the output\n" + jsonstr
352  msg += "\nIt's possible that this was a server error. If so, it may work if you try again later"
353  raise AllInOneError(msg)
354  return self.__findInJson(jsondict,"data")
355 
356  def __getDataType( self ):
357  if self.__predefined:
358  with open(self.__filename) as f:
359  datatype = None
360  for line in f.readlines():
361  if line.startswith("#data type: "):
362  if datatype is not None:
363  raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
364  datatype = line.replace("#data type: ", "").replace("\n","")
365  return datatype
366  return "unknown"
367 
368  dasQuery_type = ( 'dataset dataset=%s | grep dataset.datatype,'
369  'dataset.name'%( self.__name ) )
370  data = self.__getData( dasQuery_type )
371 
372  try:
373  return self.__findInJson(data, ["dataset", "datatype"])
374  except KeyError:
375  print ("Cannot find the datatype of the dataset '%s'\n"
376  "It may not be possible to automatically find the magnetic field,\n"
377  "and you will not be able run in CRAB mode"
378  %( self.name() ))
379  return "unknown"
380 
381  def __getParentDataset( self ):
382  dasQuery = "parent dataset=" + self.__name
383  data = self.__getData( dasQuery )
384  try:
385  return self.__findInJson(data, ["parent", "name"])
386  except KeyError:
387  raise AllInOneError("Cannot find the parent of the dataset '" + self.__name + "'\n"
388  "Here is the DAS output:\n" + str(jsondict) +
389  "\nIt's possible that this was a server error. If so, it may work if you try again later")
390 
391  def __getMagneticField( self ):
392  Bfieldlocation = os.path.join( self.__cmssw, "python", "Configuration", "StandardSequences" )
393  if not os.path.isdir(Bfieldlocation):
394  Bfieldlocation = os.path.join( self.__cmsswrelease, "python", "Configuration", "StandardSequences" )
395  Bfieldlist = [ f.replace("_cff.py",'') \
396  for f in os.listdir(Bfieldlocation) \
397  if f.startswith("MagneticField_") and f.endswith("_cff.py") ]
398  Bfieldlist.sort( key = lambda Bfield: -len(Bfield) ) #Put it in order of decreasing length, so that searching in the name gives the longer match
399 
400  if self.__predefined:
401  with open(self.__filename) as f:
402  datatype = None
403  Bfield = None
404  for line in f.readlines():
405  if line.startswith("#data type: "):
406  if datatype is not None:
407  raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
408  datatype = line.replace("#data type: ", "").replace("\n","")
409  datatype = datatype.split("#")[0].strip()
410  if line.startswith("#magnetic field: "):
411  if Bfield is not None:
412  raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
413  Bfield = line.replace("#magnetic field: ", "").replace("\n","")
414  Bfield = Bfield.split("#")[0].strip()
415  if Bfield is not None:
416  Bfield = Bfield.split(",")[0]
417  if Bfield in Bfieldlist or Bfield == "unknown":
418  return Bfield
419  #===========================================================================
420  #For compatibility with already written datasets - remove this at some point
421  #(until the next === line)
422  #It's currently June 2015, anytime starting in 2016 is more than safe
423  elif Bfield == "AutoFromDBCurrent":
424  return "MagneticField"
425  elif "MagneticField_" + Bfield in Bfieldlist:
426  return "MagneticField_" + Bfield
427  #===========================================================================
428  else:
429  print "Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield
430  print "Using Bfield='unknown' - this will revert to the default"
431  return "unknown"
432  elif datatype == "data":
433  return "MagneticField" #this should be in the "#magnetic field" line, but for safety in case it got messed up
434  else:
435  return "unknown"
436 
437  if self.__dataType == "data":
438  return "MagneticField"
439 
440  dasQuery_B = ( 'dataset dataset=%s'%( self.__name ) ) #try to find the magnetic field from DAS
441  data = self.__getData( dasQuery_B ) #it seems to be there for the newer (7X) MC samples, except cosmics
442 
443  try:
444  Bfield = self.__findInJson(data, ["dataset", "mcm", "sequences", "magField"])
445  if Bfield in Bfieldlist:
446  return Bfield
447  elif Bfield == "38T" or Bfield == "38T_PostLS1":
448  return "MagneticField"
449  elif "MagneticField_" + Bfield in Bfieldlist:
450  return "MagneticField_" + Bfield
451  elif Bfield == "":
452  pass
453  else:
454  print "Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield
455  print "Using Bfield='unknown' - this will revert to the default magnetic field"
456  return "unknown"
457  except KeyError:
458  pass
459 
460  for possibleB in Bfieldlist:
461  if (possibleB != "MagneticField"
462  and possibleB.replace("MagneticField_","") in self.__name.replace("TkAlCosmics0T", "")):
463  #final attempt - try to identify the dataset from the name
464  #all cosmics dataset names contain "TkAlCosmics0T"
465  if possibleB == "MagneticField_38T" or possibleB == "MagneticField_38T_PostLS1":
466  return "MagneticField"
467  return possibleB
468 
469  return "unknown"
470 
471  def __getMagneticFieldForRun( self, run = -1, tolerance = 0.5 ):
472  """For MC, this returns the same as the previous function.
473  For data, it gets the magnetic field from the runs. This is important for
474  deciding which template to use for offlinevalidation
475  """
476  if self.__dataType == "mc" and self.__magneticField == "MagneticField":
477  return 3.8 #For 3.8T MC the default MagneticField is used
478  if "T" in self.__magneticField:
479  Bfield = self.__magneticField.split("T")[0].replace("MagneticField_","")
480  try:
481  return float(Bfield) / 10.0 #e.g. 38T and 38T_PostLS1 both return 3.8
482  except ValueError:
483  pass
484  if self.__predefined:
485  with open(self.__filename) as f:
486  Bfield = None
487  for line in f.readlines():
488  if line.startswith("#magnetic field: ") and "," in line:
489  if Bfield is not None:
490  raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
491  return float(line.replace("#magnetic field: ", "").split(",")[1].split("#")[0].strip())
492 
493  if run > 0:
494  dasQuery = ('run = %s'%run) #for data
495  data = self.__getData(dasQuery)
496  try:
497  return self.__findInJson(data, ["run","bfield"])
498  except KeyError:
499  return "unknown Can't get the magnetic field for run %s from DAS" % run
500 
501  #run < 0 - find B field for the first and last runs, and make sure they're compatible
502  # (to within tolerance)
503  #NOT FOOLPROOF! The magnetic field might go up and then down, or vice versa
504  if self.__firstusedrun is None or self.__lastusedrun is None:
505  return "unknown Can't get the exact magnetic field for the dataset until data has been retrieved from DAS."
506  firstrunB = self.__getMagneticFieldForRun(self.__firstusedrun)
507  lastrunB = self.__getMagneticFieldForRun(self.__lastusedrun)
508  try:
509  if abs(firstrunB - lastrunB) <= tolerance:
510  return .5*(firstrunB + lastrunB)
511  print firstrunB, lastrunB, tolerance
512  return ("unknown The beginning and end of your run range for %s\n"
513  "have different magnetic fields (%s, %s)!\n"
514  "Try limiting the run range using firstRun, lastRun, begin, end, or JSON,\n"
515  "or increasing the tolerance (in dataset.py) from %s.") % (self.__name, firstrunB, lastrunB, tolerance)
516  except TypeError:
517  try:
518  if "unknown" in firstrunB:
519  return firstrunB
520  else:
521  return lastrunB
522  except TypeError:
523  return lastrunB
524 
525  def __getFileInfoList( self, dasLimit, parent = False ):
526  if self.__predefined:
527  if parent:
528  extendstring = "secFiles.extend"
529  else:
530  extendstring = "readFiles.extend"
531  with open(self.__fileName) as f:
532  files = []
533  copy = False
534  for line in f.readlines():
535  if "]" in line:
536  copy = False
537  if copy:
538  files.append({name: line.translate(None, "', " + '"')})
539  if extendstring in line and "[" in line and "]" not in line:
540  copy = True
541  return files
542 
543  if self.__fileInfoList and not parent:
544  return self.__fileInfoList
545  if self.__parentFileInfoList and parent:
546  return self.__parentFileInfoList
547 
548  if parent:
549  searchdataset = self.parentDataset()
550  else:
551  searchdataset = self.__name
552  dasQuery_files = ( 'file dataset=%s | grep file.name, file.nevents, '
553  'file.creation_time, '
554  'file.modification_time'%( searchdataset ) )
555  print "Requesting file information for '%s' from DAS..."%( searchdataset ),
556  data = self.__getData( dasQuery_files, dasLimit )
557  print "Done."
558  data = [ self.__findInJson(entry,"file") for entry in data ]
559  if len( data ) == 0:
560  msg = ("No files are available for the dataset '%s'. This can be "
561  "due to a typo or due to a DAS problem. Please check the "
562  "spelling of the dataset and/or retry to run "
563  "'validateAlignments.py'."%( self.name() ))
564  raise AllInOneError( msg )
565  fileInformationList = []
566  for file in data:
567  fileName = 'unknown'
568  try:
569  fileName = self.__findInJson(file, "name")
570  fileCreationTime = self.__findInJson(file, "creation_time")
571  fileNEvents = self.__findInJson(file, "nevents")
572  except KeyError:
573  print ("DAS query gives bad output for file '%s'. Skipping it.\n"
574  "It may work if you try again later.") % fileName
575  fileNEvents = 0
576  # select only non-empty files
577  if fileNEvents == 0:
578  continue
579  fileDict = { "name": fileName,
580  "creation_time": fileCreationTime,
581  "nevents": fileNEvents
582  }
583  fileInformationList.append( fileDict )
584  fileInformationList.sort( key=lambda info: self.__findInJson(info,"name") )
585  if parent:
586  self.__parentFileInfoList = fileInformationList
587  else:
588  self.__fileInfoList = fileInformationList
589  return fileInformationList
590 
591  def __getRunList( self ):
592  if self.__runList:
593  return self.__runList
594  dasQuery_runs = ( 'run dataset=%s | grep run.run_number,'
595  'run.creation_time'%( self.__name ) )
596  print "Requesting run information for '%s' from DAS..."%( self.__name ),
597  data = self.__getData( dasQuery_runs )
598  print "Done."
599  data = [ self.__findInJson(entry,"run") for entry in data ]
600  data.sort( key = lambda run: self.__findInJson(run, "run_number") )
601  self.__runList = data
602  return data
603 
604  def __datetime(self, stringForDas):
605  if len(stringForDas) != 8:
606  raise AllInOneError(stringForDas + " is not a valid date string.\n"
607  + "DAS accepts dates in the form 'yyyymmdd'")
608  year = stringForDas[:4]
609  month = stringForDas[4:6]
610  day = stringForDas[6:8]
611  return datetime.date(int(year), int(month), int(day))
612 
613  def __dateString(self, date):
614  return str(date.year) + str(date.month).zfill(2) + str(date.day).zfill(2)
615 
616  def convertTimeToRun( self, begin = None, end = None,
617  firstRun = None, lastRun = None,
618  shortTuple = True ):
619  if ( begin and firstRun ) or ( end and lastRun ):
620  msg = ( "The Usage of "
621  + "'begin' & 'firstRun' " * int( bool( begin and
622  firstRun ) )
623  + "and " * int( bool( ( begin and firstRun ) and
624  ( end and lastRun ) ) )
625  + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
626  + "is ambigous." )
627  raise AllInOneError( msg )
628 
629  if begin or end:
630  runList = [ self.__findInJson(run, "run_number") for run in self.__getRunList() ]
631 
632  if begin:
633  lastdate = begin
634  for delta in [ 1, 5, 10, 20, 30 ]: #try searching for about 2 months after begin
635  firstdate = lastdate
636  lastdate = self.__dateString(self.__datetime(firstdate) + datetime.timedelta(delta))
637  dasQuery_begin = "run date between[%s,%s]" % (firstdate, lastdate)
638  begindata = self.__getData(dasQuery_begin)
639  if len(begindata) > 0:
640  begindata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
641  try:
642  runIndex = self.__find_ge( runList, self.__findInJson(begindata[0], ["run", "run_number"]))
643  except ValueError:
644  msg = ( "Your 'begin' is after the creation time of the last "
645  "run in the dataset\n'%s'"%( self.__name ) )
646  raise AllInOneError( msg )
647  firstRun = runList[runIndex]
648  begin = None
649  break
650 
651  if begin:
652  raise AllInOneError("No runs within a reasonable time interval after your 'begin'."
653  "Try using a 'begin' that has runs soon after it (within 2 months at most)")
654 
655  if end:
656  firstdate = end
657  for delta in [ 1, 5, 10, 20, 30 ]: #try searching for about 2 months before end
658  lastdate = firstdate
659  firstdate = self.__dateString(self.__datetime(lastdate) - datetime.timedelta(delta))
660  dasQuery_end = "run date between[%s,%s]" % (firstdate, lastdate)
661  enddata = self.__getData(dasQuery_end)
662  if len(enddata) > 0:
663  enddata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
664  try:
665  runIndex = self.__find_lt( runList, self.__findInJson(enddata[-1], ["run", "run_number"]))
666  except ValueError:
667  msg = ( "Your 'end' is before the creation time of the first "
668  "run in the dataset\n'%s'"%( self.__name ) )
669  raise AllInOneError( msg )
670  lastRun = runList[runIndex]
671  end = None
672  break
673 
674  if end:
675  raise AllInOneError("No runs within a reasonable time interval before your 'end'."
676  "Try using an 'end' that has runs soon before it (within 2 months at most)")
677 
678  if shortTuple:
679  return firstRun, lastRun
680  else:
681  return begin, end, firstRun, lastRun
682 
683  def dataType( self ):
684  if not self.__dataType:
685  self.__dataType = self.__getDataType()
686  return self.__dataType
687 
688  def magneticField( self ):
689  if not self.__magneticField:
690  self.__magneticField = self.__getMagneticField()
691  return self.__magneticField
692 
693  def magneticFieldForRun( self, run = -1 ):
694  return self.__getMagneticFieldForRun(run)
695 
696  def parentDataset( self ):
697  if not self.__parentDataset:
698  self.__parentDataset = self.__getParentDataset()
699  return self.__parentDataset
700 
701  def datasetSnippet( self, jsonPath = None, begin = None, end = None,
702  firstRun = None, lastRun = None, crab = False, parent = False ):
703  if self.__predefined and parent:
704  with open(self.__filename) as f:
705  if "secFiles.extend" not in f.read():
706  msg = ("The predefined dataset '%s' does not contain secondary files, "
707  "which your validation requires!") % self.__name
708  if self.__official:
709  self.__name = self.__origName
710  self.__predefined = False
711  print msg
712  print ("Retreiving the files from DAS. You will be asked if you want "
713  "to overwrite the old dataset.\n"
714  "It will still be compatible with validations that don't need secondary files.")
715  else:
716  raise AllInOneError(msg)
717 
718  if self.__predefined:
719  snippet = ("process.load(\"Alignment.OfflineValidation.%s_cff\")\n"
720  "process.maxEvents = cms.untracked.PSet(\n"
721  " input = cms.untracked.int32(.oO[nEvents]Oo. / .oO[parallelJobs]Oo.)\n"
722  ")\n"
723  "process.source.skipEvents=cms.untracked.uint32(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.)"
724  %(self.__name))
725  if not parent:
726  with open(self.__filename) as f:
727  if "secFiles.extend" in f.read():
728  snippet += "\nprocess.source.secondaryFileNames = cms.untracked.vstring()"
729  return snippet
730  theMap = { "process": "process.",
731  "tab": " " * len( "process." ),
732  "nEvents": ".oO[nEvents]Oo. / .oO[parallelJobs]Oo.",
733  "skipEventsString": "process.source.skipEvents=cms.untracked.uint32(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.)\n",
734  "importCms": "",
735  "header": ""
736  }
737  datasetSnippet = self.__createSnippet( jsonPath = jsonPath,
738  begin = begin,
739  end = end,
740  firstRun = firstRun,
741  lastRun = lastRun,
742  repMap = theMap,
743  crab = crab,
744  parent = parent )
745  if jsonPath == "" and begin == "" and end == "" and firstRun == "" and lastRun == "":
746  try:
747  self.dump_cff(parent = parent)
748  except AllInOneError, e:
749  print "Can't store the dataset as a cff:"
750  print e
751  print "This may be inconvenient in the future, but will not cause a problem for this validation."
752  return datasetSnippet
753 
754  def dump_cff( self, outName = None, jsonPath = None, begin = None,
755  end = None, firstRun = None, lastRun = None, parent = False ):
756  if self.__alreadyStored:
757  return
758  self.__alreadyStored = True
759  if outName == None:
760  outName = "Dataset" + self.__name.replace("/", "_")
761  packageName = os.path.join( "Alignment", "OfflineValidation" )
762  if not os.path.exists( os.path.join(
763  self.__cmssw, "src", packageName ) ):
764  msg = ("You try to store the predefined dataset'%s'.\n"
765  "For that you need to check out the package '%s' to your "
766  "private relase area in\n"%( outName, packageName )
767  + self.__cmssw )
768  raise AllInOneError( msg )
769  theMap = { "process": "",
770  "tab": "",
771  "nEvents": str( -1 ),
772  "skipEventsString": "",
773  "importCms": "import FWCore.ParameterSet.Config as cms\n",
774  "header": "#Do not delete or (unless you know what you're doing) change these comments\n"
775  "#%(name)s\n"
776  "#data type: %(dataType)s\n"
777  "#magnetic field: .oO[magneticField]Oo.\n" #put in magnetic field later
778  %{"name": self.__name, #need to create the snippet before getting the magnetic field
779  "dataType": self.__dataType} #so that we know the first and last runs
780  }
781  dataset_cff = self.__createSnippet( jsonPath = jsonPath,
782  begin = begin,
783  end = end,
784  firstRun = firstRun,
785  lastRun = lastRun,
786  repMap = theMap,
787  parent = parent)
788  magneticField = self.__magneticField
789  if magneticField == "MagneticField":
790  magneticField = "%s, %s #%s" % (magneticField,
791  str(self.__getMagneticFieldForRun()).replace("\n"," ").split("#")[0].strip(),
792  "Use MagneticField_cff.py; the number is for determining which track selection to use."
793  )
794  dataset_cff = dataset_cff.replace(".oO[magneticField]Oo.",magneticField)
795  filePath = os.path.join( self.__cmssw, "src", packageName,
796  "python", outName + "_cff.py" )
797  if os.path.exists( filePath ):
798  existMsg = "The predefined dataset '%s' already exists.\n"%( outName )
799  askString = "Do you want to overwrite it? [y/n]\n"
800  inputQuery = existMsg + askString
801  while True:
802  userInput = raw_input( inputQuery ).lower()
803  if userInput == "y":
804  break
805  elif userInput == "n":
806  return
807  else:
808  inputQuery = askString
809  print ( "The predefined dataset '%s' will be stored in the file\n"
810  %( outName )
811  + filePath +
812  "\nFor future use you have to do 'scram b'." )
813  print
814  theFile = open( filePath, "w" )
815  theFile.write( dataset_cff )
816  theFile.close()
817  return
818 
819  def fileList( self, parent = False ):
820  if self.__fileList and not parent:
821  return self.__fileList
822  if self.__parentFileList and parent:
823  return self.__parentFileList
824 
825  fileList = [ self.__findInJson(fileInfo,"name") \
826  for fileInfo in self.fileInfoList(parent) ]
827 
828  if not parent:
829  self.__fileList = fileList
830  else:
831  self.__parentFileList = fileList
832  return fileList
833 
834  def fileInfoList( self, parent = False ):
835  return self.__getFileInfoList( self.__dasLimit, parent )
836 
837  def name( self ):
838  return self.__name
839 
840  def predefined( self ):
841  return self.__predefined
842 
843  def runList( self ):
844  if self.__runList:
845  return self.__runList
846  return self.__getRunList()
847 
848 
849 if __name__ == '__main__':
850  print "Start testing..."
851  datasetName = '/MinimumBias/Run2012D-TkAlMinBias-v1/ALCARECO'
852  jsonFile = ( '/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/'
853  'Collisions12/8TeV/Prompt/'
854  'Cert_190456-207898_8TeV_PromptReco_Collisions12_JSON.txt' )
855  dataset = Dataset( datasetName )
856  print dataset.datasetSnippet( jsonPath = jsonFile,
857  firstRun = "207800",
858  end = "20121128")
859  dataset.dump_cff( outName = "Dataset_Test_TkAlMinBias_Run2012D",
860  jsonPath = jsonFile,
861  firstRun = "207800",
862  end = "20121128" )
def __findInJson
Definition: dataset.py:278
def __getMagneticFieldForRun
Definition: dataset.py:471
def magneticField
Definition: dataset.py:688
def __getFileInfoList
Definition: dataset.py:525
def parentDataset
Definition: dataset.py:696
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
T min(T a, T b)
Definition: MathUtil.h:58
def __createSnippet
Definition: dataset.py:117
def convertTimeToRun
Definition: dataset.py:618
def fileInfoList
Definition: dataset.py:834
def magneticFieldForRun
Definition: dataset.py:693
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
tuple __dummy_source_template
Definition: dataset.py:103
def datasetSnippet
Definition: dataset.py:702
def __getRunList
Definition: dataset.py:591
def forcerunrange
Definition: dataset.py:300
def __dateString
Definition: dataset.py:613
def __getMagneticField
Definition: dataset.py:391
def __getParentDataset
Definition: dataset.py:381
def getForceRunRangeFunction
Definition: dataset.py:321
double split
Definition: MVATrainer.cc:139
def __getDataType
Definition: dataset.py:356
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run