CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
dataset.py
Go to the documentation of this file.
1 # idea stolen from:
2 # http://cmssw.cvs.cern.ch/cgi-bin/cmssw.cgi/CMSSW/
3 # PhysicsTools/PatAlgos/python/tools/cmsswVersionTools.py
4 import das_client
5 import json
6 import os
7 import bisect
8 import re
9 import datetime
10 from FWCore.PythonUtilities.LumiList import LumiList
11 from TkAlExceptions import AllInOneError
12 
13 
14 class Dataset:
15  def __init__( self, datasetName, dasLimit = 0, tryPredefinedFirst = True,
16  cmssw = os.environ["CMSSW_BASE"], cmsswrelease = os.environ["CMSSW_RELEASE_BASE"]):
17  self.__name = datasetName
18  self.__origName = datasetName
19  self.__dasLimit = dasLimit
20  self.__fileList = None
21  self.__fileInfoList = None
22  self.__runList = None
23  self.__alreadyStored = False
24  self.__cmssw = cmssw
25  self.__cmsswrelease = cmsswrelease
26  self.__firstusedrun = None
27  self.__lastusedrun = None
28  self.__parentDataset = None
29  self.__parentFileList = None
31 
32  # check, if dataset name matches CMS dataset naming scheme
33  if re.match( r'/.+/.+/.+', self.__name ):
34  self.__official = True
35  fileName = "Dataset" + self.__name.replace("/","_") + "_cff.py"
36  else:
37  self.__official = False
38  fileName = self.__name + "_cff.py"
39 
40  searchPath1 = os.path.join( self.__cmssw, "python",
41  "Alignment", "OfflineValidation",
42  fileName )
43  searchPath2 = os.path.join( self.__cmssw, "src",
44  "Alignment", "OfflineValidation",
45  "python", fileName )
46  searchPath3 = os.path.join( self.__cmsswrelease,
47  "python", "Alignment",
48  "OfflineValidation", fileName )
49  if self.__official and not tryPredefinedFirst:
50  self.__predefined = False
51  elif os.path.exists( searchPath1 ):
52  self.__predefined = True
53  self.__filename = searchPath1
54  elif os.path.exists( searchPath2 ):
55  msg = ("The predefined dataset '%s' does exist in '%s', but "
56  "you need to run 'scram b' first."
57  %( self.__name, searchPath2 ))
58  if self.__official:
59  print msg
60  print "Getting the data from DAS again. To go faster next time, run scram b."
61  else:
62  raise AllInOneError( msg )
63  elif os.path.exists( searchPath3 ):
64  self.__predefined = True
65  self.__filename = searchPath3
66  elif self.__official:
67  self.__predefined = False
68  else:
69  msg = ("The predefined dataset '%s' does not exist. Please "
70  "create it first or check for typos."%( self.__name ))
71  raise AllInOneError( msg )
72 
73  if self.__predefined and self.__official:
74  self.__name = "Dataset" + self.__name.replace("/","_")
75 
76  self.__dataType = self.__getDataType()
78 
79  def __chunks( self, theList, n ):
80  """ Yield successive n-sized chunks from theList.
81  """
82  for i in xrange( 0, len( theList ), n ):
83  yield theList[i:i+n]
84 
85  __source_template= ("%(header)s"
86  "%(importCms)s"
87  "import FWCore.PythonUtilities.LumiList as LumiList\n\n"
88  "%(goodLumiSecStr)s"
89  "readFiles = cms.untracked.vstring()\n"
90  "secFiles = cms.untracked.vstring()\n"
91  "%(process)ssource = cms.Source(\"PoolSource\",\n"
92  "%(lumiStr)s"
93  "%(tab)s secondaryFileNames ="
94  "secFiles,\n"
95  "%(tab)s fileNames = readFiles\n"
96  ")\n"
97  "%(files)s\n"
98  "%(lumiSecExtend)s\n"
99  "%(process)smaxEvents = cms.untracked.PSet( "
100  "input = cms.untracked.int32(%(nEvents)s) )\n"
101  "%(skipEventsString)s\n")
102 
103  __dummy_source_template = ("readFiles = cms.untracked.vstring()\n"
104  "secFiles = cms.untracked.vstring()\n"
105  "%(process)ssource = cms.Source(\"PoolSource\",\n"
106  "%(tab)s secondaryFileNames ="
107  "secFiles,\n"
108  "%(tab)s fileNames = readFiles\n"
109  ")\n"
110  "readFiles.extend(['dummy_File.root'])\n"
111  "%(process)smaxEvents = cms.untracked.PSet( "
112  "input = cms.untracked.int32(%(nEvents)s) )\n"
113  "%(skipEventsString)s\n")
114 
115  def __createSnippet( self, jsonPath = None, begin = None, end = None,
116  firstRun = None, lastRun = None, repMap = None,
117  crab = False, parent = False ):
118  if firstRun:
119  firstRun = int( firstRun )
120  if lastRun:
121  lastRun = int( lastRun )
122  if ( begin and firstRun ) or ( end and lastRun ):
123  msg = ( "The Usage of "
124  + "'begin' & 'firstRun' " * int( bool( begin and
125  firstRun ) )
126  + "and " * int( bool( ( begin and firstRun ) and
127  ( end and lastRun ) ) )
128  + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
129  + "is ambigous." )
130  raise AllInOneError( msg )
131  if begin or end:
132  ( firstRun, lastRun ) = self.convertTimeToRun(
133  begin = begin, end = end, firstRun = firstRun,
134  lastRun = lastRun )
135  if ( firstRun and lastRun ) and ( firstRun > lastRun ):
136  msg = ( "The lower time/runrange limit ('begin'/'firstRun') "
137  "chosen is greater than the upper time/runrange limit "
138  "('end'/'lastRun').")
139  raise AllInOneError( msg )
140  if self.predefined() and (jsonPath or begin or end or firstRun or lastRun):
141  msg = ( "The parameters 'JSON', 'begin', 'end', 'firstRun', and 'lastRun'"
142  "only work for official datasets, not predefined _cff.py files" )
143  raise AllInOneError( msg )
144  goodLumiSecStr = ""
145  lumiStr = ""
146  lumiSecExtend = ""
147  if firstRun or lastRun or jsonPath:
148  goodLumiSecStr = ( "lumiSecs = cms.untracked."
149  "VLuminosityBlockRange()\n" )
150  lumiStr = " lumisToProcess = lumiSecs,\n"
151  if not jsonPath:
152  selectedRunList = self.__getRunList()
153  if firstRun:
154  selectedRunList = [ run for run in selectedRunList \
155  if self.__findInJson(run, "run_number") >= firstRun ]
156  if lastRun:
157  selectedRunList = [ run for run in selectedRunList \
158  if self.__findInJson(run, "run_number") <= lastRun ]
159  lumiList = [ str( self.__findInJson(run, "run_number") ) + ":1-" \
160  + str( self.__findInJson(run, "run_number") ) + ":max" \
161  for run in selectedRunList ]
162  splitLumiList = list( self.__chunks( lumiList, 255 ) )
163  else:
164  theLumiList = None
165  try:
166  theLumiList = LumiList ( filename = jsonPath )
167  except ValueError:
168  pass
169 
170  if theLumiList is not None:
171  allRuns = theLumiList.getRuns()
172  runsToRemove = []
173  for run in allRuns:
174  if firstRun and int( run ) < firstRun:
175  runsToRemove.append( run )
176  if lastRun and int( run ) > lastRun:
177  runsToRemove.append( run )
178  theLumiList.removeRuns( runsToRemove )
179  splitLumiList = list( self.__chunks(
180  theLumiList.getCMSSWString().split(','), 255 ) )
181  else:
182  with open(jsonPath) as f:
183  jsoncontents = f.read()
184  if "process.source.lumisToProcess" in jsoncontents:
185  msg = "%s is not a json file, but it seems to be a CMSSW lumi selection cff snippet. Trying to use it" % jsonPath
186  if firstRun or lastRun:
187  msg += ("\n (after applying firstRun and/or lastRun)")
188  msg += ".\nPlease note that, depending on the format of this file, it may not work as expected."
189  msg += "\nCheck your config file to make sure that it worked properly."
190  print msg
191 
192  runlist = self.__getRunList()
193  if firstRun or lastRun:
194  self.__firstusedrun = -1
195  self.__lastusedrun = -1
196  jsoncontents = re.sub("\d+:(\d+|max)-\d+:(\d+|max)", self.getForceRunRangeFunction(firstRun, lastRun), jsoncontents)
197  self.__firstusedrun = max(self.__firstusedrun, int(self.__findInJson(runlist[0],"run_number")))
198  self.__lastusedrun = min(self.__lastusedrun, int(self.__findInJson(runlist[-1],"run_number")))
199  else:
200  self.__firstusedrun = int(self.__findInJson(runlist[0],"run_number"))
201  self.__lastusedrun = int(self.__findInJson(runlist[-1],"run_number"))
202  lumiSecExtend = jsoncontents
203  splitLumiList = [[""]]
204 
205  if not len(splitLumiList[0][0]) == 0:
206  lumiSecStr = [ "',\n'".join( lumis ) \
207  for lumis in splitLumiList ]
208  lumiSecStr = [ "lumiSecs.extend( [\n'" + lumis + "'\n] )" \
209  for lumis in lumiSecStr ]
210  lumiSecExtend = "\n".join( lumiSecStr )
211  runlist = self.__getRunList()
212  self.__firstusedrun = max(int(splitLumiList[0][0].split(":")[0]), int(self.__findInJson(runlist[0],"run_number")))
213  self.__lastusedrun = min(int(splitLumiList[-1][-1].split(":")[0]), int(self.__findInJson(runlist[-1],"run_number")))
214  else:
215  runlist = self.__getRunList()
216  self.__firstusedrun = int(self.__findInJson(self.__getRunList()[0],"run_number"))
217  self.__lastusedrun = int(self.__findInJson(self.__getRunList()[-1],"run_number"))
218 
219  if crab:
220  files = ""
221  else:
222  splitFileList = list( self.__chunks( self.fileList(), 255 ) )
223  fileStr = [ "',\n'".join( files ) for files in splitFileList ]
224  fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \
225  for files in fileStr ]
226  files = "\n".join( fileStr )
227 
228  if parent:
229  splitParentFileList = list( self.__chunks( self.fileList(parent = True), 255 ) )
230  parentFileStr = [ "',\n'".join( parentFiles ) for parentFiles in splitParentFileList ]
231  parentFileStr = [ "secFiles.extend( [\n'" + parentFiles + "'\n] )" \
232  for parentFiles in parentFileStr ]
233  parentFiles = "\n".join( parentFileStr )
234  files += "\n\n" + parentFiles
235 
236 
237  theMap = repMap
238  theMap["files"] = files
239  theMap["json"] = jsonPath
240  theMap["lumiStr"] = lumiStr
241  theMap["goodLumiSecStr"] = goodLumiSecStr%( theMap )
242  theMap["lumiSecExtend"] = lumiSecExtend
243  if crab:
244  dataset_snippet = self.__dummy_source_template%( theMap )
245  else:
246  dataset_snippet = self.__source_template%( theMap )
247  return dataset_snippet
248 
249  def __find_lt( self, a, x ):
250  'Find rightmost value less than x'
251  i = bisect.bisect_left( a, x )
252  if i:
253  return i-1
254  raise ValueError
255 
256  def __find_ge( self, a, x):
257  'Find leftmost item greater than or equal to x'
258  i = bisect.bisect_left( a, x )
259  if i != len( a ):
260  return i
261  raise ValueError
262 
263  def __findInJson(self, jsondict, strings):
264  if isinstance(strings, str):
265  strings = [ strings ]
266 
267  if len(strings) == 0:
268  return jsondict
269  if isinstance(jsondict,dict):
270  if strings[0] in jsondict:
271  try:
272  return self.__findInJson(jsondict[strings[0]], strings[1:])
273  except KeyError:
274  pass
275  else:
276  for a in jsondict:
277  if strings[0] in a:
278  try:
279  return self.__findInJson(a[strings[0]], strings[1:])
280  except (TypeError, KeyError): #TypeError because a could be a string and contain strings[0]
281  pass
282  #if it's not found
283  raise KeyError("Can't find " + strings[0])
284 
285  def forcerunrange(self, firstRun, lastRun, s):
286  """s must be in the format run1:lum1-run2:lum2"""
287  s = s.group()
288  run1 = s.split("-")[0].split(":")[0]
289  lum1 = s.split("-")[0].split(":")[1]
290  run2 = s.split("-")[1].split(":")[0]
291  lum2 = s.split("-")[1].split(":")[1]
292  if int(run2) < firstRun or int(run1) > lastRun:
293  return ""
294  if int(run1) < firstRun or firstRun < 0:
295  run1 = firstRun
296  lum1 = 1
297  if int(run2) > lastRun:
298  run2 = lastRun
299  lum2 = "max"
300  if int(run1) < self.__firstusedrun or self.__firstusedrun < 0:
301  self.__firstusedrun = int(run1)
302  if int(run2) > self.__lastusedrun:
303  self.__lastusedrun = int(run2)
304  return "%s:%s-%s:%s" % (run1, lum1, run2, lum2)
305 
306  def getForceRunRangeFunction(self, firstRun, lastRun):
307  def forcerunrangefunction(s):
308  return self.forcerunrange(firstRun, lastRun, s)
309  return forcerunrangefunction
310 
311  def __getData( self, dasQuery, dasLimit = 0 ):
312  dasData = das_client.get_data( 'https://cmsweb.cern.ch',
313  dasQuery, 0, dasLimit, False )
314  if isinstance(dasData, str):
315  jsondict = json.loads( dasData )
316  else:
317  jsondict = dasData
318  # Check, if the DAS query fails
319  try:
320  error = self.__findInJson(jsondict,["data","error"])
321  except KeyError:
322  error = None
323  if error or self.__findInJson(jsondict,"status") != 'ok' or "data" not in jsondict:
324  jsonstr = str(jsondict)
325  if len(jsonstr) > 10000:
326  jsonfile = "das_query_output_%i.txt"
327  i = 0
328  while os.path.lexists(jsonfile % i):
329  i += 1
330  jsonfile = jsonfile % i
331  theFile = open( jsonfile, "w" )
332  theFile.write( jsonstr )
333  theFile.close()
334  msg = "The DAS query returned an error. The output is very long, and has been stored in:\n" + jsonfile
335  else:
336  msg = "The DAS query returned a error. Here is the output\n" + jsonstr
337  msg += "\nIt's possible that this was a server error. If so, it may work if you try again later"
338  raise AllInOneError(msg)
339  return self.__findInJson(jsondict,"data")
340 
341  def __getDataType( self ):
342  if self.__predefined:
343  with open(self.__filename) as f:
344  datatype = None
345  for line in f.readlines():
346  if line.startswith("#data type: "):
347  if datatype is not None:
348  raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
349  datatype = line.replace("#data type: ", "").replace("\n","")
350  return datatype
351  return "unknown"
352 
353  dasQuery_type = ( 'dataset dataset=%s | grep dataset.datatype,'
354  'dataset.name'%( self.__name ) )
355  data = self.__getData( dasQuery_type )
356 
357  try:
358  return self.__findInJson(data, ["dataset", "datatype"])
359  except KeyError:
360  print ("Cannot find the datatype of the dataset '%s'\n"
361  "It may not be possible to automatically find the magnetic field,\n"
362  "and you will not be able run in CRAB mode"
363  %( self.name() ))
364  return "unknown"
365 
366  def __getParentDataset( self ):
367  dasQuery = "parent dataset=" + self.__name
368  data = self.__getData( dasQuery )
369  try:
370  return self.__findInJson(data, ["parent", "name"])
371  except KeyError:
372  raise AllInOneError("Cannot find the parent of the dataset '" + self.__name + "'\n"
373  "Here is the DAS output:\n" + str(jsondict) +
374  "\nIt's possible that this was a server error. If so, it may work if you try again later")
375 
376  def __getMagneticField( self ):
377  Bfieldlocation = os.path.join( self.__cmssw, "python", "Configuration", "StandardSequences" )
378  if not os.path.isdir(Bfieldlocation):
379  Bfieldlocation = os.path.join( self.__cmsswrelease, "python", "Configuration", "StandardSequences" )
380  Bfieldlist = [ f.replace("_cff.py",'') \
381  for f in os.listdir(Bfieldlocation) \
382  if f.startswith("MagneticField_") and f.endswith("_cff.py") ]
383  Bfieldlist.sort( key = lambda Bfield: -len(Bfield) ) #Put it in order of decreasing length, so that searching in the name gives the longer match
384 
385  if self.__predefined:
386  with open(self.__filename) as f:
387  datatype = None
388  Bfield = None
389  for line in f.readlines():
390  if line.startswith("#data type: "):
391  if datatype is not None:
392  raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
393  datatype = line.replace("#data type: ", "").replace("\n","")
394  datatype = datatype.split("#")[0].strip()
395  if line.startswith("#magnetic field: "):
396  if Bfield is not None:
397  raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
398  Bfield = line.replace("#magnetic field: ", "").replace("\n","")
399  Bfield = Bfield.split("#")[0].strip()
400  if Bfield is not None:
401  Bfield = Bfield.split(",")[0]
402  if Bfield in Bfieldlist or Bfield == "unknown":
403  return Bfield
404  #===========================================================================
405  #For compatibility with already written datasets - remove this at some point
406  #(until the next === line)
407  #It's currently June 2015, anytime starting in 2016 is more than safe
408  elif Bfield == "AutoFromDBCurrent":
409  return "MagneticField"
410  elif "MagneticField_" + Bfield in Bfieldlist:
411  return "MagneticField_" + Bfield
412  #===========================================================================
413  else:
414  print "Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield
415  print "Using Bfield='unknown' - this will revert to the default"
416  return "unknown"
417  elif datatype == "data":
418  return "MagneticField" #this should be in the "#magnetic field" line, but for safety in case it got messed up
419  else:
420  return "unknown"
421 
422  if self.__dataType == "data":
423  return "MagneticField"
424 
425  dasQuery_B = ( 'dataset dataset=%s'%( self.__name ) ) #try to find the magnetic field from DAS
426  data = self.__getData( dasQuery_B ) #it seems to be there for the newer (7X) MC samples, except cosmics
427 
428  try:
429  Bfield = self.__findInJson(data, ["dataset", "mcm", "sequences", "magField"])
430  if Bfield in Bfieldlist:
431  return Bfield
432  elif Bfield == "38T" or Bfield == "38T_PostLS1":
433  return "MagneticField"
434  elif "MagneticField_" + Bfield in Bfieldlist:
435  return "MagneticField_" + Bfield
436  elif Bfield == "":
437  pass
438  else:
439  print "Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield
440  print "Using Bfield='unknown' - this will revert to the default magnetic field"
441  return "unknown"
442  except KeyError:
443  pass
444 
445  for possibleB in Bfieldlist:
446  if (possibleB != "MagneticField"
447  and possibleB.replace("MagneticField_","") in self.__name.replace("TkAlCosmics0T", "")):
448  #final attempt - try to identify the dataset from the name
449  #all cosmics dataset names contain "TkAlCosmics0T"
450  if possibleB == "MagneticField_38T" or possibleB == "MagneticField_38T_PostLS1":
451  return "MagneticField"
452  return possibleB
453 
454  return "unknown"
455 
456  def __getMagneticFieldForRun( self, run = -1, tolerance = 0.5 ):
457  """For MC, this returns the same as the previous function.
458  For data, it gets the magnetic field from the runs. This is important for
459  deciding which template to use for offlinevalidation
460  """
461  if self.__dataType == "mc" and self.__magneticField == "MagneticField":
462  return 3.8 #For 3.8T MC the default MagneticField is used
463  if "T" in self.__magneticField:
464  Bfield = self.__magneticField.split("T")[0].replace("MagneticField_","")
465  try:
466  return float(Bfield) / 10.0 #e.g. 38T and 38T_PostLS1 both return 3.8
467  except ValueError:
468  pass
469  if self.__predefined:
470  with open(self.__filename) as f:
471  Bfield = None
472  for line in f.readlines():
473  if line.startswith("#magnetic field: ") and "," in line:
474  if Bfield is not None:
475  raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
476  return float(line.replace("#magnetic field: ", "").split(",")[1].split("#")[0].strip())
477 
478  if run > 0:
479  dasQuery = ('run = %s'%run) #for data
480  data = self.__getData(dasQuery)
481  try:
482  return self.__findInJson(data, ["run","bfield"])
483  except KeyError:
484  return "unknown Can't get the magnetic field for run %s from DAS" % run
485 
486  #run < 0 - find B field for the first and last runs, and make sure they're compatible
487  # (to within tolerance)
488  #NOT FOOLPROOF! The magnetic field might go up and then down, or vice versa
489  if self.__firstusedrun is None or self.__lastusedrun is None:
490  return "unknown Can't get the exact magnetic field for the dataset until data has been retrieved from DAS."
491  firstrunB = self.__getMagneticFieldForRun(self.__firstusedrun)
492  lastrunB = self.__getMagneticFieldForRun(self.__lastusedrun)
493  try:
494  if abs(firstrunB - lastrunB) <= tolerance:
495  return .5*(firstrunB + lastrunB)
496  print firstrunB, lastrunB, tolerance
497  return ("unknown The beginning and end of your run range for %s\n"
498  "have different magnetic fields (%s, %s)!\n"
499  "Try limiting the run range using firstRun, lastRun, begin, end, or JSON,\n"
500  "or increasing the tolerance (in dataset.py) from %s.") % (self.__name, firstrunB, lastrunB, tolerance)
501  except TypeError:
502  try:
503  if "unknown" in firstrunB:
504  return firstrunB
505  else:
506  return lastrunB
507  except TypeError:
508  return lastrunB
509 
510  def __getFileInfoList( self, dasLimit, parent = False ):
511  if self.__predefined:
512  if parent:
513  extendstring = "secFiles.extend"
514  else:
515  extendstring = "readFiles.extend"
516  with open(self.__fileName) as f:
517  files = []
518  copy = False
519  for line in f.readlines():
520  if "]" in line:
521  copy = False
522  if copy:
523  files.append({name: line.translate(None, "', " + '"')})
524  if extendstring in line and "[" in line and "]" not in line:
525  copy = True
526  return files
527 
528  if self.__fileInfoList and not parent:
529  return self.__fileInfoList
530  if self.__parentFileInfoList and parent:
531  return self.__parentFileInfoList
532 
533  if parent:
534  searchdataset = self.parentDataset()
535  else:
536  searchdataset = self.__name
537  dasQuery_files = ( 'file dataset=%s | grep file.name, file.nevents, '
538  'file.creation_time, '
539  'file.modification_time'%( searchdataset ) )
540  print "Requesting file information for '%s' from DAS..."%( searchdataset ),
541  data = self.__getData( dasQuery_files, dasLimit )
542  print "Done."
543  data = [ self.__findInJson(entry,"file") for entry in data ]
544  if len( data ) == 0:
545  msg = ("No files are available for the dataset '%s'. This can be "
546  "due to a typo or due to a DAS problem. Please check the "
547  "spelling of the dataset and/or retry to run "
548  "'validateAlignments.py'."%( self.name() ))
549  raise AllInOneError( msg )
550  fileInformationList = []
551  for file in data:
552  fileName = 'unknown'
553  try:
554  fileName = self.__findInJson(file, "name")
555  fileCreationTime = self.__findInJson(file, "creation_time")
556  fileNEvents = self.__findInJson(file, "nevents")
557  except KeyError:
558  print ("DAS query gives bad output for file '%s'. Skipping it.\n"
559  "It may work if you try again later.") % fileName
560  fileNEvents = 0
561  # select only non-empty files
562  if fileNEvents == 0:
563  continue
564  fileDict = { "name": fileName,
565  "creation_time": fileCreationTime,
566  "nevents": fileNEvents
567  }
568  fileInformationList.append( fileDict )
569  fileInformationList.sort( key=lambda info: self.__findInJson(info,"name") )
570  if parent:
571  self.__parentFileInfoList = fileInformationList
572  else:
573  self.__fileInfoList = fileInformationList
574  return fileInformationList
575 
576  def __getRunList( self ):
577  if self.__runList:
578  return self.__runList
579  dasQuery_runs = ( 'run dataset=%s | grep run.run_number,'
580  'run.creation_time'%( self.__name ) )
581  print "Requesting run information for '%s' from DAS..."%( self.__name ),
582  data = self.__getData( dasQuery_runs )
583  print "Done."
584  data = [ self.__findInJson(entry,"run") for entry in data ]
585  data.sort( key = lambda run: self.__findInJson(run, "run_number") )
586  self.__runList = data
587  return data
588 
589  def __datetime(self, stringForDas):
590  if len(stringForDas) != 8:
591  raise AllInOneError(stringForDas + " is not a valid date string.\n"
592  + "DAS accepts dates in the form 'yyyymmdd'")
593  year = stringForDas[:4]
594  month = stringForDas[4:6]
595  day = stringForDas[6:8]
596  return datetime.date(int(year), int(month), int(day))
597 
598  def __dateString(self, date):
599  return str(date.year) + str(date.month).zfill(2) + str(date.day).zfill(2)
600 
601  def convertTimeToRun( self, begin = None, end = None,
602  firstRun = None, lastRun = None,
603  shortTuple = True ):
604  if ( begin and firstRun ) or ( end and lastRun ):
605  msg = ( "The Usage of "
606  + "'begin' & 'firstRun' " * int( bool( begin and
607  firstRun ) )
608  + "and " * int( bool( ( begin and firstRun ) and
609  ( end and lastRun ) ) )
610  + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
611  + "is ambigous." )
612  raise AllInOneError( msg )
613 
614  if begin or end:
615  runList = [ self.__findInJson(run, "run_number") for run in self.__getRunList() ]
616 
617  if begin:
618  lastdate = begin
619  for delta in [ 1, 5, 10, 20, 30 ]: #try searching for about 2 months after begin
620  firstdate = lastdate
621  lastdate = self.__dateString(self.__datetime(firstdate) + datetime.timedelta(delta))
622  dasQuery_begin = "run date between[%s,%s]" % (firstdate, lastdate)
623  begindata = self.__getData(dasQuery_begin)
624  if len(begindata) > 0:
625  begindata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
626  try:
627  runIndex = self.__find_ge( runList, self.__findInJson(begindata[0], ["run", "run_number"]))
628  except ValueError:
629  msg = ( "Your 'begin' is after the creation time of the last "
630  "run in the dataset\n'%s'"%( self.__name ) )
631  raise AllInOneError( msg )
632  firstRun = runList[runIndex]
633  begin = None
634  break
635 
636  if begin:
637  raise AllInOneError("No runs within a reasonable time interval after your 'begin'."
638  "Try using a 'begin' that has runs soon after it (within 2 months at most)")
639 
640  if end:
641  firstdate = end
642  for delta in [ 1, 5, 10, 20, 30 ]: #try searching for about 2 months before end
643  lastdate = firstdate
644  firstdate = self.__dateString(self.__datetime(lastdate) - datetime.timedelta(delta))
645  dasQuery_end = "run date between[%s,%s]" % (firstdate, lastdate)
646  enddata = self.__getData(dasQuery_end)
647  if len(enddata) > 0:
648  enddata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
649  try:
650  runIndex = self.__find_lt( runList, self.__findInJson(enddata[-1], ["run", "run_number"]))
651  except ValueError:
652  msg = ( "Your 'end' is before the creation time of the first "
653  "run in the dataset\n'%s'"%( self.__name ) )
654  raise AllInOneError( msg )
655  lastRun = runList[runIndex]
656  end = None
657  break
658 
659  if end:
660  raise AllInOneError("No runs within a reasonable time interval before your 'end'."
661  "Try using an 'end' that has runs soon before it (within 2 months at most)")
662 
663  if shortTuple:
664  return firstRun, lastRun
665  else:
666  return begin, end, firstRun, lastRun
667 
668  def dataType( self ):
669  if not self.__dataType:
670  self.__dataType = self.__getDataType()
671  return self.__dataType
672 
673  def magneticField( self ):
674  if not self.__magneticField:
675  self.__magneticField = self.__getMagneticField()
676  return self.__magneticField
677 
678  def magneticFieldForRun( self, run = -1 ):
679  return self.__getMagneticFieldForRun(run)
680 
681  def parentDataset( self ):
682  if not self.__parentDataset:
683  self.__parentDataset = self.__getParentDataset()
684  return self.__parentDataset
685 
686  def datasetSnippet( self, jsonPath = None, begin = None, end = None,
687  firstRun = None, lastRun = None, crab = False, parent = False ):
688  if self.__predefined and parent:
689  with open(self.__filename) as f:
690  if "secFiles.extend" not in f.read():
691  msg = ("The predefined dataset '%s' does not contain secondary files, "
692  "which your validation requires!") % self.__name
693  if self.__official:
694  self.__name = self.__origName
695  self.__predefined = False
696  print msg
697  print ("Retreiving the files from DAS. You will be asked if you want "
698  "to overwrite the old dataset.\n"
699  "It will still be compatible with validations that don't need secondary files.")
700  else:
701  raise AllInOneError(msg)
702 
703  if self.__predefined:
704  snippet = ("process.load(\"Alignment.OfflineValidation.%s_cff\")\n"
705  "process.maxEvents = cms.untracked.PSet(\n"
706  " input = cms.untracked.int32(.oO[nEvents]Oo. / .oO[parallelJobs]Oo.)\n"
707  ")\n"
708  "process.source.skipEvents=cms.untracked.uint32(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.)"
709  %(self.__name))
710  if not parent:
711  with open(self.__filename) as f:
712  if "secFiles.extend" in f.read():
713  snippet += "\nprocess.source.secondaryFileNames = cms.untracked.vstring()"
714  return snippet
715  theMap = { "process": "process.",
716  "tab": " " * len( "process." ),
717  "nEvents": ".oO[nEvents]Oo. / .oO[parallelJobs]Oo.",
718  "skipEventsString": "process.source.skipEvents=cms.untracked.uint32(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.)\n",
719  "importCms": "",
720  "header": ""
721  }
722  datasetSnippet = self.__createSnippet( jsonPath = jsonPath,
723  begin = begin,
724  end = end,
725  firstRun = firstRun,
726  lastRun = lastRun,
727  repMap = theMap,
728  crab = crab,
729  parent = parent )
730  if jsonPath == "" and begin == "" and end == "" and firstRun == "" and lastRun == "":
731  try:
732  self.dump_cff(parent = parent)
733  except AllInOneError, e:
734  print "Can't store the dataset as a cff:"
735  print e
736  print "This may be inconvenient in the future, but will not cause a problem for this validation."
737  return datasetSnippet
738 
739  def dump_cff( self, outName = None, jsonPath = None, begin = None,
740  end = None, firstRun = None, lastRun = None, parent = False ):
741  if self.__alreadyStored:
742  return
743  self.__alreadyStored = True
744  if outName == None:
745  outName = "Dataset" + self.__name.replace("/", "_")
746  packageName = os.path.join( "Alignment", "OfflineValidation" )
747  if not os.path.exists( os.path.join(
748  self.__cmssw, "src", packageName ) ):
749  msg = ("You try to store the predefined dataset'%s'.\n"
750  "For that you need to check out the package '%s' to your "
751  "private relase area in\n"%( outName, packageName )
752  + self.__cmssw )
753  raise AllInOneError( msg )
754  theMap = { "process": "",
755  "tab": "",
756  "nEvents": str( -1 ),
757  "skipEventsString": "",
758  "importCms": "import FWCore.ParameterSet.Config as cms\n",
759  "header": "#Do not delete or (unless you know what you're doing) change these comments\n"
760  "#%(name)s\n"
761  "#data type: %(dataType)s\n"
762  "#magnetic field: .oO[magneticField]Oo.\n" #put in magnetic field later
763  %{"name": self.__name, #need to create the snippet before getting the magnetic field
764  "dataType": self.__dataType} #so that we know the first and last runs
765  }
766  dataset_cff = self.__createSnippet( jsonPath = jsonPath,
767  begin = begin,
768  end = end,
769  firstRun = firstRun,
770  lastRun = lastRun,
771  repMap = theMap,
772  parent = parent)
773  magneticField = self.__magneticField
774  if magneticField == "MagneticField":
775  magneticField = "%s, %s #%s" % (magneticField,
776  str(self.__getMagneticFieldForRun()).replace("\n"," ").split("#")[0].strip(),
777  "Use MagneticField_cff.py; the number is for determining which track selection to use."
778  )
779  dataset_cff = dataset_cff.replace(".oO[magneticField]Oo.",magneticField)
780  filePath = os.path.join( self.__cmssw, "src", packageName,
781  "python", outName + "_cff.py" )
782  if os.path.exists( filePath ):
783  existMsg = "The predefined dataset '%s' already exists.\n"%( outName )
784  askString = "Do you want to overwrite it? [y/n]\n"
785  inputQuery = existMsg + askString
786  while True:
787  userInput = raw_input( inputQuery ).lower()
788  if userInput == "y":
789  break
790  elif userInput == "n":
791  return
792  else:
793  inputQuery = askString
794  print ( "The predefined dataset '%s' will be stored in the file\n"
795  %( outName )
796  + filePath +
797  "\nFor future use you have to do 'scram b'." )
798  print
799  theFile = open( filePath, "w" )
800  theFile.write( dataset_cff )
801  theFile.close()
802  return
803 
804  def fileList( self, parent = False ):
805  if self.__fileList and not parent:
806  return self.__fileList
807  if self.__parentFileList and parent:
808  return self.__parentFileList
809 
810  fileList = [ self.__findInJson(fileInfo,"name") \
811  for fileInfo in self.fileInfoList(parent) ]
812 
813  if not parent:
814  self.__fileList = fileList
815  else:
816  self.__parentFileList = fileList
817  return fileList
818 
819  def fileInfoList( self, parent = False ):
820  return self.__getFileInfoList( self.__dasLimit, parent )
821 
822  def name( self ):
823  return self.__name
824 
825  def predefined( self ):
826  return self.__predefined
827 
828  def runList( self ):
829  if self.__runList:
830  return self.__runList
831  return self.__getRunList()
832 
833 
834 if __name__ == '__main__':
835  print "Start testing..."
836  datasetName = '/MinimumBias/Run2012D-TkAlMinBias-v1/ALCARECO'
837  jsonFile = ( '/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/'
838  'Collisions12/8TeV/Prompt/'
839  'Cert_190456-207898_8TeV_PromptReco_Collisions12_JSON.txt' )
840  dataset = Dataset( datasetName )
841  print dataset.datasetSnippet( jsonPath = jsonFile,
842  firstRun = "207800",
843  end = "20121128")
844  dataset.dump_cff( outName = "Dataset_Test_TkAlMinBias_Run2012D",
845  jsonPath = jsonFile,
846  firstRun = "207800",
847  end = "20121128" )
def __findInJson
Definition: dataset.py:263
def __getMagneticFieldForRun
Definition: dataset.py:456
def magneticField
Definition: dataset.py:673
def __getFileInfoList
Definition: dataset.py:510
def parentDataset
Definition: dataset.py:681
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
T min(T a, T b)
Definition: MathUtil.h:58
def __createSnippet
Definition: dataset.py:117
def convertTimeToRun
Definition: dataset.py:603
def fileInfoList
Definition: dataset.py:819
def magneticFieldForRun
Definition: dataset.py:678
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
tuple __dummy_source_template
Definition: dataset.py:103
def datasetSnippet
Definition: dataset.py:687
def __getRunList
Definition: dataset.py:576
def forcerunrange
Definition: dataset.py:285
def __dateString
Definition: dataset.py:598
def __getMagneticField
Definition: dataset.py:376
def __getParentDataset
Definition: dataset.py:366
def getForceRunRangeFunction
Definition: dataset.py:306
double split
Definition: MVATrainer.cc:139
def __getDataType
Definition: dataset.py:341
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run