CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
dataset.py
Go to the documentation of this file.
1 # idea stolen from:
2 # http://cmssw.cvs.cern.ch/cgi-bin/cmssw.cgi/CMSSW/
3 # PhysicsTools/PatAlgos/python/tools/cmsswVersionTools.py
4 import das_client
5 import json
6 import os
7 import bisect
8 import re
9 import datetime
10 from FWCore.PythonUtilities.LumiList import LumiList
11 from TkAlExceptions import AllInOneError
12 
13 
14 class Dataset:
15  def __init__( self, datasetName, dasLimit = 0, tryPredefinedFirst = True,
16  cmssw = os.environ["CMSSW_BASE"], cmsswrelease = os.environ["CMSSW_RELEASE_BASE"]):
17  self.__name = datasetName
18  self.__origName = datasetName
19  self.__dasLimit = dasLimit
20  self.__fileList = None
21  self.__fileInfoList = None
22  self.__runList = None
23  self.__alreadyStored = False
24  self.__cmssw = cmssw
25  self.__cmsswrelease = cmsswrelease
26  self.__firstusedrun = None
27  self.__lastusedrun = None
28  self.__parentDataset = None
29  self.__parentFileList = None
31 
32  # check, if dataset name matches CMS dataset naming scheme
33  if re.match( r'/.+/.+/.+', self.__name ):
34  self.__official = True
35  fileName = "Dataset" + self.__name.replace("/","_") + "_cff.py"
36  else:
37  self.__official = False
38  fileName = self.__name + "_cff.py"
39 
40  searchPath1 = os.path.join( self.__cmssw, "python",
41  "Alignment", "OfflineValidation",
42  fileName )
43  searchPath2 = os.path.join( self.__cmssw, "src",
44  "Alignment", "OfflineValidation",
45  "python", fileName )
46  searchPath3 = os.path.join( self.__cmsswrelease,
47  "python", "Alignment",
48  "OfflineValidation", fileName )
49  if self.__official and not tryPredefinedFirst:
50  self.__predefined = False
51  elif os.path.exists( searchPath1 ):
52  self.__predefined = True
53  self.__filename = searchPath1
54  elif os.path.exists( searchPath2 ):
55  msg = ("The predefined dataset '%s' does exist in '%s', but "
56  "you need to run 'scram b' first."
57  %( self.__name, searchPath2 ))
58  if self.__official:
59  print msg
60  print "Getting the data from DAS again. To go faster next time, run scram b."
61  else:
62  raise AllInOneError( msg )
63  elif os.path.exists( searchPath3 ):
64  self.__predefined = True
65  self.__filename = searchPath3
66  elif self.__official:
67  self.__predefined = False
68  else:
69  msg = ("The predefined dataset '%s' does not exist. Please "
70  "create it first or check for typos."%( self.__name ))
71  raise AllInOneError( msg )
72 
73  if self.__predefined and self.__official:
74  self.__name = "Dataset" + self.__name.replace("/","_")
75 
76  self.__dataType = self.__getDataType()
78 
79  def __chunks( self, theList, n ):
80  """ Yield successive n-sized chunks from theList.
81  """
82  for i in xrange( 0, len( theList ), n ):
83  yield theList[i:i+n]
84 
85  __source_template= ("%(header)s"
86  "%(importCms)s"
87  "import FWCore.PythonUtilities.LumiList as LumiList\n\n"
88  "%(goodLumiSecStr)s"
89  "readFiles = cms.untracked.vstring()\n"
90  "secFiles = cms.untracked.vstring()\n"
91  "%(process)ssource = cms.Source(\"PoolSource\",\n"
92  "%(lumiStr)s"
93  "%(tab)s secondaryFileNames ="
94  "secFiles,\n"
95  "%(tab)s fileNames = readFiles\n"
96  ")\n"
97  "%(files)s\n"
98  "%(lumiSecExtend)s\n"
99  "%(process)smaxEvents = cms.untracked.PSet( "
100  "input = cms.untracked.int32(%(nEvents)s) )\n"
101  "%(skipEventsString)s\n")
102 
103  __dummy_source_template = ("readFiles = cms.untracked.vstring()\n"
104  "secFiles = cms.untracked.vstring()\n"
105  "%(process)ssource = cms.Source(\"PoolSource\",\n"
106  "%(tab)s secondaryFileNames ="
107  "secFiles,\n"
108  "%(tab)s fileNames = readFiles\n"
109  ")\n"
110  "readFiles.extend(['dummy_File.root'])\n"
111  "%(process)smaxEvents = cms.untracked.PSet( "
112  "input = cms.untracked.int32(%(nEvents)s) )\n"
113  "%(skipEventsString)s\n")
114 
115  def __createSnippet( self, jsonPath = None, begin = None, end = None,
116  firstRun = None, lastRun = None, repMap = None,
117  crab = False, parent = False ):
118  if firstRun:
119  firstRun = int( firstRun )
120  if lastRun:
121  lastRun = int( lastRun )
122  if ( begin and firstRun ) or ( end and lastRun ):
123  msg = ( "The Usage of "
124  + "'begin' & 'firstRun' " * int( bool( begin and
125  firstRun ) )
126  + "and " * int( bool( ( begin and firstRun ) and
127  ( end and lastRun ) ) )
128  + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
129  + "is ambigous." )
130  raise AllInOneError( msg )
131  if begin or end:
132  ( firstRun, lastRun ) = self.convertTimeToRun(
133  begin = begin, end = end, firstRun = firstRun,
134  lastRun = lastRun )
135  if ( firstRun and lastRun ) and ( firstRun > lastRun ):
136  msg = ( "The lower time/runrange limit ('begin'/'firstRun') "
137  "chosen is greater than the upper time/runrange limit "
138  "('end'/'lastRun').")
139  raise AllInOneError( msg )
140  if self.predefined() and (jsonPath or begin or end or firstRun or lastRun):
141  msg = ( "The parameters 'JSON', 'begin', 'end', 'firstRun', and 'lastRun'"
142  "only work for official datasets, not predefined _cff.py files" )
143  raise AllInOneError( msg )
144  goodLumiSecStr = ""
145  lumiStr = ""
146  lumiSecExtend = ""
147  if firstRun or lastRun or jsonPath:
148  goodLumiSecStr = ( "lumiSecs = cms.untracked."
149  "VLuminosityBlockRange()\n" )
150  lumiStr = " lumisToProcess = lumiSecs,\n"
151  if not jsonPath:
152  selectedRunList = self.__getRunList()
153  if firstRun:
154  selectedRunList = [ run for run in selectedRunList \
155  if self.__findInJson(run, "run_number") >= firstRun ]
156  if lastRun:
157  selectedRunList = [ run for run in selectedRunList \
158  if self.__findInJson(run, "run_number") <= lastRun ]
159  lumiList = [ str( self.__findInJson(run, "run_number") ) + ":1-" \
160  + str( self.__findInJson(run, "run_number") ) + ":max" \
161  for run in selectedRunList ]
162  splitLumiList = list( self.__chunks( lumiList, 255 ) )
163  else:
164  theLumiList = None
165  try:
166  theLumiList = LumiList ( filename = jsonPath )
167  except ValueError:
168  pass
169 
170  if theLumiList is not None:
171  allRuns = theLumiList.getRuns()
172  runsToRemove = []
173  for run in allRuns:
174  if firstRun and int( run ) < firstRun:
175  runsToRemove.append( run )
176  if lastRun and int( run ) > lastRun:
177  runsToRemove.append( run )
178  theLumiList.removeRuns( runsToRemove )
179  splitLumiList = list( self.__chunks(
180  theLumiList.getCMSSWString().split(','), 255 ) )
181  else:
182  with open(jsonPath) as f:
183  jsoncontents = f.read()
184  if "process.source.lumisToProcess" in jsoncontents:
185  msg = "%s is not a json file, but it seems to be a CMSSW lumi selection cff snippet. Trying to use it" % jsonPath
186  if firstRun or lastRun:
187  msg += ("\n (after applying firstRun and/or lastRun)")
188  msg += ".\nPlease note that, depending on the format of this file, it may not work as expected."
189  msg += "\nCheck your config file to make sure that it worked properly."
190  print msg
191 
192  self.__firstUsedRun = -1
193  self.__lastUsedRun = -1
194  if firstRun or lastRun:
195  jsoncontents = re.sub("\d+:(\d+|max)-\d+:(\d+|max)", self.getForceRunRangeFunction(firstRun, lastRun), jsoncontents)
196  lumiSecExtend = jsoncontents
197  splitLumiList = [[""]]
198 
199  if not len(splitLumiList[0][0]) == 0:
200  lumiSecStr = [ "',\n'".join( lumis ) \
201  for lumis in splitLumiList ]
202  lumiSecStr = [ "lumiSecs.extend( [\n'" + lumis + "'\n] )" \
203  for lumis in lumiSecStr ]
204  lumiSecExtend = "\n".join( lumiSecStr )
205  self.__firstusedrun = splitLumiList[0][0].split(":")[0]
206  self.__lastusedrun = splitLumiList[-1][-1].split(":")[0]
207  else:
208  self.__firstusedrun = self.__findInJson(self.__getRunList()[0],"run_number")
209  self.__lastusedrun = self.__findInJson(self.__getRunList()[-1],"run_number")
210 
211  if crab:
212  files = ""
213  else:
214  splitFileList = list( self.__chunks( self.fileList(), 255 ) )
215  fileStr = [ "',\n'".join( files ) for files in splitFileList ]
216  fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \
217  for files in fileStr ]
218  files = "\n".join( fileStr )
219 
220  if parent:
221  splitParentFileList = list( self.__chunks( self.fileList(parent = True), 255 ) )
222  parentFileStr = [ "',\n'".join( parentFiles ) for parentFiles in splitParentFileList ]
223  parentFileStr = [ "secFiles.extend( [\n'" + parentFiles + "'\n] )" \
224  for parentFiles in parentFileStr ]
225  parentFiles = "\n".join( parentFileStr )
226  files += "\n\n" + parentFiles
227 
228 
229  theMap = repMap
230  theMap["files"] = files
231  theMap["json"] = jsonPath
232  theMap["lumiStr"] = lumiStr
233  theMap["goodLumiSecStr"] = goodLumiSecStr%( theMap )
234  theMap["lumiSecExtend"] = lumiSecExtend
235  if crab:
236  dataset_snippet = self.__dummy_source_template%( theMap )
237  else:
238  dataset_snippet = self.__source_template%( theMap )
239  return dataset_snippet
240 
241  def __find_lt( self, a, x ):
242  'Find rightmost value less than x'
243  i = bisect.bisect_left( a, x )
244  if i:
245  return i-1
246  raise ValueError
247 
248  def __find_ge( self, a, x):
249  'Find leftmost item greater than or equal to x'
250  i = bisect.bisect_left( a, x )
251  if i != len( a ):
252  return i
253  raise ValueError
254 
255  def __findInJson(self, jsondict, strings):
256  if isinstance(strings, str):
257  strings = [ strings ]
258 
259  if len(strings) == 0:
260  return jsondict
261  if isinstance(jsondict,dict):
262  if strings[0] in jsondict:
263  try:
264  return self.__findInJson(jsondict[strings[0]], strings[1:])
265  except KeyError:
266  pass
267  else:
268  for a in jsondict:
269  if strings[0] in a:
270  try:
271  return self.__findInJson(a[strings[0]], strings[1:])
272  except (TypeError, KeyError): #TypeError because a could be a string and contain strings[0]
273  pass
274  #if it's not found
275  raise KeyError("Can't find " + strings[0])
276 
277  def forcerunrange(self, firstRun, lastRun, s):
278  """s must be in the format run1:lum1-run2:lum2"""
279  s = s.group()
280  print s
281  run1 = s.split("-")[0].split(":")[0]
282  lum1 = s.split("-")[0].split(":")[1]
283  run2 = s.split("-")[1].split(":")[0]
284  lum2 = s.split("-")[1].split(":")[1]
285  if int(run2) < firstRun or int(run1) > lastRun:
286  return ""
287  if int(run1) < firstRun or firstRun < 0:
288  run1 = firstRun
289  lum1 = 1
290  if int(run2) > lastRun:
291  run2 = lastRun
292  lum2 = "max"
293  if int(run1) < self.__firstUsedRun:
294  self.__firstUsedRun = int(run1)
295  if int(run2) > self.__lastUsedRun:
296  self.__lastUsedRun = int(run2)
297  return "%s:%s-%s:%s" % (run1, lum1, run2, lum2)
298 
299  def getForceRunRangeFunction(self, firstRun, lastRun):
300  def forcerunrangefunction(s):
301  return self.forcerunrange(firstRun, lastRun, s)
302  return forcerunrangefunction
303 
304  def __getData( self, dasQuery, dasLimit = 0 ):
305  dasData = das_client.get_data( 'https://cmsweb.cern.ch',
306  dasQuery, 0, dasLimit, False )
307  if isinstance(dasData, str):
308  jsondict = json.loads( dasData )
309  else:
310  jsondict = dasData
311  # Check, if the DAS query fails
312  try:
313  error = self.__findInJson(jsondict,["data","error"])
314  except KeyError:
315  error = None
316  if error or self.__findInJson(jsondict,"status") != 'ok' or "data" not in jsondict:
317  msg = ("The DAS query returned a error. Here is the output\n" + str(jsondict) +
318  "\nIt's possible that this was a server error. If so, it may work if you try again later")
319  raise AllInOneError(msg)
320  return self.__findInJson(jsondict,"data")
321 
322  def __getDataType( self ):
323  if self.__predefined:
324  with open(self.__filename) as f:
325  datatype = None
326  for line in f.readlines():
327  if line.startswith("#data type: "):
328  if datatype is not None:
329  raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
330  datatype = line.replace("#data type: ", "").replace("\n","")
331  return "unknown"
332 
333  dasQuery_type = ( 'dataset dataset=%s | grep dataset.datatype,'
334  'dataset.name'%( self.__name ) )
335  data = self.__getData( dasQuery_type )
336 
337  try:
338  return self.__findInJson(data, ["dataset", "datatype"])
339  except KeyError:
340  print ("Cannot find the datatype of the dataset '%s'\n"
341  "It may not be possible to automatically find the magnetic field,\n"
342  "and you will not be able run in CRAB mode"
343  %( self.name() ))
344  return "unknown"
345 
346  def __getParentDataset( self ):
347  dasQuery = "parent dataset=" + self.__name
348  data = self.__getData( dasQuery )
349  try:
350  return self.__findInJson(data, ["parent", "name"])
351  except KeyError:
352  raise AllInOneError("Cannot find the parent of the dataset '" + self.__name + "'\n"
353  "Here is the DAS output:\n" + str(jsondict) +
354  "\nIt's possible that this was a server error. If so, it may work if you try again later")
355 
356  def __getMagneticField( self ):
357  Bfieldlocation = os.path.join( self.__cmsswrelease, "python", "Configuration", "StandardSequences" )
358  Bfieldlist = [ f.replace("MagneticField_",'').replace("_cff.py",'') \
359  for f in os.listdir(Bfieldlocation) \
360  if f.startswith("MagneticField_") and f.endswith("_cff.py") and f != "MagneticField_cff.py" ]
361  Bfieldlist.sort( key = lambda Bfield: -len(Bfield) ) #Put it in order of decreasing length, so that searching in the name gives the longer match
362 
363  if self.__predefined:
364  with open(self.__filename) as f:
365  datatype = None
366  Bfield = None
367  for line in f.readlines():
368  if line.startswith("#data type: "):
369  if datatype is not None:
370  raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
371  datatype = line.replace("#data type: ", "").replace("\n","")
372  if line.startswith("#magnetic field: "):
373  if Bfield is not None:
374  raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
375  Bfield = line.replace("#magnetic field: ", "").replace("\n","")
376  if Bfield is not None:
377  Bfield = Bfield.split(",")[0]
378  if Bfield in Bfieldlist or Bfield == "unknown":
379  return Bfield
380  else:
381  print "Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield
382  print "Using Bfield='unknown' - this will revert to the default"
383  return "unknown"
384  elif datatype == "data":
385  return "AutoFromDBCurrent" #this should be in the "#magnetic field" line, but for safety in case it got messed up
386  else:
387  return "unknown"
388 
389  if self.__dataType == "data":
390  return "AutoFromDBCurrent"
391 
392  dasQuery_B = ( 'dataset dataset=%s'%( self.__name ) ) #try to find the magnetic field from DAS
393  data = self.__getData( dasQuery_B ) #it seems to be there for the newer (7X) MC samples, except cosmics
394 
395  try:
396  Bfield = self.__findInJson(data, ["dataset", "mcm", "sequences", "magField"])
397  if Bfield in Bfieldlist:
398  return Bfield
399  elif Bfield == "":
400  pass
401  else:
402  print "Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield
403  print "Using Bfield='unknown' - this will revert to the default magnetic field"
404  return "unknown"
405  except KeyError:
406  pass
407 
408  for possibleB in Bfieldlist:
409  if possibleB in self.__name.replace("TkAlCosmics0T", ""): #for some reason all cosmics dataset names contain this string
410  return possibleB
411 
412  return "unknown"
413 
414  def __getMagneticFieldForRun( self, run = -1, tolerance = 0.5 ):
415  """For MC, this returns the same as the previous function.
416  For data, it gets the magnetic field from the runs. This is important for
417  deciding which template to use for offlinevalidation
418  """
419  if "T" in self.__magneticField: #for MC
420  Bfield = self.__magneticField.split("T")[0]
421  return float(Bfield) / 10.0 #e.g. 38T and 38T_PostLS1 both return 3.8
422  if self.__predefined:
423  with open(self.__filename) as f:
424  Bfield = None
425  for line in f.readlines():
426  if line.startswith("#magnetic field: ") and "," in line:
427  if Bfield is not None:
428  raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
429  return float(line.replace("#magnetic field: ", "").split(",")[1])
430 
431  if run > 0:
432  dasQuery = ('run = %s'%run) #for data
433  data = self.__getData(dasQuery)
434  try:
435  return self.__findInJson(data, ["run","bfield"])
436  except KeyError:
437  return "unknown Can't get the magnetic field for run %s from DAS" % run
438 
439  #run < 0 - find B field for the first and last runs, and make sure they're compatible
440  # (to within tolerance)
441  #NOT FOOLPROOF! The magnetic field might go up and then down, or vice versa
442  if self.__firstusedrun is None or self.__lastusedrun is None:
443  return "unknown Can't get the exact magnetic field for the dataset until data has been retrieved from DAS."
444  firstrunB = self.__getMagneticFieldForRun(self.__firstusedrun)
445  lastrunB = self.__getMagneticFieldForRun(self.__lastusedrun)
446  try:
447  if abs(firstrunB - lastrunB) <= tolerance:
448  return .5*(firstrunB + lastrunB)
449  print firstrunB, lastrunB, tolerance
450  return ("unknown The beginning and end of your run range for %s\n"
451  "have different magnetic fields (%s, %s)!\n"
452  "Try limiting the run range using firstRun, lastRun, begin, end, or JSON,\n"
453  "or increasing the tolerance (in dataset.py) from %s.") % (self.__name, firstrunB, lastrunB, tolerance)
454  except TypeError:
455  if "unknown" in firstrunB:
456  return firstrunB
457  else:
458  return lastrunB
459 
460  def __getFileInfoList( self, dasLimit, parent = False ):
461  if self.__predefined:
462  if parent:
463  extendstring = "secFiles.extend"
464  else:
465  extendstring = "readFiles.extend"
466  with open(self.__fileName) as f:
467  files = []
468  copy = False
469  for line in f.readlines():
470  if "]" in line:
471  copy = False
472  if copy:
473  files.append({name: line.translate(None, "', " + '"')})
474  if extendstring in line and "[" in line and "]" not in line:
475  copy = True
476  return files
477 
478  if self.__fileInfoList and not parent:
479  return self.__fileInfoList
480  if self.__parentFileInfoList and parent:
481  return self.__parentFileInfoList
482 
483  if parent:
484  searchdataset = self.parentDataset()
485  else:
486  searchdataset = self.__name
487  dasQuery_files = ( 'file dataset=%s | grep file.name, file.nevents, '
488  'file.creation_time, '
489  'file.modification_time'%( searchdataset ) )
490  print "Requesting file information for '%s' from DAS..."%( searchdataset ),
491  data = self.__getData( dasQuery_files, dasLimit )
492  print "Done."
493  data = [ self.__findInJson(entry,"file") for entry in data ]
494  if len( data ) == 0:
495  msg = ("No files are available for the dataset '%s'. This can be "
496  "due to a typo or due to a DAS problem. Please check the "
497  "spelling of the dataset and/or retry to run "
498  "'validateAlignments.py'."%( self.name() ))
499  raise AllInOneError( msg )
500  fileInformationList = []
501  for file in data:
502  fileName = 'unknown'
503  try:
504  fileName = self.__findInJson(file, "name")
505  fileCreationTime = self.__findInJson(file, "creation_time")
506  fileNEvents = self.__findInJson(file, "nevents")
507  except KeyError:
508  print ("DAS query gives bad output for file '%s'. Skipping it.\n"
509  "It may work if you try again later.") % fileName
510  fileNEvents = 0
511  # select only non-empty files
512  if fileNEvents == 0:
513  continue
514  fileDict = { "name": fileName,
515  "creation_time": fileCreationTime,
516  "nevents": fileNEvents
517  }
518  fileInformationList.append( fileDict )
519  fileInformationList.sort( key=lambda info: self.__findInJson(info,"name") )
520  if parent:
521  self.__parentFileInfoList = fileInformationList
522  else:
523  self.__fileInfoList = fileInformationList
524  return fileInformationList
525 
526  def __getRunList( self ):
527  if self.__runList:
528  return self.__runList
529  dasQuery_runs = ( 'run dataset=%s | grep run.run_number,'
530  'run.creation_time'%( self.__name ) )
531  print "Requesting run information for '%s' from DAS..."%( self.__name ),
532  data = self.__getData( dasQuery_runs )
533  print "Done."
534  data = [ self.__findInJson(entry,"run") for entry in data ]
535  data.sort( key = lambda run: self.__findInJson(run, "run_number") )
536  self.__runList = data
537  return data
538 
539  def __datetime(self, stringForDas):
540  if len(stringForDas) != 8:
541  raise AllInOneError(stringForDas + " is not a valid date string.\n"
542  + "DAS accepts dates in the form 'yyyymmdd'")
543  year = stringForDas[:4]
544  month = stringForDas[4:6]
545  day = stringForDas[6:8]
546  return datetime.date(int(year), int(month), int(day))
547 
548  def __dateString(self, date):
549  return str(date.year) + str(date.month).zfill(2) + str(date.day).zfill(2)
550 
551  def convertTimeToRun( self, begin = None, end = None,
552  firstRun = None, lastRun = None,
553  shortTuple = True ):
554  if ( begin and firstRun ) or ( end and lastRun ):
555  msg = ( "The Usage of "
556  + "'begin' & 'firstRun' " * int( bool( begin and
557  firstRun ) )
558  + "and " * int( bool( ( begin and firstRun ) and
559  ( end and lastRun ) ) )
560  + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
561  + "is ambigous." )
562  raise AllInOneError( msg )
563 
564  if begin or end:
565  runList = [ self.__findInJson(run, "run_number") for run in self.__getRunList() ]
566 
567  if begin:
568  lastdate = begin
569  for delta in [ 1, 5, 10, 20, 30 ]: #try searching for about 2 months after begin
570  firstdate = lastdate
571  lastdate = self.__dateString(self.__datetime(firstdate) + datetime.timedelta(delta))
572  dasQuery_begin = "run date between[%s,%s]" % (firstdate, lastdate)
573  begindata = self.__getData(dasQuery_begin)
574  if len(begindata) > 0:
575  begindata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
576  try:
577  runIndex = self.__find_ge( runList, self.__findInJson(begindata[0], ["run", "run_number"]))
578  except ValueError:
579  msg = ( "Your 'begin' is after the creation time of the last "
580  "run in the dataset\n'%s'"%( self.__name ) )
581  raise AllInOneError( msg )
582  firstRun = runList[runIndex]
583  begin = None
584  break
585 
586  if begin:
587  raise AllInOneError("No runs within a reasonable time interval after your 'begin'."
588  "Try using a 'begin' that has runs soon after it (within 2 months at most)")
589 
590  if end:
591  firstdate = end
592  for delta in [ 1, 5, 10, 20, 30 ]: #try searching for about 2 months before end
593  lastdate = firstdate
594  firstdate = self.__dateString(self.__datetime(lastdate) - datetime.timedelta(delta))
595  dasQuery_end = "run date between[%s,%s]" % (firstdate, lastdate)
596  enddata = self.__getData(dasQuery_end)
597  if len(enddata) > 0:
598  enddata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
599  try:
600  runIndex = self.__find_lt( runList, self.__findInJson(enddata[-1], ["run", "run_number"]))
601  except ValueError:
602  msg = ( "Your 'end' is before the creation time of the first "
603  "run in the dataset\n'%s'"%( self.__name ) )
604  raise AllInOneError( msg )
605  lastRun = runList[runIndex]
606  end = None
607  break
608 
609  if end:
610  raise AllInOneError("No runs within a reasonable time interval before your 'end'."
611  "Try using an 'end' that has runs soon before it (within 2 months at most)")
612 
613  if shortTuple:
614  return firstRun, lastRun
615  else:
616  return begin, end, firstRun, lastRun
617 
618  def dataType( self ):
619  if not self.__dataType:
620  self.__dataType = self.__getDataType()
621  return self.__dataType
622 
623  def magneticField( self ):
624  if not self.__magneticField:
625  self.__magneticField = self.__getMagneticField()
626  return self.__magneticField
627 
628  def magneticFieldForRun( self, run = -1 ):
629  return self.__getMagneticFieldForRun(run)
630 
631  def parentDataset( self ):
632  if not self.__parentDataset:
633  self.__parentDataset = self.__getParentDataset()
634  return self.__parentDataset
635 
636  def datasetSnippet( self, jsonPath = None, begin = None, end = None,
637  firstRun = None, lastRun = None, crab = False, parent = False ):
638  if self.__predefined and parent:
639  with open(self.__filename) as f:
640  if "secFiles.extend" not in f.read():
641  msg = ("The predefined dataset '%s' does not contain secondary files, "
642  "which your validation requires!") % self.__name
643  if self.__official:
644  self.__name = self.__origName
645  self.__predefined = False
646  print msg
647  print ("Retreiving the files from DAS. You will be asked if you want "
648  "to overwrite the old dataset.\n"
649  "It will still be compatible with validations that don't need secondary files.")
650  else:
651  raise AllInOneError(msg)
652 
653  if self.__predefined:
654  snippet = ("process.load(\"Alignment.OfflineValidation.%s_cff\")\n"
655  "process.maxEvents = cms.untracked.PSet(\n"
656  " input = cms.untracked.int32(.oO[nEvents]Oo.)\n"
657  ")\n"
658  "process.source.skipEvents=cms.untracked.uint32(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.)"
659  %(self.__name))
660  if not parent:
661  with open(self.__filename) as f:
662  if "secFiles.extend" in f.read():
663  snippet += "\nprocess.source.secondaryFileNames = cms.untracked.vstring()"
664  return snippet
665  theMap = { "process": "process.",
666  "tab": " " * len( "process." ),
667  "nEvents": ".oO[nEvents]Oo.",
668  "skipEventsString": "process.source.skipEvents=cms.untracked.uint32(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.)\n",
669  "importCms": "",
670  "header": ""
671  }
672  datasetSnippet = self.__createSnippet( jsonPath = jsonPath,
673  begin = begin,
674  end = end,
675  firstRun = firstRun,
676  lastRun = lastRun,
677  repMap = theMap,
678  crab = crab,
679  parent = parent )
680  if jsonPath == "" and begin == "" and end == "" and firstRun == "" and lastRun == "":
681  try:
682  self.dump_cff(parent = parent)
683  except AllInOneError, e:
684  print "Can't store the dataset as a cff:"
685  print e
686  print "This may be inconvenient in the future, but will not cause a problem for this validation."
687  return datasetSnippet
688 
689  def dump_cff( self, outName = None, jsonPath = None, begin = None,
690  end = None, firstRun = None, lastRun = None, parent = False ):
691  if self.__alreadyStored:
692  return
693  self.__alreadyStored = True
694  if outName == None:
695  outName = "Dataset" + self.__name.replace("/", "_")
696  packageName = os.path.join( "Alignment", "OfflineValidation" )
697  if not os.path.exists( os.path.join(
698  self.__cmssw, "src", packageName ) ):
699  msg = ("You try to store the predefined dataset'%s'.\n"
700  "For that you need to check out the package '%s' to your "
701  "private relase area in\n"%( outName, packageName )
702  + self.__cmssw )
703  raise AllInOneError( msg )
704  theMap = { "process": "",
705  "tab": "",
706  "nEvents": str( -1 ),
707  "skipEventsString": "",
708  "importCms": "import FWCore.ParameterSet.Config as cms\n",
709  "header": "#Do not delete or (unless you know what you're doing) change these comments\n"
710  "#%(name)s\n"
711  "#data type: %(dataType)s\n"
712  "#magnetic field: .oO[magneticField]Oo.\n" #put in magnetic field later
713  %{"name": self.__name, #need to create the snippet before getting the magnetic field
714  "dataType": self.__dataType} #so that we know the first and last runs
715  }
716  dataset_cff = self.__createSnippet( jsonPath = jsonPath,
717  begin = begin,
718  end = end,
719  firstRun = firstRun,
720  lastRun = lastRun,
721  repMap = theMap,
722  parent = parent)
723  magneticField = self.__magneticField
724  if magneticField == "AutoFromDBCurrent":
725  magneticField = "%s, %s" % (magneticField, str(self.__getMagneticFieldForRun()).replace("\n"," ")[0])
726  dataset_cff = dataset_cff.replace(".oO[magneticField]Oo.",magneticField)
727  filePath = os.path.join( self.__cmssw, "src", packageName,
728  "python", outName + "_cff.py" )
729  if os.path.exists( filePath ):
730  existMsg = "The predefined dataset '%s' already exists.\n"%( outName )
731  askString = "Do you want to overwrite it? [y/n]\n"
732  inputQuery = existMsg + askString
733  while True:
734  userInput = raw_input( inputQuery ).lower()
735  if userInput == "y":
736  break
737  elif userInput == "n":
738  return
739  else:
740  inputQuery = askString
741  print ( "The predefined dataset '%s' will be stored in the file\n"
742  %( outName )
743  + filePath +
744  "\nFor future use you have to do 'scram b'." )
745  print
746  theFile = open( filePath, "w" )
747  theFile.write( dataset_cff )
748  theFile.close()
749  return
750 
751  def fileList( self, parent = False ):
752  if self.__fileList and not parent:
753  return self.__fileList
754  if self.__parentFileList and parent:
755  return self.__parentFileList
756 
757  fileList = [ self.__findInJson(fileInfo,"name") \
758  for fileInfo in self.fileInfoList(parent) ]
759 
760  if not parent:
761  self.__fileList = fileList
762  else:
763  self.__parentFileList = fileList
764  return fileList
765 
766  def fileInfoList( self, parent = False ):
767  return self.__getFileInfoList( self.__dasLimit, parent )
768 
769  def name( self ):
770  return self.__name
771 
772  def predefined( self ):
773  return self.__predefined
774 
775  def runList( self ):
776  if self.__runList:
777  return self.__runList
778  return self.__getRunList()
779 
780 
781 if __name__ == '__main__':
782  print "Start testing..."
783  datasetName = '/MinimumBias/Run2012D-TkAlMinBias-v1/ALCARECO'
784  jsonFile = ( '/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/'
785  'Collisions12/8TeV/Prompt/'
786  'Cert_190456-207898_8TeV_PromptReco_Collisions12_JSON.txt' )
787  dataset = Dataset( datasetName )
788  print dataset.datasetSnippet( jsonPath = jsonFile,
789  firstRun = "207800",
790  end = "20121128")
791  dataset.dump_cff( outName = "Dataset_Test_TkAlMinBias_Run2012D",
792  jsonPath = jsonFile,
793  firstRun = "207800",
794  end = "20121128" )
def __findInJson
Definition: dataset.py:255
def __getMagneticFieldForRun
Definition: dataset.py:414
def magneticField
Definition: dataset.py:623
def __getFileInfoList
Definition: dataset.py:460
def parentDataset
Definition: dataset.py:631
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
def __createSnippet
Definition: dataset.py:117
def convertTimeToRun
Definition: dataset.py:553
def fileInfoList
Definition: dataset.py:766
def magneticFieldForRun
Definition: dataset.py:628
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
tuple __dummy_source_template
Definition: dataset.py:103
def datasetSnippet
Definition: dataset.py:637
def __getRunList
Definition: dataset.py:526
def forcerunrange
Definition: dataset.py:277
def __dateString
Definition: dataset.py:548
def __getMagneticField
Definition: dataset.py:356
def __getParentDataset
Definition: dataset.py:346
def getForceRunRangeFunction
Definition: dataset.py:299
double split
Definition: MVATrainer.cc:139
def __getDataType
Definition: dataset.py:322
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run