CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
dataset.py
Go to the documentation of this file.
1 # idea stolen from:
2 # http://cmssw.cvs.cern.ch/cgi-bin/cmssw.cgi/CMSSW/
3 # PhysicsTools/PatAlgos/python/tools/cmsswVersionTools.py
4 import das_client
5 import json
6 import os
7 import bisect
8 import re
9 from FWCore.PythonUtilities.LumiList import LumiList
10 from TkAlExceptions import AllInOneError
11 
12 
13 class Dataset:
14  def __init__( self, datasetName, dasLimit = 0 ):
15  self.__name = datasetName
16  # check, if dataset name matches CMS dataset naming scheme
17  if re.match( r'/.+/.+/.+', self.__name ):
18  self.__dataType = self.__getDataType()
19  self.__predefined = False
20  else:
21  fileName = self.__name + "_cff.py"
22  searchPath1 = os.path.join( os.environ["CMSSW_BASE"], "python",
23  "Alignment", "OfflineValidation",
24  fileName )
25  searchPath2 = os.path.join( os.environ["CMSSW_BASE"], "src",
26  "Alignment", "OfflineValidation",
27  "python", fileName )
28  searchPath3 = os.path.join( os.environ["CMSSW_RELEASE_BASE"],
29  "python", "Alignment",
30  "OfflineValidation", fileName )
31  if os.path.exists( searchPath1 ):
32  pass
33  elif os.path.exists( searchPath2 ):
34  msg = ("The predefined dataset '%s' does exist in '%s', but "
35  "you need to run 'scram b' first."
36  %( self.__name, searchPath2 ))
37  raise AllInOneError( msg )
38  elif os.path.exists( searchPath3 ):
39  pass
40  else:
41  msg = ("The predefined dataset '%s' does not exist. Please "
42  "create it first or check for typos."%( self.__name ))
43  raise AllInOneError( msg )
44  self.__dataType = "unknown"
45  self.__predefined = True
46  self.__dasLimit = dasLimit
47  self.__fileList = None
48  self.__fileInfoList = None
49  self.__runList = None
50 
51  def __chunks( self, theList, n ):
52  """ Yield successive n-sized chunks from theList.
53  """
54  for i in xrange( 0, len( theList ), n ):
55  yield theList[i:i+n]
56 
57  def __createSnippet( self, jsonPath = None, begin = None, end = None,
58  firstRun = None, lastRun = None, repMap = None,
59  crab = False ):
60  if firstRun:
61  firstRun = int( firstRun )
62  if lastRun:
63  lastRun = int( lastRun )
64  if ( begin and firstRun ) or ( end and lastRun ):
65  msg = ( "The Usage of "
66  + "'begin' & 'firstRun' " * int( bool( begin and
67  firstRun ) )
68  + "and " * int( bool( ( begin and firstRun ) and
69  ( end and lastRun ) ) )
70  + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
71  + "is ambigous." )
72  raise AllInOneError( msg )
73  if begin or end:
74  ( firstRun, lastRun ) = self.convertTimeToRun(
75  begin = begin, end = end, firstRun = firstRun,
76  lastRun = lastRun )
77  if ( firstRun and lastRun ) and ( firstRun > lastRun ):
78  msg = ( "The lower time/runrange limit ('begin'/'firstRun') "
79  "chosen is greater than the upper time/runrange limit "
80  "('end'/'lastRun').")
81  raise AllInOneError( msg )
82  goodLumiSecStr = ""
83  lumiStr = ""
84  lumiSecExtend = ""
85  if firstRun or lastRun:
86  goodLumiSecStr = ( "lumiSecs = cms.untracked."
87  "VLuminosityBlockRange()\n" )
88  lumiStr = " lumisToProcess = lumiSecs,\n"
89  if not jsonPath:
90  selectedRunList = self.__getRunList()
91  if firstRun:
92  selectedRunList = [ run for run in selectedRunList \
93  if run["run_number"] >= firstRun ]
94  if lastRun:
95  selectedRunList = [ run for run in selectedRunList \
96  if run["run_number"] <= lastRun ]
97  lumiList = [ str( run["run_number"] ) + ":1-" \
98  + str( run["run_number"] ) + ":max" \
99  for run in selectedRunList ]
100  splitLumiList = list( self.__chunks( lumiList, 255 ) )
101  else:
102  theLumiList = LumiList ( filename = jsonPath )
103  allRuns = theLumiList.getRuns()
104  runsToRemove = []
105  for run in allRuns:
106  if firstRun and int( run ) < firstRun:
107  runsToRemove.append( run )
108  if lastRun and int( run ) > lastRun:
109  runsToRemove.append( run )
110  theLumiList.removeRuns( runsToRemove )
111  splitLumiList = list( self.__chunks(
112  theLumiList.getCMSSWString().split(','), 255 ) )
113  if not len(splitLumiList[0][0]) == 0:
114  lumiSecStr = [ "',\n'".join( lumis ) \
115  for lumis in splitLumiList ]
116  lumiSecStr = [ "lumiSecs.extend( [\n'" + lumis + "'\n] )" \
117  for lumis in lumiSecStr ]
118  lumiSecExtend = "\n".join( lumiSecStr )
119  elif jsonPath:
120  goodLumiSecStr = ( "goodLumiSecs = LumiList.LumiList(filename"
121  "= '%(json)s').getCMSSWString().split(',')\n"
122  "lumiSecs = cms.untracked"
123  ".VLuminosityBlockRange()\n"
124  )
125  lumiStr = " lumisToProcess = lumiSecs,\n"
126  lumiSecExtend = "lumiSecs.extend(goodLumiSecs)\n"
127  if crab:
128  files = ""
129  else:
130  splitFileList = list( self.__chunks( self.fileList(), 255 ) )
131  fileStr = [ "',\n'".join( files ) for files in splitFileList ]
132  fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \
133  for files in fileStr ]
134  files = "\n".join( fileStr )
135  theMap = repMap
136  theMap["files"] = files
137  theMap["json"] = jsonPath
138  theMap["lumiStr"] = lumiStr
139  theMap["goodLumiSecStr"] = goodLumiSecStr%( theMap )
140  theMap["lumiSecExtend"] = lumiSecExtend
141  if crab:
142  dataset_snippet = self.__dummy_source_template%( theMap )
143  else:
144  dataset_snippet = self.__source_template%( theMap )
145  return dataset_snippet
146 
147  __dummy_source_template = ("%(process)smaxEvents = cms.untracked.PSet( "
148  "input = cms.untracked.int32(%(nEvents)s) )\n"
149  "readFiles = cms.untracked.vstring()\n"
150  "secFiles = cms.untracked.vstring()\n"
151  "%(process)ssource = cms.Source(\"PoolSource\",\n"
152  "%(tab)s secondaryFileNames ="
153  "secFiles,\n"
154  "%(tab)s fileNames = readFiles\n"
155  ")\n"
156  "readFiles.extend(['dummy_File.root'])\n")
157 
158  def __find_lt( self, a, x ):
159  'Find rightmost value less than x'
160  i = bisect.bisect_left( a, x )
161  if i:
162  return i-1
163  raise ValueError
164 
165  def __find_ge( self, a, x):
166  'Find leftmost item greater than or equal to x'
167  i = bisect.bisect_left( a, x )
168  if i != len( a ):
169  return i
170  raise ValueError
171 
172  def __getData( self, dasQuery, dasLimit = 0 ):
173  dasData = das_client.get_data( 'https://cmsweb.cern.ch',
174  dasQuery, 0, dasLimit, False )
175  if isinstance(dasData, str):
176  jsondict = json.loads( dasData )
177  else:
178  jsondict = dasData
179  # Check, if the DAS query fails
180  if jsondict["status"] != 'ok':
181  msg = "Status not 'ok', but:", jsondict["status"]
182  raise AllInOneError(msg)
183  return jsondict["data"]
184 
185  def __getDataType( self ):
186  dasQuery_type = ( 'dataset dataset=%s | grep dataset.datatype,'
187  'dataset.name'%( self.__name ) )
188  data = self.__getData( dasQuery_type )
189  for a in data[0]["dataset"]:
190  if "datatype" in a:
191  return a["datatype"]
192  msg = ("Cannot find the datatype of the dataset '%s'"%( self.name() ))
193  raise AllInOneError( msg )
194 
195  def __getFileInfoList( self, dasLimit ):
196  if self.__fileInfoList:
197  return self.__fileInfoList
198  dasQuery_files = ( 'file dataset=%s | grep file.name, file.nevents, '
199  'file.creation_time, '
200  'file.modification_time'%( self.__name ) )
201  print "Requesting file information for '%s' from DAS..."%( self.__name ),
202  data = self.__getData( dasQuery_files, dasLimit )
203  print "Done."
204  data = [ entry["file"] for entry in data ]
205  if len( data ) == 0:
206  msg = ("No files are available for the dataset '%s'. This can be "
207  "due to a typo or due to a DAS problem. Please check the "
208  "spelling of the dataset and/or retry to run "
209  "'validateAlignments.py'."%( self.name() ))
210  raise AllInOneError( msg )
211  fileInformationList = []
212  for file in data:
213  fileName = file[0]["name"]
214  fileCreationTime = file[0]["creation_time"]
215  for ii in range(3):
216  try:
217  fileNEvents = file[ii]["nevents"]
218  except KeyError:
219  continue
220  break
221  # select only non-empty files
222  if fileNEvents == 0:
223  continue
224  fileDict = { "name": fileName,
225  "creation_time": fileCreationTime,
226  "nevents": fileNEvents
227  }
228  fileInformationList.append( fileDict )
229  fileInformationList.sort( key=lambda info: info["name"] )
230  return fileInformationList
231 
232  def __getRunList( self ):
233  if self.__runList:
234  return self.__runList
235  dasQuery_runs = ( 'run dataset=%s | grep run.run_number,'
236  'run.creation_time'%( self.__name ) )
237  print "Requesting run information for '%s' from DAS..."%( self.__name ),
238  data = self.__getData( dasQuery_runs )
239  print "Done."
240  data = [ entry["run"][0] for entry in data ]
241  data.sort( key = lambda run: run["creation_time"] )
242  self.__runList = data
243  return data
244 
245  __source_template= ("%(importCms)s"
246  "import FWCore.PythonUtilities.LumiList as LumiList\n\n"
247  "%(goodLumiSecStr)s"
248  "%(process)smaxEvents = cms.untracked.PSet( "
249  "input = cms.untracked.int32(%(nEvents)s) )\n"
250  "readFiles = cms.untracked.vstring()\n"
251  "secFiles = cms.untracked.vstring()\n"
252  "%(process)ssource = cms.Source(\"PoolSource\",\n"
253  "%(lumiStr)s"
254  "%(tab)s secondaryFileNames ="
255  "secFiles,\n"
256  "%(tab)s fileNames = readFiles\n"
257  ")\n"
258  "%(files)s\n"
259  "%(lumiSecExtend)s\n")
260 
261  def convertTimeToRun( self, begin = None, end = None,
262  firstRun = None, lastRun = None,
263  shortTuple = True ):
264  if ( begin and firstRun ) or ( end and lastRun ):
265  msg = ( "The Usage of "
266  + "'begin' & 'firstRun' " * int( bool( begin and
267  firstRun ) )
268  + "and " * int( bool( ( begin and firstRun ) and
269  ( end and lastRun ) ) )
270  + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
271  + "is ambigous." )
272  raise AllInOneError( msg )
273 
274  runList = [ run["run_number"] for run in self.__getRunList() ]
275  runTimeList = [ run["creation_time"] for run in self.__getRunList() ]
276  if begin:
277  try:
278  runIndex = self.__find_ge( runTimeList, begin )
279  except ValueError:
280  msg = ( "Your 'begin' is after the creation time of the last "
281  "run in the dataset\n'%s'"%( self.__name ) )
282  raise AllInOneError( msg )
283  firstRun = runList[runIndex]
284  begin = None
285  if end:
286  try:
287  runIndex = self.__find_lt( runTimeList, end )
288  except ValueError:
289  msg = ( "Your 'end' is before the creation time of the first "
290  "run in the dataset\n'%s'"%( self.__name ) )
291  raise AllInOneError( msg )
292  lastRun = runList[runIndex]
293  end = None
294  if shortTuple:
295  return firstRun, lastRun
296  else:
297  return begin, end, firstRun, lastRun
298 
299  def dataType( self ):
300  return self.__dataType
301 
302  def datasetSnippet( self, jsonPath = None, begin = None, end = None,
303  firstRun = None, lastRun = None, nEvents = None,
304  crab = False ):
305  if self.__predefined:
306  return ("process.load(\"Alignment.OfflineValidation.%s_cff\")\n"
307  "process.maxEvents = cms.untracked.PSet(\n"
308  " input = cms.untracked.int32(%s)\n"
309  ")"
310  %( self.__name, nEvents ))
311  theMap = { "process": "process.",
312  "tab": " " * len( "process." ),
313  "nEvents": str( nEvents ),
314  "importCms": ""
315  }
316  datasetSnippet = self.__createSnippet( jsonPath = jsonPath,
317  begin = begin,
318  end = end,
319  firstRun = firstRun,
320  lastRun = lastRun,
321  repMap = theMap,
322  crab = crab )
323  return datasetSnippet
324 
325  def dump_cff( self, outName = None, jsonPath = None, begin = None,
326  end = None, firstRun = None, lastRun = None ):
327  if outName == None:
328  outName = "Dataset"
329  packageName = os.path.join( "Alignment", "OfflineValidation" )
330  if not os.path.exists( os.path.join(
331  os.environ["CMSSW_BASE"], "src", packageName ) ):
332  msg = ("You try to store the predefined dataset'%s'.\n"
333  "For that you need to check out the package '%s' to your "
334  "private relase area in\n"%( outName, packageName )
335  + os.environ["CMSSW_BASE"] )
336  raise AllInOneError( msg )
337  theMap = { "process": "",
338  "tab": "",
339  "nEvents": str( -1 ),
340  "importCms": "import FWCore.ParameterSet.Config as cms\n" }
341  dataset_cff = self.__createSnippet( jsonPath = jsonPath,
342  begin = begin,
343  end = end,
344  firstRun = firstRun,
345  lastRun = lastRun,
346  repMap = theMap)
347  filePath = os.path.join( os.environ["CMSSW_BASE"], "src", packageName,
348  "python", outName + "_cff.py" )
349  if os.path.exists( filePath ):
350  existMsg = "The predefined dataset '%s' already exists.\n"%( outName )
351  askString = "Do you want to overwrite it? [y/n]\n"
352  inputQuery = existMsg + askString
353  while True:
354  userInput = raw_input( inputQuery ).lower()
355  if userInput == "y":
356  break
357  elif userInput == "n":
358  return
359  else:
360  inputQuery = askString
361  print ( "The predefined dataset '%s' will be stored in the file\n"
362  %( outName )
363  + filePath +
364  "\nFor future use you have to do 'scram b'." )
365  print
366  theFile = open( filePath, "w" )
367  theFile.write( dataset_cff )
368  theFile.close()
369  return
370 
371  def fileList( self ):
372  if self.__fileList:
373  return self.__fileList
374  fileList = [ fileInfo["name"] \
375  for fileInfo in self.fileInfoList() ]
376  self.__fileList = fileList
377  return fileList
378 
379  def fileInfoList( self ):
380  return self.__getFileInfoList( self.__dasLimit )
381 
382  def name( self ):
383  return self.__name
384 
385  def predefined( self ):
386  return self.__predefined
387 
388  def runList( self ):
389  if self.__runList:
390  return self.__runList
391  return self.__getRunList()
392 
393 
394 if __name__ == '__main__':
395  print "Start testing..."
396  datasetName = '/MinimumBias/Run2012D-TkAlMinBias-v1/ALCARECO'
397  jsonFile = ( '/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/'
398  'Collisions12/8TeV/Prompt/'
399  'Cert_190456-207898_8TeV_PromptReco_Collisions12_JSON.txt' )
400  dataset = Dataset( datasetName )
401  print dataset.datasetSnippet( nEvents = 100,jsonPath = jsonFile,
402  firstRun = "207983",
403  end = "2012-11-28 00:00:00" )
404  dataset.dump_cff( outName = "Dataset_Test_TkAlMinBias_Run2012D",
405  jsonPath = jsonFile,
406  firstRun = "207983",
407  end = "2012-11-28 00:00:00" )
def __getFileInfoList
Definition: dataset.py:195
def __createSnippet
Definition: dataset.py:59
def convertTimeToRun
Definition: dataset.py:263
def fileInfoList
Definition: dataset.py:379
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
tuple __dummy_source_template
Definition: dataset.py:147
def datasetSnippet
Definition: dataset.py:304
def __getRunList
Definition: dataset.py:232
double split
Definition: MVATrainer.cc:139
def __getDataType
Definition: dataset.py:185
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run