00001
00002
00003
00004 import das_client
00005 import json
00006 import os
00007 import bisect
00008 import re
00009 from FWCore.PythonUtilities.LumiList import LumiList
00010 from TkAlExceptions import AllInOneError
00011
00012
00013 class Dataset:
00014 def __init__( self, datasetName, dasLimit = 0 ):
00015 self.__name = datasetName
00016
00017 if re.match( r'/.+/.+/.+', self.__name ):
00018 self.__dataType = self.__getDataType()
00019 self.__predefined = False
00020 else:
00021 fileName = self.__name + "_cff.py"
00022 searchPath1 = os.path.join( os.environ["CMSSW_BASE"], "python",
00023 "Alignment", "OfflineValidation",
00024 fileName )
00025 searchPath2 = os.path.join( os.environ["CMSSW_BASE"], "src",
00026 "Alignment", "OfflineValidation",
00027 "python", fileName )
00028 searchPath3 = os.path.join( os.environ["CMSSW_RELEASE_BASE"],
00029 "python", "Alignment",
00030 "OfflineValidation", fileName )
00031 if os.path.exists( searchPath1 ):
00032 pass
00033 elif os.path.exists( searchPath2 ):
00034 msg = ("The predefined dataset '%s' does exist in '%s', but "
00035 "you need to run 'scram b' first."
00036 %( self.__name, searchPath2 ))
00037 raise AllInOneError( msg )
00038 elif os.path.exists( searchPath3 ):
00039 pass
00040 else:
00041 msg = ("The predefined dataset '%s' does not exist. Please "
00042 "create it first or check for typos."%( self.__name ))
00043 raise AllInOneError( msg )
00044 self.__dataType = "unknown"
00045 self.__predefined = True
00046 self.__dasLimit = dasLimit
00047 self.__fileList = None
00048 self.__fileInfoList = None
00049 self.__runList = None
00050
00051 def __chunks( self, theList, n ):
00052 """ Yield successive n-sized chunks from theList.
00053 """
00054 for i in xrange( 0, len( theList ), n ):
00055 yield theList[i:i+n]
00056
00057 def __createSnippet( self, jsonPath = None, begin = None, end = None,
00058 firstRun = None, lastRun = None, repMap = None,
00059 crab = False ):
00060 if firstRun:
00061 firstRun = int( firstRun )
00062 if lastRun:
00063 lastRun = int( lastRun )
00064 if ( begin and firstRun ) or ( end and lastRun ):
00065 msg = ( "The Usage of "
00066 + "'begin' & 'firstRun' " * int( bool( begin and
00067 firstRun ) )
00068 + "and " * int( bool( ( begin and firstRun ) and
00069 ( end and lastRun ) ) )
00070 + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
00071 + "is ambigous." )
00072 raise AllInOneError( msg )
00073 if begin or end:
00074 ( firstRun, lastRun ) = self.convertTimeToRun(
00075 begin = begin, end = end, firstRun = firstRun,
00076 lastRun = lastRun )
00077 if ( firstRun and lastRun ) and ( firstRun > lastRun ):
00078 msg = ( "The lower time/runrange limit ('begin'/'firstRun') "
00079 "chosen is greater than the upper time/runrange limit "
00080 "('end'/'lastRun').")
00081 raise AllInOneError( msg )
00082 goodLumiSecStr = ""
00083 lumiStr = ""
00084 lumiSecExtend = ""
00085 if firstRun or lastRun:
00086 goodLumiSecStr = ( "lumiSecs = cms.untracked."
00087 "VLuminosityBlockRange()\n" )
00088 lumiStr = " lumisToProcess = lumiSecs,\n"
00089 if not jsonPath:
00090 selectedRunList = self.__getRunList()
00091 if firstRun:
00092 selectedRunList = [ run for run in selectedRunList \
00093 if run["run_number"] >= firstRun ]
00094 if lastRun:
00095 selectedRunList = [ run for run in selectedRunList \
00096 if run["run_number"] <= lastRun ]
00097 lumiList = [ str( run["run_number"] ) + ":1-" \
00098 + str( run["run_number"] ) + ":max" \
00099 for run in selectedRunList ]
00100 splitLumiList = list( self.__chunks( lumiList, 255 ) )
00101 else:
00102 theLumiList = LumiList ( filename = jsonPath )
00103 allRuns = theLumiList.getRuns()
00104 runsToRemove = []
00105 for run in allRuns:
00106 if firstRun and int( run ) < firstRun:
00107 runsToRemove.append( run )
00108 if lastRun and int( run ) > lastRun:
00109 runsToRemove.append( run )
00110 theLumiList.removeRuns( runsToRemove )
00111 splitLumiList = list( self.__chunks(
00112 theLumiList.getCMSSWString().split(','), 255 ) )
00113 if not len(splitLumiList[0][0]) == 0:
00114 lumiSecStr = [ "',\n'".join( lumis ) \
00115 for lumis in splitLumiList ]
00116 lumiSecStr = [ "lumiSecs.extend( [\n'" + lumis + "'\n] )" \
00117 for lumis in lumiSecStr ]
00118 lumiSecExtend = "\n".join( lumiSecStr )
00119 elif jsonPath:
00120 goodLumiSecStr = ( "goodLumiSecs = LumiList.LumiList(filename"
00121 "= '%(json)s').getCMSSWString().split(',')\n"
00122 "lumiSecs = cms.untracked"
00123 ".VLuminosityBlockRange()\n"
00124 )
00125 lumiStr = " lumisToProcess = lumiSecs,\n"
00126 lumiSecExtend = "lumiSecs.extend(goodLumiSecs)\n"
00127 if crab:
00128 files = ""
00129 else:
00130 splitFileList = list( self.__chunks( self.fileList(), 255 ) )
00131 fileStr = [ "',\n'".join( files ) for files in splitFileList ]
00132 fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \
00133 for files in fileStr ]
00134 files = "\n".join( fileStr )
00135 theMap = repMap
00136 theMap["files"] = files
00137 theMap["json"] = jsonPath
00138 theMap["lumiStr"] = lumiStr
00139 theMap["goodLumiSecStr"] = goodLumiSecStr%( theMap )
00140 theMap["lumiSecExtend"] = lumiSecExtend
00141 if crab:
00142 dataset_snippet = self.__dummy_source_template%( theMap )
00143 else:
00144 dataset_snippet = self.__source_template%( theMap )
00145 return dataset_snippet
00146
00147 __dummy_source_template = ("%(process)smaxEvents = cms.untracked.PSet( "
00148 "input = cms.untracked.int32(%(nEvents)s) )\n"
00149 "readFiles = cms.untracked.vstring()\n"
00150 "secFiles = cms.untracked.vstring()\n"
00151 "%(process)ssource = cms.Source(\"PoolSource\",\n"
00152 "%(tab)s secondaryFileNames ="
00153 "secFiles,\n"
00154 "%(tab)s fileNames = readFiles\n"
00155 ")\n"
00156 "readFiles.extend(['dummy_File.root'])\n")
00157
00158 def __find_lt( self, a, x ):
00159 'Find rightmost value less than x'
00160 i = bisect.bisect_left( a, x )
00161 if i:
00162 return i-1
00163 raise ValueError
00164
00165 def __find_ge( self, a, x):
00166 'Find leftmost item greater than or equal to x'
00167 i = bisect.bisect_left( a, x )
00168 if i != len( a ):
00169 return i
00170 raise ValueError
00171
00172 def __getData( self, dasQuery, dasLimit = 0 ):
00173 dasData = das_client.get_data( 'https://cmsweb.cern.ch',
00174 dasQuery, 0, dasLimit, False )
00175 jsondict = json.loads( dasData )
00176
00177 if jsondict["status"] != 'ok':
00178 msg = "Status not 'ok', but:", jsondict["status"]
00179 raise AllInOneError(msg)
00180 return jsondict["data"]
00181
00182 def __getDataType( self ):
00183 dasQuery_type = ( 'dataset dataset=%s | grep dataset.datatype,'
00184 'dataset.name'%( self.__name ) )
00185 data = self.__getData( dasQuery_type )
00186 return data[0]["dataset"][0]["datatype"]
00187
00188 def __getFileInfoList( self, dasLimit ):
00189 if self.__fileInfoList:
00190 return self.__fileInfoList
00191 dasQuery_files = ( 'file dataset=%s | grep file.name, file.nevents, '
00192 'file.creation_time, '
00193 'file.modification_time'%( self.__name ) )
00194 print "Requesting file information for '%s' from DAS..."%( self.__name ),
00195 data = self.__getData( dasQuery_files, dasLimit )
00196 print "Done."
00197 data = [ entry["file"] for entry in data ]
00198 if len( data ) == 0:
00199 msg = ("No files are available for the dataset '%s'. This can be "
00200 "due to a typo or due to a DAS problem. Please check the "
00201 "spelling of the dataset and/or retry to run "
00202 "'validateAlignments.py'."%( self.name() ))
00203 raise AllInOneError( msg )
00204 fileInformationList = []
00205 for file in data:
00206 fileName = file[0]["name"]
00207 fileCreationTime = file[0]["creation_time"]
00208 for ii in range(3):
00209 try:
00210 fileNEvents = file[ii]["nevents"]
00211 except KeyError:
00212 continue
00213 break
00214
00215 if fileNEvents == 0:
00216 continue
00217 fileDict = { "name": fileName,
00218 "creation_time": fileCreationTime,
00219 "nevents": fileNEvents
00220 }
00221 fileInformationList.append( fileDict )
00222 fileInformationList.sort( key=lambda info: info["name"] )
00223 return fileInformationList
00224
00225 def __getRunList( self ):
00226 if self.__runList:
00227 return self.__runList
00228 dasQuery_runs = ( 'run dataset=%s | grep run.run_number,'
00229 'run.creation_time'%( self.__name ) )
00230 print "Requesting run information for '%s' from DAS..."%( self.__name ),
00231 data = self.__getData( dasQuery_runs )
00232 print "Done."
00233 data = [ entry["run"][0] for entry in data ]
00234 data.sort( key = lambda run: run["creation_time"] )
00235 self.__runList = data
00236 return data
00237
00238 __source_template= ("%(importCms)s"
00239 "import FWCore.PythonUtilities.LumiList as LumiList\n\n"
00240 "%(goodLumiSecStr)s"
00241 "%(process)smaxEvents = cms.untracked.PSet( "
00242 "input = cms.untracked.int32(%(nEvents)s) )\n"
00243 "readFiles = cms.untracked.vstring()\n"
00244 "secFiles = cms.untracked.vstring()\n"
00245 "%(process)ssource = cms.Source(\"PoolSource\",\n"
00246 "%(lumiStr)s"
00247 "%(tab)s secondaryFileNames ="
00248 "secFiles,\n"
00249 "%(tab)s fileNames = readFiles\n"
00250 ")\n"
00251 "%(files)s\n"
00252 "%(lumiSecExtend)s\n")
00253
00254 def convertTimeToRun( self, begin = None, end = None,
00255 firstRun = None, lastRun = None,
00256 shortTuple = True ):
00257 if ( begin and firstRun ) or ( end and lastRun ):
00258 msg = ( "The Usage of "
00259 + "'begin' & 'firstRun' " * int( bool( begin and
00260 firstRun ) )
00261 + "and " * int( bool( ( begin and firstRun ) and
00262 ( end and lastRun ) ) )
00263 + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
00264 + "is ambigous." )
00265 raise AllInOneError( msg )
00266
00267 runList = [ run["run_number"] for run in self.__getRunList() ]
00268 runTimeList = [ run["creation_time"] for run in self.__getRunList() ]
00269 if begin:
00270 try:
00271 runIndex = self.__find_ge( runTimeList, begin )
00272 except ValueError:
00273 msg = ( "Your 'begin' is after the creation time of the last "
00274 "run in the dataset\n'%s'"%( self.__name ) )
00275 raise AllInOneError( msg )
00276 firstRun = runList[runIndex]
00277 begin = None
00278 if end:
00279 try:
00280 runIndex = self.__find_lt( runTimeList, end )
00281 except ValueError:
00282 msg = ( "Your 'end' is before the creation time of the first "
00283 "run in the dataset\n'%s'"%( self.__name ) )
00284 raise AllInOneError( msg )
00285 lastRun = runList[runIndex]
00286 end = None
00287 if shortTuple:
00288 return firstRun, lastRun
00289 else:
00290 return begin, end, firstRun, lastRun
00291
00292 def dataType( self ):
00293 return self.__dataType
00294
00295 def datasetSnippet( self, jsonPath = None, begin = None, end = None,
00296 firstRun = None, lastRun = None, nEvents = None,
00297 crab = False ):
00298 if self.__predefined:
00299 return ("process.load(\"Alignment.OfflineValidation.%s_cff\")\n"
00300 "process.maxEvents = cms.untracked.PSet(\n"
00301 " input = cms.untracked.int32(%s)\n"
00302 ")"
00303 %( self.__name, nEvents ))
00304 theMap = { "process": "process.",
00305 "tab": " " * len( "process." ),
00306 "nEvents": str( nEvents ),
00307 "importCms": ""
00308 }
00309 datasetSnippet = self.__createSnippet( jsonPath = jsonPath,
00310 begin = begin,
00311 end = end,
00312 firstRun = firstRun,
00313 lastRun = lastRun,
00314 repMap = theMap,
00315 crab = crab )
00316 return datasetSnippet
00317
00318 def dump_cff( self, outName = None, jsonPath = None, begin = None,
00319 end = None, firstRun = None, lastRun = None ):
00320 if outName == None:
00321 outName = "Dataset"
00322 packageName = os.path.join( "Alignment", "OfflineValidation" )
00323 if not os.path.exists( os.path.join(
00324 os.environ["CMSSW_BASE"], "src", packageName ) ):
00325 msg = ("You try to store the predefined dataset'%s'.\n"
00326 "For that you need to check out the package '%s' to your "
00327 "private relase area in\n"%( outName, packageName )
00328 + os.environ["CMSSW_BASE"] )
00329 raise AllInOneError( msg )
00330 theMap = { "process": "",
00331 "tab": "",
00332 "nEvents": str( -1 ),
00333 "importCms": "import FWCore.ParameterSet.Config as cms\n" }
00334 dataset_cff = self.__createSnippet( jsonPath = jsonPath,
00335 begin = begin,
00336 end = end,
00337 firstRun = firstRun,
00338 lastRun = lastRun,
00339 repMap = theMap)
00340 filePath = os.path.join( os.environ["CMSSW_BASE"], "src", packageName,
00341 "python", outName + "_cff.py" )
00342 if os.path.exists( filePath ):
00343 existMsg = "The predefined dataset '%s' already exists.\n"%( outName )
00344 askString = "Do you want to overwrite it? [y/n]\n"
00345 inputQuery = existMsg + askString
00346 while True:
00347 userInput = raw_input( inputQuery ).lower()
00348 if userInput == "y":
00349 break
00350 elif userInput == "n":
00351 return
00352 else:
00353 inputQuery = askString
00354 print ( "The predefined dataset '%s' will be stored in the file\n"
00355 %( outName )
00356 + filePath +
00357 "\nFor future use you have to do 'scram b'." )
00358 print
00359 theFile = open( filePath, "w" )
00360 theFile.write( dataset_cff )
00361 theFile.close()
00362 return
00363
00364 def fileList( self ):
00365 if self.__fileList:
00366 return self.__fileList
00367 fileList = [ fileInfo["name"] \
00368 for fileInfo in self.fileInfoList() ]
00369 self.__fileList = fileList
00370 return fileList
00371
00372 def fileInfoList( self ):
00373 return self.__getFileInfoList( self.__dasLimit )
00374
00375 def name( self ):
00376 return self.__name
00377
00378 def predefined( self ):
00379 return self.__predefined
00380
00381 def runList( self ):
00382 if self.__runList:
00383 return self.__runList
00384 return self.__getRunList()
00385
00386
00387 if __name__ == '__main__':
00388 print "Start testing..."
00389 datasetName = '/MinimumBias/Run2012D-TkAlMinBias-v1/ALCARECO'
00390 jsonFile = ( '/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/'
00391 'Collisions12/8TeV/Prompt/'
00392 'Cert_190456-207898_8TeV_PromptReco_Collisions12_JSON.txt' )
00393 dataset = Dataset( datasetName )
00394 print dataset.datasetSnippet( nEvents = 100,jsonPath = jsonFile,
00395 firstRun = "207983",
00396 end = "2012-11-28 00:00:00" )
00397 dataset.dump_cff( outName = "Dataset_Test_TkAlMinBias_Run2012D",
00398 jsonPath = jsonFile,
00399 firstRun = "207983",
00400 end = "2012-11-28 00:00:00" )