CMS 3D CMS Logo

/afs/cern.ch/work/a/aaltunda/public/www/CMSSW_6_2_5/src/Alignment/OfflineValidation/python/TkAlAllInOneTool/dataset.py

Go to the documentation of this file.
00001 # idea stolen from:
00002 # http://cmssw.cvs.cern.ch/cgi-bin/cmssw.cgi/CMSSW/
00003 #        PhysicsTools/PatAlgos/python/tools/cmsswVersionTools.py
00004 import das_client
00005 import json
00006 import os
00007 import bisect
00008 import re
00009 from FWCore.PythonUtilities.LumiList import LumiList
00010 from TkAlExceptions import AllInOneError
00011 
00012 
00013 class Dataset:
00014     def __init__( self, datasetName, dasLimit = 0 ):
00015         self.__name = datasetName
00016         # check, if dataset name matches CMS dataset naming scheme
00017         if re.match( r'/.+/.+/.+', self.__name ):
00018             self.__dataType = self.__getDataType()
00019             self.__predefined = False
00020         else:
00021             fileName = self.__name + "_cff.py"
00022             searchPath1 = os.path.join( os.environ["CMSSW_BASE"], "python",
00023                                         "Alignment", "OfflineValidation",
00024                                         fileName )
00025             searchPath2 = os.path.join( os.environ["CMSSW_BASE"], "src",
00026                                         "Alignment", "OfflineValidation",
00027                                         "python", fileName )
00028             searchPath3 = os.path.join( os.environ["CMSSW_RELEASE_BASE"],
00029                                         "python", "Alignment",
00030                                         "OfflineValidation", fileName )
00031             if os.path.exists( searchPath1 ):
00032                 pass
00033             elif os.path.exists( searchPath2 ):
00034                 msg = ("The predefined dataset '%s' does exist in '%s', but "
00035                        "you need to run 'scram b' first."
00036                        %( self.__name, searchPath2 ))
00037                 raise AllInOneError( msg )
00038             elif os.path.exists( searchPath3 ):
00039                 pass
00040             else:
00041                 msg = ("The predefined dataset '%s' does not exist. Please "
00042                        "create it first or check for typos."%( self.__name ))
00043                 raise AllInOneError( msg )
00044             self.__dataType = "unknown"
00045             self.__predefined = True
00046         self.__dasLimit = dasLimit
00047         self.__fileList = None
00048         self.__fileInfoList = None
00049         self.__runList = None
00050 
00051     def __chunks( self, theList, n ):
00052         """ Yield successive n-sized chunks from theList.
00053         """
00054         for i in xrange( 0, len( theList ), n ):
00055             yield theList[i:i+n]
00056 
00057     def __createSnippet( self, jsonPath = None, begin = None, end = None,
00058                          firstRun = None, lastRun = None, repMap = None,
00059                          crab = False ):
00060         if firstRun:
00061             firstRun = int( firstRun )
00062         if lastRun:
00063             lastRun = int( lastRun )
00064         if ( begin and firstRun ) or ( end and lastRun ):
00065             msg = ( "The Usage of "
00066                     + "'begin' & 'firstRun' " * int( bool( begin and
00067                                                            firstRun ) )
00068                     + "and " * int( bool( ( begin and firstRun ) and
00069                                          ( end and lastRun ) ) )
00070                     + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
00071                     + "is ambigous." )
00072             raise AllInOneError( msg )
00073         if begin or end:
00074             ( firstRun, lastRun ) = self.convertTimeToRun(
00075                 begin = begin, end = end, firstRun = firstRun,
00076                 lastRun = lastRun )
00077         if ( firstRun and lastRun ) and ( firstRun > lastRun ):
00078             msg = ( "The lower time/runrange limit ('begin'/'firstRun') "
00079                     "chosen is greater than the upper time/runrange limit "
00080                     "('end'/'lastRun').")
00081             raise AllInOneError( msg )
00082         goodLumiSecStr = ""
00083         lumiStr = ""
00084         lumiSecExtend = ""
00085         if firstRun or lastRun:
00086             goodLumiSecStr = ( "lumiSecs = cms.untracked."
00087                                "VLuminosityBlockRange()\n" )
00088             lumiStr = "                    lumisToProcess = lumiSecs,\n"
00089             if not jsonPath:
00090                 selectedRunList = self.__getRunList()
00091                 if firstRun:
00092                     selectedRunList = [ run for run in selectedRunList \
00093                                         if run["run_number"] >= firstRun ]
00094                 if lastRun:
00095                     selectedRunList = [ run for run in selectedRunList \
00096                                         if run["run_number"] <= lastRun ]
00097                 lumiList = [ str( run["run_number"] ) + ":1-" \
00098                              + str( run["run_number"] ) + ":max" \
00099                              for run in selectedRunList ]
00100                 splitLumiList = list( self.__chunks( lumiList, 255 ) )
00101             else:
00102                 theLumiList = LumiList ( filename = jsonPath )
00103                 allRuns = theLumiList.getRuns()
00104                 runsToRemove = []
00105                 for run in allRuns:
00106                     if firstRun and int( run ) < firstRun:
00107                         runsToRemove.append( run )
00108                     if lastRun and int( run ) > lastRun:
00109                         runsToRemove.append( run )
00110                 theLumiList.removeRuns( runsToRemove )
00111                 splitLumiList = list( self.__chunks(
00112                     theLumiList.getCMSSWString().split(','), 255 ) )
00113             if not len(splitLumiList[0][0]) == 0:
00114                 lumiSecStr = [ "',\n'".join( lumis ) \
00115                                for lumis in splitLumiList ]
00116                 lumiSecStr = [ "lumiSecs.extend( [\n'" + lumis + "'\n] )" \
00117                                for lumis in lumiSecStr ]
00118                 lumiSecExtend = "\n".join( lumiSecStr )
00119         elif jsonPath:
00120                 goodLumiSecStr = ( "goodLumiSecs = LumiList.LumiList(filename"
00121                                    "= '%(json)s').getCMSSWString().split(',')\n"
00122                                    "lumiSecs = cms.untracked"
00123                                    ".VLuminosityBlockRange()\n"
00124                                    )
00125                 lumiStr = "                    lumisToProcess = lumiSecs,\n"
00126                 lumiSecExtend = "lumiSecs.extend(goodLumiSecs)\n"
00127         if crab:
00128             files = ""
00129         else:
00130             splitFileList = list( self.__chunks( self.fileList(), 255 ) )
00131             fileStr = [ "',\n'".join( files ) for files in splitFileList ]
00132             fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \
00133                         for files in fileStr ]
00134             files = "\n".join( fileStr )
00135         theMap = repMap
00136         theMap["files"] = files
00137         theMap["json"] = jsonPath
00138         theMap["lumiStr"] = lumiStr
00139         theMap["goodLumiSecStr"] = goodLumiSecStr%( theMap )
00140         theMap["lumiSecExtend"] = lumiSecExtend
00141         if crab:
00142             dataset_snippet = self.__dummy_source_template%( theMap )
00143         else:
00144             dataset_snippet = self.__source_template%( theMap )
00145         return dataset_snippet
00146 
00147     __dummy_source_template = ("%(process)smaxEvents = cms.untracked.PSet( "
00148                                "input = cms.untracked.int32(%(nEvents)s) )\n"
00149                                "readFiles = cms.untracked.vstring()\n"
00150                                "secFiles = cms.untracked.vstring()\n"
00151                                "%(process)ssource = cms.Source(\"PoolSource\",\n"
00152                                "%(tab)s                    secondaryFileNames ="
00153                                "secFiles,\n"
00154                                "%(tab)s                    fileNames = readFiles\n"
00155                                ")\n"
00156                                "readFiles.extend(['dummy_File.root'])\n")
00157         
00158     def __find_lt( self, a, x ):
00159         'Find rightmost value less than x'
00160         i = bisect.bisect_left( a, x )
00161         if i:
00162             return i-1
00163         raise ValueError
00164 
00165     def __find_ge( self, a, x):
00166         'Find leftmost item greater than or equal to x'
00167         i = bisect.bisect_left( a, x )
00168         if i != len( a ):
00169             return i
00170         raise ValueError
00171 
00172     def __getData( self, dasQuery, dasLimit = 0 ):
00173         dasData = das_client.get_data( 'https://cmsweb.cern.ch',
00174                                        dasQuery, 0, dasLimit, False )
00175         jsondict = json.loads( dasData )
00176         # Check, if the DAS query fails
00177         if jsondict["status"] != 'ok':
00178             msg = "Status not 'ok', but:", jsondict["status"]
00179             raise AllInOneError(msg)
00180         return jsondict["data"]
00181 
00182     def __getDataType( self ):
00183         dasQuery_type = ( 'dataset dataset=%s | grep dataset.datatype,'
00184                           'dataset.name'%( self.__name ) )
00185         data = self.__getData( dasQuery_type )
00186         return data[0]["dataset"][0]["datatype"]
00187 
00188     def __getFileInfoList( self, dasLimit ):
00189         if self.__fileInfoList:
00190             return self.__fileInfoList
00191         dasQuery_files = ( 'file dataset=%s | grep file.name, file.nevents, '
00192                            'file.creation_time, '
00193                            'file.modification_time'%( self.__name ) )
00194         print "Requesting file information for '%s' from DAS..."%( self.__name ),
00195         data = self.__getData( dasQuery_files, dasLimit )
00196         print "Done."
00197         data = [ entry["file"] for entry in data ]
00198         if len( data ) == 0:
00199             msg = ("No files are available for the dataset '%s'. This can be "
00200                    "due to a typo or due to a DAS problem. Please check the "
00201                    "spelling of the dataset and/or retry to run "
00202                    "'validateAlignments.py'."%( self.name() ))
00203             raise AllInOneError( msg )
00204         fileInformationList = []
00205         for file in data:
00206             fileName = file[0]["name"]
00207             fileCreationTime = file[0]["creation_time"]
00208             for ii in range(3):
00209                 try:
00210                     fileNEvents = file[ii]["nevents"]
00211                 except KeyError:
00212                     continue
00213                 break
00214             # select only non-empty files
00215             if fileNEvents == 0:
00216                 continue
00217             fileDict = { "name": fileName,
00218                          "creation_time": fileCreationTime,
00219                          "nevents": fileNEvents
00220                          }
00221             fileInformationList.append( fileDict )
00222         fileInformationList.sort( key=lambda info: info["name"] )
00223         return fileInformationList
00224 
00225     def __getRunList( self ):
00226         if self.__runList:
00227             return self.__runList
00228         dasQuery_runs = ( 'run dataset=%s | grep run.run_number,'
00229                           'run.creation_time'%( self.__name ) )
00230         print "Requesting run information for '%s' from DAS..."%( self.__name ),
00231         data = self.__getData( dasQuery_runs )
00232         print "Done."
00233         data = [ entry["run"][0] for entry in data ]
00234         data.sort( key = lambda run: run["creation_time"] )
00235         self.__runList = data
00236         return data
00237 
00238     __source_template= ("%(importCms)s"
00239                         "import FWCore.PythonUtilities.LumiList as LumiList\n\n"
00240                         "%(goodLumiSecStr)s"
00241                         "%(process)smaxEvents = cms.untracked.PSet( "
00242                         "input = cms.untracked.int32(%(nEvents)s) )\n"
00243                         "readFiles = cms.untracked.vstring()\n"
00244                         "secFiles = cms.untracked.vstring()\n"
00245                         "%(process)ssource = cms.Source(\"PoolSource\",\n"
00246                         "%(lumiStr)s"
00247                         "%(tab)s                    secondaryFileNames ="
00248                         "secFiles,\n"
00249                         "%(tab)s                    fileNames = readFiles\n"
00250                         ")\n"
00251                         "%(files)s\n"
00252                         "%(lumiSecExtend)s\n")
00253 
00254     def convertTimeToRun( self, begin = None, end = None,
00255                           firstRun = None, lastRun = None,
00256                           shortTuple = True ):
00257         if ( begin and firstRun ) or ( end and lastRun ):
00258             msg = ( "The Usage of "
00259                     + "'begin' & 'firstRun' " * int( bool( begin and
00260                                                            firstRun ) )
00261                     + "and " * int( bool( ( begin and firstRun ) and
00262                                          ( end and lastRun ) ) )
00263                     + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
00264                     + "is ambigous." )
00265             raise AllInOneError( msg )
00266 
00267         runList = [ run["run_number"] for run in self.__getRunList() ]
00268         runTimeList = [ run["creation_time"] for run in self.__getRunList() ]
00269         if begin:
00270             try:
00271                 runIndex = self.__find_ge( runTimeList, begin )
00272             except ValueError:
00273                 msg = ( "Your 'begin' is after the creation time of the last "
00274                         "run in the dataset\n'%s'"%( self.__name ) )
00275                 raise AllInOneError( msg )
00276             firstRun = runList[runIndex]
00277             begin = None
00278         if end:
00279             try:
00280                 runIndex = self.__find_lt( runTimeList, end )
00281             except ValueError:
00282                 msg = ( "Your 'end' is before the creation time of the first "
00283                         "run in the dataset\n'%s'"%( self.__name ) )
00284                 raise AllInOneError( msg )
00285             lastRun = runList[runIndex]
00286             end = None
00287         if shortTuple:
00288             return firstRun, lastRun
00289         else:
00290             return begin, end, firstRun, lastRun
00291 
00292     def dataType( self ):
00293         return self.__dataType
00294     
00295     def datasetSnippet( self, jsonPath = None, begin = None, end = None,
00296                         firstRun = None, lastRun = None, nEvents = None,
00297                         crab = False ):
00298         if self.__predefined:
00299             return ("process.load(\"Alignment.OfflineValidation.%s_cff\")\n"
00300                     "process.maxEvents = cms.untracked.PSet(\n"
00301                     "    input = cms.untracked.int32(%s)\n"
00302                     ")"
00303                     %( self.__name, nEvents ))
00304         theMap = { "process": "process.",
00305                    "tab": " " * len( "process." ),
00306                    "nEvents": str( nEvents ),
00307                    "importCms": ""
00308                    }
00309         datasetSnippet = self.__createSnippet( jsonPath = jsonPath,
00310                                                begin = begin,
00311                                                end = end,
00312                                                firstRun = firstRun,
00313                                                lastRun = lastRun,
00314                                                repMap = theMap,
00315                                                crab = crab )
00316         return datasetSnippet
00317 
00318     def dump_cff( self, outName = None, jsonPath = None, begin = None,
00319                   end = None, firstRun = None, lastRun = None ):
00320         if outName == None:
00321             outName = "Dataset"
00322         packageName = os.path.join( "Alignment", "OfflineValidation" )
00323         if not os.path.exists( os.path.join(
00324             os.environ["CMSSW_BASE"], "src", packageName ) ):
00325             msg = ("You try to store the predefined dataset'%s'.\n"
00326                    "For that you need to check out the package '%s' to your "
00327                    "private relase area in\n"%( outName, packageName )
00328                    + os.environ["CMSSW_BASE"] )
00329             raise AllInOneError( msg )
00330         theMap = { "process": "",
00331                    "tab": "",
00332                    "nEvents": str( -1 ),
00333                    "importCms": "import FWCore.ParameterSet.Config as cms\n" }
00334         dataset_cff = self.__createSnippet( jsonPath = jsonPath,
00335                                             begin = begin,
00336                                             end = end,
00337                                             firstRun = firstRun,
00338                                             lastRun = lastRun,
00339                                             repMap = theMap)
00340         filePath = os.path.join( os.environ["CMSSW_BASE"], "src", packageName,
00341                                  "python", outName + "_cff.py" )
00342         if os.path.exists( filePath ):
00343             existMsg = "The predefined dataset '%s' already exists.\n"%( outName )
00344             askString = "Do you want to overwrite it? [y/n]\n"
00345             inputQuery = existMsg + askString
00346             while True:
00347                 userInput = raw_input( inputQuery ).lower()
00348                 if userInput == "y":
00349                     break
00350                 elif userInput == "n":
00351                     return
00352                 else:
00353                     inputQuery = askString
00354         print ( "The predefined dataset '%s' will be stored in the file\n"
00355                 %( outName )
00356                 + filePath +
00357                 "\nFor future use you have to do 'scram b'." )
00358         print
00359         theFile = open( filePath, "w" )
00360         theFile.write( dataset_cff )
00361         theFile.close()
00362         return
00363 
00364     def fileList( self ):
00365         if self.__fileList:
00366             return self.__fileList
00367         fileList = [ fileInfo["name"] \
00368                      for fileInfo in self.fileInfoList() ]
00369         self.__fileList = fileList
00370         return fileList
00371     
00372     def fileInfoList( self ):
00373         return self.__getFileInfoList( self.__dasLimit )
00374 
00375     def name( self ):
00376         return self.__name
00377 
00378     def predefined( self ):
00379         return self.__predefined
00380 
00381     def runList( self ):
00382         if self.__runList:
00383             return self.__runList
00384         return self.__getRunList()
00385 
00386 
00387 if __name__ == '__main__':
00388     print "Start testing..."
00389     datasetName = '/MinimumBias/Run2012D-TkAlMinBias-v1/ALCARECO'
00390     jsonFile = ( '/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/'
00391                  'Collisions12/8TeV/Prompt/'
00392                  'Cert_190456-207898_8TeV_PromptReco_Collisions12_JSON.txt' )
00393     dataset = Dataset( datasetName )
00394     print dataset.datasetSnippet( nEvents = 100,jsonPath = jsonFile,
00395                                   firstRun = "207983",
00396                                   end = "2012-11-28 00:00:00" )
00397     dataset.dump_cff( outName = "Dataset_Test_TkAlMinBias_Run2012D",
00398                       jsonPath = jsonFile,
00399                       firstRun = "207983",
00400                       end = "2012-11-28 00:00:00" )