d4/d59/dataset_8py_source.html

 # idea stolen from:

 # http://cmssw.cvs.cern.ch/cgi-bin/cmssw.cgi/CMSSW/

 #        PhysicsTools/PatAlgos/python/tools/cmsswVersionTools.py

 import das_client

 import json

 import os

 import bisect

 import re

 import datetime

 from FWCore.PythonUtilities.LumiList import LumiList

 from TkAlExceptions import AllInOneError


 class Dataset:

     def __init__( self, datasetName, dasLimit = 0, tryPredefinedFirst = True,

                   cmssw = os.environ["CMSSW_BASE"], cmsswrelease = os.environ["CMSSW_RELEASE_BASE"]):

         self.__name = datasetName

         self.__origName = datasetName

         self.__dasLimit = dasLimit

         self.__fileList = None

         self.__fileInfoList = None

         self.__runList = None

         self.__alreadyStored = False

         self.__cmssw = cmssw

         self.__cmsswrelease = cmsswrelease

         self.__firstusedrun = None

         self.__lastusedrun = None

         self.__parentDataset = None

         self.__parentFileList = None

         self.__parentFileInfoList = None


         # check, if dataset name matches CMS dataset naming scheme

         if re.match( r'/.+/.+/.+', self.__name ):

             self.__official = True

             fileName = "Dataset" + self.__name.replace("/","_") + "_cff.py"

         else:

             self.__official = False

             fileName = self.__name + "_cff.py"


         searchPath1 = os.path.join( self.__cmssw, "python",

                                     "Alignment", "OfflineValidation",

                                     fileName )

         searchPath2 = os.path.join( self.__cmssw, "src",

                                     "Alignment", "OfflineValidation",

                                     "python", fileName )

         searchPath3 = os.path.join( self.__cmsswrelease,

                                     "python", "Alignment",

                                     "OfflineValidation", fileName )

         if self.__official and not tryPredefinedFirst:

             self.__predefined = False

         elif os.path.exists( searchPath1 ):

             self.__predefined = True

             self.__filename = searchPath1

         elif os.path.exists( searchPath2 ):

             msg = ("The predefined dataset '%s' does exist in '%s', but "

                    "you need to run 'scram b' first."

                    %( self.__name, searchPath2 ))

             if self.__official:

                 print msg

                 print "Getting the data from DAS again.  To go faster next time, run scram b."

             else:

                 raise AllInOneError( msg )

         elif os.path.exists( searchPath3 ):

             self.__predefined = True

             self.__filename = searchPath3

         elif self.__official:

             self.__predefined = False

         else:

             msg = ("The predefined dataset '%s' does not exist. Please "

                    "create it first or check for typos."%( self.__name ))

             raise AllInOneError( msg )


         if self.__predefined and self.__official:

             self.__name = "Dataset" + self.__name.replace("/","_")


         self.__dataType = self.__getDataType()

         self.__magneticField = self.__getMagneticField()


     def __chunks( self, theList, n ):

         """ Yield successive n-sized chunks from theList.

         """

         for i in xrange( 0, len( theList ), n ):

             yield theList[i:i+n]


     __source_template= ("%(header)s"

                         "%(importCms)s"

                         "import FWCore.PythonUtilities.LumiList as LumiList\n\n"

                         "%(goodLumiSecStr)s"

                         "readFiles = cms.untracked.vstring()\n"

                         "secFiles = cms.untracked.vstring()\n"

                         "%(process)ssource = cms.Source(\"PoolSource\",\n"

                         "%(lumiStr)s"

                         "%(tab)s                    secondaryFileNames ="

                         "secFiles,\n"

                         "%(tab)s                    fileNames = readFiles\n"

                         ")\n"

                         "%(files)s\n"

                         "%(lumiSecExtend)s\n"

                         "%(process)smaxEvents = cms.untracked.PSet( "

                         "input = cms.untracked.int32(%(nEvents)s) )\n"

                         "%(skipEventsString)s\n")


     __dummy_source_template = ("readFiles = cms.untracked.vstring()\n"

                                "secFiles = cms.untracked.vstring()\n"

                                "%(process)ssource = cms.Source(\"PoolSource\",\n"

                                "%(tab)s                    secondaryFileNames ="

                                "secFiles,\n"

                                "%(tab)s                    fileNames = readFiles\n"

                                ")\n"

                                "readFiles.extend(['dummy_File.root'])\n"

                                "%(process)smaxEvents = cms.untracked.PSet( "

                                "input = cms.untracked.int32(%(nEvents)s) )\n"

                                "%(skipEventsString)s\n")


     def __createSnippet( self, jsonPath = None, begin = None, end = None,

                          firstRun = None, lastRun = None, repMap = None,

                          crab = False, parent = False ):

         if firstRun:

             firstRun = int( firstRun )

         if lastRun:

             lastRun = int( lastRun )

         if ( begin and firstRun ) or ( end and lastRun ):

             msg = ( "The Usage of "

                     + "'begin' & 'firstRun' " * int( bool( begin and

                                                            firstRun ) )

                     + "and " * int( bool( ( begin and firstRun ) and

                                          ( end and lastRun ) ) )

                     + "'end' & 'lastRun' " * int( bool( end and lastRun ) )

                     + "is ambigous." )

             raise AllInOneError( msg )

         if begin or end:

             ( firstRun, lastRun ) = self.convertTimeToRun(

                 begin = begin, end = end, firstRun = firstRun,

                 lastRun = lastRun )

         if ( firstRun and lastRun ) and ( firstRun > lastRun ):

             msg = ( "The lower time/runrange limit ('begin'/'firstRun') "

                     "chosen is greater than the upper time/runrange limit "

                     "('end'/'lastRun').")

             raise AllInOneError( msg )

         if self.predefined() and (jsonPath or begin or end or firstRun or lastRun):

             msg = ( "The parameters 'JSON', 'begin', 'end', 'firstRun', and 'lastRun'"

                     "only work for official datasets, not predefined _cff.py files" )

             raise AllInOneError( msg )

         goodLumiSecStr = ""

         lumiStr = ""

         lumiSecExtend = ""

         if firstRun or lastRun or jsonPath:

             goodLumiSecStr = ( "lumiSecs = cms.untracked."

                                "VLuminosityBlockRange()\n" )

             lumiStr = "                    lumisToProcess = lumiSecs,\n"

             if not jsonPath:

                 selectedRunList = self.__getRunList()

                 if firstRun:

                     selectedRunList = [ run for run in selectedRunList \

                                         if self.__findInJson(run, "run_number") >= firstRun ]

                 if lastRun:

                     selectedRunList = [ run for run in selectedRunList \

                                         if self.__findInJson(run, "run_number") <= lastRun ]

                 lumiList = [ str( self.__findInJson(run, "run_number") ) + ":1-" \

                              + str( self.__findInJson(run, "run_number") ) + ":max" \

                              for run in selectedRunList ]

                 splitLumiList = list( self.__chunks( lumiList, 255 ) )

             else:

                 theLumiList = None

                 try:

                     theLumiList = LumiList ( filename = jsonPath )

                 except ValueError:

                     pass


                 if theLumiList is not None:

                     allRuns = theLumiList.getRuns()

                     runsToRemove = []

                     for run in allRuns:

                         if firstRun and int( run ) < firstRun:

                             runsToRemove.append( run )

                         if lastRun and int( run ) > lastRun:

                             runsToRemove.append( run )

                     theLumiList.removeRuns( runsToRemove )

                     splitLumiList = list( self.__chunks(

                         theLumiList.getCMSSWString().split(','), 255 ) )

                 else:

                     with open(jsonPath) as f:

                         jsoncontents = f.read()

                         if "process.source.lumisToProcess" in jsoncontents:

                             msg = "%s is not a json file, but it seems to be a CMSSW lumi selection cff snippet.  Trying to use it" % jsonPath

                             if firstRun or lastRun:

                                 msg += ("\n  (after applying firstRun and/or lastRun)")

                             msg += ".\nPlease note that, depending on the format of this file, it may not work as expected."

                             msg += "\nCheck your config file to make sure that it worked properly."

                             print msg


                             runlist = self.__getRunList()

                             if firstRun or lastRun:

                                 self.__firstusedrun = -1

                                 self.__lastusedrun = -1

                                 jsoncontents = re.sub("\d+:(\d+|max)-\d+:(\d+|max)", self.getForceRunRangeFunction(firstRun, lastRun), jsoncontents)

                                 self.__firstusedrun = max(self.__firstusedrun, int(self.__findInJson(runlist[0],"run_number")))

                                 self.__lastusedrun = min(self.__lastusedrun, int(self.__findInJson(runlist[-1],"run_number")))

                             else:

                                 self.__firstusedrun = int(self.__findInJson(runlist[0],"run_number"))

                                 self.__lastusedrun = int(self.__findInJson(runlist[-1],"run_number"))

                             lumiSecExtend = jsoncontents

                             splitLumiList = [[""]]


             if splitLumiList and splitLumiList[0]:

                 if splitLumiList[0][0]:

                     lumiSecStr = [ "',\n'".join( lumis ) \

                                    for lumis in splitLumiList ]

                     lumiSecStr = [ "lumiSecs.extend( [\n'" + lumis + "'\n] )" \

                                    for lumis in lumiSecStr ]

                     lumiSecExtend = "\n".join( lumiSecStr )

                     runlist = self.__getRunList()

                     self.__firstusedrun = max(int(splitLumiList[0][0].split(":")[0]), int(self.__findInJson(runlist[0],"run_number")))

                     self.__lastusedrun = min(int(splitLumiList[-1][-1].split(":")[0]), int(self.__findInJson(runlist[-1],"run_number")))

             else:

                 msg = "You are trying to run a validation without any runs!  Check that:"

                 if firstRun or lastRun:

                     msg += "\n - firstRun and lastRun are correct for this dataset, and there are runs in between containing data"

                 if jsonPath:

                     msg += "\n - your JSON file is correct for this dataset, and the runs contain data"

                 if (firstRun or lastRun) and jsonPath:

                     msg += "\n - firstRun and lastRun are consistent with your JSON file"

                 if begin:

                     msg = msg.replace("firstRun", "begin")

                 if end:

                     msg = msg.replace("lastRun", "end")

                 raise AllInOneError(msg)


         else:

             runlist = self.__getRunList()

             self.__firstusedrun = int(self.__findInJson(self.__getRunList()[0],"run_number"))

             self.__lastusedrun = int(self.__findInJson(self.__getRunList()[-1],"run_number"))


         if crab:

             files = ""

         else:

             splitFileList = list( self.__chunks( self.fileList(), 255 ) )

             fileStr = [ "',\n'".join( files ) for files in splitFileList ]

             fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \

                         for files in fileStr ]

             files = "\n".join( fileStr )


             if parent:

                 splitParentFileList = list( self.__chunks( self.fileList(parent = True), 255 ) )

                 parentFileStr = [ "',\n'".join( parentFiles ) for parentFiles in splitParentFileList ]

                 parentFileStr = [ "secFiles.extend( [\n'" + parentFiles + "'\n] )" \

                             for parentFiles in parentFileStr ]

                 parentFiles = "\n".join( parentFileStr )

                 files += "\n\n" + parentFiles


         theMap = repMap

         theMap["files"] = files

         theMap["json"] = jsonPath

         theMap["lumiStr"] = lumiStr

         theMap["goodLumiSecStr"] = goodLumiSecStr%( theMap )

         theMap["lumiSecExtend"] = lumiSecExtend

         if crab:

             dataset_snippet = self.__dummy_source_template%( theMap )

         else:

             dataset_snippet = self.__source_template%( theMap )

         return dataset_snippet


     def __find_lt( self, a, x ):

         'Find rightmost value less than x'

         i = bisect.bisect_left( a, x )

         if i:

             return i-1

         raise ValueError


     def __find_ge( self, a, x):

         'Find leftmost item greater than or equal to x'

         i = bisect.bisect_left( a, x )

         if i != len( a ):

             return i

         raise ValueError


     def __findInJson(self, jsondict, strings):

         if isinstance(strings, str):

             strings = [ strings ]


         if len(strings) == 0:

             return jsondict

         if isinstance(jsondict,dict):

             if strings[0] in jsondict:

                 try:

                     return self.__findInJson(jsondict[strings[0]], strings[1:])

                 except KeyError:

                     pass

         else:

             for a in jsondict:

                 if strings[0] in a:

                     try:

                         return self.__findInJson(a[strings[0]], strings[1:])

                     except (TypeError, KeyError):  #TypeError because a could be a string and contain strings[0]

                         pass

         #if it's not found

         raise KeyError("Can't find " + strings[0])


     def forcerunrange(self, firstRun, lastRun, s):

         """s must be in the format run1:lum1-run2:lum2"""

         s = s.group()

         run1 = s.split("-")[0].split(":")[0]

         lum1 = s.split("-")[0].split(":")[1]

         run2 = s.split("-")[1].split(":")[0]

         lum2 = s.split("-")[1].split(":")[1]

         if int(run2) < firstRun or int(run1) > lastRun:

             return ""

         if int(run1) < firstRun or firstRun < 0:

             run1 = firstRun

             lum1 = 1

         if int(run2) > lastRun:

             run2 = lastRun

             lum2 = "max"

         if int(run1) < self.__firstusedrun or self.__firstusedrun < 0:

             self.__firstusedrun = int(run1)

         if int(run2) > self.__lastusedrun:

             self.__lastusedrun = int(run2)

         return "%s:%s-%s:%s" % (run1, lum1, run2, lum2)


     def getForceRunRangeFunction(self, firstRun, lastRun):

         def forcerunrangefunction(s):

             return self.forcerunrange(firstRun, lastRun, s)

         return forcerunrangefunction


     def __getData( self, dasQuery, dasLimit = 0 ):

         dasData = das_client.get_data( 'https://cmsweb.cern.ch',

                                        dasQuery, 0, dasLimit, False )

         if isinstance(dasData, str):

             jsondict = json.loads( dasData )

         else:

             jsondict = dasData

         # Check, if the DAS query fails

         try:

             error = self.__findInJson(jsondict,["data","error"])

         except KeyError:

             error = None

         if error or self.__findInJson(jsondict,"status") != 'ok' or "data" not in jsondict:

             jsonstr = str(jsondict)

             if len(jsonstr) > 10000:

                 jsonfile = "das_query_output_%i.txt"

                 i = 0

                 while os.path.lexists(jsonfile % i):

                     i += 1

                 jsonfile = jsonfile % i

                 theFile = open( jsonfile, "w" )

                 theFile.write( jsonstr )

                 theFile.close()

                 msg = "The DAS query returned an error.  The output is very long, and has been stored in:\n" + jsonfile

             else:

                 msg = "The DAS query returned a error.  Here is the output\n" + jsonstr

             msg += "\nIt's possible that this was a server error.  If so, it may work if you try again later"

             raise AllInOneError(msg)

         return self.__findInJson(jsondict,"data")


     def __getDataType( self ):

         if self.__predefined:

             with open(self.__filename) as f:

                 datatype = None

                 for line in f.readlines():

                     if line.startswith("#data type: "):

                         if datatype is not None:

                             raise AllInOneError(self.__filename + " has multiple 'data type' lines.")

                         datatype = line.replace("#data type: ", "").replace("\n","")

                         return datatype

                 return "unknown"


         dasQuery_type = ( 'dataset dataset=%s | grep dataset.datatype,'

                           'dataset.name'%( self.__name ) )

         data = self.__getData( dasQuery_type )


         try:

             return self.__findInJson(data, ["dataset", "datatype"])

         except KeyError:

             print ("Cannot find the datatype of the dataset '%s'\n"

                    "It may not be possible to automatically find the magnetic field,\n"

                    "and you will not be able run in CRAB mode"

                    %( self.name() ))

             return "unknown"


     def __getParentDataset( self ):

         dasQuery = "parent dataset=" + self.__name

         data = self.__getData( dasQuery )

         try:

             return self.__findInJson(data, ["parent", "name"])

         except KeyError:

             raise AllInOneError("Cannot find the parent of the dataset '" + self.__name + "'\n"

                                 "Here is the DAS output:\n" + str(jsondict) +

                                 "\nIt's possible that this was a server error.  If so, it may work if you try again later")


     def __getMagneticField( self ):

         Bfieldlocation = os.path.join( self.__cmssw, "python", "Configuration", "StandardSequences" )

         if not os.path.isdir(Bfieldlocation):

             Bfieldlocation = os.path.join( self.__cmsswrelease, "python", "Configuration", "StandardSequences" )

         Bfieldlist = [ f.replace("_cff.py",'') \

                            for f in os.listdir(Bfieldlocation) \

                                if f.startswith("MagneticField_") and f.endswith("_cff.py") ]

         Bfieldlist.sort( key = lambda Bfield: -len(Bfield) ) #Put it in order of decreasing length, so that searching in the name gives the longer match


         if self.__predefined:

             with open(self.__filename) as f:

                 datatype = None

                 Bfield = None

                 for line in f.readlines():

                     if line.startswith("#data type: "):

                         if datatype is not None:

                             raise AllInOneError(self.__filename + " has multiple 'data type' lines.")

                         datatype = line.replace("#data type: ", "").replace("\n","")

                         datatype = datatype.split("#")[0].strip()

                     if line.startswith("#magnetic field: "):

                         if Bfield is not None:

                             raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")

                         Bfield = line.replace("#magnetic field: ", "").replace("\n","")

                         Bfield = Bfield.split("#")[0].strip()

                 if Bfield is not None:

                     Bfield = Bfield.split(",")[0]

                     if Bfield in Bfieldlist or Bfield == "unknown":

                         return Bfield

                     #===========================================================================

                     #For compatibility with already written datasets - remove this at some point

                     #(until the next === line)

                     #It's currently June 2015, anytime starting in 2016 is more than safe

                     elif Bfield == "AutoFromDBCurrent":

                         return "MagneticField"

                     elif "MagneticField_" + Bfield in Bfieldlist:

                         return "MagneticField_" + Bfield

                     #===========================================================================

                     else:

                         print "Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield

                         print "Using Bfield='unknown' - this will revert to the default"

                         return "unknown"

                 elif datatype == "data":

                     return "MagneticField"           #this should be in the "#magnetic field" line, but for safety in case it got messed up

                 else:

                     return "unknown"


         if self.__dataType == "data":

             return "MagneticField"


         dasQuery_B = ( 'dataset dataset=%s'%( self.__name ) )             #try to find the magnetic field from DAS

         data = self.__getData( dasQuery_B )                               #it seems to be there for the newer (7X) MC samples, except cosmics


         try:

             Bfield = self.__findInJson(data, ["dataset", "mcm", "sequences", "magField"])

             if Bfield in Bfieldlist:

                 return Bfield

             elif Bfield == "38T" or Bfield == "38T_PostLS1":

                 return "MagneticField"

             elif "MagneticField_" + Bfield in Bfieldlist:

                 return "MagneticField_" + Bfield

             elif Bfield == "":

                 pass

             else:

                 print "Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield

                 print "Using Bfield='unknown' - this will revert to the default magnetic field"

                 return "unknown"

         except KeyError:

             pass


         for possibleB in Bfieldlist:

             if (possibleB != "MagneticField"

               and possibleB.replace("MagneticField_","") in self.__name.replace("TkAlCosmics0T", "")):

                 #final attempt - try to identify the dataset from the name

                 #all cosmics dataset names contain "TkAlCosmics0T"

                 if possibleB == "MagneticField_38T" or possibleB == "MagneticField_38T_PostLS1":

                     return "MagneticField"

                 return possibleB


         return "unknown"


     def __getMagneticFieldForRun( self, run = -1, tolerance = 0.5 ):

         """For MC, this returns the same as the previous function.

            For data, it gets the magnetic field from the runs.  This is important for

            deciding which template to use for offlinevalidation

         """

         if self.__dataType == "mc" and self.__magneticField == "MagneticField":

             return 3.8                                        #For 3.8T MC the default MagneticField is used

         if "T" in self.__magneticField:

             Bfield = self.__magneticField.split("T")[0].replace("MagneticField_","")

             try:

                 return float(Bfield) / 10.0                       #e.g. 38T and 38T_PostLS1 both return 3.8

             except ValueError:

                 pass

         if self.__predefined:

             with open(self.__filename) as f:

                 Bfield = None

                 for line in f.readlines():

                     if line.startswith("#magnetic field: ") and "," in line:

                         if Bfield is not None:

                             raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")

                         return float(line.replace("#magnetic field: ", "").split(",")[1].split("#")[0].strip())


         if run > 0:

             dasQuery = ('run = %s'%run)                         #for data

             data = self.__getData(dasQuery)

             try:

                 return self.__findInJson(data, ["run","bfield"])

             except KeyError:

                 return "unknown Can't get the magnetic field for run %s from DAS" % run


         #run < 0 - find B field for the first and last runs, and make sure they're compatible

         #  (to within tolerance)

         #NOT FOOLPROOF!  The magnetic field might go up and then down, or vice versa

         if self.__firstusedrun is None or self.__lastusedrun is None:

             return "unknown Can't get the exact magnetic field for the dataset until data has been retrieved from DAS."

         firstrunB = self.__getMagneticFieldForRun(self.__firstusedrun)

         lastrunB = self.__getMagneticFieldForRun(self.__lastusedrun)

         try:

             if abs(firstrunB - lastrunB) <= tolerance:

                 return .5*(firstrunB + lastrunB)

             print firstrunB, lastrunB, tolerance

             return ("unknown The beginning and end of your run range for %s\n"

                     "have different magnetic fields (%s, %s)!\n"

                     "Try limiting the run range using firstRun, lastRun, begin, end, or JSON,\n"

                     "or increasing the tolerance (in dataset.py) from %s.") % (self.__name, firstrunB, lastrunB, tolerance)

         except TypeError:

             try:

                 if "unknown" in firstrunB:

                     return firstrunB

                 else:

                     return lastrunB

             except TypeError:

                 return lastrunB


     def __getFileInfoList( self, dasLimit, parent = False ):

         if self.__predefined:

             if parent:

                 extendstring = "secFiles.extend"

             else:

                 extendstring = "readFiles.extend"

             with open(self.__fileName) as f:

                 files = []

                 copy = False

                 for line in f.readlines():

                     if "]" in line:

                         copy = False

                     if copy:

                         files.append({name: line.translate(None, "', " + '"')})

                     if extendstring in line and "[" in line and "]" not in line:

                         copy = True

             return files


         if self.__fileInfoList and not parent:

             return self.__fileInfoList

         if self.__parentFileInfoList and parent:

             return self.__parentFileInfoList


         if parent:

             searchdataset = self.parentDataset()

         else:

             searchdataset = self.__name

         dasQuery_files = ( 'file dataset=%s | grep file.name, file.nevents, '

                            'file.creation_time, '

                            'file.modification_time'%( searchdataset ) )

         print "Requesting file information for '%s' from DAS..."%( searchdataset ),

         data = self.__getData( dasQuery_files, dasLimit )

         print "Done."

         data = [ self.__findInJson(entry,"file") for entry in data ]

         if len( data ) == 0:

             msg = ("No files are available for the dataset '%s'. This can be "

                    "due to a typo or due to a DAS problem. Please check the "

                    "spelling of the dataset and/or retry to run "

                    "'validateAlignments.py'."%( self.name() ))

             raise AllInOneError( msg )

         fileInformationList = []

         for file in data:

             fileName = 'unknown'

             try:

                 fileName = self.__findInJson(file, "name")

                 fileCreationTime = self.__findInJson(file, "creation_time")

                 fileNEvents = self.__findInJson(file, "nevents")

             except KeyError:

                 print ("DAS query gives bad output for file '%s'.  Skipping it.\n"

                        "It may work if you try again later.") % fileName

                 fileNEvents = 0

             # select only non-empty files

             if fileNEvents == 0:

                 continue

             fileDict = { "name": fileName,

                          "creation_time": fileCreationTime,

                          "nevents": fileNEvents

                          }

             fileInformationList.append( fileDict )

         fileInformationList.sort( key=lambda info: self.__findInJson(info,"name") )

         if parent:

             self.__parentFileInfoList = fileInformationList

         else:

             self.__fileInfoList = fileInformationList

         return fileInformationList


     def __getRunList( self ):

         if self.__runList:

             return self.__runList

         dasQuery_runs = ( 'run dataset=%s | grep run.run_number,'

                           'run.creation_time'%( self.__name ) )

         print "Requesting run information for '%s' from DAS..."%( self.__name ),

         data = self.__getData( dasQuery_runs )

         print "Done."

         data = [ self.__findInJson(entry,"run") for entry in data ]

         data.sort( key = lambda run: self.__findInJson(run, "run_number") )

         self.__runList = data

         return data


     def __datetime(self, stringForDas):

         if len(stringForDas) != 8:

             raise AllInOneError(stringForDas + " is not a valid date string.\n"

                               + "DAS accepts dates in the form 'yyyymmdd'")

         year = stringForDas[:4]

         month = stringForDas[4:6]

         day = stringForDas[6:8]

         return datetime.date(int(year), int(month), int(day))


     def __dateString(self, date):

         return str(date.year) + str(date.month).zfill(2) + str(date.day).zfill(2)


     def convertTimeToRun( self, begin = None, end = None,

                           firstRun = None, lastRun = None,

                           shortTuple = True ):

         if ( begin and firstRun ) or ( end and lastRun ):

             msg = ( "The Usage of "

                     + "'begin' & 'firstRun' " * int( bool( begin and

                                                            firstRun ) )

                     + "and " * int( bool( ( begin and firstRun ) and

                                          ( end and lastRun ) ) )

                     + "'end' & 'lastRun' " * int( bool( end and lastRun ) )

                     + "is ambigous." )

             raise AllInOneError( msg )


         if begin or end:

             runList = [ self.__findInJson(run, "run_number") for run in self.__getRunList() ]


         if begin:

             lastdate = begin

             for delta in [ 1, 5, 10, 20, 30 ]:                       #try searching for about 2 months after begin

                 firstdate = lastdate

                 lastdate = self.__dateString(self.__datetime(firstdate) + datetime.timedelta(delta))

                 dasQuery_begin = "run date between[%s,%s]" % (firstdate, lastdate)

                 begindata = self.__getData(dasQuery_begin)

                 if len(begindata) > 0:

                     begindata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))

                     try:

                         runIndex = self.__find_ge( runList, self.__findInJson(begindata[0], ["run", "run_number"]))

                     except ValueError:

                         msg = ( "Your 'begin' is after the creation time of the last "

                                 "run in the dataset\n'%s'"%( self.__name ) )

                         raise AllInOneError( msg )

                     firstRun = runList[runIndex]

                     begin = None

                     break


         if begin:

             raise AllInOneError("No runs within a reasonable time interval after your 'begin'."

                                 "Try using a 'begin' that has runs soon after it (within 2 months at most)")


         if end:

             firstdate = end

             for delta in [ 1, 5, 10, 20, 30 ]:                       #try searching for about 2 months before end

                 lastdate = firstdate

                 firstdate = self.__dateString(self.__datetime(lastdate) - datetime.timedelta(delta))

                 dasQuery_end = "run date between[%s,%s]" % (firstdate, lastdate)

                 enddata = self.__getData(dasQuery_end)

                 if len(enddata) > 0:

                     enddata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))

                     try:

                         runIndex = self.__find_lt( runList, self.__findInJson(enddata[-1], ["run", "run_number"]))

                     except ValueError:

                         msg = ( "Your 'end' is before the creation time of the first "

                                 "run in the dataset\n'%s'"%( self.__name ) )

                         raise AllInOneError( msg )

                     lastRun = runList[runIndex]

                     end = None

                     break


         if end:

             raise AllInOneError("No runs within a reasonable time interval before your 'end'."

                                 "Try using an 'end' that has runs soon before it (within 2 months at most)")


         if shortTuple:

             return firstRun, lastRun

         else:

             return begin, end, firstRun, lastRun


     def dataType( self ):

         if not self.__dataType:

             self.__dataType = self.__getDataType()

         return self.__dataType


     def magneticField( self ):

         if not self.__magneticField:

             self.__magneticField = self.__getMagneticField()

         return self.__magneticField


     def magneticFieldForRun( self, run = -1 ):

         return self.__getMagneticFieldForRun(run)


     def parentDataset( self ):

         if not self.__parentDataset:

             self.__parentDataset = self.__getParentDataset()

         return self.__parentDataset


     def datasetSnippet( self, jsonPath = None, begin = None, end = None,

                         firstRun = None, lastRun = None, crab = False, parent = False ):

         if self.__predefined and parent:

                 with open(self.__filename) as f:

                     if "secFiles.extend" not in f.read():

                         msg = ("The predefined dataset '%s' does not contain secondary files, "

                                "which your validation requires!") % self.__name

                         if self.__official:

                             self.__name = self.__origName

                             self.__predefined = False

                             print msg

                             print ("Retreiving the files from DAS.  You will be asked if you want "

                                    "to overwrite the old dataset.\n"

                                    "It will still be compatible with validations that don't need secondary files.")

                         else:

                             raise AllInOneError(msg)


         if self.__predefined:

             snippet = ("process.load(\"Alignment.OfflineValidation.%s_cff\")\n"

                        "process.maxEvents = cms.untracked.PSet(\n"

                        "    input = cms.untracked.int32(.oO[nEvents]Oo. / .oO[parallelJobs]Oo.)\n"

                        ")\n"

                        "process.source.skipEvents=cms.untracked.uint32(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.)"

                        %(self.__name))

             if not parent:

                 with open(self.__filename) as f:

                     if "secFiles.extend" in f.read():

                         snippet += "\nprocess.source.secondaryFileNames = cms.untracked.vstring()"

             return snippet

         theMap = { "process": "process.",

                    "tab": " " * len( "process." ),

                    "nEvents": ".oO[nEvents]Oo. / .oO[parallelJobs]Oo.",

                    "skipEventsString": "process.source.skipEvents=cms.untracked.uint32(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.)\n",

                    "importCms": "",

                    "header": ""

                    }

         datasetSnippet = self.__createSnippet( jsonPath = jsonPath,

                                                begin = begin,

                                                end = end,

                                                firstRun = firstRun,

                                                lastRun = lastRun,

                                                repMap = theMap,

                                                crab = crab,

                                                parent = parent )

         if jsonPath == "" and begin == "" and end == "" and firstRun == "" and lastRun == "":

             try:

                 self.dump_cff(parent = parent)

             except AllInOneError, e:

                 print "Can't store the dataset as a cff:"

                 print e

                 print "This may be inconvenient in the future, but will not cause a problem for this validation."

         return datasetSnippet


     def dump_cff( self, outName = None, jsonPath = None, begin = None,

                   end = None, firstRun = None, lastRun = None, parent = False ):

         if self.__alreadyStored:

             return

         self.__alreadyStored = True

         if outName == None:

             outName = "Dataset" + self.__name.replace("/", "_")

         packageName = os.path.join( "Alignment", "OfflineValidation" )

         if not os.path.exists( os.path.join(

             self.__cmssw, "src", packageName ) ):

             msg = ("You try to store the predefined dataset'%s'.\n"

                    "For that you need to check out the package '%s' to your "

                    "private relase area in\n"%( outName, packageName )

                    + self.__cmssw )

             raise AllInOneError( msg )

         theMap = { "process": "",

                    "tab": "",

                    "nEvents": str( -1 ),

                    "skipEventsString": "",

                    "importCms": "import FWCore.ParameterSet.Config as cms\n",

                    "header": "#Do not delete or (unless you know what you're doing) change these comments\n"

                              "#%(name)s\n"

                              "#data type: %(dataType)s\n"

                              "#magnetic field: .oO[magneticField]Oo.\n"    #put in magnetic field later

                              %{"name": self.__name,                        #need to create the snippet before getting the magnetic field

                                "dataType": self.__dataType}                #so that we know the first and last runs

                    }

         dataset_cff = self.__createSnippet( jsonPath = jsonPath,

                                             begin = begin,

                                             end = end,

                                             firstRun = firstRun,

                                             lastRun = lastRun,

                                             repMap = theMap,

                                             parent = parent)

         magneticField = self.__magneticField

         if magneticField == "MagneticField":

             magneticField = "%s, %s     #%s" % (magneticField,

                                                 str(self.__getMagneticFieldForRun()).replace("\n"," ").split("#")[0].strip(),

                                                 "Use MagneticField_cff.py; the number is for determining which track selection to use."

                                                )

         dataset_cff = dataset_cff.replace(".oO[magneticField]Oo.",magneticField)

         filePath = os.path.join( self.__cmssw, "src", packageName,

                                  "python", outName + "_cff.py" )

         if os.path.exists( filePath ):

             existMsg = "The predefined dataset '%s' already exists.\n"%( outName )

             askString = "Do you want to overwrite it? [y/n]\n"

             inputQuery = existMsg + askString

             while True:

                 userInput = raw_input( inputQuery ).lower()

                 if userInput == "y":

                     break

                 elif userInput == "n":

                     return

                 else:

                     inputQuery = askString

         print ( "The predefined dataset '%s' will be stored in the file\n"

                 %( outName )

                 + filePath +

                 "\nFor future use you have to do 'scram b'." )

         print

         theFile = open( filePath, "w" )

         theFile.write( dataset_cff )

         theFile.close()

         return


     def fileList( self, parent = False ):

         if self.__fileList and not parent:

             return self.__fileList

         if self.__parentFileList and parent:

             return self.__parentFileList


         fileList = [ self.__findInJson(fileInfo,"name") \

                      for fileInfo in self.fileInfoList(parent) ]


         if not parent:

             self.__fileList = fileList

         else:

             self.__parentFileList = fileList

         return fileList


     def fileInfoList( self, parent = False ):

         return self.__getFileInfoList( self.__dasLimit, parent )


     def name( self ):

         return self.__name


     def predefined( self ):

         return self.__predefined


     def runList( self ):

         if self.__runList:

             return self.__runList

         return self.__getRunList()


 if __name__ == '__main__':

     print "Start testing..."

     datasetName = '/MinimumBias/Run2012D-TkAlMinBias-v1/ALCARECO'

     jsonFile = ( '/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/'

                  'Collisions12/8TeV/Prompt/'

                  'Cert_190456-207898_8TeV_PromptReco_Collisions12_JSON.txt' )

     dataset = Dataset( datasetName )

     print dataset.datasetSnippet( jsonPath = jsonFile,

                                   firstRun = "207800",

                                   end = "20121128")

     dataset.dump_cff( outName = "Dataset_Test_TkAlMinBias_Run2012D",

                       jsonPath = jsonFile,

                       firstRun = "207800",

                       end = "20121128" )

dataset.Dataset.dump_cff
def dump_cff
Definition: dataset.py:755

dataset.Dataset.__name
__name
Definition: dataset.py:17

dataset.Dataset.__magneticField
__magneticField
Definition: dataset.py:77

dataset.Dataset.__findInJson
def __findInJson
Definition: dataset.py:278

dataset.Dataset.__origName
__origName
Definition: dataset.py:18

dataset.Dataset.__firstusedrun
__firstusedrun
Definition: dataset.py:26

dataset.Dataset.__runList
__runList
Definition: dataset.py:22

dataset.Dataset.__dasLimit
__dasLimit
Definition: dataset.py:19

dataset.Dataset.__find_ge
def __find_ge
Definition: dataset.py:271

dataset.Dataset.dataType
def dataType
Definition: dataset.py:683

dataset.Dataset.__getMagneticFieldForRun
def __getMagneticFieldForRun
Definition: dataset.py:471

dataset.Dataset.__getData
def __getData
Definition: dataset.py:326

dataset.Dataset.__alreadyStored
__alreadyStored
Definition: dataset.py:23

dataset.Dataset.magneticField
def magneticField
Definition: dataset.py:688

dataset.Dataset.name
def name
Definition: dataset.py:837

dataset.Dataset.__predefined
__predefined
Definition: dataset.py:50

dataset.Dataset.__init__
def __init__
Definition: dataset.py:16

dataset.Dataset
Definition: dataset.py:14

TkAlExceptions.AllInOneError
Definition: TkAlExceptions.py:1

dataset.Dataset.runList
def runList
Definition: dataset.py:843

dataset.Dataset.__getFileInfoList
def __getFileInfoList
Definition: dataset.py:525

dataset.Dataset.__lastusedrun
__lastusedrun
Definition: dataset.py:27

dataset.Dataset.parentDataset
def parentDataset
Definition: dataset.py:696

python.rootplot.root2matplotlib.replace
def replace
Definition: root2matplotlib.py:443

dataset.Dataset.__filename
__filename
Definition: dataset.py:53

dataset.Dataset.__chunks
def __chunks
Definition: dataset.py:79

dataset.Dataset.__parentFileInfoList
__parentFileInfoList
Definition: dataset.py:30

funct::abs
Abs< T >::type abs(const T &t)
Definition: Abs.h:22

dataset.Dataset.predefined
def predefined
Definition: dataset.py:840

dataset.Dataset.__cmsswrelease
__cmsswrelease
Definition: dataset.py:25

min
T min(T a, T b)
Definition: MathUtil.h:58

dataset.Dataset.__createSnippet
def __createSnippet
Definition: dataset.py:117

dataset.Dataset.convertTimeToRun
def convertTimeToRun
Definition: dataset.py:618

dataset.Dataset.__fileInfoList
__fileInfoList
Definition: dataset.py:21

dataset.Dataset.fileInfoList
def fileInfoList
Definition: dataset.py:834

dataset.Dataset.__datetime
def __datetime
Definition: dataset.py:604

dataset.Dataset.magneticFieldForRun
def magneticFieldForRun
Definition: dataset.py:693

join
static std::string join(char **cmd)
Definition: RemoteFile.cc:18

dataset.Dataset.__dummy_source_template
tuple __dummy_source_template
Definition: dataset.py:103

dataset.Dataset.__find_lt
def __find_lt
Definition: dataset.py:264

dataset.Dataset.datasetSnippet
def datasetSnippet
Definition: dataset.py:702

dataset.Dataset.fileList
def fileList
Definition: dataset.py:819

dataset.Dataset.__getRunList
def __getRunList
Definition: dataset.py:591

dataset.Dataset.forcerunrange
def forcerunrange
Definition: dataset.py:300

bookConverter.max
max
Definition: bookConverter.py:166

dataset.Dataset.__dateString
def __dateString
Definition: dataset.py:613

dataset.Dataset.__cmssw
__cmssw
Definition: dataset.py:24

dataset.Dataset.__getMagneticField
def __getMagneticField
Definition: dataset.py:391

dataset.Dataset.__parentFileList
__parentFileList
Definition: dataset.py:29

dataset.Dataset.__dataType
__dataType
Definition: dataset.py:76

dataset.Dataset.__fileList
__fileList
Definition: dataset.py:20

dataset.Dataset.__parentDataset
__parentDataset
Definition: dataset.py:28

dataset.Dataset.__getParentDataset
def __getParentDataset
Definition: dataset.py:381

dataset.Dataset.getForceRunRangeFunction
def getForceRunRangeFunction
Definition: dataset.py:321

split
double split
Definition: MVATrainer.cc:139

dataset.Dataset.__official
__official
Definition: dataset.py:34

dataset.Dataset.__getDataType
def __getDataType
Definition: dataset.py:356

list
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run