CMS 3D CMS Logo

Public Member Functions | Private Member Functions | Private Attributes | Static Private Attributes

dataset::Dataset Class Reference

List of all members.

Public Member Functions

def __init__
def convertTimeToRun
def datasetSnippet
def dataType
def dump_cff
def fileInfoList
def fileList
def name
def predefined
def runList

Private Member Functions

def __chunks
def __createSnippet
def __find_ge
def __find_lt
def __getData
def __getDataType
def __getFileInfoList
def __getRunList

Private Attributes

 __dasLimit
 __dataType
 __fileInfoList
 __fileList
 __name
 __predefined
 __runList

Static Private Attributes

tuple __dummy_source_template

Detailed Description

Definition at line 13 of file dataset.py.


Constructor & Destructor Documentation

def dataset::Dataset::__init__ (   self,
  datasetName,
  dasLimit = 0 
)

Definition at line 14 of file dataset.py.

00015                                                    :
00016         self.__name = datasetName
00017         # check, if dataset name matches CMS dataset naming scheme
00018         if re.match( r'/.+/.+/.+', self.__name ):
00019             self.__dataType = self.__getDataType()
00020             self.__predefined = False
00021         else:
00022             fileName = self.__name + "_cff.py"
00023             searchPath1 = os.path.join( os.environ["CMSSW_BASE"], "python",
00024                                         "Alignment", "OfflineValidation",
00025                                         fileName )
00026             searchPath2 = os.path.join( os.environ["CMSSW_BASE"], "src",
00027                                         "Alignment", "OfflineValidation",
00028                                         "python", fileName )
00029             searchPath3 = os.path.join( os.environ["CMSSW_RELEASE_BASE"],
00030                                         "python", "Alignment",
00031                                         "OfflineValidation", fileName )
00032             if os.path.exists( searchPath1 ):
00033                 pass
00034             elif os.path.exists( searchPath2 ):
00035                 msg = ("The predefined dataset '%s' does exist in '%s', but "
00036                        "you need to run 'scram b' first."
00037                        %( self.__name, searchPath2 ))
00038                 raise AllInOneError( msg )
00039             elif os.path.exists( searchPath3 ):
00040                 pass
00041             else:
00042                 msg = ("The predefined dataset '%s' does not exist. Please "
00043                        "create it first or check for typos."%( self.__name ))
00044                 raise AllInOneError( msg )
00045             self.__dataType = "unknown"
00046             self.__predefined = True
00047         self.__dasLimit = dasLimit
00048         self.__fileList = None
00049         self.__fileInfoList = None
00050         self.__runList = None


Member Function Documentation

def dataset::Dataset::__chunks (   self,
  theList,
  n 
) [private]
Yield successive n-sized chunks from theList.

Definition at line 51 of file dataset.py.

00052                                     :
00053         """ Yield successive n-sized chunks from theList.
00054         """
00055         for i in xrange( 0, len( theList ), n ):
00056             yield theList[i:i+n]

def dataset::Dataset::__createSnippet (   self,
  jsonPath = None,
  begin = None,
  end = None,
  firstRun = None,
  lastRun = None,
  repMap = None,
  crab = False 
) [private]

Definition at line 57 of file dataset.py.

00060                                        :
00061         if firstRun:
00062             firstRun = int( firstRun )
00063         if lastRun:
00064             lastRun = int( lastRun )
00065         if ( begin and firstRun ) or ( end and lastRun ):
00066             msg = ( "The Usage of "
00067                     + "'begin' & 'firstRun' " * int( bool( begin and
00068                                                            firstRun ) )
00069                     + "and " * int( bool( ( begin and firstRun ) and
00070                                          ( end and lastRun ) ) )
00071                     + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
00072                     + "is ambigous." )
00073             raise AllInOneError( msg )
00074         if begin or end:
00075             ( firstRun, lastRun ) = self.convertTimeToRun(
00076                 begin = begin, end = end, firstRun = firstRun,
00077                 lastRun = lastRun )
00078         if ( firstRun and lastRun ) and ( firstRun > lastRun ):
00079             msg = ( "The lower time/runrange limit ('begin'/'firstRun') "
00080                     "chosen is greater than the upper time/runrange limit "
00081                     "('end'/'lastRun').")
00082             raise AllInOneError( msg )
00083         goodLumiSecStr = ""
00084         lumiStr = ""
00085         lumiSecExtend = ""
00086         if firstRun or lastRun:
00087             goodLumiSecStr = ( "lumiSecs = cms.untracked."
00088                                "VLuminosityBlockRange()\n" )
00089             lumiStr = "                    lumisToProcess = lumiSecs,\n"
00090             if not jsonPath:
00091                 selectedRunList = self.__getRunList()
00092                 if firstRun:
00093                     selectedRunList = [ run for run in selectedRunList \
00094                                         if run["run_number"] >= firstRun ]
00095                 if lastRun:
00096                     selectedRunList = [ run for run in selectedRunList \
00097                                         if run["run_number"] <= lastRun ]
00098                 lumiList = [ str( run["run_number"] ) + ":1-" \
00099                              + str( run["run_number"] ) + ":max" \
00100                              for run in selectedRunList ]
00101                 splitLumiList = list( self.__chunks( lumiList, 255 ) )
00102             else:
00103                 theLumiList = LumiList ( filename = jsonPath )
00104                 allRuns = theLumiList.getRuns()
00105                 runsToRemove = []
00106                 for run in allRuns:
00107                     if firstRun and int( run ) < firstRun:
00108                         runsToRemove.append( run )
00109                     if lastRun and int( run ) > lastRun:
00110                         runsToRemove.append( run )
00111                 theLumiList.removeRuns( runsToRemove )
00112                 splitLumiList = list( self.__chunks(
00113                     theLumiList.getCMSSWString().split(','), 255 ) )
00114             if not len(splitLumiList[0][0]) == 0:
00115                 lumiSecStr = [ "',\n'".join( lumis ) \
00116                                for lumis in splitLumiList ]
00117                 lumiSecStr = [ "lumiSecs.extend( [\n'" + lumis + "'\n] )" \
00118                                for lumis in lumiSecStr ]
00119                 lumiSecExtend = "\n".join( lumiSecStr )
00120         elif jsonPath:
00121                 goodLumiSecStr = ( "goodLumiSecs = LumiList.LumiList(filename"
00122                                    "= '%(json)s').getCMSSWString().split(',')\n"
00123                                    "lumiSecs = cms.untracked"
00124                                    ".VLuminosityBlockRange()\n"
00125                                    )
00126                 lumiStr = "                    lumisToProcess = lumiSecs,\n"
00127                 lumiSecExtend = "lumiSecs.extend(goodLumiSecs)\n"
00128         if crab:
00129             files = ""
00130         else:
00131             splitFileList = list( self.__chunks( self.fileList(), 255 ) )
00132             fileStr = [ "',\n'".join( files ) for files in splitFileList ]
00133             fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \
00134                         for files in fileStr ]
00135             files = "\n".join( fileStr )
00136         theMap = repMap
00137         theMap["files"] = files
00138         theMap["json"] = jsonPath
00139         theMap["lumiStr"] = lumiStr
00140         theMap["goodLumiSecStr"] = goodLumiSecStr%( theMap )
00141         theMap["lumiSecExtend"] = lumiSecExtend
00142         if crab:
00143             dataset_snippet = self.__dummy_source_template%( theMap )
00144         else:
00145             dataset_snippet = self.__source_template%( theMap )
00146         return dataset_snippet

def dataset::Dataset::__find_ge (   self,
  a,
  x 
) [private]

Definition at line 165 of file dataset.py.

00166                               :
00167         'Find leftmost item greater than or equal to x'
00168         i = bisect.bisect_left( a, x )
00169         if i != len( a ):
00170             return i
00171         raise ValueError

def dataset::Dataset::__find_lt (   self,
  a,
  x 
) [private]

Definition at line 158 of file dataset.py.

00159                                :
00160         'Find rightmost value less than x'
00161         i = bisect.bisect_left( a, x )
00162         if i:
00163             return i-1
00164         raise ValueError

def dataset::Dataset::__getData (   self,
  dasQuery,
  dasLimit = 0 
) [private]

Definition at line 172 of file dataset.py.

00173                                                  :
00174         dasData = das_client.get_data( 'https://cmsweb.cern.ch',
00175                                        dasQuery, 0, dasLimit, False )
00176         jsondict = json.loads( dasData )
00177         # Check, if the DAS query fails
00178         if jsondict["status"] != 'ok':
00179             msg = "Status not 'ok', but:", jsondict["status"]
00180             raise AllInOneError(msg)
00181         return jsondict["data"]

def dataset::Dataset::__getDataType (   self) [private]

Definition at line 182 of file dataset.py.

00183                              :
00184         dasQuery_type = ( 'dataset dataset=%s | grep dataset.datatype,'
00185                           'dataset.name'%( self.__name ) )
00186         data = self.__getData( dasQuery_type )
00187         return data[0]["dataset"][0]["datatype"]

def dataset::Dataset::__getFileInfoList (   self,
  dasLimit 
) [private]

Definition at line 188 of file dataset.py.

00189                                            :
00190         if self.__fileInfoList:
00191             return self.__fileInfoList
00192         dasQuery_files = ( 'file dataset=%s | grep file.name, file.nevents, '
00193                            'file.creation_time, '
00194                            'file.modification_time'%( self.__name ) )
00195         print "Requesting file information for '%s' from DAS..."%( self.__name ),
00196         data = self.__getData( dasQuery_files, dasLimit )
00197         print "Done."
00198         data = [ entry["file"] for entry in data ]
00199         if len( data ) == 0:
00200             msg = ("No files are available for the dataset '%s'. This can be "
00201                    "due to a typo or due to a DAS problem. Please check the "
00202                    "spelling of the dataset and/or retry to run "
00203                    "'validateAlignments.py'."%( self.name() ))
00204             raise AllInOneError( msg )
00205         fileInformationList = []
00206         for file in data:
00207             fileName = file[0]["name"]
00208             fileCreationTime = file[0]["creation_time"]
00209             for ii in range(3):
00210                 try:
00211                     fileNEvents = file[ii]["nevents"]
00212                 except KeyError:
00213                     continue
00214                 break
00215             # select only non-empty files
00216             if fileNEvents == 0:
00217                 continue
00218             fileDict = { "name": fileName,
00219                          "creation_time": fileCreationTime,
00220                          "nevents": fileNEvents
00221                          }
00222             fileInformationList.append( fileDict )
00223         fileInformationList.sort( key=lambda info: info["name"] )
00224         return fileInformationList

def dataset::Dataset::__getRunList (   self) [private]

Definition at line 225 of file dataset.py.

00226                             :
00227         if self.__runList:
00228             return self.__runList
00229         dasQuery_runs = ( 'run dataset=%s | grep run.run_number,'
00230                           'run.creation_time'%( self.__name ) )
00231         print "Requesting run information for '%s' from DAS..."%( self.__name ),
00232         data = self.__getData( dasQuery_runs )
00233         print "Done."
00234         data = [ entry["run"][0] for entry in data ]
00235         data.sort( key = lambda run: run["creation_time"] )
00236         self.__runList = data
00237         return data

def dataset::Dataset::convertTimeToRun (   self,
  begin = None,
  end = None,
  firstRun = None,
  lastRun = None,
  shortTuple = True 
)

Definition at line 254 of file dataset.py.

00257                                              :
00258         if ( begin and firstRun ) or ( end and lastRun ):
00259             msg = ( "The Usage of "
00260                     + "'begin' & 'firstRun' " * int( bool( begin and
00261                                                            firstRun ) )
00262                     + "and " * int( bool( ( begin and firstRun ) and
00263                                          ( end and lastRun ) ) )
00264                     + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
00265                     + "is ambigous." )
00266             raise AllInOneError( msg )
00267 
00268         runList = [ run["run_number"] for run in self.__getRunList() ]
00269         runTimeList = [ run["creation_time"] for run in self.__getRunList() ]
00270         if begin:
00271             try:
00272                 runIndex = self.__find_ge( runTimeList, begin )
00273             except ValueError:
00274                 msg = ( "Your 'begin' is after the creation time of the last "
00275                         "run in the dataset\n'%s'"%( self.__name ) )
00276                 raise AllInOneError( msg )
00277             firstRun = runList[runIndex]
00278             begin = None
00279         if end:
00280             try:
00281                 runIndex = self.__find_lt( runTimeList, end )
00282             except ValueError:
00283                 msg = ( "Your 'end' is before the creation time of the first "
00284                         "run in the dataset\n'%s'"%( self.__name ) )
00285                 raise AllInOneError( msg )
00286             lastRun = runList[runIndex]
00287             end = None
00288         if shortTuple:
00289             return firstRun, lastRun
00290         else:
00291             return begin, end, firstRun, lastRun

def dataset::Dataset::datasetSnippet (   self,
  jsonPath = None,
  begin = None,
  end = None,
  firstRun = None,
  lastRun = None,
  nEvents = None,
  crab = False 
)

Definition at line 295 of file dataset.py.

00298                                       :
00299         if self.__predefined:
00300             return ("process.load(\"Alignment.OfflineValidation.%s_cff\")\n"
00301                     "process.maxEvents = cms.untracked.PSet(\n"
00302                     "    input = cms.untracked.int32(%s)\n"
00303                     ")"
00304                     %( self.__name, nEvents ))
00305         theMap = { "process": "process.",
00306                    "tab": " " * len( "process." ),
00307                    "nEvents": str( nEvents ),
00308                    "importCms": ""
00309                    }
00310         datasetSnippet = self.__createSnippet( jsonPath = jsonPath,
00311                                                begin = begin,
00312                                                end = end,
00313                                                firstRun = firstRun,
00314                                                lastRun = lastRun,
00315                                                repMap = theMap,
00316                                                crab = crab )
00317         return datasetSnippet

def dataset::Dataset::dataType (   self)

Definition at line 292 of file dataset.py.

00293                         :
00294         return self.__dataType
    
def dataset::Dataset::dump_cff (   self,
  outName = None,
  jsonPath = None,
  begin = None,
  end = None,
  firstRun = None,
  lastRun = None 
)

Definition at line 318 of file dataset.py.

00320                                                                :
00321         if outName == None:
00322             outName = "Dataset"
00323         packageName = os.path.join( "Alignment", "OfflineValidation" )
00324         if not os.path.exists( os.path.join(
00325             os.environ["CMSSW_BASE"], "src", packageName ) ):
00326             msg = ("You try to store the predefined dataset'%s'.\n"
00327                    "For that you need to check out the package '%s' to your "
00328                    "private relase area in\n"%( outName, packageName )
00329                    + os.environ["CMSSW_BASE"] )
00330             raise AllInOneError( msg )
00331         theMap = { "process": "",
00332                    "tab": "",
00333                    "nEvents": str( -1 ),
00334                    "importCms": "import FWCore.ParameterSet.Config as cms\n" }
00335         dataset_cff = self.__createSnippet( jsonPath = jsonPath,
00336                                             begin = begin,
00337                                             end = end,
00338                                             firstRun = firstRun,
00339                                             lastRun = lastRun,
00340                                             repMap = theMap)
00341         filePath = os.path.join( os.environ["CMSSW_BASE"], "src", packageName,
00342                                  "python", outName + "_cff.py" )
00343         if os.path.exists( filePath ):
00344             existMsg = "The predefined dataset '%s' already exists.\n"%( outName )
00345             askString = "Do you want to overwrite it? [y/n]\n"
00346             inputQuery = existMsg + askString
00347             while True:
00348                 userInput = raw_input( inputQuery ).lower()
00349                 if userInput == "y":
00350                     break
00351                 elif userInput == "n":
00352                     return
00353                 else:
00354                     inputQuery = askString
00355         print ( "The predefined dataset '%s' will be stored in the file\n"
00356                 %( outName )
00357                 + filePath +
00358                 "\nFor future use you have to do 'scram b'." )
00359         print
00360         theFile = open( filePath, "w" )
00361         theFile.write( dataset_cff )
00362         theFile.close()
00363         return

def dataset::Dataset::fileInfoList (   self)

Definition at line 372 of file dataset.py.

00373                             :
00374         return self.__getFileInfoList( self.__dasLimit )

def dataset::Dataset::fileList (   self)

Definition at line 364 of file dataset.py.

00365                         :
00366         if self.__fileList:
00367             return self.__fileList
00368         fileList = [ fileInfo["name"] \
00369                      for fileInfo in self.fileInfoList() ]
00370         self.__fileList = fileList
00371         return fileList
    
def dataset::Dataset::name (   self)

Definition at line 375 of file dataset.py.

00376                     :
00377         return self.__name

def dataset::Dataset::predefined (   self)

Definition at line 378 of file dataset.py.

00379                           :
00380         return self.__predefined

def dataset::Dataset::runList (   self)

Definition at line 381 of file dataset.py.

00382                        :
00383         if self.__runList:
00384             return self.__runList
00385         return self.__getRunList()
00386 


Member Data Documentation

Definition at line 14 of file dataset.py.

Definition at line 14 of file dataset.py.

Initial value:
("%(process)smaxEvents = cms.untracked.PSet( "
                               "input = cms.untracked.int32(%(nEvents)s) )\n"
                               "readFiles = cms.untracked.vstring()\n"
                               "secFiles = cms.untracked.vstring()\n"
                               "%(process)ssource = cms.Source(\"PoolSource\",\n"
                               "%(tab)s                    secondaryFileNames ="
                               "secFiles,\n"
                               "%(tab)s                    fileNames = readFiles\n"
                               ")\n"
                               "readFiles.extend(['dummy_File.root'])\n")

Definition at line 147 of file dataset.py.

Definition at line 14 of file dataset.py.

Definition at line 14 of file dataset.py.

Definition at line 14 of file dataset.py.

Definition at line 14 of file dataset.py.

Definition at line 14 of file dataset.py.