Inheritance diagram for dataset.Dataset:

Public Member Functions
def	__init__ (self, datasetName, dasLimit=0, tryPredefinedFirst=True, cmssw=os.environ["CMSSW_BASE"], cmsswrelease=os.environ["CMSSW_RELEASE_BASE"], magneticfield=None, dasinstance=None)

def	__init__ (self, datasetname, dasinstance=defaultdasinstance)

def	__init__ (self, name, user, pattern='.*root')

def	buildListOfBadFiles (self)

def	buildListOfFiles (self, pattern='.*root')

def	convertTimeToRun (self, begin=None, end=None, firstRun=None, lastRun=None, shortTuple=True)

def	createdatasetfile_hippy (self, filename, filesperjob, firstrun, lastrun)

def	datasetSnippet (self, jsonPath=None, begin=None, end=None, firstRun=None, lastRun=None, crab=False, parent=False)

def	dataType (self)

def	dump_cff (self, outName=None, jsonPath=None, begin=None, end=None, firstRun=None, lastRun=None, parent=False)

def	extractFileSizes (self)

def	fileInfoList (self, parent=False)

def	fileList (self, parent=False, firstRun=None, lastRun=None, forcerunselection=False)

def	forcerunrange (self, firstRun, lastRun, s)

def	getfiles (self, usecache)

def	getForceRunRangeFunction (self, firstRun, lastRun)

def	getPrimaryDatasetEntries (self)

def	headercomment (self)

def	magneticField (self)

def	magneticFieldForRun (self, run=-1)

def	name (self)

def	parentDataset (self)

def	predefined (self)

def	printInfo (self)

def	runList (self)

Public Member Functions inherited from dataset.BaseDataset
def	__init__ (self, name, user, pattern='.*root', run_range=None, dbsInstance=None)
	def init(self, name, user, pattern='. More...

def	buildListOfBadFiles (self)

def	buildListOfFiles (self, pattern)

def	extractFileSizes (self)

def	getPrimaryDatasetEntries (self)

def	listOfFiles (self)

def	listOfGoodFiles (self)

def	listOfGoodFilesWithPrescale (self, prescale)

def	printFiles (self, abspath=True, info=True)

def	printInfo (self)

Public Member Functions inherited from dataset.DatasetBase
def	getfiles (self, usecache)

def	headercomment (self)

def	writefilelist_hippy (self, firstrun, lastrun, runs, eventsperjob, maxevents, outputfile, usecache=True)

def	writefilelist_validation (self, firstrun, lastrun, runs, maxevents, outputfile=None, usecache=True)

Static Public Member Functions
def	getrunnumberfromfilename (filename)

Public Attributes
	bad_files

	castorDir

	dasinstance

	datasetname

	filenamebase

	files

	filesAndSizes

	good_files

	lfnDir

	maskExists

	official

	report

Public Attributes inherited from dataset.BaseDataset
	bad_files

	dbsInstance
	MM. More...

	files

	filesAndSizes

	good_files

	name

	pattern

	primaryDatasetEntries
	MM. More...

	report

	run_range

	user

Private Member Functions
def	__chunks (self, theList, n)

def	__createSnippet (self, jsonPath=None, begin=None, end=None, firstRun=None, lastRun=None, repMap=None, crab=False, parent=False)

def	__dateString (self, date)

def	__datetime (self, stringForDas)

def	__fileListSnippet (self, crab=False, parent=False, firstRun=None, lastRun=None, forcerunselection=False)

def	__find_ge (self, a, x)

def	__find_lt (self, a, x)

def	__findInJson (self, jsondict, strings)

def	__getData (self, dasQuery, dasLimit=0)

def	__getDataType (self)

def	__getFileInfoList (self, dasLimit, parent=False)

def	__getMagneticField (self)

def	__getMagneticFieldForRun (self, run=-1, tolerance=0.5)

def	__getParentDataset (self)

def	__getRunList (self)

def	__lumiSelectionSnippet (self, jsonPath=None, firstRun=None, lastRun=None)

Private Attributes
	__cmssw

	__cmsswrelease

	__dasinstance

	__dasLimit

	__dataType

	__filename

	__firstusedrun

	__inputMagneticField

	__lastusedrun

	__magneticField

	__name

	__official

	__origName

	__parentDataset

	__predefined

Static Private Attributes
tuple	__dummy_source_template

	__source_template

Detailed Description

Definition at line 198 of file dataset.py.

Constructor & Destructor Documentation

◆ init() [1/3]

def dataset.Dataset.__init__	(	self,
		datasetname,
		dasinstance = `defaultdasinstance`
	)

Definition at line 199 of file dataset.py.

Referenced by dataset.Dataset.__init__().

   def __init__(self, datasetname, dasinstance=defaultdasinstance):
     self.datasetname = datasetname
     if re.match(r'/.+/.+/.+', datasetname):
       self.official = True
       self.filenamebase = "Dataset" + self.datasetname.replace("/","_")
     else:
       self.official = False
       self.filenamebase = datasetname
 
     self.dasinstance = dasinstance
 

◆ init() [2/3]

def dataset.Dataset.__init__	(	self,
		datasetName,
		dasLimit = `0`,
		tryPredefinedFirst = `True`,
		cmssw = `os.environ["CMSSW_BASE"]`,
		cmsswrelease = `os.environ["CMSSW_RELEASE_BASE"]`,
		magneticfield = `None`,
		dasinstance = `None`
	)

Definition at line 23 of file dataset.py.

                   magneticfield = None, dasinstance = None):
         self.__name = datasetName
         self.__origName = datasetName
         self.__dasLimit = dasLimit
         self.__dasinstance = dasinstance
         self.__cmssw = cmssw
         self.__cmsswrelease = cmsswrelease
         self.__firstusedrun = None
         self.__lastusedrun = None
         self.__parentDataset = None
 
         # check, if dataset name matches CMS dataset naming scheme
         if re.match( r'/.+/.+/.+', self.__name ):
             self.__official = True
             fileName = "Dataset" + self.__name.replace("/","_") + "_cff.py"
         else:
             self.__official = False
             fileName = self.__name + "_cff.py"
 
         searchPath1 = os.path.join( self.__cmssw, "python",
                                     "Alignment", "OfflineValidation",
                                     fileName )
         searchPath2 = os.path.join( self.__cmssw, "src",
                                     "Alignment", "OfflineValidation",
                                     "python", fileName )
         searchPath3 = os.path.join( self.__cmsswrelease,
                                     "python", "Alignment",
                                     "OfflineValidation", fileName )
         if self.__official and not tryPredefinedFirst:
             self.__predefined = False
         elif os.path.exists( searchPath1 ):
             self.__predefined = True
             self.__filename = searchPath1
         elif os.path.exists( searchPath2 ):
             msg = ("The predefined dataset '%s' does exist in '%s', but "
                    "you need to run 'scram b' first."
                    %( self.__name, searchPath2 ))
             if self.__official:
                 print(msg)
                 print("Getting the data from DAS again.  To go faster next time, run scram b.")
             else:
                 raise AllInOneError( msg )
         elif os.path.exists( searchPath3 ):
             self.__predefined = True
             self.__filename = searchPath3
         elif self.__official:
             self.__predefined = False
         else:
             msg = ("The predefined dataset '%s' does not exist. Please "
                    "create it first or check for typos."%( self.__name ))
             raise AllInOneError( msg )
 
         if self.__predefined and self.__official:
             self.__name = "Dataset" + self.__name.replace("/","_")
 
         if magneticfield is not None:
             try:
                 magneticfield = float(magneticfield)
             except ValueError:
                 raise AllInOneError("Bad magneticfield {} which can't be converted to float".format(magneticfield))
         self.__inputMagneticField = magneticfield
 
         self.__dataType = self.__getDataType()
         self.__magneticField = self.__getMagneticField()
 
 

◆ init() [3/3]

def dataset.Dataset.__init__	(	self,
		name,
		user,
		pattern = `'.*root'`
	)

Definition at line 267 of file dataset.py.

References dataset.Dataset.__init__().

     def __init__(self, name, user, pattern='.*root'):
         self.lfnDir = castorBaseDir(user) + name
         self.castorDir = castortools.lfnToCastor( self.lfnDir )
         self.maskExists = False
         self.report = None
         super(Dataset, self).__init__(name, user, pattern)
         

Member Function Documentation

◆ __chunks()

def dataset.Dataset.__chunks	(	self,
		theList,
		n
	)

private

Yield successive n-sized chunks from theList.

Definition at line 89 of file dataset.py.

References FastTimerService_cff.range.

Referenced by dataset.Dataset.__fileListSnippet(), dataset.Dataset.__lumiSelectionSnippet(), and dataset.Dataset.createdatasetfile_hippy().

     def __chunks( self, theList, n ):
         """ Yield successive n-sized chunks from theList.
         """
         for i in range( 0, len( theList ), n ):
             yield theList[i:i+n]
 

◆ __createSnippet()

def dataset.Dataset.__createSnippet	(	self,
		jsonPath = `None`,
		begin = `None`,
		end = `None`,
		firstRun = `None`,
		lastRun = `None`,
		repMap = `None`,
		crab = `False`,
		parent = `False`
	)

private

Definition at line 245 of file dataset.py.

References dataset.Dataset.__dummy_source_template, dataset.Dataset.__fileListSnippet(), dataset.Dataset.__lumiSelectionSnippet(), dataset.Dataset.__source_template, electrons_cff.bool, dataset.Dataset.convertTimeToRun(), and dataset.int.

Referenced by dataset.Dataset.__fileListSnippet(), dataset.Dataset.datasetSnippet(), and dataset.Dataset.dump_cff().

                          crab = False, parent = False ):
 
         if firstRun:
             firstRun = int( firstRun )
         if lastRun:
             lastRun = int( lastRun )
         if ( begin and firstRun ) or ( end and lastRun ):
             msg = ( "The Usage of "
                     + "'begin' & 'firstRun' " * int( bool( begin and
                                                            firstRun ) )
                     + "and " * int( bool( ( begin and firstRun ) and
                                          ( end and lastRun ) ) )
                     + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
                     + "is ambigous." )
             raise AllInOneError( msg )
         if begin or end:
             ( firstRun, lastRun ) = self.convertTimeToRun(
                 begin = begin, end = end, firstRun = firstRun,
                 lastRun = lastRun )
         if ( firstRun and lastRun ) and ( firstRun > lastRun ):
             msg = ( "The lower time/runrange limit ('begin'/'firstRun') "
                     "chosen is greater than the upper time/runrange limit "
                     "('end'/'lastRun').")
             raise AllInOneError( msg )
 
         lumiSecExtend = self.__lumiSelectionSnippet(jsonPath=jsonPath, firstRun=firstRun, lastRun=lastRun)
         lumiStr = goodLumiSecStr = ""
         if lumiSecExtend:
             goodLumiSecStr = "lumiSecs = cms.untracked.VLuminosityBlockRange()\n"
             lumiStr = "                    lumisToProcess = lumiSecs,\n"
 
         files = self.__fileListSnippet(crab=crab, parent=parent, firstRun=firstRun, lastRun=lastRun, forcerunselection=False)
 
         theMap = repMap
         theMap["files"] = files
         theMap["json"] = jsonPath
         theMap["lumiStr"] = lumiStr
         theMap["goodLumiSecStr"] = goodLumiSecStr%( theMap )
         theMap["lumiSecExtend"] = lumiSecExtend
         if crab:
             dataset_snippet = self.__dummy_source_template%( theMap )
         else:
             dataset_snippet = self.__source_template%( theMap )
         return dataset_snippet
 

◆ __dateString()

def dataset.Dataset.__dateString	(	self,
		date
	)

private

Definition at line 640 of file dataset.py.

References dataset.Dataset.convertTimeToRun(), and str.

Referenced by dataset.Dataset.convertTimeToRun().

     def __dateString(self, date):
         return str(date.year) + str(date.month).zfill(2) + str(date.day).zfill(2)
 

◆ __datetime()

def dataset.Dataset.__datetime	(	self,
		stringForDas
	)

private

Definition at line 631 of file dataset.py.

References dataset.int.

Referenced by dataset.Dataset.convertTimeToRun().

     def __datetime(self, stringForDas):
         if len(stringForDas) != 8:
             raise AllInOneError(stringForDas + " is not a valid date string.\n"
                               + "DAS accepts dates in the form 'yyyymmdd'")
         year = stringForDas[:4]
         month = stringForDas[4:6]
         day = stringForDas[6:8]
         return datetime.date(int(year), int(month), int(day))
 

◆ __fileListSnippet()

def dataset.Dataset.__fileListSnippet	(	self,
		crab = `False`,
		parent = `False`,
		firstRun = `None`,
		lastRun = `None`,
		forcerunselection = `False`
	)

private

Definition at line 221 of file dataset.py.

References dataset.Dataset.__chunks(), dataset.Dataset.__createSnippet(), dataset.Dataset.__name, dataset.Dataset.fileList(), and join().

Referenced by dataset.Dataset.__createSnippet().

     def __fileListSnippet(self, crab=False, parent=False, firstRun=None, lastRun=None, forcerunselection=False):
         if crab:
             files = ""
         else:
             splitFileList = list( self.__chunks( self.fileList(firstRun=firstRun, lastRun=lastRun, forcerunselection=forcerunselection), 255 ) )
             if not splitFileList:
                 raise AllInOneError("No files found for dataset {}.  Check the spelling, or maybe specify another das instance?".format(self.__name))
             fileStr = [ "',\n'".join( files ) for files in splitFileList ]
             fileStr = [ "readFiles.extend( [\n'" + files + "'\n] )" \
                         for files in fileStr ]
             files = "\n".join( fileStr )
 
             if parent:
                 splitParentFileList = list( self.__chunks( self.fileList(parent=True, firstRun=firstRun, lastRun=lastRun, forcerunselection=forcerunselection), 255 ) )
                 parentFileStr = [ "',\n'".join( parentFiles ) for parentFiles in splitParentFileList ]
                 parentFileStr = [ "secFiles.extend( [\n'" + parentFiles + "'\n] )" \
                             for parentFiles in parentFileStr ]
                 parentFiles = "\n".join( parentFileStr )
                 files += "\n\n" + parentFiles
 
         return files
 

◆ __find_ge()

def dataset.Dataset.__find_ge	(	self,
		a,
		x
	)

private

Definition at line 297 of file dataset.py.

Referenced by dataset.Dataset.convertTimeToRun().

     def __find_ge( self, a, x):
         'Find leftmost item greater than or equal to x'
         i = bisect.bisect_left( a, x )
         if i != len( a ):
             return i
         raise ValueError
 

◆ __find_lt()

def dataset.Dataset.__find_lt	(	self,
		a,
		x
	)

private

Definition at line 290 of file dataset.py.

Referenced by dataset.Dataset.convertTimeToRun().

     def __find_lt( self, a, x ):
         'Find rightmost value less than x'
         i = bisect.bisect_left( a, x )
         if i:
             return i-1
         raise ValueError
 

◆ __findInJson()

def dataset.Dataset.__findInJson	(	self,
		jsondict,
		strings
	)

private

Definition at line 304 of file dataset.py.

References dataset.Dataset.__findInJson().

Referenced by dataset.Dataset.__findInJson(), dataset.Dataset.__getData(), dataset.Dataset.__getDataType(), dataset.Dataset.__getFileInfoList(), dataset.Dataset.__getMagneticField(), dataset.Dataset.__getMagneticFieldForRun(), dataset.Dataset.__getParentDataset(), dataset.Dataset.__getRunList(), dataset.Dataset.__lumiSelectionSnippet(), dataset.Dataset.convertTimeToRun(), and dataset.Dataset.fileList().

     def __findInJson(self, jsondict, strings):
         if isinstance(strings, str):
             strings = [ strings ]
 
         if len(strings) == 0:
             return jsondict
         if isinstance(jsondict,dict):
             if strings[0] in jsondict:
                 try:
                     return self.__findInJson(jsondict[strings[0]], strings[1:])
                 except KeyError:
                     pass
         else:
             for a in jsondict:
                 if strings[0] in a:
                     try:
                         return self.__findInJson(a[strings[0]], strings[1:])
                     except (TypeError, KeyError):  #TypeError because a could be a string and contain strings[0]
                         pass
         #if it's not found
         raise KeyError("Can't find " + strings[0])
 

◆ __getData()

def dataset.Dataset.__getData	(	self,
		dasQuery,
		dasLimit = `0`
	)

private

Definition at line 356 of file dataset.py.

References dataset.Dataset.__findInJson(), das_client.get_data(), and str.

Referenced by dataset.Dataset.__getDataType(), dataset.Dataset.__getFileInfoList(), dataset.Dataset.__getMagneticField(), dataset.Dataset.__getMagneticFieldForRun(), dataset.Dataset.__getParentDataset(), dataset.Dataset.__getRunList(), and dataset.Dataset.convertTimeToRun().

     def __getData( self, dasQuery, dasLimit = 0 ):
         dasData = das_client.get_data(dasQuery, dasLimit)
         if isinstance(dasData, str):
             jsondict = json.loads( dasData )
         else:
             jsondict = dasData
         # Check, if the DAS query fails
         try:
             error = self.__findInJson(jsondict,["data","error"])
         except KeyError:
             error = None
         if error or self.__findInJson(jsondict,"status") != 'ok' or "data" not in jsondict:
             try:
                 jsonstr = self.__findInJson(jsondict,"reason")
             except KeyError: 
                 jsonstr = str(jsondict)
             if len(jsonstr) > 10000:
                 jsonfile = "das_query_output_%i.txt"
                 i = 0
                 while os.path.lexists(jsonfile % i):
                     i += 1
                 jsonfile = jsonfile % i
                 theFile = open( jsonfile, "w" )
                 theFile.write( jsonstr )
                 theFile.close()
                 msg = "The DAS query returned an error.  The output is very long, and has been stored in:\n" + jsonfile
             else:
                 msg = "The DAS query returned a error.  Here is the output\n" + jsonstr
             msg += "\nIt's possible that this was a server error.  If so, it may work if you try again later"
             raise AllInOneError(msg)
         return self.__findInJson(jsondict,"data")
 

◆ __getDataType()

def dataset.Dataset.__getDataType ( self )

private

Definition at line 388 of file dataset.py.

Referenced by dataset.Dataset.dataType().

     def __getDataType( self ):
         if self.__predefined:
             with open(self.__filename) as f:
                 datatype = None
                 for line in f.readlines():
                     if line.startswith("#data type: "):
                         if datatype is not None:
                             raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
                         datatype = line.replace("#data type: ", "").replace("\n","")
                         return datatype
                 return "unknown"
 
         dasQuery_type = ( 'dataset dataset=%s instance=%s detail=true | grep dataset.datatype,'
                           'dataset.name'%( self.__name, self.__dasinstance ) )
         data = self.__getData( dasQuery_type )
 
         try:
             return self.__findInJson(data, ["dataset", "datatype"])
         except KeyError:
             print ("Cannot find the datatype of the dataset '%s'\n"
                    "It may not be possible to automatically find the magnetic field,\n"
                    "and you will not be able run in CRAB mode"
                    %( self.name() ))
             return "unknown"
 

◆ __getFileInfoList()

def dataset.Dataset.__getFileInfoList	(	self,
		dasLimit,
		parent = `False`
	)

private

Definition at line 561 of file dataset.py.

Referenced by dataset.Dataset.fileInfoList().

     def __getFileInfoList( self, dasLimit, parent = False ):
         if self.__predefined:
             if parent:
                 extendstring = "secFiles.extend"
             else:
                 extendstring = "readFiles.extend"
             with open(self.__fileName) as f:
                 files = []
                 copy = False
                 for line in f.readlines():
                     if "]" in line:
                         copy = False
                     if copy:
                         files.append({name: line.translate(None, "', " + '"')})
                     if extendstring in line and "[" in line and "]" not in line:
                         copy = True
             return files
 
         if parent:
             searchdataset = self.parentDataset()
         else:
             searchdataset = self.__name
         dasQuery_files = ( 'file dataset=%s instance=%s detail=true | grep file.name, file.nevents, '
                            'file.creation_time, '
                            'file.modification_time'%( searchdataset, self.__dasinstance ) )
         print("Requesting file information for '%s' from DAS..."%( searchdataset ), end=' ')
         sys.stdout.flush()
         data = self.__getData( dasQuery_files, dasLimit )
         print("Done.")
         data = [ self.__findInJson(entry,"file") for entry in data ]
         if len( data ) == 0:
             msg = ("No files are available for the dataset '%s'. This can be "
                    "due to a typo or due to a DAS problem. Please check the "
                    "spelling of the dataset and/or retry to run "
                    "'validateAlignments.py'."%( self.name() ))
             raise AllInOneError( msg )
         fileInformationList = []
         for file in data:
             fileName = 'unknown'
             try:
                 fileName = self.__findInJson(file, "name")
                 fileCreationTime = self.__findInJson(file, "creation_time")
                 fileNEvents = self.__findInJson(file, "nevents")
             except KeyError:
                 print(("DAS query gives bad output for file '%s'.  Skipping it.\n"
                        "It may work if you try again later.") % fileName)
                 fileNEvents = 0
             # select only non-empty files
             if fileNEvents == 0:
                 continue
             fileDict = { "name": fileName,
                          "creation_time": fileCreationTime,
                          "nevents": fileNEvents
                          }
             fileInformationList.append( fileDict )
         fileInformationList.sort( key=lambda info: self.__findInJson(info,"name") )
         return fileInformationList
 

◆ __getMagneticField()

def dataset.Dataset.__getMagneticField ( self )

private

Definition at line 423 of file dataset.py.

References dataset.Dataset.__cmssw, dataset.Dataset.__cmsswrelease, dataset.Dataset.__dasinstance, dataset.Dataset.__dataType, dataset.Dataset.__filename, dataset.Dataset.__findInJson(), dataset.Dataset.__getData(), dataset.Dataset.__inputMagneticField, dataset.Dataset.__name, dataset.Dataset.__predefined, print(), python.rootplot.root2matplotlib.replace(), and digitizers_cfi.strip.

Referenced by dataset.Dataset.magneticField().

     def __getMagneticField( self ):
         Bfieldlocation = os.path.join( self.__cmssw, "python", "Configuration", "StandardSequences" )
         if not os.path.isdir(Bfieldlocation):
             Bfieldlocation = os.path.join( self.__cmsswrelease, "python", "Configuration", "StandardSequences" )
         Bfieldlist = [ f.replace("_cff.py",'') \
                            for f in os.listdir(Bfieldlocation) \
                                if f.startswith("MagneticField_") and f.endswith("_cff.py") ]
         Bfieldlist.sort( key = lambda Bfield: -len(Bfield) ) #Put it in order of decreasing length, so that searching in the name gives the longer match
 
         if self.__inputMagneticField is not None:
             if self.__inputMagneticField == 3.8:
                 return "MagneticField"
             elif self.__inputMagneticField == 0:
                 return "MagneticField_0T"
             else:
                 raise ValueError("Unknown input magnetic field {}".format(self.__inputMagneticField))
 
         if self.__predefined:
             with open(self.__filename) as f:
                 datatype = None
                 Bfield = None
                 for line in f.readlines():
                     if line.startswith("#data type: "):
                         if datatype is not None:
                             raise AllInOneError(self.__filename + " has multiple 'data type' lines.")
                         datatype = line.replace("#data type: ", "").replace("\n","")
                         datatype = datatype.split("#")[0].strip()
                     if line.startswith("#magnetic field: "):
                         if Bfield is not None:
                             raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
                         Bfield = line.replace("#magnetic field: ", "").replace("\n","")
                         Bfield = Bfield.split("#")[0].strip()
                 if Bfield is not None:
                     Bfield = Bfield.split(",")[0]
                     if Bfield in Bfieldlist or Bfield == "unknown":
                         return Bfield
                     else:
                         print("Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield)
                         print("Using Bfield='unknown' - this will revert to the default")
                         return "unknown"
                 elif datatype == "data":
                     return "MagneticField"           #this should be in the "#magnetic field" line, but for safety in case it got messed up
                 else:
                     return "unknown"
 
         if self.__dataType == "data":
             return "MagneticField"
 
         #try to find the magnetic field from DAS
         #it seems to be there for the newer (7X) MC samples, except cosmics
         dasQuery_B = ('dataset dataset=%s instance=%s'%(self.__name, self.__dasinstance))
         data = self.__getData( dasQuery_B )
 
         try:
             Bfield = self.__findInJson(data, ["dataset", "mcm", "sequences", "magField"])
             if Bfield in Bfieldlist:
                 return Bfield
             elif Bfield == "38T" or Bfield == "38T_PostLS1":
                 return "MagneticField"
             elif "MagneticField_" + Bfield in Bfieldlist:
                 return "MagneticField_" + Bfield
             elif Bfield == "":
                 pass
             else:
                 print("Your dataset has magnetic field '%s', which does not exist in your CMSSW version!" % Bfield)
                 print("Using Bfield='unknown' - this will revert to the default magnetic field")
                 return "unknown"
         except KeyError:
             pass
 
         for possibleB in Bfieldlist:
             if (possibleB != "MagneticField"
               and possibleB.replace("MagneticField_","") in self.__name.replace("TkAlCosmics0T", "")):
                 #final attempt - try to identify the dataset from the name
                 #all cosmics dataset names contain "TkAlCosmics0T"
                 if possibleB == "MagneticField_38T" or possibleB == "MagneticField_38T_PostLS1":
                     return "MagneticField"
                 return possibleB
 
         return "unknown"
 

◆ __getMagneticFieldForRun()

def dataset.Dataset.__getMagneticFieldForRun	(	self,
		run = `-1`,
		tolerance = `0.5`
	)

private

For MC, this returns the same as the previous function.
   For data, it gets the magnetic field from the runs.  This is important for
   deciding which template to use for offlinevalidation

Definition at line 504 of file dataset.py.

Referenced by dataset.Dataset.__getMagneticFieldForRun(), dataset.Dataset.dump_cff(), and dataset.Dataset.magneticFieldForRun().

     def __getMagneticFieldForRun( self, run = -1, tolerance = 0.5 ):
         """For MC, this returns the same as the previous function.
            For data, it gets the magnetic field from the runs.  This is important for
            deciding which template to use for offlinevalidation
         """
         if self.__dataType == "mc" and self.__magneticField == "MagneticField":
             return 3.8                                        #For 3.8T MC the default MagneticField is used
         if self.__inputMagneticField is not None:
             return self.__inputMagneticField
         if "T" in self.__magneticField:
             Bfield = self.__magneticField.split("T")[0].replace("MagneticField_","")
             try:
                 return float(Bfield) / 10.0                       #e.g. 38T and 38T_PostLS1 both return 3.8
             except ValueError:
                 pass
         if self.__predefined:
             with open(self.__filename) as f:
                 Bfield = None
                 for line in f.readlines():
                     if line.startswith("#magnetic field: ") and "," in line:
                         if Bfield is not None:
                             raise AllInOneError(self.__filename + " has multiple 'magnetic field' lines.")
                         return float(line.replace("#magnetic field: ", "").split(",")[1].split("#")[0].strip())
 
         if run > 0:
             dasQuery = ('run=%s instance=%s detail=true'%(run, self.__dasinstance))   #for data
             data = self.__getData(dasQuery)
             try:
                 return self.__findInJson(data, ["run","bfield"])
             except KeyError:
                 return "unknown Can't get the magnetic field for run %s from DAS" % run
 
         #run < 0 - find B field for the first and last runs, and make sure they're compatible
         #  (to within tolerance)
         #NOT FOOLPROOF!  The magnetic field might go up and then down, or vice versa
         if self.__firstusedrun is None or self.__lastusedrun is None:
             return "unknown Can't get the exact magnetic field for the dataset until data has been retrieved from DAS."
         firstrunB = self.__getMagneticFieldForRun(self.__firstusedrun)
         lastrunB = self.__getMagneticFieldForRun(self.__lastusedrun)
         try:
             if abs(firstrunB - lastrunB) <= tolerance:
                 return .5*(firstrunB + lastrunB)
             print(firstrunB, lastrunB, tolerance)
             return ("unknown The beginning and end of your run range for %s\n"
                     "have different magnetic fields (%s, %s)!\n"
                     "Try limiting the run range using firstRun, lastRun, begin, end, or JSON,\n"
                     "or increasing the tolerance (in dataset.py) from %s.") % (self.__name, firstrunB, lastrunB, tolerance)
         except TypeError:
             try:
                 if "unknown" in firstrunB:
                     return firstrunB
                 else:
                     return lastrunB
             except TypeError:
                 return lastrunB
 

◆ __getParentDataset()

def dataset.Dataset.__getParentDataset ( self )

private

Definition at line 413 of file dataset.py.

References dataset.Dataset.__dasinstance, dataset.Dataset.__findInJson(), dataset.Dataset.__getData(), dataset.Dataset.__name, and str.

Referenced by dataset.Dataset.parentDataset().

     def __getParentDataset( self ):
         dasQuery = "parent dataset=" + self.__name + " instance="+self.__dasinstance
         data = self.__getData( dasQuery )
         try:
             return self.__findInJson(data, ["parent", "name"])
         except KeyError:
             raise AllInOneError("Cannot find the parent of the dataset '" + self.__name + "'\n"
                                 "Here is the DAS output:\n" + str(jsondict) +
                                 "\nIt's possible that this was a server error.  If so, it may work if you try again later")
 

◆ __getRunList()

def dataset.Dataset.__getRunList ( self )

private

Definition at line 620 of file dataset.py.

References dataset.Dataset.__dasinstance, dataset.Dataset.__findInJson(), dataset.Dataset.__getData(), dataset.Dataset.__name, and print().

Referenced by dataset.Dataset.__lumiSelectionSnippet(), dataset.Dataset.convertTimeToRun(), and dataset.Dataset.runList().

     def __getRunList( self ):
         dasQuery_runs = ( 'run dataset=%s instance=%s | grep run.run_number,'
                           'run.creation_time'%( self.__name, self.__dasinstance ) )
         print("Requesting run information for '%s' from DAS..."%( self.__name ), end=' ')
         sys.stdout.flush()
         data = self.__getData( dasQuery_runs )
         print("Done.")
         data = [ self.__findInJson(entry,"run") for entry in data ]
         data.sort( key = lambda run: self.__findInJson(run, "run_number") )
         return data
 

◆ __lumiSelectionSnippet()

def dataset.Dataset.__lumiSelectionSnippet	(	self,
		jsonPath = `None`,
		firstRun = `None`,
		lastRun = `None`
	)

private

Definition at line 125 of file dataset.py.

References dataset.Dataset.__chunks(), dataset.Dataset.__findInJson(), dataset.Dataset.__firstusedrun, dataset.Dataset.__getRunList(), dataset.Dataset.__inputMagneticField, dataset.Dataset.__lastusedrun, dataset.Dataset.getForceRunRangeFunction(), dataset.int, join(), SiStripPI.max, SiStripPI.min, print(), python.rootplot.root2matplotlib.replace(), submitPVValidationJobs.split(), and str.

Referenced by dataset.Dataset.__createSnippet().

     def __lumiSelectionSnippet( self, jsonPath = None, firstRun = None, lastRun = None ):
         lumiSecExtend = ""
         if firstRun or lastRun or jsonPath:
             if not jsonPath:
                 selectedRunList = self.__getRunList()
                 if firstRun:
                     selectedRunList = [ run for run in selectedRunList \
                                         if self.__findInJson(run, "run_number") >= firstRun ]
                 if lastRun:
                     selectedRunList = [ run for run in selectedRunList \
                                         if self.__findInJson(run, "run_number") <= lastRun ]
                 lumiList = [ str( self.__findInJson(run, "run_number") ) + ":1-" \
                              + str( self.__findInJson(run, "run_number") ) + ":max" \
                              for run in selectedRunList ]
                 splitLumiList = list( self.__chunks( lumiList, 255 ) )
             else:
                 theLumiList = None
                 try:
                     theLumiList = LumiList ( filename = jsonPath )
                 except ValueError:
                     pass
 
                 if theLumiList is not None:
                     allRuns = theLumiList.getRuns()
                     runsToRemove = []
                     for run in allRuns:
                         if firstRun and int( run ) < firstRun:
                             runsToRemove.append( run )
                         if lastRun and int( run ) > lastRun:
                             runsToRemove.append( run )
                     theLumiList.removeRuns( runsToRemove )
                     splitLumiList = list( self.__chunks(
                         theLumiList.getCMSSWString().split(','), 255 ) )
                     if not (splitLumiList and splitLumiList[0] and splitLumiList[0][0]):
                         splitLumiList = None
                 else:
                     with open(jsonPath) as f:
                         jsoncontents = f.read()
                         if "process.source.lumisToProcess" in jsoncontents:
                             msg = "%s is not a json file, but it seems to be a CMSSW lumi selection cff snippet.  Trying to use it" % jsonPath
                             if firstRun or lastRun:
                                 msg += ("\n  (after applying firstRun and/or lastRun)")
                             msg += ".\nPlease note that, depending on the format of this file, it may not work as expected."
                             msg += "\nCheck your config file to make sure that it worked properly."
                             print(msg)
 
                             runlist = self.__getRunList()
                             if firstRun or lastRun:
                                 self.__firstusedrun = -1
                                 self.__lastusedrun = -1
                                 jsoncontents = re.sub(r"\d+:(\d+|max)(-\d+:(\d+|max))?", self.getForceRunRangeFunction(firstRun, lastRun), jsoncontents)
                                 jsoncontents = (jsoncontents.replace("'',\n","").replace("''\n","")
                                                             .replace('"",\n','').replace('""\n',''))
                                 self.__firstusedrun = max(self.__firstusedrun, int(self.__findInJson(runlist[0],"run_number")))
                                 self.__lastusedrun = min(self.__lastusedrun, int(self.__findInJson(runlist[-1],"run_number")))
                                 if self.__lastusedrun < self.__firstusedrun:
                                     jsoncontents = None
                             else:
                                 self.__firstusedrun = int(self.__findInJson(runlist[0],"run_number"))
                                 self.__lastusedrun = int(self.__findInJson(runlist[-1],"run_number"))
                             lumiSecExtend = jsoncontents
                             splitLumiList = None
                         else:
                             raise AllInOneError("%s is not a valid json file!" % jsonPath)
 
             if splitLumiList and splitLumiList[0] and splitLumiList[0][0]:
                 lumiSecStr = [ "',\n'".join( lumis ) \
                                for lumis in splitLumiList ]
                 lumiSecStr = [ "lumiSecs.extend( [\n'" + lumis + "'\n] )" \
                                for lumis in lumiSecStr ]
                 lumiSecExtend = "\n".join( lumiSecStr )
                 runlist = self.__getRunList()
                 self.__firstusedrun = max(int(splitLumiList[0][0].split(":")[0]), int(self.__findInJson(runlist[0],"run_number")))
                 self.__lastusedrun = min(int(splitLumiList[-1][-1].split(":")[0]), int(self.__findInJson(runlist[-1],"run_number")))
             elif lumiSecExtend:
                 pass
             else:
                 msg = "You are trying to run a validation without any runs!  Check that:"
                 if firstRun or lastRun:
                     msg += "\n - firstRun/begin and lastRun/end are correct for this dataset, and there are runs in between containing data"
                 if jsonPath:
                     msg += "\n - your JSON file is correct for this dataset, and the runs contain data"
                 if (firstRun or lastRun) and jsonPath:
                     msg += "\n - firstRun/begin and lastRun/end are consistent with your JSON file"
                 raise AllInOneError(msg)
 
         else:
             if self.__inputMagneticField is not None:
                 pass  #never need self.__firstusedrun or self.__lastusedrun
             else:
                 runlist = self.__getRunList()
                 self.__firstusedrun = int(self.__findInJson(self.__getRunList()[0],"run_number"))
                 self.__lastusedrun = int(self.__findInJson(self.__getRunList()[-1],"run_number"))
 
         return lumiSecExtend
 

◆ buildListOfBadFiles()

def dataset.Dataset.buildListOfBadFiles ( self )

fills the list of bad files from the IntegrityCheck log.

When the integrity check file is not available,
files are considered as good.

Definition at line 278 of file dataset.py.

     def buildListOfBadFiles(self):
         '''fills the list of bad files from the IntegrityCheck log.
 
         When the integrity check file is not available,
         files are considered as good.'''
         mask = "IntegrityCheck"
            
         self.bad_files = {}
         self.good_files = []
 
         file_mask = castortools.matchingFiles(self.castorDir, '^%s_.*\.txt$' % mask)
         if file_mask:
             # here to avoid circular dependency
             from .edmIntegrityCheck import PublishToFileSystem
             p = PublishToFileSystem(mask)
             report = p.get(self.castorDir)
             if report is not None and report:
                 self.maskExists = True
                 self.report = report
                 dup = report.get('ValidDuplicates',{})
                 for name, status in report['Files'].items():
                     # print name, status
                     if not status[0]:
                         self.bad_files[name] = 'MarkedBad'
                     elif name in dup:
                         self.bad_files[name] = 'ValidDup'
                     else:
                         self.good_files.append( name )
         else:
             raise IntegrityCheckError( "ERROR: IntegrityCheck log file IntegrityCheck_XXXXXXXXXX.txt not found" )
 

◆ buildListOfFiles()

def dataset.Dataset.buildListOfFiles	(	self,
		pattern = `'.*root'`
	)

fills list of files, taking all root files matching the pattern in the castor dir

Definition at line 274 of file dataset.py.

     def buildListOfFiles(self, pattern='.*root'):
         '''fills list of files, taking all root files matching the pattern in the castor dir'''
         self.files = castortools.matchingFiles( self.castorDir, pattern )
                              

◆ convertTimeToRun()

def dataset.Dataset.convertTimeToRun	(	self,
		begin = `None`,
		end = `None`,
		firstRun = `None`,
		lastRun = `None`,
		shortTuple = `True`
	)

Definition at line 645 of file dataset.py.

References dataset.Dataset.__dasinstance, dataset.Dataset.__dateString(), dataset.Dataset.__datetime(), dataset.Dataset.__find_ge(), dataset.Dataset.__find_lt(), dataset.Dataset.__findInJson(), dataset.Dataset.__getData(), dataset.Dataset.__getRunList(), dataset.Dataset.__name, electrons_cff.bool, and dataset.int.

Referenced by dataset.Dataset.__createSnippet(), and dataset.Dataset.__dateString().

                           shortTuple = True ):
         if ( begin and firstRun ) or ( end and lastRun ):
             msg = ( "The Usage of "
                     + "'begin' & 'firstRun' " * int( bool( begin and
                                                            firstRun ) )
                     + "and " * int( bool( ( begin and firstRun ) and
                                          ( end and lastRun ) ) )
                     + "'end' & 'lastRun' " * int( bool( end and lastRun ) )
                     + "is ambigous." )
             raise AllInOneError( msg )
 
         if begin or end:
             runList = [ self.__findInJson(run, "run_number") for run in self.__getRunList() ]
 
         if begin:
             lastdate = begin
             for delta in [ 1, 5, 10, 20, 30 ]:                       #try searching for about 2 months after begin
                 firstdate = lastdate
                 lastdate = self.__dateString(self.__datetime(firstdate) + datetime.timedelta(delta))
                 dasQuery_begin = "run date between[%s,%s] instance=%s" % (firstdate, lastdate, self.__dasinstance)
                 begindata = self.__getData(dasQuery_begin)
                 if len(begindata) > 0:
                     begindata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
                     try:
                         runIndex = self.__find_ge( runList, self.__findInJson(begindata[0], ["run", "run_number"]))
                     except ValueError:
                         msg = ( "Your 'begin' is after the creation time of the last "
                                 "run in the dataset\n'%s'"%( self.__name ) )
                         raise AllInOneError( msg )
                     firstRun = runList[runIndex]
                     begin = None
                     break
 
         if begin:
             raise AllInOneError("No runs within a reasonable time interval after your 'begin'."
                                 "Try using a 'begin' that has runs soon after it (within 2 months at most)")
 
         if end:
             firstdate = end
             for delta in [ 1, 5, 10, 20, 30 ]:                       #try searching for about 2 months before end
                 lastdate = firstdate
                 firstdate = self.__dateString(self.__datetime(lastdate) - datetime.timedelta(delta))
                 dasQuery_end = "run date between[%s,%s] instance=%s" % (firstdate, lastdate, self.__dasinstance)
                 enddata = self.__getData(dasQuery_end)
                 if len(enddata) > 0:
                     enddata.sort(key = lambda run: self.__findInJson(run, ["run", "run_number"]))
                     try:
                         runIndex = self.__find_lt( runList, self.__findInJson(enddata[-1], ["run", "run_number"]))
                     except ValueError:
                         msg = ( "Your 'end' is before the creation time of the first "
                                 "run in the dataset\n'%s'"%( self.__name ) )
                         raise AllInOneError( msg )
                     lastRun = runList[runIndex]
                     end = None
                     break
 
         if end:
             raise AllInOneError("No runs within a reasonable time interval before your 'end'."
                                 "Try using an 'end' that has runs soon before it (within 2 months at most)")
 
         if shortTuple:
             return firstRun, lastRun
         else:
             return begin, end, firstRun, lastRun
 

◆ createdatasetfile_hippy()

def dataset.Dataset.createdatasetfile_hippy	(	self,
		filename,
		filesperjob,
		firstrun,
		lastrun
	)

Definition at line 852 of file dataset.py.

References dataset.Dataset.__chunks(), dataset.Dataset.fileList(), and join().

     def createdatasetfile_hippy(self, filename, filesperjob, firstrun, lastrun):
         with open(filename, "w") as f:
             for job in self.__chunks(self.fileList(firstRun=firstrun, lastRun=lastrun, forcerunselection=True), filesperjob):
                 f.write(",".join("'{}'".format(file) for file in job)+"\n")
 

◆ datasetSnippet()

def dataset.Dataset.datasetSnippet	(	self,
		jsonPath = `None`,
		begin = `None`,
		end = `None`,
		firstRun = `None`,
		lastRun = `None`,
		crab = `False`,
		parent = `False`
	)

Definition at line 729 of file dataset.py.

References dataset.Dataset.__createSnippet(), dataset.Dataset.__filename, dataset.Dataset.__name, dataset.Dataset.__official, dataset.Dataset.__origName, dataset.Dataset.__predefined, dataset.Dataset.dump_cff(), and print().

Referenced by dataset.Dataset.parentDataset().

                         firstRun = None, lastRun = None, crab = False, parent = False ):
         if not firstRun: firstRun = None
         if not lastRun: lastRun = None
         if not begin: begin = None
         if not end: end = None
         if self.__predefined and (jsonPath or begin or end or firstRun or lastRun):
             msg = ( "The parameters 'JSON', 'begin', 'end', 'firstRun', and 'lastRun' "
                     "only work for official datasets, not predefined _cff.py files" )
             raise AllInOneError( msg )
         if self.__predefined and parent:
             with open(self.__filename) as f:
                 if "secFiles.extend" not in f.read():
                     msg = ("The predefined dataset '%s' does not contain secondary files, "
                            "which your validation requires!") % self.__name
                     if self.__official:
                         self.__name = self.__origName
                         self.__predefined = False
                         print(msg)
                         print ("Retreiving the files from DAS.  You will be asked if you want "
                                "to overwrite the old dataset.\n"
                                "It will still be compatible with validations that don't need secondary files.")
                     else:
                         raise AllInOneError(msg)
 
         if self.__predefined:
             snippet = ("process.load(\"Alignment.OfflineValidation.%s_cff\")\n"
                        "process.maxEvents = cms.untracked.PSet(\n"
                        "    input = cms.untracked.int32(int(.oO[nEvents]Oo. / .oO[parallelJobs]Oo.))\n"
                        ")\n"
                        "process.source.skipEvents=cms.untracked.uint32(int(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.))"
                        %(self.__name))
             if not parent:
                 with open(self.__filename) as f:
                     if "secFiles.extend" in f.read():
                         snippet += "\nprocess.source.secondaryFileNames = cms.untracked.vstring()"
             return snippet
         theMap = { "process": "process.",
                    "tab": " " * len( "process." ),
                    "nEvents": ".oO[nEvents]Oo. / .oO[parallelJobs]Oo.",
                    "skipEventsString": "process.source.skipEvents=cms.untracked.uint32(int(.oO[nIndex]Oo.*.oO[nEvents]Oo./.oO[parallelJobs]Oo.))\n",
                    "importCms": "",
                    "header": ""
                    }
         datasetSnippet = self.__createSnippet( jsonPath = jsonPath,
                                                begin = begin,
                                                end = end,
                                                firstRun = firstRun,
                                                lastRun = lastRun,
                                                repMap = theMap,
                                                crab = crab,
                                                parent = parent )
         if jsonPath == "" and begin == "" and end == "" and firstRun == "" and lastRun == "":
             try:
                 self.dump_cff(parent = parent)
             except AllInOneError as e:
                 print("Can't store the dataset as a cff:")
                 print(e)
                 print("This may be inconvenient in the future, but will not cause a problem for this validation.")
         return datasetSnippet
 

◆ dataType()

def dataset.Dataset.dataType ( self )

Definition at line 710 of file dataset.py.

References dataset.Dataset.__dataType, and dataset.Dataset.__getDataType().

     def dataType( self ):
         if not self.__dataType:
             self.__dataType = self.__getDataType()
         return self.__dataType
 

◆ dump_cff()

def dataset.Dataset.dump_cff	(	self,
		outName = `None`,
		jsonPath = `None`,
		begin = `None`,
		end = `None`,
		firstRun = `None`,
		lastRun = `None`,
		parent = `False`
	)

Definition at line 791 of file dataset.py.

References dataset.Dataset.__cmssw, dataset.Dataset.__createSnippet(), dataset.Dataset.__dataType, dataset.Dataset.__getMagneticFieldForRun(), dataset.Dataset.__magneticField, dataset.Dataset.__name, print(), python.rootplot.root2matplotlib.replace(), submitPVValidationJobs.split(), str, and digitizers_cfi.strip.

Referenced by dataset.Dataset.datasetSnippet().

                   end = None, firstRun = None, lastRun = None, parent = False ):
         if outName == None:
             outName = "Dataset" + self.__name.replace("/", "_")
         packageName = os.path.join( "Alignment", "OfflineValidation" )
         if not os.path.exists( os.path.join(
             self.__cmssw, "src", packageName ) ):
             msg = ("You try to store the predefined dataset'%s'.\n"
                    "For that you need to check out the package '%s' to your "
                    "private relase area in\n"%( outName, packageName )
                    + self.__cmssw )
             raise AllInOneError( msg )
         theMap = { "process": "",
                    "tab": "",
                    "nEvents": str( -1 ),
                    "skipEventsString": "",
                    "importCms": "import FWCore.ParameterSet.Config as cms\n",
                    "header": "#Do not delete or (unless you know what you're doing) change these comments\n"
                              "#%(name)s\n"
                              "#data type: %(dataType)s\n"
                              "#magnetic field: .oO[magneticField]Oo.\n"    #put in magnetic field later
                              %{"name": self.__name,                        #need to create the snippet before getting the magnetic field
                                "dataType": self.__dataType}                #so that we know the first and last runs
                    }
         dataset_cff = self.__createSnippet( jsonPath = jsonPath,
                                             begin = begin,
                                             end = end,
                                             firstRun = firstRun,
                                             lastRun = lastRun,
                                             repMap = theMap,
                                             parent = parent)
         magneticField = self.__magneticField
         if magneticField == "MagneticField":
             magneticField = "%s, %s     #%s" % (magneticField,
                                                 str(self.__getMagneticFieldForRun()).replace("\n"," ").split("#")[0].strip(),
                                                 "Use MagneticField_cff.py; the number is for determining which track selection to use."
                                                )
         dataset_cff = dataset_cff.replace(".oO[magneticField]Oo.",magneticField)
         filePath = os.path.join( self.__cmssw, "src", packageName,
                                  "python", outName + "_cff.py" )
         if os.path.exists( filePath ):
             existMsg = "The predefined dataset '%s' already exists.\n"%( outName )
             askString = "Do you want to overwrite it? [y/n]\n"
             inputQuery = existMsg + askString
             while True:
                 userInput = raw_input( inputQuery ).lower()
                 if userInput == "y":
                     break
                 elif userInput == "n":
                     return
                 else:
                     inputQuery = askString
         print ( "The predefined dataset '%s' will be stored in the file\n"
                 %( outName )
                 + filePath +
                 "\nFor future use you have to do 'scram b'." )
         print()
         theFile = open( filePath, "w" )
         theFile.write( dataset_cff )
         theFile.close()
         return
 

◆ extractFileSizes()

def dataset.Dataset.extractFileSizes ( self )

Get the file size for each file, from the eos ls -l command.

Definition at line 309 of file dataset.py.

References dataset.EOSDataset.castorDir, and dataset.Dataset.castorDir.

     def extractFileSizes(self):
         '''Get the file size for each file, from the eos ls -l command.'''
         # EOS command does not work in tier3
         lsout = castortools.runXRDCommand(self.castorDir,'dirlist')[0]
         lsout = lsout.split('\n')
         self.filesAndSizes = {}
         for entry in lsout:
             values = entry.split()
             if( len(values) != 5):
                 continue
             # using full abs path as a key.
             file = '/'.join([self.lfnDir, values[4].split("/")[-1]])
             size = values[1]
             self.filesAndSizes[file] = size 
          

◆ fileInfoList()

def dataset.Dataset.fileInfoList	(	self,
		parent = `False`
	)

Definition at line 914 of file dataset.py.

References dataset.Dataset.__dasLimit, and dataset.Dataset.__getFileInfoList().

Referenced by dataset.Dataset.fileList().

     def fileInfoList( self, parent = False ):
         return self.__getFileInfoList( self.__dasLimit, parent )
 

◆ fileList()

def dataset.Dataset.fileList	(	self,
		parent = `False`,
		firstRun = `None`,
		lastRun = `None`,
		forcerunselection = `False`
	)

Definition at line 885 of file dataset.py.

References dataset.Dataset.__findInJson(), dataset.Dataset.fileInfoList(), dqmMemoryStats.float, dataset.Dataset.getrunnumberfromfilename(), and print().

Referenced by dataset.Dataset.__fileListSnippet(), and dataset.Dataset.createdatasetfile_hippy().

     def fileList(self, parent=False, firstRun=None, lastRun=None, forcerunselection=False):
         fileList = [ self.__findInJson(fileInfo,"name")
                      for fileInfo in self.fileInfoList(parent) ]
 
         if firstRun or lastRun:
             if not firstRun: firstRun = -1
             if not lastRun: lastRun = float('infinity')
             unknownfilenames, reasons = [], set()
             for filename in fileList[:]:
                 try:
                     if not firstRun <= self.getrunnumberfromfilename(filename) <= lastRun:
                         fileList.remove(filename)
                 except AllInOneError as e:
                     if forcerunselection: raise
                     unknownfilenames.append(e.message.split("\n")[1])
                     reasons         .add   (e.message.split("\n")[2])
             if reasons:
                 if len(unknownfilenames) == len(fileList):
                     print("Could not figure out the run numbers of any of the filenames for the following reason(s):")
                 else:
                     print("Could not figure out the run numbers of the following filenames:")
                     for filename in unknownfilenames:
                         print("    "+filename)
                     print("for the following reason(s):")
                 for reason in reasons:
                     print("    "+reason)
                 print("Using the files anyway.  The runs will be filtered at the CMSSW level.")
         return fileList
 

◆ forcerunrange()

def dataset.Dataset.forcerunrange	(	self,
		firstRun,
		lastRun,
		s
	)

s must be in the format run1:lum1-run2:lum2

Definition at line 326 of file dataset.py.

References dataset.Dataset.__firstusedrun, dataset.Dataset.__lastusedrun, dataset.int, and submitPVValidationJobs.split().

Referenced by dataset.Dataset.getForceRunRangeFunction().

     def forcerunrange(self, firstRun, lastRun, s):
         """s must be in the format run1:lum1-run2:lum2"""
         s = s.group()
         run1 = s.split("-")[0].split(":")[0]
         lum1 = s.split("-")[0].split(":")[1]
         try:
             run2 = s.split("-")[1].split(":")[0]
             lum2 = s.split("-")[1].split(":")[1]
         except IndexError:
             run2 = run1
             lum2 = lum1
         if int(run2) < firstRun or int(run1) > lastRun:
             return ""
         if int(run1) < firstRun or firstRun < 0:
             run1 = firstRun
             lum1 = 1
         if int(run2) > lastRun:
             run2 = lastRun
             lum2 = "max"
         if int(run1) < self.__firstusedrun or self.__firstusedrun < 0:
             self.__firstusedrun = int(run1)
         if int(run2) > self.__lastusedrun:
             self.__lastusedrun = int(run2)
         return "%s:%s-%s:%s" % (run1, lum1, run2, lum2)
 

◆ getfiles()

def dataset.Dataset.getfiles	(	self,
		usecache
	)

Definition at line 211 of file dataset.py.

References dataset.Dataset.dasinstance, dataset.dasquery(), dataset.Dataset.datasetname, dataset.Dataset.filenamebase, dataset.findinjson(), dataset.int, and print().

   def getfiles(self, usecache):
     filename = os.path.join(os.environ["CMSSW_BASE"], "src", "Alignment", "CommonAlignment", "data", self.filenamebase+".csv")
     if not usecache:
       try:
         os.remove(filename)
       except OSError as e:
         if os.path.exists(filename):
           raise
 
     result = []
     try:
       with open(filename) as f:
         for row in csv.DictReader(f):
           result.append(DataFile(**row))
         return result
     except IOError:
       pass
 
     query = "file dataset={} instance={} detail=true | grep file.name, file.nevents".format(self.datasetname, self.dasinstance)
     dasoutput = dasquery(query)
     if not dasoutput:
       raise DatasetError("No files are available for the dataset '{}'. This can be "
                          "due to a typo or due to a DAS problem. Please check the "
                          "spelling of the dataset and/or try again.".format(datasetname))
     result = [DataFile(findinjson(_, "file", "name"), findinjson(_, "file", "nevents")) for _ in dasoutput if int(findinjson(_, "file", "nevents"))]
     try:
       with open(filename, "w") as f:
         writer = csv.DictWriter(f, ("filename", "nevents", "runs"))
         writer.writeheader()
         for datafile in result:
           writer.writerow(datafile.getdict())
     except Exception as e:
       print("Couldn't write the dataset csv file:\n\n{}".format(e))
     return result
 

◆ getForceRunRangeFunction()

def dataset.Dataset.getForceRunRangeFunction	(	self,
		firstRun,
		lastRun
	)

Definition at line 351 of file dataset.py.

References dataset.Dataset.forcerunrange().

Referenced by dataset.Dataset.__lumiSelectionSnippet().

     def getForceRunRangeFunction(self, firstRun, lastRun):
         def forcerunrangefunction(s):
             return self.forcerunrange(firstRun, lastRun, s)
         return forcerunrangefunction
 

◆ getPrimaryDatasetEntries()

def dataset.Dataset.getPrimaryDatasetEntries ( self )

Definition at line 329 of file dataset.py.

References dataset.int, runall.testit.report, WorkFlowRunner.WorkFlowRunner.report, ALIUtils.report, and dataset.BaseDataset.report.

     def getPrimaryDatasetEntries(self):
         if self.report is not None and self.report:
             return int(self.report.get('PrimaryDatasetEntries',-1))
         return -1
 
 

◆ getrunnumberfromfilename()

def dataset.Dataset.getrunnumberfromfilename ( filename )

static

Definition at line 858 of file dataset.py.

References python.cmstools.all(), dataset.int, and join().

Referenced by dataset.Dataset.fileList().

     def getrunnumberfromfilename(filename):
         parts = filename.split("/")
         result = error = None
         if parts[0] != "" or parts[1] != "store":
             error = "does not start with /store"
         elif parts[2] in ["mc", "relval"]:
             result = 1
         elif not parts[-1].endswith(".root"):
             error = "does not end with something.root"
         elif len(parts) != 12:
             error = "should be exactly 11 slashes counting the first one"
         else:
             runnumberparts = parts[-5:-2]
             if not all(len(part)==3 for part in runnumberparts):
                 error = "the 3 directories {} do not have length 3 each".format("/".join(runnumberparts))
             try:
                 result = int("".join(runnumberparts))
             except ValueError:
                 error = "the 3 directories {} do not form an integer".format("/".join(runnumberparts))
 
         if error:
             error = "could not figure out which run number this file is from:\n{}\n{}".format(filename, error)
             raise AllInOneError(error)
 
         return result
 

◆ headercomment()

def dataset.Dataset.headercomment ( self )

Definition at line 247 of file dataset.py.

References dataset.Dataset.datasetname.

   def headercomment(self):
     return self.datasetname
 

◆ magneticField()

def dataset.Dataset.magneticField ( self )

Definition at line 715 of file dataset.py.

References dataset.Dataset.__getMagneticField(), and dataset.Dataset.__magneticField.

     def magneticField( self ):
         if not self.__magneticField:
             self.__magneticField = self.__getMagneticField()
         return self.__magneticField
 

◆ magneticFieldForRun()

def dataset.Dataset.magneticFieldForRun	(	self,
		run = `-1`
	)

Definition at line 720 of file dataset.py.

References dataset.Dataset.__getMagneticFieldForRun().

     def magneticFieldForRun( self, run = -1 ):
         return self.__getMagneticFieldForRun(run)
 

◆ name()

def dataset.Dataset.name ( self )

Definition at line 917 of file dataset.py.

References dataset.Dataset.__name.

Referenced by config.CFG.__str__(), validation.Sample.digest(), and VIDSelectorBase.VIDSelectorBase.initialize().

     def name( self ):
         return self.__name
 

◆ parentDataset()

def dataset.Dataset.parentDataset ( self )

Definition at line 723 of file dataset.py.

References dataset.Dataset.__getParentDataset(), dataset.Dataset.__parentDataset, and dataset.Dataset.datasetSnippet().

Referenced by dataset.Dataset.__getFileInfoList().

     def parentDataset( self ):
         if not self.__parentDataset:
             self.__parentDataset = self.__getParentDataset()
         return self.__parentDataset
 

◆ predefined()

def dataset.Dataset.predefined ( self )

Definition at line 920 of file dataset.py.

References dataset.Dataset.__predefined.

     def predefined( self ):
         return self.__predefined
 

◆ printInfo()

def dataset.Dataset.printInfo ( self )

Definition at line 324 of file dataset.py.

     def printInfo(self):
         print('sample      :  ' + self.name)
         print('LFN         :  ' + self.lfnDir)
         print('Castor path :  ' + self.castorDir)
 

◆ runList()

def dataset.Dataset.runList ( self )

Definition at line 924 of file dataset.py.

References dataset.Dataset.__getRunList(), and print().

     def runList( self ):
         return self.__getRunList()

Member Data Documentation

◆ __cmssw

dataset.Dataset.__cmssw

private

Definition at line 28 of file dataset.py.

Referenced by dataset.Dataset.__getMagneticField(), and dataset.Dataset.dump_cff().

◆ __cmsswrelease

dataset.Dataset.__cmsswrelease

private

Definition at line 29 of file dataset.py.

Referenced by dataset.Dataset.__getMagneticField().

◆ __dasinstance

dataset.Dataset.__dasinstance

private

Definition at line 27 of file dataset.py.

Referenced by dataset.Dataset.__getDataType(), dataset.Dataset.__getFileInfoList(), dataset.Dataset.__getMagneticField(), dataset.Dataset.__getMagneticFieldForRun(), dataset.Dataset.__getParentDataset(), dataset.Dataset.__getRunList(), and dataset.Dataset.convertTimeToRun().

◆ __dasLimit

dataset.Dataset.__dasLimit

private

Definition at line 26 of file dataset.py.

Referenced by dataset.Dataset.fileInfoList().

◆ __dataType

dataset.Dataset.__dataType

private

Definition at line 85 of file dataset.py.

Referenced by dataset.Dataset.__getMagneticField(), dataset.Dataset.__getMagneticFieldForRun(), dataset.Dataset.dataType(), and dataset.Dataset.dump_cff().

◆ __dummy_source_template

tuple dataset.Dataset.__dummy_source_template

staticprivate

Initial value:

=  ("readFiles = cms.untracked.vstring()\n"
                               "secFiles = cms.untracked.vstring()\n"
                               "%(process)ssource = cms.Source(\"PoolSource\",\n"
                               "%(tab)s                    secondaryFileNames ="
                               "secFiles,\n"
                               "%(tab)s                    fileNames = readFiles\n"
                               ")\n"
                               "readFiles.extend(['dummy_File.root'])\n"
                               "%(process)smaxEvents = cms.untracked.PSet( "
                               "input = cms.untracked.int32(int(%(nEvents)s)) )\n"
                               "%(skipEventsString)s\n")

Definition at line 113 of file dataset.py.

Referenced by dataset.Dataset.__createSnippet().

◆ __filename

dataset.Dataset.__filename

private

Definition at line 55 of file dataset.py.

Referenced by dataset.Dataset.__getDataType(), dataset.Dataset.__getMagneticField(), dataset.Dataset.__getMagneticFieldForRun(), and dataset.Dataset.datasetSnippet().

◆ __firstusedrun

dataset.Dataset.__firstusedrun

private

Definition at line 30 of file dataset.py.

Referenced by dataset.Dataset.__getMagneticFieldForRun(), dataset.Dataset.__lumiSelectionSnippet(), and dataset.Dataset.forcerunrange().

◆ __inputMagneticField

dataset.Dataset.__inputMagneticField

private

Definition at line 83 of file dataset.py.

Referenced by dataset.Dataset.__getMagneticField(), dataset.Dataset.__getMagneticFieldForRun(), and dataset.Dataset.__lumiSelectionSnippet().

◆ __lastusedrun

dataset.Dataset.__lastusedrun

private

Definition at line 31 of file dataset.py.

Referenced by dataset.Dataset.__getMagneticFieldForRun(), dataset.Dataset.__lumiSelectionSnippet(), and dataset.Dataset.forcerunrange().

◆ __magneticField

dataset.Dataset.__magneticField

private

Definition at line 86 of file dataset.py.

Referenced by dataset.Dataset.__getMagneticFieldForRun(), dataset.Dataset.dump_cff(), and dataset.Dataset.magneticField().

◆ __name

dataset.Dataset.__name

private

Definition at line 24 of file dataset.py.

Referenced by dataset.Dataset.__fileListSnippet(), dataset.Dataset.__getDataType(), dataset.Dataset.__getFileInfoList(), dataset.Dataset.__getMagneticField(), dataset.Dataset.__getMagneticFieldForRun(), dataset.Dataset.__getParentDataset(), dataset.Dataset.__getRunList(), dataset.Dataset.convertTimeToRun(), dataset.Dataset.datasetSnippet(), dataset.Dataset.dump_cff(), Config.Process.dumpConfig(), Config.Process.dumpPython(), genericValidation.ValidationWithPlotsSummaryBase.SummaryItem.name(), dataset.Dataset.name(), Config.Process.name_(), and Config.Process.splitPython().

◆ __official

dataset.Dataset.__official

private

Definition at line 36 of file dataset.py.

Referenced by dataset.Dataset.datasetSnippet().

◆ __origName

dataset.Dataset.__origName

private

Definition at line 25 of file dataset.py.

Referenced by dataset.Dataset.datasetSnippet().

◆ __parentDataset

dataset.Dataset.__parentDataset

private

Definition at line 32 of file dataset.py.

Referenced by dataset.Dataset.parentDataset().

◆ __predefined

dataset.Dataset.__predefined

private

Definition at line 52 of file dataset.py.

Referenced by dataset.Dataset.__getDataType(), dataset.Dataset.__getFileInfoList(), dataset.Dataset.__getMagneticField(), dataset.Dataset.__getMagneticFieldForRun(), dataset.Dataset.datasetSnippet(), and dataset.Dataset.predefined().

◆ __source_template

dataset.Dataset.__source_template

staticprivate

Definition at line 95 of file dataset.py.

Referenced by dataset.Dataset.__createSnippet().

◆ bad_files

dataset.Dataset.bad_files

Definition at line 285 of file dataset.py.

◆ castorDir

dataset.Dataset.castorDir

Definition at line 269 of file dataset.py.

Referenced by dataset.Dataset.extractFileSizes(), and dataset.Dataset.printInfo().

◆ dasinstance

dataset.Dataset.dasinstance

Definition at line 208 of file dataset.py.

Referenced by dataset.Dataset.getfiles().

◆ datasetname

dataset.Dataset.datasetname

Definition at line 200 of file dataset.py.

Referenced by dataset.Dataset.getfiles(), and dataset.Dataset.headercomment().

◆ filenamebase

dataset.Dataset.filenamebase

Definition at line 203 of file dataset.py.

Referenced by dataset.Dataset.getfiles().

◆ files

dataset.Dataset.files

Definition at line 276 of file dataset.py.

◆ filesAndSizes

dataset.Dataset.filesAndSizes

Definition at line 314 of file dataset.py.

◆ good_files

dataset.Dataset.good_files

Definition at line 286 of file dataset.py.

◆ lfnDir

dataset.Dataset.lfnDir

Definition at line 268 of file dataset.py.

Referenced by dataset.Dataset.printInfo().

◆ maskExists

dataset.Dataset.maskExists

Definition at line 270 of file dataset.py.

◆ official

dataset.Dataset.official

Definition at line 202 of file dataset.py.

◆ report

dataset.Dataset.report

Definition at line 271 of file dataset.py.

Referenced by addOnTests.testit.run().

Public Member Functions

Static Public Member Functions

Public Attributes

Private Member Functions

Private Attributes

Static Private Attributes

Detailed Description

Constructor & Destructor Documentation

◆ __init__() [1/3]

◆ __init__() [2/3]

◆ __init__() [3/3]

Member Function Documentation

◆ __chunks()

◆ __createSnippet()

◆ __dateString()

◆ __datetime()

◆ __fileListSnippet()

◆ __find_ge()

◆ __find_lt()

◆ __findInJson()

◆ __getData()

◆ __getDataType()

◆ __getFileInfoList()

◆ __getMagneticField()

◆ __getMagneticFieldForRun()

◆ __getParentDataset()

◆ __getRunList()

◆ __lumiSelectionSnippet()

◆ buildListOfBadFiles()

◆ buildListOfFiles()

◆ convertTimeToRun()

◆ createdatasetfile_hippy()

◆ datasetSnippet()

◆ dataType()

◆ dump_cff()

◆ extractFileSizes()

◆ fileInfoList()

◆ fileList()

◆ forcerunrange()

◆ getfiles()

◆ getForceRunRangeFunction()

◆ getPrimaryDatasetEntries()

◆ getrunnumberfromfilename()

◆ headercomment()

◆ magneticField()

◆ magneticFieldForRun()

◆ name()

◆ parentDataset()

◆ predefined()

◆ printInfo()

◆ runList()

Member Data Documentation

◆ __cmssw

◆ __cmsswrelease

◆ __dasinstance

◆ __dasLimit

◆ __dataType

◆ __dummy_source_template

◆ __filename

◆ __firstusedrun

◆ __inputMagneticField

◆ __lastusedrun

◆ __magneticField

◆ __name

◆ __official

◆ __origName

◆ __parentDataset

◆ __predefined

◆ __source_template

◆ bad_files

◆ castorDir

◆ dasinstance

◆ datasetname

◆ filenamebase

◆ files

◆ filesAndSizes

◆ good_files

◆ lfnDir

◆ maskExists

◆ official

◆ init() [1/3]

◆ init() [2/3]

◆ init() [3/3]