CMS 3D CMS Logo

List of all members | Public Member Functions | Public Attributes
dataset.Dataset Class Reference
Inheritance diagram for dataset.Dataset:
dataset.BaseDataset dataset.DatasetBase

Public Member Functions

def __init__ (self, datasetname, dasinstance=defaultdasinstance)
 
def __init__ (self, name, user, pattern='.*root')
 
def buildListOfBadFiles (self)
 
def buildListOfFiles (self, pattern='.*root')
 
def extractFileSizes (self)
 
def getfiles (self, usecache)
 
def getPrimaryDatasetEntries (self)
 
def headercomment (self)
 
def printInfo (self)
 
- Public Member Functions inherited from dataset.BaseDataset
def __init__ (self, name, user, pattern='.*root', run_range=None, dbsInstance=None)
 def init(self, name, user, pattern='. More...
 
def buildListOfBadFiles (self)
 
def buildListOfFiles (self, pattern)
 
def extractFileSizes (self)
 
def getPrimaryDatasetEntries (self)
 
def listOfFiles (self)
 
def listOfGoodFiles (self)
 
def listOfGoodFilesWithPrescale (self, prescale)
 
def printFiles (self, abspath=True, info=True)
 
def printInfo (self)
 
- Public Member Functions inherited from dataset.DatasetBase
def getfiles (self, usecache)
 
def headercomment (self)
 
def writefilelist_hippy (self, firstrun, lastrun, runs, eventsperjob, maxevents, outputfile, usecache=True)
 
def writefilelist_validation (self, firstrun, lastrun, runs, maxevents, outputfile=None, usecache=True)
 

Public Attributes

 bad_files
 
 castorDir
 
 dasinstance
 
 datasetname
 
 filenamebase
 
 files
 
 filesAndSizes
 
 good_files
 
 lfnDir
 
 maskExists
 
 official
 
 report
 
- Public Attributes inherited from dataset.BaseDataset
 bad_files
 
 dbsInstance
 MM. More...
 
 files
 
 filesAndSizes
 
 good_files
 
 name
 
 pattern
 
 primaryDatasetEntries
 MM. More...
 
 report
 
 run_range
 
 user
 

Detailed Description

Definition at line 198 of file dataset.py.

Constructor & Destructor Documentation

◆ __init__() [1/2]

def dataset.Dataset.__init__ (   self,
  datasetname,
  dasinstance = defaultdasinstance 
)

Definition at line 199 of file dataset.py.

Referenced by dataset.Dataset.__init__().

199  def __init__(self, datasetname, dasinstance=defaultdasinstance):
200  self.datasetname = datasetname
201  if re.match(r'/.+/.+/.+', datasetname):
202  self.official = True
203  self.filenamebase = "Dataset" + self.datasetname.replace("/","_")
204  else:
205  self.official = False
206  self.filenamebase = datasetname
207 
208  self.dasinstance = dasinstance
209 
def __init__(self, dataset, job_number, job_id, job_name, isDA, isMC, applyBOWS, applyEXTRACOND, extraconditions, runboundary, lumilist, intlumi, maxevents, gt, allFromGT, alignmentDB, alignmentTAG, apeDB, apeTAG, bowDB, bowTAG, vertextype, tracktype, refittertype, ttrhtype, applyruncontrol, ptcut, CMSSW_dir, the_dir)
def replace(string, replacements)

◆ __init__() [2/2]

def dataset.Dataset.__init__ (   self,
  name,
  user,
  pattern = '.*root' 
)

Definition at line 267 of file dataset.py.

References dataset.Dataset.__init__().

267  def __init__(self, name, user, pattern='.*root'):
268  self.lfnDir = castorBaseDir(user) + name
269  self.castorDir = castortools.lfnToCastor( self.lfnDir )
270  self.maskExists = False
271  self.report = None
272  super(Dataset, self).__init__(name, user, pattern)
273 
def __init__(self, dataset, job_number, job_id, job_name, isDA, isMC, applyBOWS, applyEXTRACOND, extraconditions, runboundary, lumilist, intlumi, maxevents, gt, allFromGT, alignmentDB, alignmentTAG, apeDB, apeTAG, bowDB, bowTAG, vertextype, tracktype, refittertype, ttrhtype, applyruncontrol, ptcut, CMSSW_dir, the_dir)

Member Function Documentation

◆ buildListOfBadFiles()

def dataset.Dataset.buildListOfBadFiles (   self)
fills the list of bad files from the IntegrityCheck log.

When the integrity check file is not available,
files are considered as good.

Definition at line 278 of file dataset.py.

278  def buildListOfBadFiles(self):
279  '''fills the list of bad files from the IntegrityCheck log.
280 
281  When the integrity check file is not available,
282  files are considered as good.'''
283  mask = "IntegrityCheck"
284 
285  self.bad_files = {}
286  self.good_files = []
287 
288  file_mask = castortools.matchingFiles(self.castorDir, '^%s_.*\.txt$' % mask)
289  if file_mask:
290  # here to avoid circular dependency
291  from .edmIntegrityCheck import PublishToFileSystem
292  p = PublishToFileSystem(mask)
293  report = p.get(self.castorDir)
294  if report is not None and report:
295  self.maskExists = True
296  self.report = report
297  dup = report.get('ValidDuplicates',{})
298  for name, status in report['Files'].items():
299  # print name, status
300  if not status[0]:
301  self.bad_files[name] = 'MarkedBad'
302  elif name in dup:
303  self.bad_files[name] = 'ValidDup'
304  else:
305  self.good_files.append( name )
306  else:
307  raise IntegrityCheckError( "ERROR: IntegrityCheck log file IntegrityCheck_XXXXXXXXXX.txt not found" )
308 

◆ buildListOfFiles()

def dataset.Dataset.buildListOfFiles (   self,
  pattern = '.*root' 
)
fills list of files, taking all root files matching the pattern in the castor dir

Definition at line 274 of file dataset.py.

274  def buildListOfFiles(self, pattern='.*root'):
275  '''fills list of files, taking all root files matching the pattern in the castor dir'''
276  self.files = castortools.matchingFiles( self.castorDir, pattern )
277 

◆ extractFileSizes()

def dataset.Dataset.extractFileSizes (   self)
Get the file size for each file, from the eos ls -l command.

Definition at line 309 of file dataset.py.

References dataset.EOSDataset.castorDir, and dataset.Dataset.castorDir.

309  def extractFileSizes(self):
310  '''Get the file size for each file, from the eos ls -l command.'''
311  # EOS command does not work in tier3
312  lsout = castortools.runXRDCommand(self.castorDir,'dirlist')[0]
313  lsout = lsout.split('\n')
314  self.filesAndSizes = {}
315  for entry in lsout:
316  values = entry.split()
317  if( len(values) != 5):
318  continue
319  # using full abs path as a key.
320  file = '/'.join([self.lfnDir, values[4].split("/")[-1]])
321  size = values[1]
322  self.filesAndSizes[file] = size
323 
static std::string join(char **cmd)
Definition: RemoteFile.cc:19

◆ getfiles()

def dataset.Dataset.getfiles (   self,
  usecache 
)

Definition at line 211 of file dataset.py.

References dataset.Dataset.dasinstance, dataset.dasquery(), dataset.Dataset.datasetname, dataset.Dataset.filenamebase, dataset.findinjson(), dataset.int, and print().

211  def getfiles(self, usecache):
212  filename = os.path.join(os.environ["CMSSW_BASE"], "src", "Alignment", "CommonAlignment", "data", self.filenamebase+".csv")
213  if not usecache:
214  try:
215  os.remove(filename)
216  except OSError as e:
217  if os.path.exists(filename):
218  raise
219 
220  result = []
221  try:
222  with open(filename) as f:
223  for row in csv.DictReader(f):
224  result.append(DataFile(**row))
225  return result
226  except IOError:
227  pass
228 
229  query = "file dataset={} instance={} detail=true | grep file.name, file.nevents".format(self.datasetname, self.dasinstance)
230  dasoutput = dasquery(query)
231  if not dasoutput:
232  raise DatasetError("No files are available for the dataset '{}'. This can be "
233  "due to a typo or due to a DAS problem. Please check the "
234  "spelling of the dataset and/or try again.".format(datasetname))
235  result = [DataFile(findinjson(_, "file", "name"), findinjson(_, "file", "nevents")) for _ in dasoutput if int(findinjson(_, "file", "nevents"))]
236  try:
237  with open(filename, "w") as f:
238  writer = csv.DictWriter(f, ("filename", "nevents", "runs"))
239  writer.writeheader()
240  for datafile in result:
241  writer.writerow(datafile.getdict())
242  except Exception as e:
243  print("Couldn't write the dataset csv file:\n\n{}".format(e))
244  return result
245 
def dasquery(dasQuery, dasLimit=0)
Definition: dataset.py:27
void print(TMatrixD &m, const char *label=nullptr, bool mathematicaFormat=false)
Definition: Utilities.cc:47
def findinjson(jsondict, strings)
Definition: dataset.py:95

◆ getPrimaryDatasetEntries()

def dataset.Dataset.getPrimaryDatasetEntries (   self)

Definition at line 329 of file dataset.py.

References dataset.int, runall.testit.report, WorkFlowRunner.WorkFlowRunner.report, ALIUtils.report, and dataset.BaseDataset.report.

329  def getPrimaryDatasetEntries(self):
330  if self.report is not None and self.report:
331  return int(self.report.get('PrimaryDatasetEntries',-1))
332  return -1
333 
334 

◆ headercomment()

def dataset.Dataset.headercomment (   self)

Definition at line 247 of file dataset.py.

References dataset.Dataset.datasetname.

247  def headercomment(self):
248  return self.datasetname
249 

◆ printInfo()

def dataset.Dataset.printInfo (   self)

Definition at line 324 of file dataset.py.

References dataset.EOSDataset.castorDir, dataset.Dataset.castorDir, dataset.Dataset.lfnDir, ElectronMVAID.ElectronMVAID.name, HFRaddamTask.name, HcalOfflineHarvesting.name, LaserTask.name, HcalOnlineHarvesting.name, NoCQTask.name, PedestalTask.name, QIE10Task.name, QIE11Task.name, RecHitTask.name, UMNioTask.name, ZDCTask.name, AlignableObjectId::entry.name, RawTask.name, average.Average.name, counter.Counter.name, TPTask.name, histograms.Histograms.name, DigiTask.name, LEDTask.name, cond::persistency::TAG::NAME.name, cond::persistency::RUN_INFO::RUN_NUMBER.name, TmModule.name, cond::persistency::GTEditorData.name, cond::persistency::GLOBAL_TAG::NAME.name, cond::persistency::TAG::TIME_TYPE.name, cond::persistency::RUN_INFO::START_TIME.name, cond::persistency::GLOBAL_TAG::VALIDITY.name, cond::persistency::TAG::OBJECT_TYPE.name, cond::persistency::RUN_INFO::END_TIME.name, core.autovars.NTupleVariable.name, cond::persistency::GLOBAL_TAG::DESCRIPTION.name, DQMRivetClient::NormOption.name, cond::persistency::TAG::SYNCHRONIZATION.name, cond::persistency::GLOBAL_TAG::RELEASE.name, cond::persistency::TAG::END_OF_VALIDITY.name, MEPSet.name, cond::persistency::GLOBAL_TAG::SNAPSHOT_TIME.name, cond::persistency::O2O_RUN::JOB_NAME.name, cond::persistency::TAG::DESCRIPTION.name, cms::dd::NameValuePair< T >.name, cond::persistency::GLOBAL_TAG::INSERTION_TIME.name, cond::persistency::O2O_RUN::START_TIME.name, cond::persistency::TAG::LAST_VALIDATED_TIME.name, cond::persistency::O2O_RUN::END_TIME.name, cond::persistency::TAG::INSERTION_TIME.name, FWTGeoRecoGeometry::Info.name, cond::persistency::O2O_RUN::STATUS_CODE.name, cond::persistency::TAG::MODIFICATION_TIME.name, cond::persistency::O2O_RUN::LOG.name, ParameterSet.name, nanoaod::MergeableCounterTable::SingleColumn< T >.name, cond::persistency::TAG::PROTECTION_CODE.name, OutputMEPSet.name, PixelDCSObject< T >::Item.name, dataset.BaseDataset.name, AlignmentConstraint.name, cms::dd::ValuePair< T, U >.name, personalPlayback.Applet.name, Types._Untracked.name, MagCylinder.name, analyzer.Analyzer.name, DQMRivetClient::LumiOption.name, heppy::ParSet.name, cond::persistency::GTProxyData.name, SingleObjectCondition.name, DQMRivetClient::ScaleFactorOption.name, edm::PathTimingSummary.name, cms::DDAlgoArguments.name, EgHLTOfflineSummaryClient::SumHistBinData.name, Barrel.name, cond::TimeTypeSpecs.name, perftools::EdmEventSize::BranchRecord.name, core.autovars.NTupleObjectType.name, EcalLogicID.name, edm::PathSummary.name, lumi::TriggerInfo.name, XMLProcessor::_loaderBaseConfig.name, PixelEndcapLinkMaker::Item.name, MEtoEDM< T >::MEtoEDMObject.name, FWTableViewManager::TableEntry.name, PixelBarrelLinkMaker::Item.name, ExpressionHisto< T >.name, DQMGenericClient::EfficOption.name, Supermodule.name, TreeCrawler.Package.name, cond::persistency::GLOBAL_TAG_MAP::GLOBAL_TAG_NAME.name, options.ConnectionHLTMenu.name, cond::persistency::GLOBAL_TAG_MAP::RECORD.name, cond::persistency::GLOBAL_TAG_MAP::LABEL.name, cms::DDParsingContext::CompositeMaterial.name, cond::persistency::GLOBAL_TAG_MAP::TAG_NAME.name, cond::Tag_t.name, dqmoffline::l1t::HistDefinition.name, DQMGenericClient::ProfileOption.name, magneticfield::BaseVolumeHandle.name, FastHFShowerLibrary.name, nanoaod::MergeableCounterTable::VectorColumn< T >.name, emtf::Node.name, h4DSegm.name, DQMGenericClient::NormOption.name, core.TriggerMatchAnalyzer.TriggerMatchAnalyzer.name, DQMGenericClient::CDOption.name, CounterChecker.name, cond::TagInfo_t.name, TrackerSectorStruct.name, MuonGeometrySanityCheckPoint.name, PhysicsTools::Calibration::Variable.name, DQMGenericClient::NoFlowOption.name, cond::persistency::PAYLOAD::HASH.name, FCDTask.name, EDMtoMEConverter.name, Mapper::definition< ScannerT >.name, looper.Looper.name, cond::persistency::PAYLOAD::OBJECT_TYPE.name, cond::persistency::PAYLOAD::DATA.name, cond::persistency::PAYLOAD::STREAMER_INFO.name, cond::persistency::PAYLOAD::VERSION.name, cond::persistency::PAYLOAD::INSERTION_TIME.name, classes.MonitorData.name, HistogramManager.name, classes.OutputData.name, BPHDecayToResResBuilderBase::DZSelect.name, Crystal.name, h2DSegm.name, options.HLTProcessOptions.name, cond::persistency::IOV::TAG_NAME.name, cond::persistency::IOV::SINCE.name, cond::persistency::IOV::PAYLOAD_HASH.name, cond::persistency::IOV::INSERTION_TIME.name, DQMNet::WaitObject.name, core.TriggerBitAnalyzer.TriggerBitAnalyzer.name, AlpgenParameterName.name, config.Analyzer.name, geometry.Structure.name, core.autovars.NTupleSubObject.name, Capsule.name, core.autovars.NTupleObject.name, Ceramic.name, SiStripMonitorDigi.name, BulkSilicon.name, config.Service.name, APD.name, core.autovars.NTupleCollection.name, BPHRecoBuilder::BPHRecoSource.name, nanoaod::FlatTable::Column.name, BPHRecoBuilder::BPHCompSource.name, StraightTrackAlignment::RPSetPlots.name, cond::persistency::TAG_AUTHORIZATION::TAG_NAME.name, cond::persistency::TAG_AUTHORIZATION::ACCESS_TYPE.name, cond::persistency::TAG_AUTHORIZATION::CREDENTIAL.name, cond::persistency::TAG_AUTHORIZATION::CREDENTIAL_TYPE.name, InnerLayerVolume.name, cond::payloadInspector::TagReference.name, cond::persistency::TAG_LOG::TAG_NAME.name, cond::persistency::TAG_LOG::EVENT_TIME.name, cond::persistency::TAG_LOG::USER_NAME.name, cond::persistency::TAG_LOG::HOST_NAME.name, cond::persistency::TAG_LOG::COMMAND.name, cond::persistency::TAG_LOG::ACTION.name, cond::persistency::TAG_LOG::USER_TEXT.name, personalPlayback.FrameworkJob.name, Grid.name, trklet::TrackletConfigBuilder::DTCinfo.name, Grille.name, BackPipe.name, plotscripts.SawTeethFunction.name, PatchPanel.name, BackCoolTank.name, DryAirTube.name, crabFunctions.CrabTask.name, MBCoolTube.name, MBManif.name, cscdqm::ParHistoDef.name, hTMaxCell.name, SummaryOutputProducer::GenericSummary.name, and print().

324  def printInfo(self):
325  print('sample : ' + self.name)
326  print('LFN : ' + self.lfnDir)
327  print('Castor path : ' + self.castorDir)
328 
void print(TMatrixD &m, const char *label=nullptr, bool mathematicaFormat=false)
Definition: Utilities.cc:47
def printInfo(self, event)

Member Data Documentation

◆ bad_files

dataset.Dataset.bad_files

Definition at line 285 of file dataset.py.

◆ castorDir

dataset.Dataset.castorDir

Definition at line 269 of file dataset.py.

Referenced by dataset.Dataset.extractFileSizes(), and dataset.Dataset.printInfo().

◆ dasinstance

dataset.Dataset.dasinstance

Definition at line 208 of file dataset.py.

Referenced by dataset.Dataset.getfiles().

◆ datasetname

dataset.Dataset.datasetname

Definition at line 200 of file dataset.py.

Referenced by dataset.Dataset.getfiles(), and dataset.Dataset.headercomment().

◆ filenamebase

dataset.Dataset.filenamebase

Definition at line 203 of file dataset.py.

Referenced by dataset.Dataset.getfiles().

◆ files

dataset.Dataset.files

Definition at line 276 of file dataset.py.

◆ filesAndSizes

dataset.Dataset.filesAndSizes

Definition at line 314 of file dataset.py.

◆ good_files

dataset.Dataset.good_files

Definition at line 286 of file dataset.py.

◆ lfnDir

dataset.Dataset.lfnDir

Definition at line 268 of file dataset.py.

Referenced by dataset.Dataset.printInfo().

◆ maskExists

dataset.Dataset.maskExists

Definition at line 270 of file dataset.py.

◆ official

dataset.Dataset.official

Definition at line 202 of file dataset.py.

◆ report

dataset.Dataset.report

Definition at line 271 of file dataset.py.

Referenced by addOnTests.testit.run().