CMS 3D CMS Logo

List of all members | Public Member Functions | Public Attributes
edmIntegrityCheck.IntegrityCheck Class Reference
Inheritance diagram for edmIntegrityCheck.IntegrityCheck:

Public Member Functions

def __init__ (self, dataset, options)
 
def getParseNumberOfEvents (self, output)
 
def listFiles (self, dir)
 
def listRootFiles (self, dir)
 
def query (self)
 
def report (self)
 
def sortByBaseDir (self, files)
 
def stageHost (self)
 
def stripDuplicates (self)
 
def structured (self)
 
def test (self, previous=None, timeout=-1)
 
def testFile (self, lfn)
 
def testFileTimeOut (self, lfn, timeout)
 

Public Attributes

 dataset
 
 directory
 
 eventsSeen
 
 eventsTotal
 
 options
 
 test_result
 
 topdir
 

Detailed Description

Definition at line 71 of file edmIntegrityCheck.py.

Constructor & Destructor Documentation

def edmIntegrityCheck.IntegrityCheck.__init__ (   self,
  dataset,
  options 
)

Definition at line 73 of file edmIntegrityCheck.py.

73  def __init__(self, dataset, options):
74  if not dataset.startswith(os.sep):
75  dataset = os.sep + dataset
76 
77  self.dataset = dataset
78  self.options = options
79  self.topdir = castortools.lfnToCastor( castorBaseDir(user=options.user) )
80  self.directory = os.path.join(self.topdir, *self.dataset.split(os.sep))
81 
82  #event counters
83  self.eventsTotal = -1
84  self.eventsSeen = 0
85 
86  self.test_result = None
87 
def __init__(self, dataset, options)

Member Function Documentation

def edmIntegrityCheck.IntegrityCheck.getParseNumberOfEvents (   self,
  output 
)
Parse the output of edmFileUtil to get the number of events found

Definition at line 312 of file edmIntegrityCheck.py.

References edmIntegrityCheck.int.

Referenced by edmIntegrityCheck.IntegrityCheck.testFile().

312  def getParseNumberOfEvents(self,output):
313  """Parse the output of edmFileUtil to get the number of events found"""
314  tokens = output.split(' ')
315  result = -2
316  try:
317  result = int(tokens[-4])
318  except ValueError:
319  pass
320  return result
321 
def edmIntegrityCheck.IntegrityCheck.listFiles (   self,
  dir 
)
Recursively list a file or directory on castor

Definition at line 293 of file edmIntegrityCheck.py.

Referenced by edmIntegrityCheck.IntegrityCheck.listRootFiles().

293  def listFiles(self,dir):
294  """Recursively list a file or directory on castor"""
295  return castortools.listFiles(dir,self.options.resursive)
296 
def edmIntegrityCheck.IntegrityCheck.listRootFiles (   self,
  dir 
)
filter out filenames so that they only contain root files

Definition at line 297 of file edmIntegrityCheck.py.

References edmIntegrityCheck.IntegrityCheck.listFiles().

Referenced by edmIntegrityCheck.IntegrityCheck.test().

297  def listRootFiles(self,dir):
298  """filter out filenames so that they only contain root files"""
299  return [f for f in self.listFiles(dir) if f.lower().endswith('.root')]
300 
def edmIntegrityCheck.IntegrityCheck.query (   self)
Query DAS to find out how many events are in the dataset

Definition at line 88 of file edmIntegrityCheck.py.

References CalibratedPatElectronProducer.dataset, CalibratedElectronProducer.dataset, edmIntegrityCheck.IntegrityCheck.dataset, genericValidation.GenericValidationData.dataset, edmIntegrityCheck.IntegrityCheck.eventsTotal, FileExportPlugin.FileExportPlugin.options, cmsswPreprocessor.CmsswPreprocessor.options, DOTExport.DotProducer.options, eventsfwlite.Events.options, confdb.HLTProcess.options, and edmIntegrityCheck.IntegrityCheck.options.

Referenced by production_tasks.BaseDataset.run(), and edmIntegrityCheck.IntegrityCheck.test().

88  def query(self):
89  """Query DAS to find out how many events are in the dataset"""
90  from production_tasks import BaseDataset
91  base = BaseDataset(self.dataset, self.options.user, self.options)
92 
93  data = None
94  output = base.run({})
95  if 'Das' in output:
96  self.options.name = output['Name']
97  data = output['Das']
98 
99  if data is None:
100  raise Exception("Dataset '%s' not found in Das. Please check." % self.dataset)
101  #get the number of events in the dataset
102  self.eventsTotal = CMSDataset.findPrimaryDatasetEntries(self.options.name, self.options.min_run, self.options.max_run)
103 
def edmIntegrityCheck.IntegrityCheck.report (   self)

Definition at line 211 of file edmIntegrityCheck.py.

References pat::GenericDuplicateRemover< Comparator, Arbitrator >.duplicates(), LumiList.LumiList.duplicates, edmIntegrityCheck.IntegrityCheck.eventsSeen, edmIntegrityCheck.IntegrityCheck.eventsTotal, edm::RunningAverage.int::test_average::running_average::test(), join(), harvestTrackValidationPlots.str, value_test.ValueTestCase.test(), eventstfile_test.EventsTFileTestCase.test(), pat::Flags.test(), helper::Parser.test(), pftools::CaloBox.test(), reco::PFBlock::Link.test, helper::ScannerBase.test(), DiMuonHistograms.test, edmIntegrityCheck.IntegrityCheck.test(), XMLProcessor.test(), BitArray< N >.test(), DTTFBitArray< N >.test(), TwoObjectVariable< LHS, lLHS, RHS, lRHS, Calculator >::getObject.test, cond::SmallWORMDict.test::SmallWORMDict::test, reco::HitPattern.test::TestHitPattern::test, edmIntegrityCheck.IntegrityCheck.test_result, and edmIntegrityCheck.IntegrityCheck.topdir.

211  def report(self):
212 
213  if self.test_result is None:
214  self.test()
215 
216  print 'DBS Dataset name: %s' % self.options.name
217  print 'Storage path: %s' % self.topdir
218 
219  for dirname, files in self.test_result.iteritems():
220  print 'Directory: %s' % dirname
221  for name, status in files.iteritems():
222  fname = os.path.join(dirname, name)
223  if not fname in self.duplicates:
224  print '\t\t %s: %s' % (name, str(status))
225  else:
226  print '\t\t %s: %s (Valid duplicate)' % (name, str(status))
227  print 'Total entries in DBS: %i' % self.eventsTotal
228  print 'Total entries in processed files: %i' % self.eventsSeen
229  if self.eventsTotal>0:
230  print 'Fraction of dataset processed: %f' % (self.eventsSeen/(1.*self.eventsTotal))
231  else:
232  print 'Total entries in DBS not determined'
233  if self.bad_jobs:
234  print "Bad Crab Jobs: '%s'" % ','.join([str(j) for j in self.bad_jobs])
235 
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
def test(self, previous=None, timeout=-1)
def edmIntegrityCheck.IntegrityCheck.sortByBaseDir (   self,
  files 
)
Sort files into directories

Definition at line 301 of file edmIntegrityCheck.py.

References mps_alisetup.append.

Referenced by edmIntegrityCheck.IntegrityCheck.test().

301  def sortByBaseDir(self,files):
302  """Sort files into directories"""
303  result = {}
304  for f in files:
305  dirname = os.path.dirname(f)
306  filename = os.path.basename(f)
307  if dirname not in result: result[dirname] = []
308  result[dirname].append(filename)
309  return result
310 
311 
def edmIntegrityCheck.IntegrityCheck.stageHost (   self)
Returns the CASTOR instance to use

Definition at line 289 of file edmIntegrityCheck.py.

Referenced by edmIntegrityCheck.IntegrityCheck.structured().

289  def stageHost(self):
290  """Returns the CASTOR instance to use"""
291  return os.environ.get('STAGE_HOST','castorcms')
292 
def edmIntegrityCheck.IntegrityCheck.stripDuplicates (   self)

Definition at line 104 of file edmIntegrityCheck.py.

References mps_alisetup.append, edmIntegrityCheck.int, list(), hpstanc_transforms.max, and min().

Referenced by edmIntegrityCheck.IntegrityCheck.test().

104  def stripDuplicates(self):
105 
106  import re
107 
108  filemask = {}
109  for dirname, files in self.test_result.iteritems():
110  for name, status in files.iteritems():
111  fname = os.path.join(dirname, name)
112  filemask[fname] = status
113 
114  def isCrabFile(name):
115  _, fname = os.path.split(name)
116  base, _ = os.path.splitext(fname)
117  return re.match(".*_\d+_\d+_\w+$", base) is not None, base
118  def getCrabIndex(base):
119  tokens = base.split('_')
120  if len(tokens) > 2:
121  return (int(tokens[-3]), int(tokens[-2]))
122  return None
123 
124  files = {}
125 
126  mmin = 1000000000
127  mmax = -100000000
128  for f in filemask:
129  isCrab, base = isCrabFile(f)
130  if isCrab:
131  index = getCrabIndex(base)
132  if index is not None:
133  jobid, retry = index
134 
135  mmin = min(mmin, jobid)
136  mmax = max(mmax, jobid)
137  if jobid in files and filemask[f][0]:
138  files[jobid].append((retry, f))
139  elif filemask[f][0]:
140  files[jobid] = [(retry, f)]
141 
142  good_duplicates = {}
143  bad_jobs = set()
144  sum_dup = 0
145  for i in xrange(mmin, mmax+1):
146  if i in files:
147  duplicates = files[i]
148  duplicates.sort()
149 
150  fname = duplicates[-1][1]
151  if len(duplicates) > 1:
152  for d in duplicates[:-1]:
153  good_duplicates[d[1]] = filemask[d[1]][1]
154  sum_dup += good_duplicates[d[1]]
155  else:
156  bad_jobs.add(i)
157  return good_duplicates, sorted(list(bad_jobs)), sum_dup
158 
T min(T a, T b)
Definition: MathUtil.h:58
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run
def edmIntegrityCheck.IntegrityCheck.structured (   self)

Definition at line 236 of file edmIntegrityCheck.py.

References mps_alisetup.append, CalibratedPatElectronProducer.dataset, CalibratedElectronProducer.dataset, edmIntegrityCheck.IntegrityCheck.dataset, genericValidation.GenericValidationData.dataset, pat::GenericDuplicateRemover< Comparator, Arbitrator >.duplicates(), LumiList.LumiList.duplicates, edmIntegrityCheck.IntegrityCheck.eventsSeen, edmIntegrityCheck.IntegrityCheck.eventsTotal, edm::RunningAverage.int::test_average::running_average::test(), edmIntegrityCheck.IntegrityCheck.stageHost(), value_test.ValueTestCase.test(), eventstfile_test.EventsTFileTestCase.test(), pat::Flags.test(), helper::Parser.test(), pftools::CaloBox.test(), reco::PFBlock::Link.test, helper::ScannerBase.test(), DiMuonHistograms.test, edmIntegrityCheck.IntegrityCheck.test(), XMLProcessor.test(), DTTFBitArray< N >.test(), BitArray< N >.test(), TwoObjectVariable< LHS, lLHS, RHS, lRHS, Calculator >::getObject.test, cond::SmallWORMDict.test::SmallWORMDict::test, reco::HitPattern.test::TestHitPattern::test, edmIntegrityCheck.IntegrityCheck.test_result, and edmIntegrityCheck.IntegrityCheck.topdir.

236  def structured(self):
237 
238  if self.test_result is None:
239  self.test()
240 
241  totalGood = 0
242  totalBad = 0
243 
244  report = {'data':{},
245  'ReportVersion':3,
246  'PrimaryDataset':self.options.name,
247  'Name':self.dataset,
248  'PhysicsGroup':'CMG',
249  'Status':'VALID',
250  'TierList':[],
251  'AlgoList':[],
252  'RunList':[],
253  'PathList':[],
254  'Topdir':self.topdir,
255  'StageHost':self.stageHost(),
256  'CreatedBy':self.options.user,
257  'DateCreated':datetime.datetime.now().strftime("%s"),
258  'Files':{}}
259 
260  for dirname, files in self.test_result.iteritems():
261  report['PathList'].append(dirname)
262  for name, status in files.iteritems():
263  fname = os.path.join(dirname, name)
264  report['Files'][fname] = status
265  if status[0]:
266  totalGood += 1
267  else:
268  totalBad += 1
269 
270  report['PrimaryDatasetEntries'] = self.eventsTotal
271  if self.eventsTotal>0:
272  report['PrimaryDatasetFraction'] = (self.eventsSeen/(1.*self.eventsTotal))
273  else:
274  report['PrimaryDatasetFraction'] = -1.
275  report['FilesEntries'] = self.eventsSeen
276 
277  report['FilesGood'] = totalGood
278  report['FilesBad'] = totalBad
279  report['FilesCount'] = totalGood + totalBad
280 
281  report['BadJobs'] = self.bad_jobs
282  report['ValidDuplicates'] = self.duplicates
283 
284  report['MinRun'] = self.options.min_run
285  report['MaxRun'] = self.options.max_run
286 
287  return report
288 
def test(self, previous=None, timeout=-1)
def edmIntegrityCheck.IntegrityCheck.test (   self,
  previous = None,
  timeout = -1 
)

Definition at line 159 of file edmIntegrityCheck.py.

References Book.directory, edmIntegrityCheck.IntegrityCheck.directory, pat::GenericDuplicateRemover< Comparator, Arbitrator >.duplicates(), LumiList.LumiList.duplicates, edmIntegrityCheck.IntegrityCheck.eventsSeen, edmIntegrityCheck.IntegrityCheck.listRootFiles(), DbQuery.query, edmIntegrityCheck.IntegrityCheck.query(), confdbOfflineConverter.OfflineConverter.query(), upload_popcon.HTTP.query(), uploadConditions.HTTP.query(), edmIntegrityCheck.IntegrityCheck.sortByBaseDir(), edmIntegrityCheck.IntegrityCheck.stripDuplicates(), edmIntegrityCheck.IntegrityCheck.test_result, and edmIntegrityCheck.IntegrityCheck.testFileTimeOut().

Referenced by edmIntegrityCheck.IntegrityCheck.report(), and edmIntegrityCheck.IntegrityCheck.structured().

159  def test(self, previous = None, timeout = -1):
160  if not castortools.fileExists(self.directory):
161  raise Exception("The top level directory '%s' for this dataset does not exist" % self.directory)
162 
163  self.query()
164 
165  test_results = {}
166 
167  #support updating to speed things up
168  prev_results = {}
169  if previous is not None:
170  for name, status in previous['Files'].iteritems():
171  prev_results[name] = status
172 
173  filesToTest = self.sortByBaseDir(self.listRootFiles(self.directory))
174  for dir, filelist in filesToTest.iteritems():
175  filemask = {}
176  #apply a UNIX wildcard if specified
177  filtered = filelist
178  if self.options.wildcard is not None:
179  filtered = fnmatch.filter(filelist, self.options.wildcard)
180  if not filtered:
181  print >> sys.stderr, "Warning: The wildcard '%s' does not match any files in '%s'. Please check you are using quotes." % (self.options.wildcard,self.directory)
182 
183  count = 0
184  for ff in filtered:
185  fname = os.path.join(dir, ff)
186  lfn = castortools.castorToLFN(fname)
187 
188  #try to update from the previous result if available
189  if lfn in prev_results and prev_results[lfn][0]:
190  if self.options.printout:
191  print '[%i/%i]\t Skipping %s...' % (count, len(filtered),fname),
192  OK, num = prev_results[lfn]
193  else:
194  if self.options.printout:
195  print '[%i/%i]\t Checking %s...' % (count, len(filtered),fname),
196  OK, num = self.testFileTimeOut(lfn, timeout)
197 
198  filemask[ff] = (OK,num)
199  if self.options.printout:
200  print (OK, num)
201  if OK:
202  self.eventsSeen += num
203  count += 1
204  test_results[castortools.castorToLFN(dir)] = filemask
205  self.test_result = test_results
206 
207  self.duplicates, self.bad_jobs, sum_dup = self.stripDuplicates()
208  #remove duplicate entries from the event count
209  self.eventsSeen -= sum_dup
210 
def test(self, previous=None, timeout=-1)
def testFileTimeOut(self, lfn, timeout)
def edmIntegrityCheck.IntegrityCheck.testFile (   self,
  lfn 
)

Definition at line 322 of file edmIntegrityCheck.py.

References communicate(), and edmIntegrityCheck.IntegrityCheck.getParseNumberOfEvents().

Referenced by edmIntegrityCheck.IntegrityCheck.testFileTimeOut().

322  def testFile(self,lfn):
323  stdout = subprocess.Popen(['edmFileUtil',lfn], stdout=subprocess.PIPE,stderr=subprocess.PIPE).communicate()[0]
324  for error in ["Fatal Root Error","Could not open file","Not a valid collection"]:
325  if error in stdout: return (False,-1)
326  return (True,self.getParseNumberOfEvents(stdout))
327 
static void * communicate(void *obj)
Definition: DQMNet.cc:1251
def edmIntegrityCheck.IntegrityCheck.testFileTimeOut (   self,
  lfn,
  timeout 
)

Definition at line 328 of file edmIntegrityCheck.py.

References AlignmentIORootBase.testFile(), edmIntegrityCheck.IntegrityCheck.testFile(), and timeout.timed_out().

Referenced by edmIntegrityCheck.IntegrityCheck.test().

328  def testFileTimeOut(self,lfn, timeout):
329  @timed_out(timeout)
330  def tf(lfn):
331  try:
332  return self.testFile(lfn)
333  except TimedOutExc as e:
334  print >> sys.stderr, "ERROR:\tedmFileUtil timed out for lfn '%s' (%d)" % (lfn,timeout)
335  return (False,-1)
336  if timeout > 0:
337  return tf(lfn)
338  else:
339  return self.testFile(lfn)
340 
341 
342 
def timed_out(timeout)
Definition: timeout.py:23
def testFileTimeOut(self, lfn, timeout)

Member Data Documentation

edmIntegrityCheck.IntegrityCheck.dataset
edmIntegrityCheck.IntegrityCheck.directory
edmIntegrityCheck.IntegrityCheck.eventsSeen
edmIntegrityCheck.IntegrityCheck.eventsTotal
edmIntegrityCheck.IntegrityCheck.options
edmIntegrityCheck.IntegrityCheck.test_result
edmIntegrityCheck.IntegrityCheck.topdir