CMS 3D CMS Logo

List of all members | Public Member Functions | Public Attributes
edmIntegrityCheck.IntegrityCheck Class Reference
Inheritance diagram for edmIntegrityCheck.IntegrityCheck:

Public Member Functions

def __init__ (self, dataset, options)
 
def getParseNumberOfEvents (self, output)
 
def listFiles (self, dir)
 
def listRootFiles (self, dir)
 
def query (self)
 
def report (self)
 
def sortByBaseDir (self, files)
 
def stageHost (self)
 
def stripDuplicates (self)
 
def structured (self)
 
def test (self, previous=None, timeout=-1)
 
def testFile (self, lfn)
 
def testFileTimeOut (self, lfn, timeout)
 

Public Attributes

 dataset
 
 directory
 
 eventsSeen
 
 eventsTotal
 
 options
 
 test_result
 
 topdir
 

Detailed Description

Definition at line 73 of file edmIntegrityCheck.py.

Constructor & Destructor Documentation

def edmIntegrityCheck.IntegrityCheck.__init__ (   self,
  dataset,
  options 
)

Definition at line 75 of file edmIntegrityCheck.py.

75  def __init__(self, dataset, options):
76  if not dataset.startswith(os.sep):
77  dataset = os.sep + dataset
78 
79  self.dataset = dataset
80  self.options = options
81  self.topdir = castortools.lfnToCastor( castorBaseDir(user=options.user) )
82  self.directory = os.path.join(self.topdir, *self.dataset.split(os.sep))
83 
84  #event counters
85  self.eventsTotal = -1
86  self.eventsSeen = 0
87 
88  self.test_result = None
89 
def __init__(self, dataset, options)

Member Function Documentation

def edmIntegrityCheck.IntegrityCheck.getParseNumberOfEvents (   self,
  output 
)
Parse the output of edmFileUtil to get the number of events found

Definition at line 313 of file edmIntegrityCheck.py.

References edmIntegrityCheck.int.

Referenced by edmIntegrityCheck.IntegrityCheck.testFile().

313  def getParseNumberOfEvents(self,output):
314  """Parse the output of edmFileUtil to get the number of events found"""
315  tokens = output.split(' ')
316  result = -2
317  try:
318  result = int(tokens[-4])
319  except ValueError:
320  pass
321  return result
322 
def edmIntegrityCheck.IntegrityCheck.listFiles (   self,
  dir 
)
Recursively list a file or directory on castor

Definition at line 294 of file edmIntegrityCheck.py.

Referenced by edmIntegrityCheck.IntegrityCheck.listRootFiles().

294  def listFiles(self,dir):
295  """Recursively list a file or directory on castor"""
296  return castortools.listFiles(dir,self.options.resursive)
297 
def edmIntegrityCheck.IntegrityCheck.listRootFiles (   self,
  dir 
)
filter out filenames so that they only contain root files

Definition at line 298 of file edmIntegrityCheck.py.

References edmIntegrityCheck.IntegrityCheck.listFiles().

Referenced by edmIntegrityCheck.IntegrityCheck.test().

298  def listRootFiles(self,dir):
299  """filter out filenames so that they only contain root files"""
300  return [f for f in self.listFiles(dir) if f.lower().endswith('.root')]
301 
def edmIntegrityCheck.IntegrityCheck.query (   self)
Query DAS to find out how many events are in the dataset

Definition at line 90 of file edmIntegrityCheck.py.

References edmIntegrityCheck.IntegrityCheck.dataset, genericValidation.GenericValidationData.dataset, edmIntegrityCheck.IntegrityCheck.eventsTotal, FileExportPlugin.FileExportPlugin.options, cmsswPreprocessor.CmsswPreprocessor.options, DTCalibrationWorker.DTCalibrationWorker.options, DTWorkflow.DTWorkflow.options, DOTExport.DotProducer.options, TestProcess.TestProcess.options, confdb.HLTProcess.options, and edmIntegrityCheck.IntegrityCheck.options.

Referenced by production_tasks.BaseDataset.run(), and edmIntegrityCheck.IntegrityCheck.test().

90  def query(self):
91  """Query DAS to find out how many events are in the dataset"""
92  from production_tasks import BaseDataset
93  base = BaseDataset(self.dataset, self.options.user, self.options)
94 
95  data = None
96  output = base.run({})
97  if 'Das' in output:
98  self.options.name = output['Name']
99  data = output['Das']
100 
101  if data is None:
102  raise Exception("Dataset '%s' not found in Das. Please check." % self.dataset)
103  #get the number of events in the dataset
104  self.eventsTotal = CMSDataset.findPrimaryDatasetEntries(self.options.name, self.options.min_run, self.options.max_run)
105 
def edmIntegrityCheck.IntegrityCheck.report (   self)

Definition at line 212 of file edmIntegrityCheck.py.

References pat::GenericDuplicateRemover< Comparator, Arbitrator >.duplicates(), LumiList.LumiList.duplicates, edmIntegrityCheck.IntegrityCheck.eventsSeen, edmIntegrityCheck.IntegrityCheck.eventsTotal, reco::HitPattern.int::test::TestHitPattern::test(), edm::RunningAverage.int::test_average::running_average::test(), join(), edm.print(), str, value_test.ValueTestCase.test(), eventstfile_test.EventsTFileTestCase.test(), pat::Flags.test(), helper::Parser.test(), reco::PFBlock::Link.test, pftools::CaloBox.test(), helper::ScannerBase.test(), MiniFloatConverter::ReduceMantissaToNbitsRounding.test, DiMuonHistograms.test, edmIntegrityCheck.IntegrityCheck.test(), XMLProcessor.test(), edm::test::TestProcessor.test(), DTTFBitArray< N >.test(), BitArray< N >.test(), TwoObjectVariable< LHS, lLHS, RHS, lRHS, Calculator >::getObject.test, cond::SmallWORMDict.test::SmallWORMDict::test, edmIntegrityCheck.IntegrityCheck.test_result, and edmIntegrityCheck.IntegrityCheck.topdir.

212  def report(self):
213 
214  if self.test_result is None:
215  self.test()
216 
217  print('DBS Dataset name: %s' % self.options.name)
218  print('Storage path: %s' % self.topdir)
219 
220  for dirname, files in six.iteritems(self.test_result):
221  print('Directory: %s' % dirname)
222  for name, status in six.iteritems(files):
223  fname = os.path.join(dirname, name)
224  if not fname in self.duplicates:
225  print('\t\t %s: %s' % (name, str(status)))
226  else:
227  print('\t\t %s: %s (Valid duplicate)' % (name, str(status)))
228  print('Total entries in DBS: %i' % self.eventsTotal)
229  print('Total entries in processed files: %i' % self.eventsSeen)
230  if self.eventsTotal>0:
231  print('Fraction of dataset processed: %f' % (self.eventsSeen/(1.*self.eventsTotal)))
232  else:
233  print('Total entries in DBS not determined')
234  if self.bad_jobs:
235  print("Bad Crab Jobs: '%s'" % ','.join([str(j) for j in self.bad_jobs]))
236 
S & print(S &os, JobReport::InputFile const &f)
Definition: JobReport.cc:65
static std::string join(char **cmd)
Definition: RemoteFile.cc:18
def test(self, previous=None, timeout=-1)
#define str(s)
def edmIntegrityCheck.IntegrityCheck.sortByBaseDir (   self,
  files 
)
Sort files into directories

Definition at line 302 of file edmIntegrityCheck.py.

References mps_setup.append.

Referenced by edmIntegrityCheck.IntegrityCheck.test().

302  def sortByBaseDir(self,files):
303  """Sort files into directories"""
304  result = {}
305  for f in files:
306  dirname = os.path.dirname(f)
307  filename = os.path.basename(f)
308  if dirname not in result: result[dirname] = []
309  result[dirname].append(filename)
310  return result
311 
312 
def edmIntegrityCheck.IntegrityCheck.stageHost (   self)
Returns the CASTOR instance to use

Definition at line 290 of file edmIntegrityCheck.py.

Referenced by edmIntegrityCheck.IntegrityCheck.structured().

290  def stageHost(self):
291  """Returns the CASTOR instance to use"""
292  return os.environ.get('STAGE_HOST','castorcms')
293 
def edmIntegrityCheck.IntegrityCheck.stripDuplicates (   self)

Definition at line 106 of file edmIntegrityCheck.py.

References mps_setup.append, edmIntegrityCheck.int, list(), SiStripPI.max, min(), and edmIntegrityCheck.IntegrityCheck.test_result.

Referenced by edmIntegrityCheck.IntegrityCheck.test().

106  def stripDuplicates(self):
107 
108  import re
109 
110  filemask = {}
111  for dirname, files in six.iteritems(self.test_result):
112  for name, status in six.iteritems(files):
113  fname = os.path.join(dirname, name)
114  filemask[fname] = status
115 
116  def isCrabFile(name):
117  _, fname = os.path.split(name)
118  base, _ = os.path.splitext(fname)
119  return re.match(".*_\d+_\d+_\w+$", base) is not None, base
120  def getCrabIndex(base):
121  tokens = base.split('_')
122  if len(tokens) > 2:
123  return (int(tokens[-3]), int(tokens[-2]))
124  return None
125 
126  files = {}
127 
128  mmin = 1000000000
129  mmax = -100000000
130  for f in filemask:
131  isCrab, base = isCrabFile(f)
132  if isCrab:
133  index = getCrabIndex(base)
134  if index is not None:
135  jobid, retry = index
136 
137  mmin = min(mmin, jobid)
138  mmax = max(mmax, jobid)
139  if jobid in files and filemask[f][0]:
140  files[jobid].append((retry, f))
141  elif filemask[f][0]:
142  files[jobid] = [(retry, f)]
143 
144  good_duplicates = {}
145  bad_jobs = set()
146  sum_dup = 0
147  for i in xrange(mmin, mmax+1):
148  if i in files:
149  duplicates = sorted(files[i])
150 
151  fname = duplicates[-1][1]
152  if len(duplicates) > 1:
153  for d in duplicates[:-1]:
154  good_duplicates[d[1]] = filemask[d[1]][1]
155  sum_dup += good_duplicates[d[1]]
156  else:
157  bad_jobs.add(i)
158  return good_duplicates, sorted(list(bad_jobs)), sum_dup
159 
T min(T a, T b)
Definition: MathUtil.h:58
How EventSelector::AcceptEvent() decides whether to accept an event for output otherwise it is excluding the probing of A single or multiple positive and the trigger will pass if any such matching triggers are PASS or EXCEPTION[A criterion thatmatches no triggers at all is detected and causes a throw.] A single negative with an expectation of appropriate bit checking in the decision and the trigger will pass if any such matching triggers are FAIL or EXCEPTION A wildcarded negative criterion that matches more than one trigger in the trigger list("!*","!HLTx*"if it matches 2 triggers or more) will accept the event if all the matching triggers are FAIL.It will reject the event if any of the triggers are PASS or EXCEPTION(this matches the behavior of"!*"before the partial wildcard feature was incorporated).Triggers which are in the READY state are completely ignored.(READY should never be returned since the trigger paths have been run
def edmIntegrityCheck.IntegrityCheck.structured (   self)

Definition at line 237 of file edmIntegrityCheck.py.

References mps_setup.append, edmIntegrityCheck.IntegrityCheck.dataset, genericValidation.GenericValidationData.dataset, pat::GenericDuplicateRemover< Comparator, Arbitrator >.duplicates(), LumiList.LumiList.duplicates, edmIntegrityCheck.IntegrityCheck.eventsSeen, edmIntegrityCheck.IntegrityCheck.eventsTotal, reco::HitPattern.int::test::TestHitPattern::test(), edm::RunningAverage.int::test_average::running_average::test(), edmIntegrityCheck.IntegrityCheck.stageHost(), value_test.ValueTestCase.test(), eventstfile_test.EventsTFileTestCase.test(), pat::Flags.test(), helper::Parser.test(), reco::PFBlock::Link.test, pftools::CaloBox.test(), helper::ScannerBase.test(), MiniFloatConverter::ReduceMantissaToNbitsRounding.test, DiMuonHistograms.test, edmIntegrityCheck.IntegrityCheck.test(), XMLProcessor.test(), edm::test::TestProcessor.test(), DTTFBitArray< N >.test(), BitArray< N >.test(), TwoObjectVariable< LHS, lLHS, RHS, lRHS, Calculator >::getObject.test, cond::SmallWORMDict.test::SmallWORMDict::test, edmIntegrityCheck.IntegrityCheck.test_result, and edmIntegrityCheck.IntegrityCheck.topdir.

237  def structured(self):
238 
239  if self.test_result is None:
240  self.test()
241 
242  totalGood = 0
243  totalBad = 0
244 
245  report = {'data':{},
246  'ReportVersion':3,
247  'PrimaryDataset':self.options.name,
248  'Name':self.dataset,
249  'PhysicsGroup':'CMG',
250  'Status':'VALID',
251  'TierList':[],
252  'AlgoList':[],
253  'RunList':[],
254  'PathList':[],
255  'Topdir':self.topdir,
256  'StageHost':self.stageHost(),
257  'CreatedBy':self.options.user,
258  'DateCreated':datetime.datetime.now().strftime("%s"),
259  'Files':{}}
260 
261  for dirname, files in six.iteritems(self.test_result):
262  report['PathList'].append(dirname)
263  for name, status in six.iteritems(files):
264  fname = os.path.join(dirname, name)
265  report['Files'][fname] = status
266  if status[0]:
267  totalGood += 1
268  else:
269  totalBad += 1
270 
271  report['PrimaryDatasetEntries'] = self.eventsTotal
272  if self.eventsTotal>0:
273  report['PrimaryDatasetFraction'] = (self.eventsSeen/(1.*self.eventsTotal))
274  else:
275  report['PrimaryDatasetFraction'] = -1.
276  report['FilesEntries'] = self.eventsSeen
277 
278  report['FilesGood'] = totalGood
279  report['FilesBad'] = totalBad
280  report['FilesCount'] = totalGood + totalBad
281 
282  report['BadJobs'] = self.bad_jobs
283  report['ValidDuplicates'] = self.duplicates
284 
285  report['MinRun'] = self.options.min_run
286  report['MaxRun'] = self.options.max_run
287 
288  return report
289 
def test(self, previous=None, timeout=-1)
def edmIntegrityCheck.IntegrityCheck.test (   self,
  previous = None,
  timeout = -1 
)

Definition at line 160 of file edmIntegrityCheck.py.

References Book.directory, edmIntegrityCheck.IntegrityCheck.directory, pat::GenericDuplicateRemover< Comparator, Arbitrator >.duplicates(), LumiList.LumiList.duplicates, edmIntegrityCheck.IntegrityCheck.eventsSeen, edmIntegrityCheck.IntegrityCheck.listRootFiles(), edm.print(), DbQuery.query, edmIntegrityCheck.IntegrityCheck.query(), confdbOfflineConverter.OfflineConverter.query(), upload_popcon.HTTP.query(), uploadConditions.HTTP.query(), edmIntegrityCheck.IntegrityCheck.sortByBaseDir(), edmIntegrityCheck.IntegrityCheck.stripDuplicates(), edmIntegrityCheck.IntegrityCheck.test_result, and edmIntegrityCheck.IntegrityCheck.testFileTimeOut().

Referenced by edmIntegrityCheck.IntegrityCheck.report(), and edmIntegrityCheck.IntegrityCheck.structured().

160  def test(self, previous = None, timeout = -1):
161  if not castortools.fileExists(self.directory):
162  raise Exception("The top level directory '%s' for this dataset does not exist" % self.directory)
163 
164  self.query()
165 
166  test_results = {}
167 
168  #support updating to speed things up
169  prev_results = {}
170  if previous is not None:
171  for name, status in six.iteritems(previous['Files']):
172  prev_results[name] = status
173 
174  filesToTest = self.sortByBaseDir(self.listRootFiles(self.directory))
175  for dir, filelist in six.iteritems(filesToTest):
176  filemask = {}
177  #apply a UNIX wildcard if specified
178  filtered = filelist
179  if self.options.wildcard is not None:
180  filtered = fnmatch.filter(filelist, self.options.wildcard)
181  if not filtered:
182  print("Warning: The wildcard '%s' does not match any files in '%s'. Please check you are using quotes." % (self.options.wildcard,self.directory), file=sys.stderr)
183 
184  count = 0
185  for ff in filtered:
186  fname = os.path.join(dir, ff)
187  lfn = castortools.castorToLFN(fname)
188 
189  #try to update from the previous result if available
190  if lfn in prev_results and prev_results[lfn][0]:
191  if self.options.printout:
192  print('[%i/%i]\t Skipping %s...' % (count, len(filtered),fname), end=' ')
193  OK, num = prev_results[lfn]
194  else:
195  if self.options.printout:
196  print('[%i/%i]\t Checking %s...' % (count, len(filtered),fname), end=' ')
197  OK, num = self.testFileTimeOut(lfn, timeout)
198 
199  filemask[ff] = (OK,num)
200  if self.options.printout:
201  print((OK, num))
202  if OK:
203  self.eventsSeen += num
204  count += 1
205  test_results[castortools.castorToLFN(dir)] = filemask
206  self.test_result = test_results
207 
208  self.duplicates, self.bad_jobs, sum_dup = self.stripDuplicates()
209  #remove duplicate entries from the event count
210  self.eventsSeen -= sum_dup
211 
S & print(S &os, JobReport::InputFile const &f)
Definition: JobReport.cc:65
def test(self, previous=None, timeout=-1)
def testFileTimeOut(self, lfn, timeout)
def edmIntegrityCheck.IntegrityCheck.testFile (   self,
  lfn 
)

Definition at line 323 of file edmIntegrityCheck.py.

References communicate(), and edmIntegrityCheck.IntegrityCheck.getParseNumberOfEvents().

Referenced by edmIntegrityCheck.IntegrityCheck.testFileTimeOut().

323  def testFile(self,lfn):
324  stdout = subprocess.Popen(['edmFileUtil',lfn], stdout=subprocess.PIPE,stderr=subprocess.PIPE).communicate()[0]
325  for error in ["Fatal Root Error","Could not open file","Not a valid collection"]:
326  if error in stdout: return (False,-1)
327  return (True,self.getParseNumberOfEvents(stdout))
328 
static void * communicate(void *obj)
Definition: DQMNet.cc:1251
def edmIntegrityCheck.IntegrityCheck.testFileTimeOut (   self,
  lfn,
  timeout 
)

Definition at line 329 of file edmIntegrityCheck.py.

References edm.print(), AlignmentIORootBase.testFile(), edmIntegrityCheck.IntegrityCheck.testFile(), and timeout.timed_out().

Referenced by edmIntegrityCheck.IntegrityCheck.test().

329  def testFileTimeOut(self,lfn, timeout):
330  @timed_out(timeout)
331  def tf(lfn):
332  try:
333  return self.testFile(lfn)
334  except TimedOutExc as e:
335  print("ERROR:\tedmFileUtil timed out for lfn '%s' (%d)" % (lfn,timeout), file=sys.stderr)
336  return (False,-1)
337  if timeout > 0:
338  return tf(lfn)
339  else:
340  return self.testFile(lfn)
341 
342 
343 
S & print(S &os, JobReport::InputFile const &f)
Definition: JobReport.cc:65
def timed_out(timeout)
Definition: timeout.py:23
def testFileTimeOut(self, lfn, timeout)

Member Data Documentation

edmIntegrityCheck.IntegrityCheck.dataset
edmIntegrityCheck.IntegrityCheck.directory
edmIntegrityCheck.IntegrityCheck.eventsSeen
edmIntegrityCheck.IntegrityCheck.eventsTotal
edmIntegrityCheck.IntegrityCheck.options
edmIntegrityCheck.IntegrityCheck.test_result
edmIntegrityCheck.IntegrityCheck.topdir