3 from __future__
import print_function
4 from __future__
import absolute_import
5 from builtins
import range
12 from .castorBaseDir
import castorBaseDir
13 from .
import eostools
as castortools
21 return repr(self.
value)
26 def __init__(self, name, user, pattern='.*root', run_range=None, dbsInstance=None):
45 '''Get the file size for each file, 46 from the eos ls -l command.''' 62 if self.
files ==
None:
64 for file
in self.
files:
70 fileNameToPrint = file
72 fileNameToPrint = os.path.basename(file)
74 size=self.filesAndSizes.get(file,
'UNKNOWN').rjust(10)
77 print(status.ljust(10), size, \
78 '\t', fileNameToPrint)
80 print(fileNameToPrint)
84 '''Returns all files, even the bad ones.''' 88 '''Returns all files flagged as good in the integrity 89 check text output, or not present in this file, are 90 considered as good.''' 92 for file
in self.
files:
94 self.good_files.append( file )
98 """Takes the list of good files and selects a random sample 99 from them according to the prescale factor. 100 E.g. a prescale of 10 will select 1 in 10 files.""" 107 num_files =
int( (len(good_files)/(1.0*prescale)) + 0.5)
110 if num_files > len(good_files):
111 num_files = len(good_files)
116 while len(subset) < num_files:
118 choice = random.choice(good_files)
124 if len(subset) > slen:
125 good_files.remove(choice)
126 assert len(subset)==num_files,
'The number of files does not match' 128 return [f
for f
in subset]
133 super(CMSDataset, self).
__init__( name,
'CMS', run_range=run_range)
136 print(
'buildListOfFilesDBS',begin,end)
137 sampleName = self.name.rstrip(
'/')
138 query, qwhat = sampleName,
"dataset" 139 if "#" in sampleName: qwhat =
"block" 144 print(
"WARNING: queries with run ranges are slow in DAS")
146 dbs=
'das_client.py --query="file %s=%s"'%(qwhat,query)
148 dbs +=
' --index %d' % begin
150 dbs +=
' --limit %d' % (end-begin+1)
153 print(
'dbs\t: %s' % dbs)
154 dbsOut = os.popen(dbs)
157 if line.find(
'/store')==-1:
171 if num_files > limit:
172 num_steps =
int(num_files/limit)+1
174 for i
in range(num_steps):
178 self.files.extend(DBSFiles)
185 query, qwhat = dataset,
"dataset" 186 if "#" in dataset: qwhat =
"block" 187 if runmin >0
or runmax > 0:
189 query =
"%s run=%d" % (query,runmin)
191 print(
"WARNING: queries with run ranges are slow in DAS")
192 query =
"%s run between [%d, %d]" % (query,runmin
if runmin > 0
else 1, runmax
if runmax > 0
else 999999)
193 dbs=
'das_client.py --query="summary %s=%s"'%(qwhat,query)
194 dbsOut = os.popen(dbs).readlines()
198 line = line.replace(
'\n',
'')
199 if "nevents" in line:
200 entries.append(
int(line.split(
":")[1]))
208 query, qwhat = dataset,
"dataset" 209 if "#" in dataset: qwhat =
"block" 210 if runmin >0
or runmax > 0:
212 query =
"%s run=%d" % (query,runmin)
214 print(
"WARNING: queries with run ranges are slow in DAS")
215 query =
"%s run between [%d, %d]" % (query,runmin
if runmin > 0
else 1, runmax
if runmax > 0
else 999999)
216 dbs=
'das_client.py --query="summary %s=%s"'%(qwhat,query)
217 dbsOut = os.popen(dbs).readlines()
221 line = line.replace(
'\n',
'')
223 entries.append(
int(line.split(
":")[1]))
240 super(LocalDataset, self).
__init__( name,
'LOCAL', pattern)
243 pat = re.compile( pattern )
244 sampleName = self.name.rstrip(
'/')
248 for file
in sorted(os.listdir( self.
dir )):
249 if pat.match( file )
is not None:
250 self.files.append(
'/'.
join([self.
dir, file]) )
254 '''A dataset located in any given eos directory''' 258 if not castortools.isEOSDir(self.
castorDir):
259 raise ValueError(
'directory should be a directory on EOS.')
260 super(EOSDataset, self).
__init__( name,
'EOS', pattern)
273 super(Dataset, self).
__init__(name, user, pattern)
276 '''fills list of files, taking all root files matching the pattern in the castor dir''' 280 '''fills the list of bad files from the IntegrityCheck log. 282 When the integrity check file is not available, 283 files are considered as good.''' 284 mask =
"IntegrityCheck" 289 file_mask = castortools.matchingFiles(self.
castorDir,
'^%s_.*\.txt$' % mask)
292 from .edmIntegrityCheck
import PublishToFileSystem
293 p = PublishToFileSystem(mask)
295 if report
is not None and report:
298 dup = report.get(
'ValidDuplicates',{})
299 for name, status
in six.iteritems(report[
'Files']):
306 self.good_files.append( name )
308 raise IntegrityCheckError(
"ERROR: IntegrityCheck log file IntegrityCheck_XXXXXXXXXX.txt not found" )
311 '''Get the file size for each file, from the eos ls -l command.''' 313 lsout = castortools.runXRDCommand(self.
castorDir,
'dirlist')[0]
314 lsout = lsout.split(
'\n')
317 values = entry.split()
318 if( len(values) != 5):
332 return int(self.report.get(
'PrimaryDatasetEntries',-1))
340 super(PrivateDataset, self).
__init__(name,
'PRIVATE', dbsInstance=dbsInstance)
345 dbs =
'das_client.py --query="file dataset=%s instance=prod/%s" --limit=%s' % (name, dbsInstance, entries)
346 dbsOut = os.popen(dbs)
348 if line.find(
'/store')==-1:
363 query, qwhat = dataset,
"dataset" 364 if "#" in dataset: qwhat =
"block" 365 if runmin >0
or runmax > 0:
367 query =
"%s run=%d" % (query,runmin)
369 print(
"WARNING: queries with run ranges are slow in DAS")
370 query =
"%s run between [%d, %d]" % (query,runmin
if runmin > 0
else 1, runmax
if runmax > 0
else 999999)
371 dbs=
'das_client.py --query="summary %s=%s instance=prod/%s"'%(qwhat, query, dbsInstance)
372 dbsOut = os.popen(dbs).readlines()
376 line = line.replace(
'\n',
'')
377 if "nevents" in line:
378 entries.append(
int(line.split(
":")[1]))
387 query, qwhat = dataset,
"dataset" 388 if "#" in dataset: qwhat =
"block" 389 if runmin >0
or runmax > 0:
391 query =
"%s run=%d" % (query,runmin)
393 print(
"WARNING: queries with run ranges are slow in DAS")
394 query =
"%s run between [%d, %d]" % (query,runmin
if runmin > 0
else 1, runmax
if runmax > 0
else 999999)
395 dbs=
'das_client.py --query="summary %s=%s instance=prod/%s"'%(qwhat, query, dbsInstance)
396 dbsOut = os.popen(dbs).readlines()
400 line = line.replace(
'\n',
'')
402 entries.append(
int(line.split(
":")[1]))
417 cachedir =
'/'.
join( [os.environ[
'HOME'],
'.cmgdataset'])
418 pckfile = open( cachedir +
"/" + cachename )
419 dataset = pickle.load(pckfile)
423 cachedir =
'/'.
join( [os.environ[
'HOME'],
'.cmgdataset'])
424 if not os.path.exists(cachedir):
426 pckfile = open( cachedir +
"/" + cachename,
'w')
427 pickle.dump(dataset, pckfile)
430 basedir =
None, run_range =
None):
433 def cacheFileName(data, user, pattern):
434 return '{user}%{name}%{pattern}.pck'.
format( user = user, name = data.replace(
'/',
'_'), pattern = pattern)
436 def writeCache(dataset):
439 def readCache(data, user, pattern):
444 data = readCache(dataset, user, pattern)
449 data =
CMSDataset( dataset , run_range = run_range)
451 elif user ==
'LOCAL':
458 data =
Dataset( dataset, user, pattern)
473 cachedir =
'/'.
join( [os.environ[
'HOME'],
'.cmgdataset'])
475 def cacheFileName(data, user, dbsInstance, pattern):
476 cf = data.replace(
'/',
'_')
477 name =
'{dir}/{user}%{dbsInstance}%{name}%{pattern}.pck'.
format(
480 dbsInstance = dbsInstance,
485 def writeCache(dataset):
486 if not os.path.exists(cachedir):
488 cachename = cacheFileName(dataset.name,
492 pckfile = open( cachename,
'w')
493 pickle.dump(dataset, pckfile)
495 def readCache(data, user, dbsInstance, pattern):
496 cachename = cacheFileName(data, user, dbsInstance, pattern)
498 pckfile = open( cachename)
499 dataset = pickle.load(pckfile)
505 data = readCache(dataset, user, dbsInstance, pattern)
509 if user ==
'PRIVATE':
def __init__(self, name, basedir, pattern)
def __init__(self, name, user, pattern='.*root', run_range=None, dbsInstance=None)
def init(self, name, user, pattern='.
def getPrimaryDatasetEntries(self)
def listOfGoodFiles(self)
def writeDatasetToCache(cachename, dataset)
S & print(S &os, JobReport::InputFile const &f)
def findPrimaryDatasetNumFiles(dataset, runmin, runmax)
def buildListOfFiles(self, pattern)
def getPrimaryDatasetEntries(self)
def getPrimaryDatasetEntries(self)
def findPrimaryDatasetEntries(dataset, dbsInstance, runmin, runmax)
def extractFileSizes(self)
def buildListOfFiles(self, pattern='.*root')
def __init__(self, value)
def extractFileSizes(self)
def createDataset(user, dataset, pattern, readcache=False, basedir=None, run_range=None)
def __init__(self, name, basedir, pattern)
def buildListOfFilesDBS(self, pattern, begin=-1, end=-1)
def listOfGoodFilesWithPrescale(self, prescale)
static std::string join(char **cmd)
def getDatasetFromCache(cachename)
def buildListOfFiles(self, pattern='.*root')
def buildListOfFiles(self, pattern='.*root')
def buildListOfFiles(self, pattern='.*root')
def getPrimaryDatasetEntries(self)
def findPrimaryDatasetEntries(dataset, runmin, runmax)
def __init__(self, datasetname, dasinstance=defaultdasinstance)
def createMyDataset(user, dataset, pattern, dbsInstance, readcache=False)
if user == 'CMS': data = CMSDataset( dataset ) elif user == 'LOCAL': if basedir is None: basedir = os...
def buildListOfBadFiles(self)
def findPrimaryDatasetNumFiles(dataset, dbsInstance, runmin, runmax)
def __init__(self, name, dbsInstance=None)
def buildListOfFilesDBS(self, name, dbsInstance)
def buildListOfBadFiles(self)
def printFiles(self, abspath=True, info=True)
def __init__(self, name, run_range=None)
def buildListOfFiles(self, pattern='.*root')