00001
00002
00003 import sys
00004 import os
00005
00006 """
00007 arguments [<list-of-processes>]
00008 description:
00009 creates crab.cfg, multicrab.cfg, harvest_*.py
00010 if dbs is set:
00011 prints number of events found in dataset
00012 if no argument is provided looks for all available datsets for release
00013 user can edit multicrab and confirm process list as needed
00014 nuno@cern.ch 09.04
00015 """
00016
00017 def print_def():
00018 print "Usage:", sys.argv[0], "[list_of_processes]"
00019 print "Examples:"
00020 print "harvestRelVal.py"
00021 print "harvestRelVal.py /RelValTTbar/CMSSW_3_1_0_pre4_STARTUP_30X_v1/GEN-SIM-RECO"
00022 print "harvestRelVal.py <dataset_list.txt>"
00023
00024 def check_dbs():
00025 if os.getenv('DBSCMD_HOME','NOTSET') == 'NOTSET' :
00026 return 0
00027 return 1
00028
00029 def check_nevts_dset(dset):
00030 if not is_dbs :
00031 return -1
00032 ntot=0
00033 for afile in api.listFiles(path=str(dset)):
00034 nevts = afile['NumberOfEvents']
00035 ntot += nevts
00036
00037 return ntot
00038
00039 def make_dqmname(s):
00040 return 'DQM_V0001_R000000001' + s.replace('/','__') + '.root'
00041
00042 def get_name_from_dsetpath(ds):
00043 fs = ds.split('/')
00044 fa = fs[1].replace('RelVal','')
00045 return fa
00046
00047 def get_cond_from_dsetpath(ds) :
00048 ca = ds.split('/')[2].replace(cmssw_ver+'_','').replace('IDEAL_','').replace('STARTUP_','').replace('_FastSim','')
00049 cb = ca[:ca.find('v')-1]
00050 if cb[0].find('3') == -1 or len(cb) > 3:
00051 print "problem extracting condition for", ds, " : ", cb, '(len:',len(cb),')'
00052 if cb.find('31X') != -1:
00053 cb = '31X'
00054 elif cb.find('30X') != -1:
00055 cb = '30X'
00056 else:
00057 print "skipping", cb
00058 return 0
00059 print "condition found:", cb
00060 else :
00061 print "good condition for", ds, " : ", cb, '(len:',len(cb),')'
00062 return cb
00063
00064
00065 def make_dbs_list(dbslf) :
00066 if not is_dbs :
00067 return
00068 flis = open(dbslf,'w')
00069 for ads in api.listDatasetPaths() :
00070 if ads.find('RelVal') != -1 \
00071 or ads.find(cmssw_ver) != -1 \
00072 or ads.find("/GEN-SIM") != -1 :
00073
00074 flis.write(ads + '\n')
00075 flis.close()
00076 print 'Generated dataset list', dbslf, 'from dbs.'
00077
00078
00079
00080
00081
00082 def read_ds_file() :
00083 if not os.path.exists(dsfile) :
00084 print "problem reading file", dsfile
00085 sys.exit(30)
00086 fin = open(dsfile,'r')
00087 for dset in fin.readlines():
00088 d = dset.replace('\n','')
00089 if d.find('#') == -1 :
00090 dsetpaths.append(d)
00091 else :
00092 print 'skipping:', d
00093 fin.close()
00094 print 'Using data set list in ', dsfile
00095
00096 def check_dset() :
00097
00098 for s in dsetpaths:
00099 if s.find(cmssw_ver) == -1 :
00100 dsetpaths.remove(s)
00101 print 'Inconsistency found with datset and cmssw version (', cmssw_ver, ')' \
00102 ': \t ', s, ' has been removed.'
00103
00104 for s in dsetpaths[:]:
00105 cond = get_cond_from_dsetpath(s)
00106 if cond == 0 :
00107 dsetpaths.remove(s)
00108
00109 nSamples = len(dsetpaths)
00110 if nSamples == 0 :
00111 print "Empty input list, exit."
00112 sys.exit(12)
00113 else :
00114 print 'Processing', nSamples, 'data sets.'
00115
00116 nSampleEvts = list()
00117 for s in dsetpaths:
00118 nSampleEvts.append(check_nevts_dset(s))
00119 print 'number of events per dataset:', nSampleEvts
00120
00121 def find_dqmref(ds) :
00122 if not do_reference :
00123 return 'NONE'
00124 cp = cmssw_ver[-1:]
00125 ip = (int)(cp) - 1
00126 ref_ver = cmssw_ver.replace(cp,str(ip))
00127
00128 ref_dir = "/castor/cern.ch/user/n/nuno/relval/harvest/" + ref_ver + "/"
00129 ref_dsf = make_dqmname(ds.replace(cmssw_ver, ref_ver))
00130 gls = " | grep root | grep "
00131
00132 gls += ref_dsf[:-25]
00133 gls += "| awk '{print $9}' "
00134
00135 command = "rfcp " + ref_dir + "`rfdir " + ref_dir + gls + "` ."
00136
00137 os.system(command)
00138 tmpfile = "ref.txt"
00139 command = "ls -rtl *" + gls + " > " + tmpfile
00140
00141 os.system(command)
00142 the_ref = 'NONE'
00143 if os.path.exists(tmpfile) :
00144 fin = open(tmpfile,'r')
00145 ref = fin.readline().replace('\n','')
00146
00147 fin.close()
00148 if os.path.exists(ref) :
00149 the_ref = ref
00150 else :
00151 the_ref = 'NONE'
00152 print "Found reference file:", the_ref
00153 return the_ref
00154
00155 def create_harvest(ds) :
00156 raw_cmsdriver = "cmsDriver.py harvest -s HARVESTING:validationHarvesting --mc --conditions FrontierConditions_GlobalTag,STARTUP_30X::All --harvesting AtJobEnd --no_exec -n -1"
00157 cmsdriver = raw_cmsdriver
00158 cond = get_cond_from_dsetpath(ds)
00159 if cond == 0 :
00160 print 'unexpected problem with conditions'
00161 sys.exit(50)
00162 cmsdriver = cmsdriver.replace('30X',cond)
00163 fin_name="harvest_HARVESTING_STARTUP.py"
00164 if ds.find('IDEAL') != -1 :
00165 cmsdriver = cmsdriver.replace('STARTUP','IDEAL')
00166 fin_name = fin_name.replace('STARTUP','IDEAL')
00167 if ds.find('FastSim') != -1:
00168 cmsdriver = cmsdriver.replace('validationHarvesting','validationHarvestingFS')
00169 if ds.find('PileUp') != -1:
00170 cmsdriver = cmsdriver.replace('validationHarvesting','validationHarvestingPU')
00171
00172
00173 if os.path.exists(fin_name) :
00174 os.system("rm " + fin_name)
00175 print "executing cmsdriver command:\n\t", cmsdriver
00176 os.system(cmsdriver)
00177 if not os.path.exists(fin_name) :
00178 print 'problem with cmsdriver file name'
00179 sys.exit(40)
00180 os.system("touch " + fin_name)
00181 hf = make_harv_name(ds)
00182 os.system('mv ' + fin_name + " " + hf)
00183 out = open(hf, 'a')
00184 out.write("\n\n##additions to cmsDriver output \n")
00185 out.write("process.dqmSaver.workflow = '" + ds + "'\n")
00186 if is_dbs :
00187 out.write("process.source.fileNames = cms.untracked.vstring(\n")
00188 for afile in api.listFiles(path=ds):
00189 out.write(" '%s',\n" % afile['LogicalFileName'])
00190 out.write(")\n")
00191
00192 dqmref = find_dqmref(ds);
00193 if not dqmref == 'NONE' :
00194 out.write("process.DQMStore.referenceFileName = '" + dqmref + "'\n")
00195 out.write("process.dqmSaver.referenceHandling = 'all'\n")
00196
00197 out.close()
00198
00199 def create_mcrab(set, fcrab, fout):
00200 out = open(fout, 'w')
00201 out.write('[MULTICRAB]')
00202 out.write('\ncfg=' + fcrab)
00203 out.write('\n\n[COMMON]')
00204 nevt = -1
00205 njob = 1
00206 out.write('\nCMSSW.total_number_of_events=' + (str)(nevt) )
00207 out.write('\nCMSSW.number_of_jobs=' + (str)(njob) )
00208 for s in set:
00209 append_sample_mcrab(s, out)
00210 out.close()
00211
00212 def make_harv_name(dset) :
00213 return 'harvest_' + get_name_from_dsetpath(dset) + '.py'
00214
00215 def append_sample_mcrab(dsetp, fout):
00216 dqm = make_dqmname(dsetp)
00217 sample = get_name_from_dsetpath(dsetp)
00218 hf = make_harv_name(dsetp)
00219 if not os.path.exists(hf) :
00220 print 'problem creating multicrab, file', hf, 'does not exist'
00221 sys.exit(17)
00222 fout.write('\n\n[' + sample + ']')
00223 fout.write('\nCMSSW.pset=' + hf)
00224 fout.write('\nCMSSW.datasetpath=' + dsetp)
00225 fout.write('\nCMSSW.output_file=' + dqm)
00226
00227 dqmref = find_dqmref(dsetp);
00228 if not dqmref == 'NONE' :
00229 fout.write('\nUSER.additional_input_files=' + dqmref)
00230
00231 def create_crab(ds) :
00232 dqmout = make_dqmname(ds)
00233 hf = make_harv_name(ds)
00234 out = open(f_crab, 'w')
00235 out.write(crab_block)
00236 out.write('\npset=' + hf)
00237 out.write('datasetpath=' + ds)
00238 out.write('\noutput_file=' + dqmout)
00239 out.close()
00240
00241 crab_block = """
00242 [CRAB]
00243 jobtype = cmssw
00244 scheduler = glite
00245
00246 [EDG]
00247 remove_default_blacklist=1
00248 rb = CERN
00249
00250 [USER]
00251 return_data = 1
00252 #copy_data = 1
00253 #storage_element=srm-cms.cern.ch
00254 #storage_path=/srm/managerv2?SFN=/castor/cern.ch
00255 #user_remote_dir=/user/n/nuno/test
00256 publish_data=0
00257 thresholdLevel=70
00258 eMail=nuno@cern.ch
00259
00260 [CMSSW]
00261 total_number_of_events=-1
00262 show_prod = 1
00263 number_of_jobs=1
00264 """
00265
00266
00267
00268 input_type = ''
00269 argin = ''
00270 dsfile = ''
00271 do_reference = False
00272 if len(sys.argv) > 2 :
00273 print_def()
00274 sys.exit(10)
00275 elif len(sys.argv) == 1 :
00276 print "Will search for available datasets."
00277 input_type = 'none'
00278 elif len(sys.argv) == 2 :
00279 argin = sys.argv[1]
00280 if os.path.exists(argin) :
00281 dsfile = argin
00282
00283 input_type = 'file'
00284 elif argin.find('CMSSW') != -1 and argin.find('RelVal'):
00285 print 'Using specified data set', argin
00286 input_type = 'ds'
00287 else :
00288 print 'Invalid argument: process list, dataset or file', \
00289 argin, 'does not exist.'
00290 sys.exit(11)
00291
00292
00293 is_dbs = check_dbs()
00294 if not is_dbs:
00295 print "dbs not set!"
00296 else:
00297 print "dbs home:", os.getenv('DBSCMD_HOME')
00298 from DBSAPI.dbsApi import DbsApi
00299 from DBSAPI.dbsException import *
00300 from DBSAPI.dbsApiException import *
00301 from DBSAPI.dbsOptions import DbsOptionParser
00302 optManager = DbsOptionParser()
00303 (opts,args) = optManager.getOpt()
00304
00305 args={}
00306 args['url']= "https://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
00307 api = DbsApi(args)
00308
00309
00310 cmssw_ver = os.getenv('CMSSW_VERSION','NOTSET')
00311 if cmssw_ver == 'NOTSET' :
00312 print """
00313 cmssw not set!
00314 example:
00315 scramv1 p CMSSW CMSSW_3_1_0_pre5
00316 cd CMSSW_3_1_0_pre5/src
00317 eval `scramv1 runtime -sh`
00318 cd -
00319 """
00320 sys.exit(12)
00321 else :
00322 print "Using cmssw version:", cmssw_ver
00323
00324
00325
00326 dsetpaths = list()
00327
00328 if input_type == 'none' :
00329 if not is_dbs :
00330 print "no dataset specified, and dbs isn't set..."
00331 print_def()
00332 sys.exit(13)
00333 else :
00334 dsfile = cmssw_ver + "_dbslist.txt"
00335 make_dbs_list(dsfile)
00336 read_ds_file()
00337 elif input_type == 'file' :
00338 read_ds_file()
00339 elif input_type == 'ds' :
00340 dsetpaths.append(argin)
00341
00342
00343
00344 check_dset()
00345
00346
00347 print 'data sets:', dsetpaths
00348 dslproc = open("dset_processed.txt", 'w')
00349 for s in dsetpaths :
00350 dslproc.write(s+'\n')
00351 dslproc.close()
00352
00353
00354
00355 create_harvest(dsetpaths[0])
00356
00357
00358 f_crab = 'crab.cfg'
00359 create_crab(dsetpaths[0])
00360
00361
00362 for s in dsetpaths:
00363 create_harvest(s)
00364
00365
00366 f_multi_crab = 'multicrab.cfg'
00367 create_mcrab(dsetpaths, f_crab, f_multi_crab)
00368
00369
00370
00371 harvfilelist = list()
00372 for s in dsetpaths:
00373 harvfilelist.append(make_harv_name(s))
00374
00375 print '\nCreated:\n\t %(pwd)s/%(cf)s \n\t %(pwd)s/%(mc)s' \
00376 % {'pwd' : os.environ["PWD"],'cf' : f_crab, 'mc' : f_multi_crab}
00377 print "\tIndividual harvest py's:\n\t", harvfilelist
00378
00379 print "Done."