CMS 3D CMS Logo

/data/refman/pasoursint/CMSSW_5_3_1/src/HLTriggerOffline/Common/relval/harvestRelVal.py

Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 
00003 import sys
00004 import os
00005 
00006 """
00007 arguments [<list-of-processes>]
00008 description:
00009 creates crab.cfg, multicrab.cfg, harvest_*.py
00010 if dbs is set:
00011  prints number of events found in dataset
00012  if no argument is provided looks for all available datsets for release
00013  user can edit multicrab and confirm process list as needed
00014 nuno@cern.ch 09.04
00015 """
00016 
00017 def print_def():
00018     print "Usage:", sys.argv[0], "[list_of_processes]"
00019     print "Examples:"
00020     print "harvestRelVal.py"
00021     print "harvestRelVal.py /RelValTTbar/CMSSW_3_1_0_pre4_STARTUP_30X_v1/GEN-SIM-RECO"
00022     print "harvestRelVal.py <dataset_list.txt>"
00023 
00024 def check_dbs():
00025     if os.getenv('DBSCMD_HOME','NOTSET') == 'NOTSET' :
00026         return 0
00027     return 1
00028 
00029 def check_nevts_dset(dset):
00030     if not is_dbs :
00031         return -1
00032     ntot=0
00033     for afile in api.listFiles(path=str(dset)):
00034         nevts = afile['NumberOfEvents']
00035         ntot += nevts
00036         #print "  %s" % afile['LogicalFileName']
00037     return ntot  
00038 
00039 def make_dqmname(s):
00040     return  'DQM_V0001_R000000001' + s.replace('/','__') + '.root' 
00041 
00042 def get_name_from_dsetpath(ds):
00043     fs = ds.split('/')
00044     fa = fs[1].replace('RelVal','')
00045     return fa
00046 
00047 def get_cond_from_dsetpath(ds) :
00048     ca = ds.split('/')[2].replace(cmssw_ver+'_','').replace('IDEAL_','').replace('STARTUP_','').replace('_FastSim','')
00049     cb = ca[:ca.find('v')-1]
00050     if cb[0].find('3') == -1 or len(cb) > 3:
00051         print "problem extracting condition for", ds, " : ", cb, '(len:',len(cb),')'  
00052         if cb.find('31X') != -1:
00053             cb = '31X'
00054         elif cb.find('30X') != -1:
00055             cb = '30X'
00056         else:
00057             print "skipping", cb
00058             return 0
00059         print "condition found:", cb
00060     else :
00061         print "good condition for", ds, " : ", cb, '(len:',len(cb),')'      
00062     return cb
00063 
00064 
00065 def make_dbs_list(dbslf) :
00066     if not is_dbs :
00067         return
00068     flis = open(dbslf,'w')
00069     for ads in api.listDatasetPaths() :
00070         if ads.find('RelVal') != -1 \
00071                or ads.find(cmssw_ver) != -1 \
00072                or ads.find("/GEN-SIM") != -1 : 
00073 #               and ads.find("/GEN-SIM-RECO") != -1 : 
00074             flis.write(ads + '\n')
00075     flis.close()
00076     print 'Generated dataset list', dbslf, 'from dbs.' 
00077     #exampe:
00078     #dbs lsd --path=/RelVal*/CMSSW_3_1_0_pre5*/GEN-SIM-RECO --url=https://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet > mylist.txt
00079     #dbslsd = "dbs lsd --path=/RelVal*/" + cmssw_ver + "*/GEN-SIM-RECO --url=https://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
00080     #os.system( '`' + dbslsd + ' > ' + dbslf + '`')
00081 
00082 def read_ds_file() :
00083     if not os.path.exists(dsfile) :
00084         print "problem reading file", dsfile
00085         sys.exit(30)
00086     fin = open(dsfile,'r')
00087     for dset in fin.readlines(): 
00088         d = dset.replace('\n','')
00089         if d.find('#') == -1 :
00090             dsetpaths.append(d)
00091         else :
00092             print 'skipping:', d
00093     fin.close()
00094     print 'Using data set list in ', dsfile
00095 
00096 def check_dset() :
00097    #check cmssw consistency
00098    for s in dsetpaths:
00099        if s.find(cmssw_ver) == -1 :
00100            dsetpaths.remove(s)        
00101            print 'Inconsistency found with datset and cmssw version (', cmssw_ver, ')' \
00102                  ': \t ', s, ' has been removed.'
00103    #check conditions from dsetname
00104    for s in dsetpaths[:]: #nb:need to make a copy here!
00105        cond = get_cond_from_dsetpath(s)
00106        if cond  == 0 : 
00107            dsetpaths.remove(s)        
00108    #check list size
00109    nSamples = len(dsetpaths)
00110    if nSamples == 0 :
00111        print "Empty input list, exit."
00112        sys.exit(12)
00113    else :
00114        print 'Processing', nSamples, 'data sets.'
00115    #check event numbers
00116    nSampleEvts = list()
00117    for s in dsetpaths:
00118        nSampleEvts.append(check_nevts_dset(s))
00119    print 'number of events per dataset:', nSampleEvts
00120 
00121 def find_dqmref(ds) :
00122     if not do_reference :
00123         return 'NONE'
00124     cp = cmssw_ver[-1:]
00125     ip = (int)(cp) - 1
00126     ref_ver = cmssw_ver.replace(cp,str(ip))
00127     #print "cms:", cmssw_ver, " cp:", cp, " ip:", ip, " new_ver:", ref_ver  
00128     ref_dir = "/castor/cern.ch/user/n/nuno/relval/harvest/" + ref_ver + "/"
00129     ref_dsf = make_dqmname(ds.replace(cmssw_ver, ref_ver))
00130     gls = " | grep root | grep "
00131     #to accept crab appended _1.root in file names, nd skip versions/conditions
00132     gls += ref_dsf[:-25] 
00133     gls += "| awk '{print $9}' "
00134     #print "refds:", ref_dsf, " command: rfdir", ref_dir+gls
00135     command = "rfcp " + ref_dir  + "`rfdir " + ref_dir + gls + "` ."
00136     #print "command:", command
00137     os.system(command)
00138     tmpfile = "ref.txt"
00139     command = "ls -rtl *" + gls + " > " + tmpfile
00140     #print "command:", command
00141     os.system(command)
00142     the_ref = 'NONE'
00143     if os.path.exists(tmpfile) :
00144         fin = open(tmpfile,'r')
00145         ref = fin.readline().replace('\n','')
00146         #print "read ref:", ref, "exists?", os.path.exists(ref)
00147         fin.close()
00148         if os.path.exists(ref) :
00149             the_ref = ref
00150     else :
00151         the_ref = 'NONE'
00152     print "Found reference file:", the_ref
00153     return the_ref
00154 
00155 def create_harvest(ds) :
00156     raw_cmsdriver = "cmsDriver.py harvest -s HARVESTING:validationHarvesting --mc  --conditions FrontierConditions_GlobalTag,STARTUP_30X::All --harvesting AtJobEnd --no_exec -n -1"
00157     cmsdriver = raw_cmsdriver
00158     cond = get_cond_from_dsetpath(ds)
00159     if cond == 0 :
00160         print 'unexpected problem with conditions'
00161         sys.exit(50)
00162     cmsdriver = cmsdriver.replace('30X',cond)
00163     fin_name="harvest_HARVESTING_STARTUP.py"
00164     if ds.find('IDEAL') != -1 :
00165         cmsdriver = cmsdriver.replace('STARTUP','IDEAL')
00166         fin_name = fin_name.replace('STARTUP','IDEAL')
00167     if ds.find('FastSim') != -1:
00168         cmsdriver = cmsdriver.replace('validationHarvesting','validationHarvestingFS')
00169     if ds.find('PileUp') != -1:
00170         cmsdriver = cmsdriver.replace('validationHarvesting','validationHarvestingPU')
00171 
00172     #print "=>", cmsdriver, " fs?", ds.find('FastSim')
00173     if os.path.exists(fin_name) : 
00174         os.system("rm " + fin_name)
00175     print "executing cmsdriver command:\n\t", cmsdriver
00176     os.system(cmsdriver)
00177     if not os.path.exists(fin_name) : 
00178         print 'problem with cmsdriver file name'
00179         sys.exit(40)
00180     os.system("touch " + fin_name)
00181     hf = make_harv_name(ds)
00182     os.system('mv ' + fin_name + " " + hf)
00183     out = open(hf, 'a')
00184     out.write("\n\n##additions to cmsDriver output \n")
00185     out.write("process.dqmSaver.workflow = '" + ds + "'\n")
00186     if is_dbs :
00187         out.write("process.source.fileNames = cms.untracked.vstring(\n")
00188         for afile in api.listFiles(path=ds):
00189             out.write("  '%s',\n" % afile['LogicalFileName'])
00190         out.write(")\n")
00191 
00192     dqmref = find_dqmref(ds);
00193     if not dqmref == 'NONE' : 
00194         out.write("process.DQMStore.referenceFileName = '" + dqmref + "'\n")
00195         out.write("process.dqmSaver.referenceHandling = 'all'\n")
00196 
00197     out.close()
00198 
00199 def create_mcrab(set, fcrab, fout):
00200     out = open(fout, 'w')
00201     out.write('[MULTICRAB]')
00202     out.write('\ncfg=' + fcrab)
00203     out.write('\n\n[COMMON]')
00204     nevt = -1
00205     njob = 1
00206     out.write('\nCMSSW.total_number_of_events=' + (str)(nevt) )
00207     out.write('\nCMSSW.number_of_jobs=' + (str)(njob) )
00208     for s in set:
00209         append_sample_mcrab(s, out)
00210     out.close()    
00211 
00212 def make_harv_name(dset) :
00213     return 'harvest_' + get_name_from_dsetpath(dset) + '.py' 
00214 
00215 def append_sample_mcrab(dsetp, fout):
00216     dqm = make_dqmname(dsetp)
00217     sample = get_name_from_dsetpath(dsetp)
00218     hf = make_harv_name(dsetp)
00219     if not os.path.exists(hf) :
00220         print 'problem creating multicrab, file', hf, 'does not exist'
00221         sys.exit(17)
00222     fout.write('\n\n[' + sample + ']')
00223     fout.write('\nCMSSW.pset=' + hf)
00224     fout.write('\nCMSSW.datasetpath=' + dsetp)
00225     fout.write('\nCMSSW.output_file=' + dqm)
00226 
00227     dqmref = find_dqmref(dsetp);
00228     if not dqmref == 'NONE' : 
00229         fout.write('\nUSER.additional_input_files=' + dqmref)
00230 
00231 def create_crab(ds) :
00232     dqmout = make_dqmname(ds)
00233     hf = make_harv_name(ds)
00234     out = open(f_crab, 'w')
00235     out.write(crab_block)
00236     out.write('\npset=' + hf)
00237     out.write('datasetpath=' + ds)
00238     out.write('\noutput_file=' + dqmout)
00239     out.close()
00240 
00241 crab_block = """
00242 [CRAB]
00243 jobtype = cmssw
00244 scheduler = glite
00245 
00246 [EDG]
00247 remove_default_blacklist=1
00248 rb = CERN
00249 
00250 [USER]
00251 return_data = 1
00252 #copy_data = 1
00253 #storage_element=srm-cms.cern.ch
00254 #storage_path=/srm/managerv2?SFN=/castor/cern.ch
00255 #user_remote_dir=/user/n/nuno/test
00256 publish_data=0
00257 thresholdLevel=70
00258 eMail=nuno@cern.ch
00259 
00260 [CMSSW]
00261 total_number_of_events=-1
00262 show_prod = 1
00263 number_of_jobs=1
00264 """
00265 
00266 
00267 #Check arg,settings
00268 input_type = ''
00269 argin = ''
00270 dsfile = ''
00271 do_reference = False
00272 if len(sys.argv) > 2 : 
00273     print_def()
00274     sys.exit(10) 
00275 elif len(sys.argv) == 1 : 
00276     print "Will search for available datasets."
00277     input_type = 'none'
00278 elif len(sys.argv) == 2 : 
00279     argin = sys.argv[1]
00280     if os.path.exists(argin) :
00281         dsfile = argin
00282         #print 'Reading list of datasets from', dsfile
00283         input_type = 'file'
00284     elif argin.find('CMSSW') != -1 and argin.find('RelVal'): 
00285         print 'Using specified data set', argin
00286         input_type = 'ds'
00287     else :
00288         print 'Invalid argument: process list, dataset or file', \
00289                   argin, 'does not exist.'
00290         sys.exit(11) 
00291 
00292 #dbs
00293 is_dbs = check_dbs()
00294 if not is_dbs:
00295     print "dbs not set!"
00296 else:
00297     print "dbs home:", os.getenv('DBSCMD_HOME')
00298     from DBSAPI.dbsApi import DbsApi
00299     from DBSAPI.dbsException import *
00300     from DBSAPI.dbsApiException import *
00301     from DBSAPI.dbsOptions import DbsOptionParser
00302     optManager  = DbsOptionParser()
00303     (opts,args) = optManager.getOpt()
00304     #api = DbsApi(opts.__dict__)
00305     args={}
00306     args['url']= "https://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
00307     api = DbsApi(args)
00308 
00309 #cmssw
00310 cmssw_ver = os.getenv('CMSSW_VERSION','NOTSET')
00311 if cmssw_ver == 'NOTSET' :
00312     print """
00313     cmssw not set!
00314     example:
00315       scramv1 p CMSSW CMSSW_3_1_0_pre5
00316       cd CMSSW_3_1_0_pre5/src
00317       eval `scramv1 runtime -sh`
00318       cd -
00319     """
00320     sys.exit(12) 
00321 else :
00322     print "Using cmssw version:", cmssw_ver
00323     
00324 
00325 #read datasets
00326 dsetpaths = list()
00327 
00328 if input_type == 'none' :
00329     if not is_dbs :
00330         print "no dataset specified, and dbs isn't set..."
00331         print_def()
00332         sys.exit(13)
00333     else :
00334         dsfile = cmssw_ver + "_dbslist.txt"
00335         make_dbs_list(dsfile)
00336         read_ds_file()
00337 elif input_type == 'file' :
00338     read_ds_file()
00339 elif input_type == 'ds' :
00340     dsetpaths.append(argin)
00341 
00342 
00343 #check dataset list: remove incompatible dsets
00344 check_dset()
00345 
00346 #print dataset list to be processed
00347 print 'data sets:', dsetpaths
00348 dslproc = open("dset_processed.txt", 'w')
00349 for s in dsetpaths :
00350     dslproc.write(s+'\n')
00351 dslproc.close()
00352 
00353 
00354 ##Create harvest.py template
00355 create_harvest(dsetpaths[0])
00356 
00357 ##Create crab.cfg template
00358 f_crab = 'crab.cfg'
00359 create_crab(dsetpaths[0])
00360 
00361 ##Create harvest_n.py for individual datasets
00362 for s in dsetpaths:
00363     create_harvest(s)
00364 
00365 ##Create multicrab.cfg
00366 f_multi_crab = 'multicrab.cfg'
00367 create_mcrab(dsetpaths, f_crab, f_multi_crab)
00368 
00369 ##Print what has been created
00370 
00371 harvfilelist = list()
00372 for s in dsetpaths:
00373     harvfilelist.append(make_harv_name(s))
00374 
00375 print '\nCreated:\n\t %(pwd)s/%(cf)s \n\t %(pwd)s/%(mc)s' \
00376       % {'pwd' : os.environ["PWD"],'cf' : f_crab, 'mc' : f_multi_crab}
00377 print "\tIndividual harvest py's:\n\t", harvfilelist
00378 
00379 print "Done."