CMS 3D CMS Logo

harvestRelVal.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
3 from __future__ import print_function
4 import sys
5 import os
6 
7 """
8 arguments [<list-of-processes>]
9 description:
10 creates crab.cfg, multicrab.cfg, harvest_*.py
11 if dbs is set:
12  prints number of events found in dataset
13  if no argument is provided looks for all available datsets for release
14  user can edit multicrab and confirm process list as needed
15 nuno@cern.ch 09.04
16 """
17 
18 def print_def():
19  print("Usage:", sys.argv[0], "[list_of_processes]")
20  print("Examples:")
21  print("harvestRelVal.py")
22  print("harvestRelVal.py /RelValTTbar/CMSSW_3_1_0_pre4_STARTUP_30X_v1/GEN-SIM-RECO")
23  print("harvestRelVal.py <dataset_list.txt>")
24 
25 def check_dbs():
26  if os.getenv('DBSCMD_HOME','NOTSET') == 'NOTSET' :
27  return 0
28  return 1
29 
30 def check_nevts_dset(dset):
31  if not is_dbs :
32  return -1
33  ntot=0
34  for afile in api.listFiles(path=str(dset)):
35  nevts = afile['NumberOfEvents']
36  ntot += nevts
37  #print " %s" % afile['LogicalFileName']
38  return ntot
39 
40 def make_dqmname(s):
41  return 'DQM_V0001_R000000001' + s.replace('/','__') + '.root'
42 
44  fs = ds.split('/')
45  fa = fs[1].replace('RelVal','')
46  return fa
47 
49  ca = ds.split('/')[2].replace(cmssw_ver+'_','').replace('IDEAL_','').replace('STARTUP_','').replace('_FastSim','')
50  cb = ca[:ca.find('v')-1]
51  if cb[0].find('3') == -1 or len(cb) > 3:
52  print("problem extracting condition for", ds, " : ", cb, '(len:',len(cb),')')
53  if cb.find('31X') != -1:
54  cb = '31X'
55  elif cb.find('30X') != -1:
56  cb = '30X'
57  else:
58  print("skipping", cb)
59  return 0
60  print("condition found:", cb)
61  else :
62  print("good condition for", ds, " : ", cb, '(len:',len(cb),')')
63  return cb
64 
65 
66 def make_dbs_list(dbslf) :
67  if not is_dbs :
68  return
69  flis = open(dbslf,'w')
70  for ads in api.listDatasetPaths() :
71  if ads.find('RelVal') != -1 \
72  or ads.find(cmssw_ver) != -1 \
73  or ads.find("/GEN-SIM") != -1 :
74 # and ads.find("/GEN-SIM-RECO") != -1 :
75  flis.write(ads + '\n')
76  flis.close()
77  print('Generated dataset list', dbslf, 'from dbs.')
78  #exampe:
79  #dbs lsd --path=/RelVal*/CMSSW_3_1_0_pre5*/GEN-SIM-RECO --url=http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet > mylist.txt
80  #dbslsd = "dbs lsd --path=/RelVal*/" + cmssw_ver + "*/GEN-SIM-RECO --url=http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
81  #os.system( '`' + dbslsd + ' > ' + dbslf + '`')
82 
83 def read_ds_file() :
84  if not os.path.exists(dsfile) :
85  print("problem reading file", dsfile)
86  sys.exit(30)
87  fin = open(dsfile,'r')
88  for dset in fin.readlines():
89  d = dset.replace('\n','')
90  if d.find('#') == -1 :
91  dsetpaths.append(d)
92  else :
93  print('skipping:', d)
94  fin.close()
95  print('Using data set list in ', dsfile)
96 
97 def check_dset() :
98  #check cmssw consistency
99  for s in dsetpaths:
100  if s.find(cmssw_ver) == -1 :
101  dsetpaths.remove(s)
102  print('Inconsistency found with datset and cmssw version (', cmssw_ver, ')' \
103  ': \t ', s, ' has been removed.')
104  #check conditions from dsetname
105  for s in dsetpaths[:]: #nb:need to make a copy here!
106  cond = get_cond_from_dsetpath(s)
107  if cond == 0 :
108  dsetpaths.remove(s)
109  #check list size
110  nSamples = len(dsetpaths)
111  if nSamples == 0 :
112  print("Empty input list, exit.")
113  sys.exit(12)
114  else :
115  print('Processing', nSamples, 'data sets.')
116  #check event numbers
117  nSampleEvts = list()
118  for s in dsetpaths:
119  nSampleEvts.append(check_nevts_dset(s))
120  print('number of events per dataset:', nSampleEvts)
121 
122 def find_dqmref(ds) :
123  if not do_reference :
124  return 'NONE'
125  cp = cmssw_ver[-1:]
126  ip = (int)(cp) - 1
127  ref_ver = cmssw_ver.replace(cp,str(ip))
128  #print "cms:", cmssw_ver, " cp:", cp, " ip:", ip, " new_ver:", ref_ver
129  ref_dir = "/castor/cern.ch/user/n/nuno/relval/harvest/" + ref_ver + "/"
130  ref_dsf = make_dqmname(ds.replace(cmssw_ver, ref_ver))
131  gls = " | grep root | grep "
132  #to accept crab appended _1.root in file names, nd skip versions/conditions
133  gls += ref_dsf[:-25]
134  gls += "| awk '{print $9}' "
135  #print "refds:", ref_dsf, " command: rfdir", ref_dir+gls
136  command = "rfcp " + ref_dir + "`rfdir " + ref_dir + gls + "` ."
137  #print "command:", command
138  os.system(command)
139  tmpfile = "ref.txt"
140  command = "ls -rtl *" + gls + " > " + tmpfile
141  #print "command:", command
142  os.system(command)
143  the_ref = 'NONE'
144  if os.path.exists(tmpfile) :
145  fin = open(tmpfile,'r')
146  ref = fin.readline().replace('\n','')
147  #print "read ref:", ref, "exists?", os.path.exists(ref)
148  fin.close()
149  if os.path.exists(ref) :
150  the_ref = ref
151  else :
152  the_ref = 'NONE'
153  print("Found reference file:", the_ref)
154  return the_ref
155 
156 def create_harvest(ds) :
157  raw_cmsdriver = "cmsDriver.py harvest -s HARVESTING:validationHarvesting --mc --conditions FrontierConditions_GlobalTag,STARTUP_30X::All --harvesting AtJobEnd --no_exec -n -1"
158  cmsdriver = raw_cmsdriver
159  cond = get_cond_from_dsetpath(ds)
160  if cond == 0 :
161  print('unexpected problem with conditions')
162  sys.exit(50)
163  cmsdriver = cmsdriver.replace('30X',cond)
164  fin_name="harvest_HARVESTING_STARTUP.py"
165  if ds.find('IDEAL') != -1 :
166  cmsdriver = cmsdriver.replace('STARTUP','IDEAL')
167  fin_name = fin_name.replace('STARTUP','IDEAL')
168  if ds.find('FastSim') != -1:
169  cmsdriver = cmsdriver.replace('validationHarvesting','validationHarvestingFS')
170  if ds.find('PileUp') != -1:
171  cmsdriver = cmsdriver.replace('validationHarvesting','validationHarvestingPU')
172 
173  #print "=>", cmsdriver, " fs?", ds.find('FastSim')
174  if os.path.exists(fin_name) :
175  os.system("rm " + fin_name)
176  print("executing cmsdriver command:\n\t", cmsdriver)
177  os.system(cmsdriver)
178  if not os.path.exists(fin_name) :
179  print('problem with cmsdriver file name')
180  sys.exit(40)
181  os.system("touch " + fin_name)
182  hf = make_harv_name(ds)
183  os.system('mv ' + fin_name + " " + hf)
184  out = open(hf, 'a')
185  out.write("\n\n##additions to cmsDriver output \n")
186  out.write("process.dqmSaver.workflow = '" + ds + "'\n")
187  if is_dbs :
188  out.write("process.source.fileNames = cms.untracked.vstring(\n")
189  for afile in api.listFiles(path=ds):
190  out.write(" '%s',\n" % afile['LogicalFileName'])
191  out.write(")\n")
192 
193  dqmref = find_dqmref(ds);
194  if not dqmref == 'NONE' :
195  out.write("process.dqmSaver.referenceHandling = 'all'\n")
196 
197  out.close()
198 
199 def create_mcrab(set, fcrab, fout):
200  out = open(fout, 'w')
201  out.write('[MULTICRAB]')
202  out.write('\ncfg=' + fcrab)
203  out.write('\n\n[COMMON]')
204  nevt = -1
205  njob = 1
206  out.write('\nCMSSW.total_number_of_events=' + (str)(nevt) )
207  out.write('\nCMSSW.number_of_jobs=' + (str)(njob) )
208  for s in set:
209  append_sample_mcrab(s, out)
210  out.close()
211 
212 def make_harv_name(dset) :
213  return 'harvest_' + get_name_from_dsetpath(dset) + '.py'
214 
215 def append_sample_mcrab(dsetp, fout):
216  dqm = make_dqmname(dsetp)
217  sample = get_name_from_dsetpath(dsetp)
218  hf = make_harv_name(dsetp)
219  if not os.path.exists(hf) :
220  print('problem creating multicrab, file', hf, 'does not exist')
221  sys.exit(17)
222  fout.write('\n\n[' + sample + ']')
223  fout.write('\nCMSSW.pset=' + hf)
224  fout.write('\nCMSSW.datasetpath=' + dsetp)
225  fout.write('\nCMSSW.output_file=' + dqm)
226 
227  dqmref = find_dqmref(dsetp);
228  if not dqmref == 'NONE' :
229  fout.write('\nUSER.additional_input_files=' + dqmref)
230 
231 def create_crab(ds) :
232  dqmout = make_dqmname(ds)
233  hf = make_harv_name(ds)
234  out = open(f_crab, 'w')
235  out.write(crab_block)
236  out.write('\npset=' + hf)
237  out.write('datasetpath=' + ds)
238  out.write('\noutput_file=' + dqmout)
239  out.close()
240 
241 crab_block = """
242 [CRAB]
243 jobtype = cmssw
244 scheduler = glite
245 
246 [EDG]
247 remove_default_blacklist=1
248 rb = CERN
249 
250 [USER]
251 return_data = 1
252 #copy_data = 1
253 #storage_element=srm-cms.cern.ch
254 #storage_path=/srm/managerv2?SFN=/castor/cern.ch
255 #user_remote_dir=/user/n/nuno/test
256 publish_data=0
257 thresholdLevel=70
258 eMail=nuno@cern.ch
259 
260 [CMSSW]
261 total_number_of_events=-1
262 show_prod = 1
263 number_of_jobs=1
264 """
265 
266 
267 #Check arg,settings
268 input_type = ''
269 argin = ''
270 dsfile = ''
271 do_reference = False
272 if len(sys.argv) > 2 :
273  print_def()
274  sys.exit(10)
275 elif len(sys.argv) == 1 :
276  print("Will search for available datasets.")
277  input_type = 'none'
278 elif len(sys.argv) == 2 :
279  argin = sys.argv[1]
280  if os.path.exists(argin) :
281  dsfile = argin
282  #print 'Reading list of datasets from', dsfile
283  input_type = 'file'
284  elif argin.find('CMSSW') != -1 and argin.find('RelVal'):
285  print('Using specified data set', argin)
286  input_type = 'ds'
287  else :
288  print('Invalid argument: process list, dataset or file', \
289  argin, 'does not exist.')
290  sys.exit(11)
291 
292 #dbs
293 is_dbs = check_dbs()
294 if not is_dbs:
295  print("dbs not set!")
296 else:
297  print("dbs home:", os.getenv('DBSCMD_HOME'))
298  from DBSAPI.dbsApi import DbsApi
299  from DBSAPI.dbsException import *
300  from DBSAPI.dbsApiException import *
301  from DBSAPI.dbsOptions import DbsOptionParser
302  optManager = DbsOptionParser()
303  (opts,args) = optManager.getOpt()
304  #api = DbsApi(opts.__dict__)
305  args={}
306  args['url']= "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
307  api = DbsApi(args)
308 
309 #cmssw
310 cmssw_ver = os.getenv('CMSSW_VERSION','NOTSET')
311 if cmssw_ver == 'NOTSET' :
312  print("""
313  cmssw not set!
314  example:
315  scramv1 p CMSSW CMSSW_3_1_0_pre5
316  cd CMSSW_3_1_0_pre5/src
317  eval `scramv1 runtime -sh`
318  cd -
319  """)
320  sys.exit(12)
321 else :
322  print("Using cmssw version:", cmssw_ver)
323 
324 
325 #read datasets
326 dsetpaths = list()
327 
328 if input_type == 'none' :
329  if not is_dbs :
330  print("no dataset specified, and dbs isn't set...")
331  print_def()
332  sys.exit(13)
333  else :
334  dsfile = cmssw_ver + "_dbslist.txt"
335  make_dbs_list(dsfile)
336  read_ds_file()
337 elif input_type == 'file' :
338  read_ds_file()
339 elif input_type == 'ds' :
340  dsetpaths.append(argin)
341 
342 
343 #check dataset list: remove incompatible dsets
344 check_dset()
345 
346 #print dataset list to be processed
347 print('data sets:', dsetpaths)
348 dslproc = open("dset_processed.txt", 'w')
349 for s in dsetpaths :
350  dslproc.write(s+'\n')
351 dslproc.close()
352 
353 
354 
355 create_harvest(dsetpaths[0])
356 
357 
358 f_crab = 'crab.cfg'
359 create_crab(dsetpaths[0])
360 
361 
362 for s in dsetpaths:
363  create_harvest(s)
364 
365 
366 f_multi_crab = 'multicrab.cfg'
367 create_mcrab(dsetpaths, f_crab, f_multi_crab)
368 
369 
370 
371 harvfilelist = list()
372 for s in dsetpaths:
373  harvfilelist.append(make_harv_name(s))
374 
375 print('\nCreated:\n\t %(pwd)s/%(cf)s \n\t %(pwd)s/%(mc)s' \
376  % {'pwd' : os.environ["PWD"],'cf' : f_crab, 'mc' : f_multi_crab})
377 print("\tIndividual harvest py's:\n\t", harvfilelist)
378 
379 print("Done.")
def read_ds_file()
def get_name_from_dsetpath(ds)
def replace(string, replacements)
def create_mcrab(set, fcrab, fout)
void find(edm::Handle< EcalRecHitCollection > &hits, DetId thisDet, std::vector< EcalRecHitCollection::const_iterator > &hit, bool debug=false)
Definition: FindCaloHit.cc:19
def make_dbs_list(dbslf)
void print(TMatrixD &m, const char *label=nullptr, bool mathematicaFormat=false)
Definition: Utilities.cc:47
def make_harv_name(dset)
def append_sample_mcrab(dsetp, fout)
def get_cond_from_dsetpath(ds)
def check_nevts_dset(dset)
def find_dqmref(ds)
def create_harvest(ds)
def create_crab(ds)
#define str(s)
def make_dqmname(s)