CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
cmsBatch.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # Colin
3 # batch mode for cmsRun, March 2009
4 
5 import os, sys, imp, re, pprint, string, time,shutil,copy,pickle,math
6 from optparse import OptionParser
7 
8 # particle flow specific
9 from PhysicsTools.HeppyCore.utils.batchmanager import BatchManager
10 import PhysicsTools.HeppyCore.utils.eostools as castortools
11 
12 # cms specific
13 import FWCore.ParameterSet.Config as cms
14 from IOMC.RandomEngine.RandomServiceHelper import RandomNumberServiceHelper
15 
16 
17 
18 
20  script = """!/usr/bin/env bash
21 #PBS -l platform=LINUX,u_sps_cmsf,M=2000MB,T=2000000
22 # sets the queue
23 #PBS -q T
24 #PBS -eo
25 #PBS -me
26 #PBS -V
27 
28 source $HOME/.bash_profile
29 
30 echo '***********************'
31 
32 ulimit -v 3000000
33 
34 # coming back to submission dir do setup the env
35 cd $PBS_O_WORKDIR
36 eval `scramv1 ru -sh`
37 
38 
39 # back to the worker
40 cd -
41 
42 # copy job dir here
43 cp -r $PBS_O_WORKDIR .
44 
45 # go inside
46 jobdir=`ls`
47 echo $jobdir
48 
49 cd $jobdir
50 
51 cat > sysinfo.sh <<EOF
52 #! env bash
53 echo '************** ENVIRONMENT ****************'
54 
55 env
56 
57 echo
58 echo '************** WORKER *********************'
59 echo
60 
61 free
62 cat /proc/cpuinfo
63 
64 echo
65 echo '************** START *********************'
66 echo
67 EOF
68 
69 source sysinfo.sh > sysinfo.txt
70 
71 cmsRun run_cfg.py
72 
73 # copy job dir do disk
74 cd -
75 cp -r $jobdir $PBS_O_WORKDIR
76 """
77  return script
78 
79 
80 def batchScriptCERN( remoteDir, index ):
81  '''prepare the LSF version of the batch script, to run on LSF'''
82  script = """#!/bin/bash
83 # sets the queue
84 #BSUB -q 8nm
85 
86 echo 'environment:'
87 echo
88 env
89 ulimit -v 3000000
90 echo 'copying job dir to worker'
91 cd $CMSSW_BASE/src
92 eval `scramv1 ru -sh`
93 cd -
94 cp -rf $LS_SUBCWD .
95 ls
96 cd `find . -type d | grep /`
97 echo 'running'
98 %s run_cfg.py
99 if [ $? != 0 ]; then
100  echo wrong exit code! removing all root files
101  rm *.root
102  exit 1
103 fi
104 echo 'sending the job directory back'
105 """ % prog
106 
107  if remoteDir != '':
108  remoteDir = castortools.eosToLFN(remoteDir) #remoteDir.replace('/eos/cms','')
109  script += """
110 for file in *.root; do
111 newFileName=`echo $file | sed -r -e 's/\./_%s\./'`
112 fullFileName=%s/$newFileName
113 #this does cmsStage, but with retries
114 cmsStageWithFailover.py -f $file $fullFileName
115 #write the files as user readable but not writable
116 eos chmod 755 /eos/cms/$fullFileName
117 done
118 """ % (index, remoteDir)
119  script += 'rm *.root\n'
120  script += 'cp -rf * $LS_SUBCWD\n'
121 
122  return script
123 
124 
125 def batchScriptLocal( remoteDir, index ):
126  '''prepare a local version of the batch script, to run using nohup'''
127 
128  script = """#!/bin/bash
129 echo 'running'
130 %s run_cfg.py
131 if [ $? != 0 ]; then
132  echo wrong exit code! removing all root files
133  rm *.root
134  exit 1
135 fi
136 echo 'sending the job directory back'
137 """ % prog
138 
139  if remoteDir != '':
140  remoteDir = castortools.eosToLFN(remoteDir)
141  script += """
142 for file in *.root; do
143 newFileName=`echo $file | sed -r -e 's/\./_%s\./'`
144 cmsStageWithFailover.py -f $file $fullFileName
145 eos chmod 755 /eos/cms/$fullFileName
146 done
147 """ % (index, remoteDir)
148  script += 'rm *.root\n'
149  return script
150 
151 
153  '''Exception class for this script'''
154 
155  def __init__(self, value):
156  self.value = value
157 
158  def __str__(self):
159  return str( self.value)
160 
161 
162 class MyBatchManager( BatchManager ):
163  '''Batch manager specific to cmsRun processes.'''
164 
165  def PrepareJobUser(self, jobDir, value ):
166  '''Prepare one job. This function is called by the base class.'''
167 
168  process.source = fullSource.clone()
169 
170  #prepare the batch script
171  scriptFileName = jobDir+'/batchScript.sh'
172  scriptFile = open(scriptFileName,'w')
173  storeDir = self.remoteOutputDir_.replace('/castor/cern.ch/cms','')
174  mode = self.RunningMode(options.batch)
175  if mode == 'LXPLUS':
176  scriptFile.write( batchScriptCERN( storeDir, value) ) #here is the call to batchScriptCERN, i need to change value
177  elif mode == 'LOCAL':
178  scriptFile.write( batchScriptLocal( storeDir, value) ) #same as above but for batchScriptLocal
179  scriptFile.close()
180  os.system('chmod +x %s' % scriptFileName)
181 
182  #prepare the cfg
183  # replace the list of fileNames by a chunk of filenames:
184  if generator:
185  randSvc = RandomNumberServiceHelper(process.RandomNumberGeneratorService)
186  randSvc.populate()
187  else:
188  iFileMin = (value-1)*grouping
189  iFileMax = (value)*grouping
190  process.source.fileNames = fullSource.fileNames[iFileMin:iFileMax]
191  print process.source
192  cfgFile = open(jobDir+'/run_cfg.py','w')
193  cfgFile.write('import FWCore.ParameterSet.Config as cms\n\n')
194  cfgFile.write('import os,sys\n')
195  # need to import most of the config from the base directory containing all jobs
196  cfgFile.write("sys.path.append('%s')\n" % os.path.dirname(jobDir) )
197  cfgFile.write('from base_cfg import *\n')
198  cfgFile.write('process.source = ' + process.source.dumpPython() + '\n')
199  if generator:
200  cfgFile.write('process.RandomNumberGeneratorService = ' + process.RandomNumberGeneratorService.dumpPython() + '\n')
201  cfgFile.close()
202 
203 
204 batchManager = MyBatchManager()
205 
206 
207 file = open('cmsBatch.txt', 'w')
208 file.write(string.join(sys.argv) + "\n")
209 file.close()
210 
211 batchManager.parser_.usage = """
212 %prog [options] <number of input files per job> <your_cfg.py>.
213 
214 Submits a number of jobs taking your_cfg.py as a template. your_cfg.py can either read events from input files, or produce them with a generator. In the later case, the seeds are of course updated for each job.
215 
216 A local output directory is created locally. This directory contains a job directory for each job, and a Logger/ directory containing information on the software you are using.
217 By default:
218 - the name of the output directory is created automatically.
219 - the output root files end up in the job directories.
220 
221 Each job directory contains:
222 - the full python configuration for this job. You can run it interactively by doing:
223 cmsRun run_cfg.py
224 - the batch script to run the job. You can submit it again by calling the batch command yourself, see the -b option.
225 - while running interactively: nohup.out, where the job stderr and stdout are redirected. To check the status of a job running interactively, do:
226 tail nohup.out
227 - after running:
228  o the full nohup.out (your log) and your root files, in case you ran interactively
229  o the LSF directory, in case you ran on LSF
230 
231 Also see fwBatch.py, which is a layer on top of cmsBatch.py adapted to the organization of our samples on the CMST3.
232 
233 Examples:
234 
235 First do:
236 cd $CMSSW_BASE/src/CMGTools/Common/test
237 
238 to run on your local machine:
239 cmsBatch.py 1 testCMGTools_cfg.py -b 'nohup ./batchScript.sh&'
240 
241 to run on LSF (you must be logged on lxplus, not on your interactive machine, so that you have access to LSF)
242 cmsBatch.py 1 testCMGTools_cfg.py -b 'bsub -q 8nm < ./batchScript.sh'
243 """
244 batchManager.parser_.add_option("-p", "--program", dest="prog",
245  help="program to run on your cfg file",
246  default="cmsRun")
247 ## batchManager.parser_.add_option("-b", "--batch", dest="batch",
248 ## help="batch command. default is: 'bsub -q 8nh < batchScript.sh'. You can also use 'nohup < ./batchScript.sh &' to run locally.",
249 ## default="bsub -q 8nh < .batchScript.sh")
250 batchManager.parser_.add_option("-c", "--command-args", dest="cmdargs",
251  help="command line arguments for the job",
252  default=None)
253 batchManager.parser_.add_option("--notagCVS", dest="tagPackages",
254  default=True,action="store_false",
255  help="tag the package on CVS (True)")
256 
257 (options,args) = batchManager.parser_.parse_args()
258 batchManager.ParseOptions()
259 
260 prog = options.prog
261 doCVSTag = options.tagPackages
262 
263 if len(args)!=2:
264  batchManager.parser_.print_help()
265  sys.exit(1)
266 
267 # testing that we run a sensible batch command. If not, exit.
268 runningMode = None
269 try:
270  runningMode = batchManager.RunningMode( options.batch )
271 except CmsBatchException as err:
272  print err
273  sys.exit(1)
274 
275 grouping = int(args[0])
276 nJobs = grouping
277 cfgFileName = args[1]
278 
279 print 'Loading cfg'
280 
281 pycfg_params = options.cmdargs
282 trueArgv = sys.argv
283 sys.argv = [cfgFileName]
284 if pycfg_params:
285  sys.argv.extend(pycfg_params.split(' '))
286 print sys.argv
287 
288 
289 # load cfg script
290 handle = open(cfgFileName, 'r')
291 cfo = imp.load_source("pycfg", cfgFileName, handle)
292 process = cfo.process
293 handle.close()
294 
295 # Restore original sys.argv
296 sys.argv = trueArgv
297 
298 
299 # keep track of the original source
300 fullSource = process.source.clone()
301 generator = False
302 
303 try:
304  process.source.fileNames
305 except:
306  print 'No input file. This is a generator process.'
307  generator = True
308  listOfValues = [i+1 for i in range( nJobs )] #Here is where the list of values is created
309 else:
310  print "Number of files in the source:",len(process.source.fileNames), ":"
311  pprint.pprint(process.source.fileNames)
312  nFiles = len(process.source.fileNames)
313  nJobs = nFiles / grouping
314  if (nJobs!=0 and (nFiles % grouping) > 0) or nJobs==0:
315  nJobs = nJobs + 1
316 
317  print "number of jobs to be created: ", nJobs
318  listOfValues = [i+1 for i in range( nJobs )] #OR Here is where the list of values is created
319  #here i change from e.g 0-19 to 1-20
320 
321 batchManager.PrepareJobs( listOfValues ) #PrepareJobs with listOfValues as param
322 
323 # preparing master cfg file
324 
325 cfgFile = open(batchManager.outputDir_+'/base_cfg.py','w')
326 cfgFile.write( process.dumpPython() + '\n')
327 cfgFile.close()
328 
329 # need to wait 5 seconds to give castor some time
330 # now on EOS, should be ok. reducing to 1 sec
331 waitingTime = 1
332 if runningMode == 'LOCAL':
333  # of course, not the case when running with nohup
334  # because we will never have enough processes to saturate castor.
335  waitingTime = 0
336 batchManager.SubmitJobs( waitingTime )
337 
338 
339 # logging
340 
341 from PhysicsTools.HeppyCore.utils.logger import logger
342 
343 oldPwd = os.getcwd()
344 os.chdir(batchManager.outputDir_)
345 logDir = 'Logger'
346 os.system( 'mkdir ' + logDir )
347 log = logger( logDir )
348 
349 log.logCMSSW()
350 log.logJobs(nJobs)
351 #COLIN not so elegant... but tar is behaving in a strange way.
352 log.addFile( oldPwd + '/' + cfgFileName )
353 
354 if not batchManager.options_.negate:
355  if batchManager.remoteOutputDir_ != "":
356  # we don't want to crush an existing log file on castor
357  #COLIN could protect the logger against that.
358  log.stageOut( batchManager.remoteOutputDir_ )
359 
360 os.chdir( oldPwd )
361 
362 
def batchScriptLocal
Definition: cmsBatch.py:125
Definition: logger.py:1
def batchScriptCCIN2P3
Definition: cmsBatch.py:19
def batchScriptCERN
Definition: cmsBatch.py:80