CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
cmsBatch.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # Colin
3 # batch mode for cmsRun, March 2009
4 
5 import os, sys, imp, re, pprint, string, time,shutil,copy,pickle,math
6 from optparse import OptionParser
7 
8 # particle flow specific
9 from PhysicsTools.HeppyCore.utils.batchmanager import BatchManager
10 import PhysicsTools.HeppyCore.utils.eostools as eostools
11 
12 # cms specific
13 import FWCore.ParameterSet.Config as cms
14 from IOMC.RandomEngine.RandomServiceHelper import RandomNumberServiceHelper
15 
16 
18  script = """!/usr/bin/env bash
19 #PBS -l platform=LINUX,u_sps_cmsf,M=2000MB,T=2000000
20 # sets the queue
21 #PBS -q T
22 #PBS -eo
23 #PBS -me
24 #PBS -V
25 
26 source $HOME/.bash_profile
27 
28 echo '***********************'
29 
30 ulimit -v 3000000
31 
32 # coming back to submission dir do setup the env
33 cd $PBS_O_WORKDIR
34 eval `scramv1 ru -sh`
35 
36 
37 # back to the worker
38 cd -
39 
40 # copy job dir here
41 cp -r $PBS_O_WORKDIR .
42 
43 # go inside
44 jobdir=`ls`
45 echo $jobdir
46 
47 cd $jobdir
48 
49 cat > sysinfo.sh <<EOF
50 #! env bash
51 echo '************** ENVIRONMENT ****************'
52 
53 env
54 
55 echo
56 echo '************** WORKER *********************'
57 echo
58 
59 free
60 cat /proc/cpuinfo
61 
62 echo
63 echo '************** START *********************'
64 echo
65 EOF
66 
67 source sysinfo.sh > sysinfo.txt
68 
69 cmsRun run_cfg.py
70 
71 # copy job dir do disk
72 cd -
73 cp -r $jobdir $PBS_O_WORKDIR
74 """
75  return script
76 
77 
78 
79 def rootfiles_to_eos_script(index, remoteDir):
80  remoteDir = eostools.eosToLFN(remoteDir)
81  return """
82 for file in *.root; do
83 newFileName=`echo $file | sed -r -e 's/\./_{index}\./'`
84 fullFileName={remoteDir}/$newFileName
85 {eos} cp $file /eos/cms/$fullFileName
86 {eos} chmod 755 /eos/cms/$fullFileName
87 rm *.root
88 done
89 """.format(index=index, remoteDir=remoteDir, eos=eostools.eos_select)
90 
91 
92 def batchScriptCERN( remoteDir, index ):
93  '''prepare the LSF version of the batch script, to run on LSF'''
94  script = """#!/bin/bash
95 # sets the queue
96 #BSUB -q 8nm
97 
98 echo 'environment:'
99 echo
100 env
101 ulimit -v 3000000
102 echo 'copying job dir to worker'
103 cd $CMSSW_BASE/src
104 eval `scramv1 ru -sh`
105 cd -
106 cp -rf $LS_SUBCWD .
107 ls
108 cd `find . -type d | grep /`
109 echo 'running'
110 {prog} run_cfg.py
111 if [ $? != 0 ]; then
112  echo wrong exit code! removing all root files
113  rm *.root
114  exit 1
115 fi
116 echo 'sending the job directory back'
117 """.format(prog=prog)
118 
119  if remoteDir != '':
120  script += rootfiles_to_eos_script(index, remoteDir)
121 
122  script += 'cp -rf * $LS_SUBCWD\n'
123 
124  return script
125 
126 def batchScriptLocal( remoteDir, index ):
127  '''prepare a local version of the batch script, to run using nohup'''
128 
129  script = """#!/bin/bash
130 echo 'running'
131 {prog} run_cfg.py
132 if [ $? != 0 ]; then
133  echo wrong exit code! removing all root files
134  rm *.root
135  exit 1
136 fi
137 echo 'sending the job directory back'
138 """.format(prog=prog)
139 
140  if remoteDir != '':
141  script += rootfiles_to_eos_script(index, remoteDir)
142 
143  return script
144 
145 
147  '''Exception class for this script'''
148 
149  def __init__(self, value):
150  self.value = value
151 
152  def __str__(self):
153  return str( self.value)
154 
155 
156 class MyBatchManager( BatchManager ):
157  '''Batch manager specific to cmsRun processes.'''
158 
159  def PrepareJobUser(self, jobDir, value ):
160  '''Prepare one job. This function is called by the base class.'''
161 
162  process.source = fullSource.clone()
163 
164  #prepare the batch script
165  scriptFileName = jobDir+'/batchScript.sh'
166  scriptFile = open(scriptFileName,'w')
167  storeDir = self.remoteOutputDir_.replace('/castor/cern.ch/cms','')
168  mode = self.RunningMode(options.batch)
169  if mode == 'LXPLUS':
170  scriptFile.write( batchScriptCERN( storeDir, value) ) #here is the call to batchScriptCERN, i need to change value
171  elif mode == 'LOCAL':
172  scriptFile.write( batchScriptLocal( storeDir, value) ) #same as above but for batchScriptLocal
173  scriptFile.close()
174  os.system('chmod +x %s' % scriptFileName)
175 
176  #prepare the cfg
177  # replace the list of fileNames by a chunk of filenames:
178  if generator:
179  randSvc = RandomNumberServiceHelper(process.RandomNumberGeneratorService)
180  randSvc.populate()
181  else:
182  iFileMin = (value-1)*grouping
183  iFileMax = (value)*grouping
184  process.source.fileNames = fullSource.fileNames[iFileMin:iFileMax]
185  print process.source
186  cfgFile = open(jobDir+'/run_cfg.py','w')
187  cfgFile.write('import FWCore.ParameterSet.Config as cms\n\n')
188  cfgFile.write('import os,sys\n')
189  # need to import most of the config from the base directory containing all jobs
190  cfgFile.write("sys.path.append('%s')\n" % os.path.dirname(jobDir) )
191  cfgFile.write('from base_cfg import *\n')
192  cfgFile.write('process.source = ' + process.source.dumpPython() + '\n')
193  if generator:
194  cfgFile.write('process.RandomNumberGeneratorService = ' + process.RandomNumberGeneratorService.dumpPython() + '\n')
195  cfgFile.close()
196 
197 
198 batchManager = MyBatchManager()
199 
200 
201 file = open('cmsBatch.txt', 'w')
202 file.write(string.join(sys.argv) + "\n")
203 file.close()
204 
205 batchManager.parser_.usage = """
206 %prog [options] <number of input files per job> <your_cfg.py>.
207 
208 Submits a number of jobs taking your_cfg.py as a template. your_cfg.py can either read events from input files, or produce them with a generator. In the later case, the seeds are of course updated for each job.
209 
210 A local output directory is created locally. This directory contains a job directory for each job, and a Logger/ directory containing information on the software you are using.
211 By default:
212 - the name of the output directory is created automatically.
213 - the output root files end up in the job directories.
214 
215 Each job directory contains:
216 - the full python configuration for this job. You can run it interactively by doing:
217 cmsRun run_cfg.py
218 - the batch script to run the job. You can submit it again by calling the batch command yourself, see the -b option.
219 - while running interactively: nohup.out, where the job stderr and stdout are redirected. To check the status of a job running interactively, do:
220 tail nohup.out
221 - after running:
222  o the full nohup.out (your log) and your root files, in case you ran interactively
223  o the LSF directory, in case you ran on LSF
224 
225 Also see fwBatch.py, which is a layer on top of cmsBatch.py adapted to the organization of our samples on the CMST3.
226 
227 Examples:
228 
229 First do:
230 cd $CMSSW_BASE/src/CMGTools/Common/test
231 
232 to run on your local machine:
233 cmsBatch.py 1 testCMGTools_cfg.py -b 'nohup ./batchScript.sh&'
234 
235 to run on LSF (you must be logged on lxplus, not on your interactive machine, so that you have access to LSF)
236 cmsBatch.py 1 testCMGTools_cfg.py -b 'bsub -q 8nm < ./batchScript.sh'
237 """
238 batchManager.parser_.add_option("-p", "--program", dest="prog",
239  help="program to run on your cfg file",
240  default="cmsRun")
241 ## batchManager.parser_.add_option("-b", "--batch", dest="batch",
242 ## help="batch command. default is: 'bsub -q 8nh < batchScript.sh'. You can also use 'nohup < ./batchScript.sh &' to run locally.",
243 ## default="bsub -q 8nh < .batchScript.sh")
244 batchManager.parser_.add_option("-c", "--command-args", dest="cmdargs",
245  help="command line arguments for the job",
246  default=None)
247 batchManager.parser_.add_option("--notagCVS", dest="tagPackages",
248  default=True,action="store_false",
249  help="tag the package on CVS (True)")
250 
251 (options,args) = batchManager.parser_.parse_args()
252 batchManager.ParseOptions()
253 
254 prog = options.prog
255 doCVSTag = options.tagPackages
256 
257 if len(args)!=2:
258  batchManager.parser_.print_help()
259  sys.exit(1)
260 
261 # testing that we run a sensible batch command. If not, exit.
262 runningMode = None
263 try:
264  runningMode = batchManager.RunningMode( options.batch )
265 except CmsBatchException as err:
266  print err
267  sys.exit(1)
268 
269 grouping = int(args[0])
270 nJobs = grouping
271 cfgFileName = args[1]
272 
273 print 'Loading cfg'
274 
275 pycfg_params = options.cmdargs
276 trueArgv = sys.argv
277 sys.argv = [cfgFileName]
278 if pycfg_params:
279  sys.argv.extend(pycfg_params.split(' '))
280 print sys.argv
281 
282 
283 # load cfg script
284 handle = open(cfgFileName, 'r')
285 cfo = imp.load_source("pycfg", cfgFileName, handle)
286 process = cfo.process
287 handle.close()
288 
289 # Restore original sys.argv
290 sys.argv = trueArgv
291 
292 
293 # keep track of the original source
294 fullSource = process.source.clone()
295 generator = False
296 
297 try:
298  process.source.fileNames
299 except:
300  print 'No input file. This is a generator process.'
301  generator = True
302  listOfValues = [i+1 for i in range( nJobs )] #Here is where the list of values is created
303 else:
304  print "Number of files in the source:",len(process.source.fileNames), ":"
305  pprint.pprint(process.source.fileNames)
306  nFiles = len(process.source.fileNames)
307  nJobs = nFiles / grouping
308  if (nJobs!=0 and (nFiles % grouping) > 0) or nJobs==0:
309  nJobs = nJobs + 1
310 
311  print "number of jobs to be created: ", nJobs
312  listOfValues = [i+1 for i in range( nJobs )] #OR Here is where the list of values is created
313  #here i change from e.g 0-19 to 1-20
314 
315 batchManager.PrepareJobs( listOfValues ) #PrepareJobs with listOfValues as param
316 
317 # preparing master cfg file
318 
319 cfgFile = open(batchManager.outputDir_+'/base_cfg.py','w')
320 cfgFile.write( process.dumpPython() + '\n')
321 cfgFile.close()
322 
323 # need to wait 5 seconds to give castor some time
324 # now on EOS, should be ok. reducing to 1 sec
325 waitingTime = 1
326 if runningMode == 'LOCAL':
327  # of course, not the case when running with nohup
328  # because we will never have enough processes to saturate castor.
329  waitingTime = 0
330 batchManager.SubmitJobs( waitingTime )
331 
332 
333 # logging
334 
335 from PhysicsTools.HeppyCore.utils.logger import logger
336 
337 oldPwd = os.getcwd()
338 os.chdir(batchManager.outputDir_)
339 logDir = 'Logger'
340 os.system( 'mkdir ' + logDir )
341 log = logger( logDir )
342 
343 log.logCMSSW()
344 log.logJobs(nJobs)
345 #COLIN not so elegant... but tar is behaving in a strange way.
346 log.addFile( oldPwd + '/' + cfgFileName )
347 
348 if not batchManager.options_.negate:
349  if batchManager.remoteOutputDir_ != "":
350  # we don't want to crush an existing log file on castor
351  #COLIN could protect the logger against that.
352  log.stageOut( batchManager.remoteOutputDir_ )
353 
354 os.chdir( oldPwd )
355 
356 
def batchScriptLocal
Definition: cmsBatch.py:126
def eosToLFN
Definition: eostools.py:65
def rootfiles_to_eos_script
Definition: cmsBatch.py:79
Definition: logger.py:1
def batchScriptCCIN2P3
Definition: cmsBatch.py:17
def batchScriptCERN
Definition: cmsBatch.py:92