CMS 3D CMS Logo

/data/refman/pasoursint/CMSSW_6_1_1/src/DQMOffline/EGamma/python/electronDataDiscovery.py

Go to the documentation of this file.
00001 
00002 #===================================================================
00003 # So to get the list of input files. One must call :
00004 #   search(), to get the list of primary files
00005 #   search2(), to get the list of eventual secondary files
00006 # 
00007 # The selection of files is configured thanks to shell
00008 # environment variables: 
00009 # 
00010 #   DD_RELEASE, for example CMSSW_2_2_0_pre1
00011 #   DD_SAMPLE, for example RelValSingleElectronPt35
00012 #   DD_RUN, for example ''
00013 #   DD_COND , for example MC_31X_V2-v1
00014 #   DD_TIER , for example RECO
00015 #   DD_TIER_SECONDARY, for eventual secondary files
00016 #   
00017 #   DD_SOURCE:
00018 #     das: use das
00019 #     dbs: use dbs search
00020 #     lsf: use dbs lsf
00021 #     /castor/cern.ch/cms/...: assumed to be the path of a castor directory containing the input data files
00022 #       for relvals: '/castor/cern.ch/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
00023 #       for harvested dqm: '/castor/cern.ch/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
00024 #     /eos/cms/...: assumed to be the path of a castor directory containing the input data files
00025 #       for relvals: '/eos/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
00026 #       for harvested dqm: '/eos/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
00027 #     /...: assumed to be the path of a text file containing the list of input data files
00028 #
00029 # All except DD_SOURCE can use wildcard *.
00030 #===================================================================
00031 
00032 #import httpslib, urllib, urllib2, types, string, os, sys
00033 import os, sys, re, das_client
00034 
00035 if not os.environ.has_key('DD_SOURCE'):
00036   os.environ['DD_SOURCE'] = 'das'
00037 if not os.environ.has_key('DD_RELEASE'):
00038   os.environ['DD_RELEASE'] = ''
00039 if not os.environ.has_key('DD_SAMPLE'):
00040   os.environ['DD_SAMPLE'] = ''
00041 if not os.environ.has_key('DD_COND'):
00042   os.environ['DD_COND'] = ''
00043 if not os.environ.has_key('DD_TIER'):
00044   os.environ['DD_TIER'] = ''
00045 if not os.environ.has_key('DD_TIER_SECONDARY'):
00046   os.environ['DD_TIER_SECONDARY'] = ''
00047 if not os.environ.has_key('DD_RUN'):
00048   os.environ['DD_RUN'] = ''
00049   
00050 dd_release_re = re.compile(os.environ['DD_RELEASE'].replace('*','.*')) ;
00051 dd_sample_re = re.compile(os.environ['DD_SAMPLE'].replace('*','.*')) ;
00052 dd_cond_re = re.compile(os.environ['DD_COND'].replace('*','.*')) ;
00053 dd_run_re = re.compile(os.environ['DD_RUN'].replace('*','.*')) ;
00054 
00055 def common_search(dd_tier):
00056 
00057   dd_tier_re = re.compile(dd_tier.replace('*','.*')) ;
00058 
00059   if os.environ['DD_SOURCE'] == "das":
00060   
00061     query = "dataset instance=cms_dbs_prod_global"
00062     if os.environ['DD_RELEASE'] != "" :
00063       query = query + " release=" + os.environ['DD_RELEASE']
00064     if os.environ['DD_SAMPLE'] != "":
00065       query = query + " primary_dataset=" + os.environ['DD_SAMPLE']
00066     if dd_tier != "":
00067       query = query + " tier=" + dd_tier
00068     if os.environ['DD_COND'] != "":
00069       query = query + " dataset=*" + os.environ['DD_COND'] + "*"
00070     if os.environ['DD_RUN'] != "":
00071       query = query + " run=" + os.environ['DD_RUN']
00072     #query = query + " | unique" # too long ??
00073     
00074     #data = os.popen('das_client.py --limit=0 --query "'+query+'"')
00075     #datalines = data.readlines()
00076     #data.close()
00077     #datasets = []
00078     #for line in datalines:
00079     #  line = line.rstrip()
00080     #  if line != "" and line[0] =="/":
00081     #    datasets.append(line)
00082     #dataset = datasets[0]
00083     
00084     data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch',query,0,0,0))
00085             
00086     if data['nresults']==0:
00087       print '[electronDataDiscovery.py] No DAS dataset for query:', query
00088       return []
00089     while data['nresults']>1:
00090       if data['data'][0]['dataset'][0]['name']==data['data'][1]['dataset'][0]['name']:
00091         data['data'].pop(0)
00092         data['nresults'] -= 1
00093       else:
00094         print '[electronDataDiscovery.py] Several DAS datasets for query:', query
00095         for i in range(data['nresults']):
00096           print '[electronDataDiscovery.py] dataset['+str(i)+']: '+data['data'][i]['dataset'][0]['name']
00097         return []
00098 
00099     dataset = data['data'][0]['dataset'][0]['name']
00100     
00101     query = "file instance=cms_dbs_prod_global dataset="+dataset
00102     
00103     #data = os.popen('das_client.py --limit=0 --query "'+query+'"')
00104     #datalines = data.readlines()
00105     #data.close()
00106     #result = []
00107     #for line in datalines:
00108     #  line = line.rstrip()
00109     #  if line != "" and line[0] =="/":
00110     #    result.append(line)
00111     
00112     data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch',query,0,0,0))
00113     
00114     if data['nresults']==0:
00115       print '[electronDataDiscovery.py] No DAS file in dataset:', dataset
00116       return []
00117       
00118     result = []
00119     for i in range(0,data['nresults']):
00120       result.append(str(data['data'][i]['file'][0]['name']))
00121     
00122   elif os.environ['DD_SOURCE'] == "dbs":
00123   
00124     input = "find file"
00125     separator = " where "
00126     if os.environ['DD_RELEASE'] != "":
00127       input = input + separator + "release = " + os.environ['DD_RELEASE']
00128       separator = " and "
00129     if os.environ['DD_SAMPLE'] != "":
00130       input = input + separator + "primds = " + os.environ['DD_SAMPLE']
00131       separator = " and "
00132     if os.environ['DD_RUN'] != "":
00133       input = input + separator + "run = " + os.environ['DD_RUN']
00134       separator = " and "
00135     input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
00136     
00137     data = os.popen('dbs search --url="https://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" --query "'+input+'"')
00138     datalines = data.readlines()
00139     data.close()
00140     result = []
00141     for line in datalines:
00142       line = line.rstrip()
00143       if line != "" and line[0] =="/":
00144         result.append(line)
00145     
00146   elif os.environ['DD_SOURCE'] == "https":
00147   
00148     input = "find file"
00149     separator = " where "
00150     if os.environ['DD_RELEASE'] != "":
00151       input = input + separator + "release = " + os.environ['DD_RELEASE']
00152       separator = " and "
00153     if os.environ['DD_SAMPLE'] != "":
00154       input = input + separator + "primds = " + os.environ['DD_SAMPLE']
00155       separator = " and "
00156     if os.environ['DD_RUN'] != "":
00157       input = input + separator + "run = " + os.environ['DD_RUN']
00158       separator = " and "
00159     input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
00160     
00161     url = "https://cmsweb.cern.ch:443/dbs_discovery/aSearch"
00162     final_input = urllib.quote(input) ;
00163     
00164     agent   = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
00165     ctypes  = "text/plain"
00166     headers = { 'User-Agent':agent, 'Accept':ctypes}
00167     params  = {'dbsInst':'cms_dbs_prod_global',
00168                'html':0,'caseSensitive':'on','_idx':0,'pagerStep':-1,
00169                'userInput':final_input,
00170                'xml':0,'details':0,'cff':0,'method':'dbsapi'}
00171     data    = urllib.urlencode(params,doseq=True)
00172     req     = urllib2.Request(url, data, headers)
00173     data    = ""
00174     
00175     try:
00176       response = urllib2.urlopen(req)
00177       data = response.read()
00178     except urllib2.HTTPError, e:
00179       if e.code==201:
00180         print e.headers       
00181         print e.msg
00182         pass
00183       else:
00184         raise e
00185 
00186     datalines = data.readlines()
00187     data.close()
00188     result = []
00189     for line in datalines:
00190       line = line.rstrip()
00191       if line != "" and line[0] =="/":
00192         result.append(line)
00193     
00194   elif os.environ['DD_SOURCE'] == "lsf":
00195   
00196     dbs_path = '/'+os.environ['DD_SAMPLE']+'/'+os.environ['DD_RELEASE']+'-'+os.environ['DD_COND']+'/'+os.environ['DD_TIER']+'"'
00197     if __name__ == "__main__":
00198       print 'dbs path:',dbs_path
00199     data = os.popen('dbs lsf --path="'+dbs_path+'"')
00200     datalines = data.readlines()
00201     data.close()
00202     result = []
00203     for line in datalines:
00204       line = line.rstrip()
00205       if line != "" and line[0] =="/":
00206         result.append(line)
00207       
00208   elif os.environ['DD_SOURCE'].startswith('/castor/cern.ch/cms/'): # assumed to be a castor dir
00209   
00210     castor_dir = os.environ['DD_SOURCE'].replace('/castor/cern.ch/cms/','/',1)
00211     result = []
00212     data = os.popen('rfdir /castor/cern.ch/cms'+castor_dir)
00213     subdirs = data.readlines()
00214     data.close()
00215     datalines = []
00216     for line in subdirs:
00217       line = line.rstrip()
00218       subdir = line.split()[8]
00219       data = os.popen('rfdir /castor/cern.ch/cms'+castor_dir+'/'+subdir)
00220       datalines = data.readlines()
00221       for line in datalines:
00222         line = line.rstrip()
00223         file = line.split()[8]
00224         if file != "":
00225           result.append(castor_dir+'/'+subdir+'/'+file)
00226       data.close()
00227       
00228   elif os.environ['DD_SOURCE'].startswith('/eos/cms/'): # assumed to be an eos dir
00229   
00230     data = os.popen('/afs/cern.ch/project/eos/installation/pro/bin/eos.select find -f '+os.environ['DD_SOURCE'])
00231     lines = data.readlines()
00232     data.close()
00233     result = []
00234     for line in lines:
00235       line = line.strip().replace('/eos/cms/','/',1)
00236       if line == "": continue
00237       if dd_sample_re.search(line) == None: continue
00238       if dd_cond_re.search(line) == None: continue
00239       if dd_tier_re.search(line) == None: continue
00240       if dd_run_re.search(line) == None: continue
00241       result.append(line)
00242       
00243   else: # os.environ['DD_SOURCE'] is assumed to be a file name
00244   
00245     result = []
00246     for line in open(os.environ['DD_SOURCE']).readlines():
00247       line = os.path.expandvars(line.strip())
00248       if line == "": continue
00249       if dd_sample_re.search(line) == None: continue
00250       if dd_cond_re.search(line) == None: continue
00251       if dd_tier_re.search(line) == None: continue
00252       if dd_run_re.search(line) == None: continue
00253       result.append(line)
00254       
00255     if len(result)==0:
00256       diag = '[electronDataDiscovery.py] No more files after filtering with :'
00257       if os.environ['DD_SAMPLE']!='': diag += ' ' + os.environ['DD_SAMPLE']
00258       if os.environ['DD_COND']!='': diag += ' ' + os.environ['DD_COND']
00259       if dd_tier!='': diag += ' ' + dd_tier
00260       if os.environ['DD_RUN']!='': diag += ' ' + os.environ['DD_RUN']
00261       print diag
00262       
00263   return result
00264 
00265 def search():
00266   return common_search(os.environ['DD_TIER'])
00267 
00268 def search2():
00269   return common_search(os.environ['DD_TIER_SECONDARY'])
00270 
00271         
00272         
00273