CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
electronDataDiscovery.py
Go to the documentation of this file.
1 
2 #===================================================================
3 # So to get the list of input files. One must call :
4 # search(), to get the list of primary files
5 # search2(), to get the list of eventual secondary files
6 #
7 # The selection of files is configured thanks to shell
8 # environment variables:
9 #
10 # DD_RELEASE, for example CMSSW_2_2_0_pre1
11 # DD_SAMPLE, for example RelValSingleElectronPt35
12 # DD_RUN, for example ''
13 # DD_COND , for example MC_31X_V2-v1
14 # DD_TIER , for example RECO
15 # DD_TIER_SECONDARY, for eventual secondary files
16 #
17 # DD_SOURCE:
18 # das: use das
19 # dbs: use dbs search
20 # lsf: use dbs lsf
21 # /castor/cern.ch/cms/...: assumed to be the path of a castor directory containing the input data files
22 # for relvals: '/castor/cern.ch/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
23 # for harvested dqm: '/castor/cern.ch/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
24 # /eos/cms/...: assumed to be the path of a castor directory containing the input data files
25 # for relvals: '/eos/cms/store/relval/${DD_RELEASE}/${DD_SAMPLE}/${DD_TIER}/${DD_COND}/'
26 # for harvested dqm: '/eos/cms/store/unmerged/dqm/${DD_SAMPLE}-${DD_RELEASE}-${DD_COND}-DQM-DQMHarvest-OfflineDQM'
27 # /...: assumed to be the path of a text file containing the list of input data files
28 #
29 # All except DD_SOURCE can use wildcard *.
30 #===================================================================
31 
32 #import httplib, urllib, urllib2, types, string, os, sys
33 import os, sys, re, das_client
34 
35 if not os.environ.has_key('DD_SOURCE'):
36  os.environ['DD_SOURCE'] = 'das'
37 if not os.environ.has_key('DD_RELEASE'):
38  os.environ['DD_RELEASE'] = ''
39 if not os.environ.has_key('DD_SAMPLE'):
40  os.environ['DD_SAMPLE'] = ''
41 if not os.environ.has_key('DD_COND'):
42  os.environ['DD_COND'] = ''
43 if not os.environ.has_key('DD_TIER'):
44  os.environ['DD_TIER'] = ''
45 if not os.environ.has_key('DD_TIER_SECONDARY'):
46  os.environ['DD_TIER_SECONDARY'] = ''
47 if not os.environ.has_key('DD_RUN'):
48  os.environ['DD_RUN'] = ''
49 
50 dd_release_re = re.compile(os.environ['DD_RELEASE'].replace('*','.*')) ;
51 dd_sample_re = re.compile(os.environ['DD_SAMPLE'].replace('*','.*')) ;
52 dd_cond_re = re.compile(os.environ['DD_COND'].replace('*','.*')) ;
53 dd_run_re = re.compile(os.environ['DD_RUN'].replace('*','.*')) ;
54 
55 def common_search(dd_tier):
56 
57  dd_tier_re = re.compile(dd_tier.replace('*','.*')) ;
58 
59  if os.environ['DD_SOURCE'] == "das":
60 
61  query = "dataset instance=cms_dbs_prod_global"
62  if os.environ['DD_RELEASE'] != "" :
63  query = query + " release=" + os.environ['DD_RELEASE']
64  if os.environ['DD_SAMPLE'] != "":
65  query = query + " primary_dataset=" + os.environ['DD_SAMPLE']
66  if dd_tier != "":
67  query = query + " tier=" + dd_tier
68  if os.environ['DD_COND'] != "":
69  query = query + " dataset=*" + os.environ['DD_COND'] + "*"
70  if os.environ['DD_RUN'] != "":
71  query = query + " run=" + os.environ['DD_RUN']
72 
73  #data = os.popen('das_client.py --limit=0 --query "'+query+'"')
74  #datalines = data.readlines()
75  #data.close()
76  #datasets = []
77  #for line in datalines:
78  # line = line.rstrip()
79  # if line != "" and line[0] =="/":
80  # datasets.append(line)
81  #dataset = datasets[0]
82 
83  data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch',query,0,0,0))
84 
85  if data['nresults']==0:
86  print '[electronDataDiscovery.py] No DAS dataset for query:', query
87  return []
88  if data['nresults']>1:
89  print '[electronDataDiscovery.py] Several DAS datasets for query:', query
90  return []
91 
92  dataset = data['data'][0]['dataset'][0]['name']
93 
94  query = "file instance=cms_dbs_prod_global dataset="+dataset
95 
96  #data = os.popen('das_client.py --limit=0 --query "'+query+'"')
97  #datalines = data.readlines()
98  #data.close()
99  #result = []
100  #for line in datalines:
101  # line = line.rstrip()
102  # if line != "" and line[0] =="/":
103  # result.append(line)
104 
105  data = das_client.json.loads(das_client.get_data('https://cmsweb.cern.ch',query,0,0,0))
106 
107  if data['nresults']==0:
108  print '[electronDataDiscovery.py] No DAS file in dataset:', dataset
109  return []
110 
111  result = []
112  for i in range(0,data['nresults']):
113  result.append(str(data['data'][i]['file'][0]['name']))
114 
115  elif os.environ['DD_SOURCE'] == "dbs":
116 
117  input = "find file"
118  separator = " where "
119  if os.environ['DD_RELEASE'] != "":
120  input = input + separator + "release = " + os.environ['DD_RELEASE']
121  separator = " and "
122  if os.environ['DD_SAMPLE'] != "":
123  input = input + separator + "primds = " + os.environ['DD_SAMPLE']
124  separator = " and "
125  if os.environ['DD_RUN'] != "":
126  input = input + separator + "run = " + os.environ['DD_RUN']
127  separator = " and "
128  input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
129 
130  data = os.popen('dbs search --url="http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet" --query "'+input+'"')
131  datalines = data.readlines()
132  data.close()
133  result = []
134  for line in datalines:
135  line = line.rstrip()
136  if line != "" and line[0] =="/":
137  result.append(line)
138 
139  elif os.environ['DD_SOURCE'] == "http":
140 
141  input = "find file"
142  separator = " where "
143  if os.environ['DD_RELEASE'] != "":
144  input = input + separator + "release = " + os.environ['DD_RELEASE']
145  separator = " and "
146  if os.environ['DD_SAMPLE'] != "":
147  input = input + separator + "primds = " + os.environ['DD_SAMPLE']
148  separator = " and "
149  if os.environ['DD_RUN'] != "":
150  input = input + separator + "run = " + os.environ['DD_RUN']
151  separator = " and "
152  input = input + separator + "dataset like *" + os.environ['DD_COND'] + "*" + dd_tier + "*"
153 
154  url = "https://cmsweb.cern.ch:443/dbs_discovery/aSearch"
155  final_input = urllib.quote(input) ;
156 
157  agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
158  ctypes = "text/plain"
159  headers = { 'User-Agent':agent, 'Accept':ctypes}
160  params = {'dbsInst':'cms_dbs_prod_global',
161  'html':0,'caseSensitive':'on','_idx':0,'pagerStep':-1,
162  'userInput':final_input,
163  'xml':0,'details':0,'cff':0,'method':'dbsapi'}
164  data = urllib.urlencode(params,doseq=True)
165  req = urllib2.Request(url, data, headers)
166  data = ""
167 
168  try:
169  response = urllib2.urlopen(req)
170  data = response.read()
171  except urllib2.HTTPError, e:
172  if e.code==201:
173  print e.headers
174  print e.msg
175  pass
176  else:
177  raise e
178 
179  datalines = data.readlines()
180  data.close()
181  result = []
182  for line in datalines:
183  line = line.rstrip()
184  if line != "" and line[0] =="/":
185  result.append(line)
186 
187  elif os.environ['DD_SOURCE'] == "lsf":
188 
189  dbs_path = '/'+os.environ['DD_SAMPLE']+'/'+os.environ['DD_RELEASE']+'-'+os.environ['DD_COND']+'/'+os.environ['DD_TIER']+'"'
190  if __name__ == "__main__":
191  print 'dbs path:',dbs_path
192  data = os.popen('dbs lsf --path="'+dbs_path+'"')
193  datalines = data.readlines()
194  data.close()
195  result = []
196  for line in datalines:
197  line = line.rstrip()
198  if line != "" and line[0] =="/":
199  result.append(line)
200 
201  elif os.environ['DD_SOURCE'].startswith('/castor/cern.ch/cms/'): # assumed to be a castor dir
202 
203  castor_dir = os.environ['DD_SOURCE'].replace('/castor/cern.ch/cms/','/',1)
204  result = []
205  data = os.popen('rfdir /castor/cern.ch/cms'+castor_dir)
206  subdirs = data.readlines()
207  data.close()
208  datalines = []
209  for line in subdirs:
210  line = line.rstrip()
211  subdir = line.split()[8]
212  data = os.popen('rfdir /castor/cern.ch/cms'+castor_dir+'/'+subdir)
213  datalines = data.readlines()
214  for line in datalines:
215  line = line.rstrip()
216  file = line.split()[8]
217  if file != "":
218  result.append(castor_dir+'/'+subdir+'/'+file)
219  data.close()
220 
221  elif os.environ['DD_SOURCE'].startswith('/eos/cms/'): # assumed to be an eos dir
222 
223  data = os.popen('/afs/cern.ch/project/eos/installation/pro/bin/eos.select find -f '+os.environ['DD_SOURCE'])
224  lines = data.readlines()
225  data.close()
226  result = []
227  for line in lines:
228  line = line.strip().replace('/eos/cms/','/',1)
229  if line == "": continue
230  if dd_sample_re.search(line) == None: continue
231  if dd_cond_re.search(line) == None: continue
232  if dd_tier_re.search(line) == None: continue
233  if dd_run_re.search(line) == None: continue
234  result.append(line)
235 
236  else: # os.environ['DD_SOURCE'] is assumed to be a file name
237 
238  result = []
239  for line in open(os.environ['DD_SOURCE']).readlines():
240  line = os.path.expandvars(line.strip())
241  if line == "": continue
242  if dd_sample_re.search(line) == None: continue
243  if dd_cond_re.search(line) == None: continue
244  if dd_tier_re.search(line) == None: continue
245  if dd_run_re.search(line) == None: continue
246  result.append(line)
247 
248  if len(result)==0:
249  diag = '[electronDataDiscovery.py] No more files after filtering with :'
250  if os.environ['DD_SAMPLE']!='': diag += ' ' + os.environ['DD_SAMPLE']
251  if os.environ['DD_COND']!='': diag += ' ' + os.environ['DD_COND']
252  if dd_tier!='': diag += ' ' + dd_tier
253  if os.environ['DD_RUN']!='': diag += ' ' + os.environ['DD_RUN']
254  print diag
255 
256  return result
257 
258 def search():
259  return common_search(os.environ['DD_TIER'])
260 
261 def search2():
262  return common_search(os.environ['DD_TIER_SECONDARY'])
263 
264 
265 
266