CMS 3D CMS Logo

dqmiodatasetharvest.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 from __future__ import print_function
3 import re
4 import json
5 import ROOT
6 import sqlite3
7 import argparse
8 import subprocess
9 import multiprocessing
10 import fnmatch
11 
12 ROOTPREFIX = "root://cms-xrd-global.cern.ch/"
13 #ROOTPREFIX = "root://eoscms//eos/cms" # for more local files
14 
15 parser = argparse.ArgumentParser(description="Collect a MEs from DQMIO data, with maximum possible granularity")
16 
17 parser.add_argument('dataset', help='dataset name, like "/StreamHIExpress/HIRun2018A-Express-v1/DQMIO"')
18 parser.add_argument('-o', '--output', help='SQLite file to write', default='dqmio.sqlite')
19 parser.add_argument('-j', '--njobs', help='Number of threads to read files', type=int, default=1)
20 parser.add_argument('-l', '--limit', help='Only load up to LIMIT files', type=int, default=-1)
21 args = parser.parse_args()
22 
23 
24 # we can save a lot of time by only scanning some types, if we know all interesting MEs are of these types.
25 interesting_types = {
26  "TH1Fs",
27  "TH1Ds",
28  "TH2Fs"
29 }
30 
31 # insert the list of needed histograms below, wild cards are usable
32 interesting_mes = [
33 
34 "PixelPhase1/Phase1_MechanicalView/PXBarrel/adc_PXLayer*",
35 
36 ]
37 
38 inf = re.compile("([- \[])inf([,}\]])")
39 nan = re.compile("([- \[])nan([,}\]])")
40 
41 def check_interesting(mename):
42  for pattern in interesting_mes:
43  if fnmatch.fnmatch(mename,pattern):
44  return True
45  return False
46 
47 def tosqlite(x):
48  if isinstance(x, ROOT.string):
49  try:
50  return unicode(x.data())
51  except:
52  return buffer(x.data())
53  if isinstance(x, int):
54  return x
55  if isinstance(x, float):
56  return x
57  if isinstance(x, int):
58  return x
59  else:
60  try:
61  rootobj = unicode(ROOT.TBufferJSON.ConvertToJSON(x))
62  # turns out ROOT does not generate valid JSON for NaN/inf
63  clean = nan.sub('\\g<1>0\\g<2>', inf.sub('\\g<1>1e38\\g<2>', rootobj))
64  obj = json.loads(clean)
65  jsonobj = json.dumps(obj, allow_nan=False)
66  return jsonobj
67  except Exception as e:
68  return json.dumps({"root2sqlite_error": e.__repr__(), "root2sqlite_object": x.__repr__()})
69 
70 def dasquery(dataset):
71  if not dataset.endswith("DQMIO"):
72  raise Exception("This tool probably cannot read the dataset you specified. The name should end with DQMIO.")
73  dasquery = ["dasgoclient", "-query=file dataset=%s" % dataset]
74  print("Querying das ... %s" % dasquery)
75  files = subprocess.check_output(dasquery)
76  files = files.splitlines()
77  print("Got %d files." % len(files))
78  return files
79 
80 
81 treenames = {
82  0: "Ints",
83  1: "Floats",
84  2: "Strings",
85  3: "TH1Fs",
86  4: "TH1Ss",
87  5: "TH1Ds",
88  6: "TH2Fs",
89  7: "TH2Ss",
90  8: "TH2Ds",
91  9: "TH3Fs",
92  10: "TProfiles",
93  11: "TProfile2Ds",
94 }
95 
96 maketable = """
97  CREATE TABLE IF NOT EXISTS monitorelements (
98  name,
99  fromrun, fromlumi, torun, tolumi,
100  metype,
101  value
102  ); """
103 makeindex = """
104  CREATE INDEX runorder ON monitorelements(fromrun, fromlumi);
105 """
106 insertinto = """
107  INSERT INTO monitorelements (
108  name,
109  fromrun, fromlumi, torun, tolumi,
110  metype,
111  value
112  ) VALUES (
113  ?, ?, ?, ?, ?, ?, ?
114  ); """
115 dumpmes = """
116  SELECT fromlumi, tolumi, fromrun, name, value FROM monitorelements ORDER BY fromrun, fromlumi ASC;
117 """
118 
119 db = sqlite3.connect(args.output)
120 db.execute(maketable)
121 db.execute(makeindex)
122 
123 def harvestfile(fname):
124  f = ROOT.TFile.Open(ROOTPREFIX + fname)
125  idxtree = getattr(f, "Indices")
126  #idxtree.GetEntry._threaded = True # now the blocking call should release the GIL...
127 
128  # we have no good way to find out which lumis where processed in a job.
129  # so we watch the per-lumi indices and assume that all mentioned lumis
130  # are covered in the end-of-job MEs. This might fail if there are no
131  # per-lumi MEs.
132  knownlumis = set()
133  mes_to_store = []
134 
135  for i in range(idxtree.GetEntries()):
136  idxtree.GetEntry(i)
137  run, lumi, metype = idxtree.Run, idxtree.Lumi, idxtree.Type
138  if lumi != 0:
139  knownlumis.add(lumi)
140 
141  if not treenames[metype] in interesting_types:
142  continue
143 
144  endrun = run # assume no multi-run files for now
145  if lumi == 0: # per-job ME
146  endlumi = max(knownlumis)
147  lumi = min(knownlumis)
148  else:
149  endlumi = lumi
150 
151  # inclusive range -- for 0 entries, row is left out
152  firstidx, lastidx = idxtree.FirstIndex, idxtree.LastIndex
153  metree = getattr(f, treenames[metype])
154  metree.GetEntry(0)
155  metree.SetBranchStatus("*",0)
156  metree.SetBranchStatus("FullName",1)
157 
158  for x in range(firstidx, lastidx+1):
159  metree.GetEntry(x)
160  mename = str(metree.FullName)
161 
162  if mename.find("AlCaReco") != -1:
163  continue
164 
165  if mename.find("Isolated") != -1:
166  continue
167 
168  if mename.find("HLT") != -1:
169  continue
170 
171  if not ((mename.find("SiStrip") >= 0) or (mename.find("OfflinePV") >= 0) or (mename.find("PixelPhase1") >= 0) or (mename.find("Tracking") >= 0 )):
172  continue
173 
174  if check_interesting(mename):
175  metree.GetEntry(x, 1)
176  value = metree.Value
177 
178  mes_to_store.append((
179  mename,
180  run, lumi, endrun, endlumi,
181  metype,
182  tosqlite(value),
183  ))
184 
185  return mes_to_store
186 
187 files = dasquery(args.dataset)
188 if args.limit > 0: files = files[:args.limit]
189 
190 pool = multiprocessing.Pool(processes=args.njobs)
191 ctr = 0
192 for mes_to_store in pool.imap_unordered(harvestfile, files):
193 #for mes_to_store in map(harvestfile, files):
194  db.executemany(insertinto, mes_to_store);
195  db.commit()
196  ctr += 1
197  print("Processed %d files of %d, got %d MEs...\r" % (ctr, len(files), len(mes_to_store)), end='')
198 print("\nDone.")
199 
200 sqlite2tree = """
201 // Convert the sqlite format saved above back into a TTree.
202 // Saving TTrees with objects (TH1's) seems to be close to impossible in Python,
203 // so we do the roundtrip via SQLite and JSON in a ROOT macro.
204 // This needs a ROOT with TBufferJSON::FromJSON, which the 6.12 in CMSSW for
205 // for now does not have. We can load a newer version from SFT (on lxplus6,
206 // in (!) a cmsenv):
207 // source /cvmfs/sft.cern.ch/lcg/releases/ROOT/6.16.00-f8770/x86_64-slc6-gcc8-opt/bin/thisroot.sh
208 // root sqlite2tree.C
209 // It is rather slow, but the root file is a lot more compact.
210 
211 int run;
212 int fromlumi;
213 int tolumi;
214 TString* name;
215 TH2F* value;
216 
217 int sqlite2tree() {
218 
219  auto sql = TSQLiteServer("sqlite:///dev/shm/schneiml/CMSSW_10_5_0_pre1/src/dqmio.sqlite");
220  auto query = "SELECT fromlumi, tolumi, fromrun, name, value FROM monitorelements ORDER BY fromrun, fromlumi ASC;";
221  auto res = sql.Query(query);
222 
223  TFile outfile("/dev/shm/dqmio.root", "RECREATE");
224  auto outtree = new TTree("MEs", "MonitorElements by run and lumisection");
225  auto nameb = outtree->Branch("name", &name);
226  auto valueb = outtree->Branch("value", &value,128*1024);
227  auto runb = outtree->Branch("run", &run);
228  auto fromlumib = outtree->Branch("fromlumi",&fromlumi);
229  auto tolumib = outtree->Branch("tolumi", &tolumi);
230 
231 
232  while (auto row = res->Next()) {
233  fromlumi = atoi(row->GetField(0));
234  tolumi = atoi(row->GetField(1));
235  run = atoi(row->GetField(2));
236  name = new TString(row->GetField(3));
237  value = nullptr;
238  TBufferJSON::FromJSON(value, row->GetField(4));
239  outtree->Fill();
240  }
241  return 0;
242 }
243 """
244 
245 
void print(TMatrixD &m, const char *label=nullptr, bool mathematicaFormat=false)
Definition: Utilities.cc:47
#define str(s)