CMS 3D CMS Logo

/data/refman/pasoursint/CMSSW_6_1_1/src/Utilities/RelMon/python/utils.py

Go to the documentation of this file.
00001 ################################################################################
00002 # RelMon: a tool for automatic Release Comparison                              
00003 # https://twiki.cern.ch/twiki/bin/view/CMSPublic/RelMon
00004 #
00005 # $Author: anorkus $
00006 # $Date: 2012/10/23 15:10:13 $
00007 # $Revision: 1.12 $
00008 #
00009 #                                                                              
00010 # Danilo Piparo CERN - danilo.piparo@cern.ch                                   
00011 #                                                                              
00012 ################################################################################
00013 
00014 
00015 import array
00016 import os
00017 import re
00018 import sys
00019 from cPickle import load
00020 from os.path import dirname,basename,join,isfile
00021 from threading import Thread
00022 from time import asctime
00023 
00024 theargv=sys.argv
00025 sys.argv=[]
00026 from ROOT import *
00027 import ROOT
00028 ROOT.gErrorIgnoreLevel=1001
00029 ROOT.gROOT.SetBatch(True)
00030 sys.argv=theargv
00031 
00032 from urllib2  import Request,build_opener,urlopen
00033 
00034 if os.environ.has_key("RELMON_SA"):
00035   from definitions import *
00036   from authentication import X509CertOpen
00037   from utils import __file__ as this_module_name  
00038 else:
00039   from Utilities.RelMon.definitions import *
00040   from Utilities.RelMon.authentication import X509CertOpen
00041   from Utilities.RelMon.utils import __file__ as this_module_name
00042 
00043 #ROOT.gErrorIgnoreLevel=1001
00044 
00045 
00046 _log_level=10
00047 def logger(msg_level,message):
00048   if msg_level>=_log_level:
00049     print "[%s] %s" %(asctime(),message)
00050 
00051 #-------------------------------------------------------------------------------
00052 def setTDRStyle():  
00053   this_dir=dirname(this_module_name)
00054   this_dir_one_up=this_dir[:this_dir.rfind("/")+1]
00055   #this_dir_two_up=this_dir_one_up[:this_dir_one_up.rfind("/")+1]
00056   style_file=''
00057   if os.environ.has_key("RELMON_SA"):
00058     style_file=this_dir_one_up+"data/tdrstyle_mod.C"
00059   else:
00060     style_file="%s/src/Utilities/RelMon/data/tdrstyle_mod.C"%(os.environ["CMSSW_BASE"])
00061   try:
00062     gROOT.ProcessLine(".L %s" %style_file)
00063     gROOT.ProcessLine("setTDRStyle()")
00064   except:
00065     "Print could not set the TDR style. File %s not found?" %style_file
00066     
00067 
00068 #-------------------------------------------------------------------------------
00069 def literal2root (literal,rootType):
00070   bitsarray = array.array('B')
00071   bitsarray.fromstring(literal.decode('hex'))
00072 
00073   tbuffer=0
00074   try:  
00075       tbuffer = TBufferFile(TBufferFile.kRead, len(bitsarray), bitsarray, False,0)
00076   except:
00077       print "could not transform to object array:"
00078       print [ i for i in  bitsarray ]
00079   
00080   # replace a couple of shortcuts with the real root class name
00081   if rootType == 'TPROF':
00082       rootType = 'TProfile'
00083   if rootType == 'TPROF2D':
00084       rootType = 'TProfile2D'
00085   
00086   root_class=eval(rootType+'.Class()')
00087   
00088   return tbuffer.ReadObject(root_class)
00089   
00090 #-------------------------------------------------------------------------------
00091 
00092 def getNbins(h):
00093   biny=h.GetNbinsY()
00094   if biny>1:biny+=1
00095   binz=h.GetNbinsZ()
00096   if binz>1:binz+=1
00097   return (h.GetNbinsX()+1)*(biny)*(binz)
00098 
00099 #-------------------------------------------------------------------------------
00100 
00101 
00102 class StatisticalTest(object):
00103   def __init__(self,threshold):
00104     self.name=""
00105     self.h1=None
00106     self.h2=None
00107     self.threshold=float(threshold)
00108     self.rank=-1
00109     self.is_init=False
00110 
00111   def set_operands(self,h1,h2):
00112     self.h1=h1
00113     self.h2=h2
00114 
00115   def get_rank(self):
00116     if not self.is_init:
00117       if self.rank < 0:
00118         type1=type(self.h1)
00119         type2=type(self.h2)
00120         if (type1 != type2):
00121           logger(1,"*** ERROR: object types in comparison don't match: %s!=%s" %(type1,type2))
00122           self.rank=test_codes["DIFF_TYPES"]
00123         elif not self.h2.InheritsFrom("TH1"):
00124           logger(1,"*** ERROR: object type is not histogram but a %s" %(type1))
00125           self.rank=test_codes["NO_HIST"]    
00126         # if histos are empty
00127         #elif self.h1.InheritsFrom("TH2") and not "BinToBin" in self.name:
00128           ## 2D!
00129           #return test_codes["2D"]
00130         else:
00131           is_empty1=is_empty(self.h1)
00132           is_empty2=is_empty(self.h2)
00133           are_empty=is_empty1 and is_empty2
00134           one_empty=is_empty1 or is_empty2
00135 
00136           Nbins1= getNbins(self.h1)
00137           Nbins2= getNbins(self.h2)
00138 
00139           if are_empty:
00140             #return -103
00141             # Conversation with JeanRoch and David 5 April
00142             return 1
00143           elif one_empty:
00144             #return -103
00145             # Conversation with JeanRoch and David 5 April
00146             return 1
00147 
00148           # if histos have different number of bins
00149           if Nbins1!=Nbins2:
00150             return test_codes["DIFF_BIN"]
00151 
00152       self.rank=self.do_test()
00153     self.is_init=True
00154     return self.rank
00155 
00156   def get_status(self):
00157     status = SUCCESS
00158     if self.get_rank()<0:
00159       status=NULL
00160       logger(0,"+++ Test %s FAILED: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))  
00161     elif self.get_rank() < self.threshold:
00162       status=FAIL
00163     logger(0,"+++ Test %s: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
00164     return status
00165 
00166   def do_test(self):
00167     pass
00168 
00169 #-------------------------------------------------------------------------------
00170 
00171 def is_empty(h):
00172   for i in xrange(1,getNbins(h)):
00173     if h.GetBinContent(i)!=0: return False
00174   return True
00175   #return h.GetSumOfWeights()==0
00176 
00177 #-------------------------------------------------------------------------------
00178 
00179 def is_sparse(h):
00180   filled_bins=0.
00181   nbins=h.GetNbinsX()
00182   for ibin in xrange(nbins):
00183     if h.GetBinContent(ibin)>0:
00184       filled_bins+=1
00185   #print "%s %s --> %s" %(filled_bins,nbins,filled_bins/nbins)
00186   if filled_bins/nbins < .5:
00187     return True
00188   else:
00189     return False
00190 
00191 #-------------------------------------------------------------------------------
00192 
00193 class KS(StatisticalTest):
00194   def __init__(self, threshold):
00195    StatisticalTest.__init__(self,threshold)
00196    self.name="KS"
00197 
00198   def do_test(self):
00199 
00200     # Calculate errors if not there...
00201     for h in self.h1,self.h2:
00202       w2s=h.GetSumw2()
00203       if w2s.GetSize()==0:
00204         h.Sumw2()
00205 
00206     ## If errors are 0:
00207     #zero_errors=True
00208     #for h in self.h1,self.h2:
00209       #for ibin in xrange(Nbins1):
00210         #if h.GetBinError(ibin+1) >0:
00211           #zero_errors=False
00212           #break
00213     #if zero_errors:
00214       #return test_codes["ZERO_ERR"]
00215 
00216     return self.h1.KolmogorovTest(self.h2)
00217 
00218 #-------------------------------------------------------------------------------
00219 import array
00220 def profile2histo(profile):
00221   if not profile.InheritsFrom("TH1"):
00222     return profile
00223     
00224   bin_low_edges=[]
00225   n_bins=profile.GetNbinsX()
00226   
00227   for ibin in xrange(1,n_bins+2):
00228     bin_low_edges.append(profile.GetBinLowEdge(ibin))
00229   bin_low_edges=array.array('f',bin_low_edges)
00230   histo=TH1F(profile.GetName(),profile.GetTitle(),n_bins,bin_low_edges)
00231   for ibin in xrange(0,n_bins+1):
00232     histo.SetBinContent(ibin,profile.GetBinContent(ibin))
00233     histo.SetBinError(ibin,profile.GetBinError(ibin))    
00234   
00235   return histo
00236 #-------------------------------------------------------------------------------
00237 
00238 class Chi2(StatisticalTest):
00239   def __init__(self, threshold):
00240     StatisticalTest.__init__(self,threshold)
00241     self.name="Chi2"     
00242 
00243   def check_filled_bins(self,min_filled):
00244     nbins=self.h1.GetNbinsX()
00245     n_filled_l=[]
00246     for h in self.h1,self.h2:
00247       nfilled=0.
00248       for ibin in xrange(1,nbins+1):
00249         if h.GetBinContent(ibin)>0:
00250           nfilled+=1
00251       n_filled_l.append(nfilled)
00252     return len(filter (lambda x:x>=min_filled,n_filled_l) )>0
00253 
00254   def absval(self):
00255     nbins=getNbins(self.h1)
00256     binc=0
00257     for i in xrange(1,nbins):
00258       for h in self.h1,self.h2:
00259         binc=h.GetBinContent(i)
00260         if binc<0:
00261           h.SetBinContent(i,-1*binc)
00262         if h.GetBinError(i)==0 and binc!=0:
00263           #print "Histo ",h.GetName()," Bin:",i,"-Content:",h.GetBinContent(i)," had zero error"
00264           h.SetBinContent(i,0)
00265 
00266   def check_histograms(self, histogram):
00267       if histogram.InheritsFrom("TProfile") or  (histogram.GetEntries()!=histogram.GetSumOfWeights()):
00268           return 'W'
00269       else:
00270           return 'U'
00271 
00272   def do_test(self):
00273     self.absval()
00274     if self.check_filled_bins(3):
00275       #if self.h1.InheritsFrom("TProfile") or  (self.h1.GetEntries()!=self.h1.GetSumOfWeights()):
00276       #  chi2=self.h1.Chi2Test(self.h2,'WW')
00277       #  #if chi2==0: print "DEBUG",self.h1.GetName(),"Chi2 is:", chi2
00278       #  return chi2
00279       #else:
00280       #  return self.h1.Chi2Test(self.h2,'UU')
00281       hist1 = self.check_histograms(self.h1)
00282       hist2 = self.check_histograms(self.h2)
00283       if hist1 =='W' and hist2 =='W': ##in case 
00284           chi2 = self.h1.Chi2Test(self.h2,'WW')     ## the both histograms are weighted
00285           return chi2
00286       elif hist1 == 'U' and hist2 == 'U':
00287           chi2 = self.h1.Chi2Test(self.h2,'UU')    ##the both histograms are unweighted
00288           return chi2
00289       elif hist1 == 'U' and hist2 == 'W':
00290           chi2 = self.h1.Chi2Test(self.h2,'UW')   ## 1st histogram is unweighted, 2nd weighted
00291           return chi2
00292       elif hist1 == 'W' and hist2 == 'U':
00293           chi2 = self.h2.Chi2Test(self.h1,'UW')   ## 1 is wieghted, 2nd unweigthed. so flip order to make a UW comparison
00294           return chi2
00295     else:
00296       return 1
00297       #return test_codes["FEW_BINS"]
00298 
00299 #-------------------------------------------------------------------------------
00300 
00301 class BinToBin(StatisticalTest):
00302   """The bin to bin comparison builds a fake pvalue. It is 0 if the number of 
00303   bins is different. It is % of corresponding bins otherwhise.
00304   A threshold of 1 is needed to require a 1 to 1 correspondance between 
00305   hisograms.
00306   """
00307   def __init__(self, threshold=1):
00308     StatisticalTest.__init__(self, threshold)
00309     self.name='BinToBin'
00310     self.epsilon= 0.000001
00311 
00312   def checkBinningMatches(self):
00313     if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
00314            or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
00315            or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
00316            or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
00317            or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
00318            or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
00319            or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
00320            or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
00321            or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
00322       return False
00323     return True
00324 
00325   def do_test(self):
00326     # fist check that binning matches
00327     if not self.checkBinningMatches():
00328       return test_codes["DIFF_BIN"]
00329     # then do the real check
00330     equal = 1
00331     nbins = getNbins(self.h1)
00332     n_ok_bins=0.0
00333     for ibin in xrange(0,nbins):
00334       ibin+=1
00335       h1bin=self.h1.GetBinContent(ibin)
00336       h2bin=self.h2.GetBinContent(ibin)
00337       bindiff=h1bin-h2bin
00338 
00339       binavg=.5*(h1bin+h2bin)
00340 
00341       if binavg==0 or abs(bindiff) < self.epsilon:
00342         n_ok_bins+=1
00343         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00344       else:
00345         print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00346 
00347       #if abs(bindiff)!=0 :
00348         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00349     
00350     rank=n_ok_bins/nbins
00351     
00352     if rank!=1:
00353       print "Histogram %s differs: nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins)
00354     
00355     return rank
00356 
00357 #-------------------------------------------------------------------------------
00358 
00359 class BinToBin1percent(StatisticalTest):
00360   """The bin to bin comparison builds a fake pvalue. It is 0 if the number of 
00361   bins is different. It is % of corresponding bins otherwhise.
00362   A threshold of 1 is needed to require a 1 to 1 correspondance between 
00363   hisograms.
00364   """
00365   def __init__(self, threshold=1):
00366     StatisticalTest.__init__(self, threshold)
00367     self.name='BinToBin1percent'
00368     self.epsilon= 0.000001
00369     self.tolerance= 0.01
00370 
00371   def checkBinningMatches(self):
00372     if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
00373            or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
00374            or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
00375            or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
00376            or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
00377            or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
00378            or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
00379            or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
00380            or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
00381       return False
00382     return True
00383 
00384   def do_test(self):
00385     # fist check that binning matches
00386     if not self.checkBinningMatches():
00387       return test_codes["DIFF_BIN"]
00388     # then do the real check
00389     equal = 1
00390     nbins = getNbins(self.h1)
00391     n_ok_bins=0.0
00392     for ibin in xrange(0,nbins):
00393       ibin+=1
00394       h1bin=self.h1.GetBinContent(ibin)
00395       h2bin=self.h2.GetBinContent(ibin)
00396       bindiff=h1bin-h2bin
00397 
00398       binavg=.5*(h1bin+h2bin)
00399 
00400       if binavg==0 or 100*abs(bindiff)/binavg < self.tolerance:
00401         n_ok_bins+=1
00402         #print "Bin %i bin: bindiff %s" %(ibin,bindiff)
00403       else:
00404         print "-->Bin %i bin: bindiff %s (%s - %s )" %(ibin,bindiff,h1bin,h2bin)
00405 
00406       #if abs(bindiff)!=0 :
00407         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00408     
00409     rank=n_ok_bins/nbins
00410     
00411     if rank!=1:
00412       print "%s nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins)
00413     
00414     return rank
00415 #-------------------------------------------------------------------------------
00416 Statistical_Tests={"KS":KS,
00417                    "Chi2":Chi2,
00418                    "BinToBin":BinToBin,
00419                    "BinToBin1percent":BinToBin1percent,
00420                    "Bin2Bin":BinToBin,
00421                    "b2b":BinToBin,}
00422 #-------------------------------------------------------------------------------  
00423 
00424 def ask_ok(prompt, retries=4, complaint='yes or no'):
00425     while True:
00426         ok = raw_input(prompt)
00427         if ok in ('y', 'ye', 'yes'):
00428             return True
00429         if ok in ('n', 'no'):
00430             return False
00431         retries = retries - 1
00432         if retries < 0:
00433             raise IOError('refusenik user')
00434         print complaint
00435 
00436 #-------------------------------------------------------------------------------
00437 
00438 class unpickler(Thread):
00439   def __init__(self,filename):
00440     Thread.__init__(self)
00441     self.filename=filename
00442     self.directory=""
00443 
00444   def run(self):
00445     print "Reading directory from %s" %(self.filename)
00446     ifile=open(self.filename,"rb")
00447     self.directory=load(ifile) 
00448     ifile.close()
00449 
00450 #-------------------------------------------------------------------------------    
00451 
00452 def wget(url):
00453   """ Fetch the WHOLE file, not in bunches... To be optimised.
00454   """
00455   opener=build_opener(X509CertOpen())  
00456   datareq = Request(url)
00457   datareq.add_header('authenticated_wget', "The ultimate wgetter")    
00458   bin_content=None
00459   try:
00460     filename=basename(url)  
00461     print "Checking existence of file %s on disk..."%filename
00462     if not isfile("./%s"%filename):      
00463       bin_content=opener.open(datareq).read()
00464     else:
00465       print "File %s exists, skipping.." %filename
00466   except ValueError:
00467     print "Error: Unknown url %s" %url
00468   
00469   if bin_content!=None:  
00470     ofile = open(filename, 'wb')
00471     ofile.write(bin_content)
00472     ofile.close()
00473 
00474 #-------------------------------------------------------------------------------
00475 ##-----------------   Make files pairs:  RelValData utils   --------------------
00476 
00477 def get_relvaldata_id(file):
00478     """Returns unique relvaldata ID for a given file."""
00479     run_id = re.search('R\d{9}', file)
00480     run = re.search('_RelVal_([\w\d]*)-v\d__', file)
00481     if not run:
00482         run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
00483     if run_id and run:
00484         return (run_id.group(), run.group(1))
00485     return None
00486 
00487 def get_relvaldata_cmssw_version(file):
00488     """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
00489     cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00490     gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
00491     if not gr_r_version:
00492         gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
00493     if cmssw_release and gr_r_version:
00494         return (cmssw_release[0], gr_r_version[0])
00495 
00496 def get_relvaldata_version(file):
00497     """Returns tuple (CMSSW version, run version) for specified file."""
00498     cmssw_version = re.findall('DQM_V(\d*)_', file)
00499     run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
00500     if not run_version:
00501         run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
00502     if cmssw_version and run_version:
00503         return (int(cmssw_version[0]), int(run_version[0]))
00504 
00505 def get_relvaldata_max_version(files):
00506     """Returns file with maximum version at a) beggining of the file,
00507     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00508     max_file = files[0]
00509     max_v = get_relvaldata_version(files[0])
00510     for file in files:
00511         file_v = get_relvaldata_version(file)
00512         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00513             max_file = file
00514             max_v = file_v
00515     return max_file
00516 
00517 ##-------------------   Make files pairs:  RelVal utils   ---------------------
00518 def get_relval_version(file):
00519     """Returns tuple (CMSSW version, run version) for specified file."""
00520     cmssw_version = re.findall('DQM_V(\d*)_', file)
00521     run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
00522     if cmssw_version and run_version:
00523         return (int(cmssw_version[0]), int(run_version[0]))
00524 
00525 def get_relval_max_version(files):
00526     """Returns file with maximum version at a) beggining of the file,
00527     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00528     max_file = files[0]
00529     max_v = get_relval_version(files[0])
00530     for file in files:
00531         file_v = get_relval_version(file)
00532         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00533             max_file = file
00534             max_v = file_v
00535     return max_file
00536 
00537 def get_relval_cmssw_version(file):
00538     cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00539     gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
00540     if cmssw_release and gr_r_version:
00541         return (cmssw_release[0], gr_r_version[0])
00542 
00543 def get_relval_id(file):
00544     """Returns unique relval ID (dataset name) for a given file."""
00545     dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
00546     return dataset_name[0]
00547 
00548 ##-------------------------  Make files pairs --------------------------
00549 def is_relvaldata(files):
00550     is_relvaldata_re = re.compile('_RelVal_')
00551     return any([is_relvaldata_re.search(filename) for filename in files])
00552 
00553 def make_files_pairs(files, verbose=True):
00554     ## Select functions to use
00555     if is_relvaldata(files):
00556         is_relval_data = True
00557         get_cmssw_version = get_relvaldata_cmssw_version
00558         get_id = get_relvaldata_id
00559         get_max_version = get_relvaldata_max_version
00560         # print 'Pairing Data RelVal files.'
00561     else:
00562         is_relval_data = False
00563         get_cmssw_version = get_relval_cmssw_version
00564         get_id = get_relval_id
00565         get_max_version = get_relval_max_version
00566         # print 'Pairing Monte Carlo RelVal files.'
00567 
00568     ## Divide files into groups
00569     versions_files = dict()
00570     for file in files:
00571         version = get_cmssw_version(file)
00572         if versions_files.has_key(version):
00573             versions_files[version].append(file)
00574         else:
00575             versions_files[version] = [file]
00576 
00577     ## Print the division into groups
00578     if verbose:
00579         print '\nFound versions:'
00580         for version in versions_files:
00581             print '%s: %d files' % (str(version),  len(versions_files[version]))
00582 
00583     if len(versions_files.keys()) <= 1:
00584         print '\nFound too little versions, there is nothing to pair. Exiting...\n'
00585         exit()
00586 
00587     ## Select two biggest groups.
00588     versions = versions_files.keys()
00589     sizes = [len(value) for value in versions_files.values()]
00590     v1 = versions[sizes.index(max(sizes))]
00591     versions.remove(v1)
00592     sizes.remove(max(sizes))
00593     v2 = versions[sizes.index(max(sizes))]
00594 
00595     ## Print two biggest groups.
00596     if verbose:
00597         print '\nPairing %s (%d files) and %s (%d files)' % (str(v1),
00598                 len(versions_files[v1]), str(v2), len(versions_files[v2]))
00599 
00600     ## Pairing two versions
00601     print '\nGot pairs:'
00602     pairs = []
00603     for unique_id in set([get_id(file) for file in versions_files[v1]]):
00604         if is_relval_data:
00605             dataset_re = re.compile(unique_id[0]+'_')
00606             run_re = re.compile(unique_id[1])
00607             c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and run_re.search(file)]
00608             c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and run_re.search(file)]
00609         else:
00610             dataset_re = re.compile(unique_id+'_')
00611             c1_files = [file for file in versions_files[v1] if dataset_re.search(file)]
00612             c2_files = [file for file in versions_files[v2] if dataset_re.search(file)]
00613 
00614         if len(c1_files) > 0 and len(c2_files) > 0:
00615             first_file = get_max_version(c1_files)
00616             second_file = get_max_version(c2_files)
00617             print '%s\n%s\n' % (first_file, second_file)
00618             pairs.extend((first_file, second_file))
00619     if verbose:
00620         print "Paired and got %d files.\n" % len(pairs)
00621     return pairs