CMS 3D CMS Logo

/afs/cern.ch/work/a/aaltunda/public/www/CMSSW_5_3_13_patch3/src/Utilities/RelMon/python/utils.py

Go to the documentation of this file.
00001 ################################################################################
00002 # RelMon: a tool for automatic Release Comparison                              
00003 # https://twiki.cern.ch/twiki/bin/view/CMSPublic/RelMon
00004 #
00005 # $Author: anorkus $
00006 # $Date: 2013/03/18 14:29:46 $
00007 # $Revision: 1.13 $
00008 #
00009 #                                                                              
00010 # Danilo Piparo CERN - danilo.piparo@cern.ch                                   
00011 #                                                                              
00012 ################################################################################
00013 
00014 
00015 import array
00016 import os
00017 import re
00018 import sys
00019 from cPickle import load
00020 from os.path import dirname,basename,join,isfile
00021 from threading import Thread
00022 from time import asctime
00023 
00024 theargv=sys.argv
00025 sys.argv=[]
00026 from ROOT import *
00027 import ROOT
00028 ROOT.gErrorIgnoreLevel=1001
00029 ROOT.gROOT.SetBatch(True)
00030 sys.argv=theargv
00031 
00032 from urllib2  import Request,build_opener,urlopen
00033 
00034 if os.environ.has_key("RELMON_SA"):
00035   from definitions import *
00036   from authentication import X509CertOpen
00037   from utils import __file__ as this_module_name  
00038 else:
00039   from Utilities.RelMon.definitions import *
00040   from Utilities.RelMon.authentication import X509CertOpen
00041   from Utilities.RelMon.utils import __file__ as this_module_name
00042 
00043 #ROOT.gErrorIgnoreLevel=1001
00044 
00045 
00046 _log_level=10
00047 def logger(msg_level,message):
00048   if msg_level>=_log_level:
00049     print "[%s] %s" %(asctime(),message)
00050 
00051 #-------------------------------------------------------------------------------
00052 def setTDRStyle():  
00053   this_dir=dirname(this_module_name)
00054   this_dir_one_up=this_dir[:this_dir.rfind("/")+1]
00055   #this_dir_two_up=this_dir_one_up[:this_dir_one_up.rfind("/")+1]
00056   style_file=''
00057   if os.environ.has_key("RELMON_SA"):
00058     style_file=this_dir_one_up+"data/tdrstyle_mod.C"
00059   else:
00060     style_file="%s/src/Utilities/RelMon/data/tdrstyle_mod.C"%(os.environ["CMSSW_BASE"])
00061   try:
00062     gROOT.ProcessLine(".L %s" %style_file)
00063     gROOT.ProcessLine("setTDRStyle()")
00064   except:
00065     "Print could not set the TDR style. File %s not found?" %style_file
00066     
00067 
00068 #-------------------------------------------------------------------------------
00069 def literal2root (literal,rootType):
00070   bitsarray = array.array('B')
00071   bitsarray.fromstring(literal.decode('hex'))
00072 
00073   tbuffer=0
00074   try:  
00075       tbuffer = TBufferFile(TBufferFile.kRead, len(bitsarray), bitsarray, False,0)
00076   except:
00077       print "could not transform to object array:"
00078       print [ i for i in  bitsarray ]
00079   
00080   # replace a couple of shortcuts with the real root class name
00081   if rootType == 'TPROF':
00082       rootType = 'TProfile'
00083   if rootType == 'TPROF2D':
00084       rootType = 'TProfile2D'
00085   
00086   root_class=eval(rootType+'.Class()')
00087   
00088   return tbuffer.ReadObject(root_class)
00089   
00090 #-------------------------------------------------------------------------------
00091 
00092 def getNbins(h):
00093   biny=h.GetNbinsY()
00094   if biny>1:biny+=1
00095   binz=h.GetNbinsZ()
00096   if binz>1:binz+=1
00097   return (h.GetNbinsX()+1)*(biny)*(binz)
00098 
00099 #-------------------------------------------------------------------------------
00100 
00101 
00102 class StatisticalTest(object):
00103   def __init__(self,threshold):
00104     self.name=""
00105     self.h1=None
00106     self.h2=None
00107     self.threshold=float(threshold)
00108     self.rank=-1
00109     self.is_init=False
00110 
00111   def set_operands(self,h1,h2):
00112     self.h1=h1
00113     self.h2=h2
00114 
00115   def get_rank(self):
00116     if not self.is_init:
00117       if self.rank < 0:
00118         type1=type(self.h1)
00119         type2=type(self.h2)
00120         if (type1 != type2):
00121           logger(1,"*** ERROR: object types in comparison don't match: %s!=%s" %(type1,type2))
00122           self.rank=test_codes["DIFF_TYPES"]
00123         elif not self.h2.InheritsFrom("TH1"):
00124           logger(1,"*** ERROR: object type is not histogram but a %s" %(type1))
00125           self.rank=test_codes["NO_HIST"]    
00126         # if histos are empty
00127         #elif self.h1.InheritsFrom("TH2") and not "BinToBin" in self.name:
00128           ## 2D!
00129           #return test_codes["2D"]
00130         else:
00131           is_empty1=is_empty(self.h1)
00132           is_empty2=is_empty(self.h2)
00133           are_empty=is_empty1 and is_empty2
00134           one_empty=is_empty1 or is_empty2
00135 
00136           Nbins1= getNbins(self.h1)
00137           Nbins2= getNbins(self.h2)
00138 
00139           if are_empty:
00140             #return -103
00141             # Conversation with JeanRoch and David 5 April
00142             return 1
00143           elif one_empty:
00144             #return -103
00145             # Conversation with JeanRoch and David 5 April
00146             return 1
00147 
00148           # if histos have different number of bins
00149           if Nbins1!=Nbins2:
00150             return test_codes["DIFF_BIN"]
00151 
00152       self.rank=self.do_test()
00153     self.is_init=True
00154     return self.rank
00155 
00156   def get_status(self):
00157     status = SUCCESS
00158     if self.get_rank()<0:
00159       status=NULL
00160       logger(0,"+++ Test %s FAILED: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))  
00161     elif self.get_rank() < self.threshold:
00162       status=FAIL
00163     logger(0,"+++ Test %s: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
00164     return status
00165 
00166   def do_test(self):
00167     pass
00168 
00169 #-------------------------------------------------------------------------------
00170 
00171 def is_empty(h):
00172   for i in xrange(1,getNbins(h)):
00173     if h.GetBinContent(i)!=0: return False
00174   return True
00175   #return h.GetSumOfWeights()==0
00176 
00177 #-------------------------------------------------------------------------------
00178 
00179 def is_sparse(h):
00180   filled_bins=0.
00181   nbins=h.GetNbinsX()
00182   for ibin in xrange(nbins):
00183     if h.GetBinContent(ibin)>0:
00184       filled_bins+=1
00185   #print "%s %s --> %s" %(filled_bins,nbins,filled_bins/nbins)
00186   if filled_bins/nbins < .5:
00187     return True
00188   else:
00189     return False
00190 
00191 #-------------------------------------------------------------------------------
00192 
00193 class KS(StatisticalTest):
00194   def __init__(self, threshold):
00195    StatisticalTest.__init__(self,threshold)
00196    self.name="KS"
00197 
00198   def do_test(self):
00199 
00200     # Calculate errors if not there...
00201     for h in self.h1,self.h2:
00202       w2s=h.GetSumw2()
00203       if w2s.GetSize()==0:
00204         h.Sumw2()
00205 
00206     ## If errors are 0:
00207     #zero_errors=True
00208     #for h in self.h1,self.h2:
00209       #for ibin in xrange(Nbins1):
00210         #if h.GetBinError(ibin+1) >0:
00211           #zero_errors=False
00212           #break
00213     #if zero_errors:
00214       #return test_codes["ZERO_ERR"]
00215 
00216     return self.h1.KolmogorovTest(self.h2)
00217 
00218 #-------------------------------------------------------------------------------
00219 import array
00220 def profile2histo(profile):
00221   if not profile.InheritsFrom("TH1"):
00222     return profile
00223     
00224   bin_low_edges=[]
00225   n_bins=profile.GetNbinsX()
00226   
00227   for ibin in xrange(1,n_bins+2):
00228     bin_low_edges.append(profile.GetBinLowEdge(ibin))
00229   bin_low_edges=array.array('f',bin_low_edges)
00230   histo=TH1F(profile.GetName(),profile.GetTitle(),n_bins,bin_low_edges)
00231   for ibin in xrange(0,n_bins+1):
00232     histo.SetBinContent(ibin,profile.GetBinContent(ibin))
00233     histo.SetBinError(ibin,profile.GetBinError(ibin))    
00234   
00235   return histo
00236 #-------------------------------------------------------------------------------
00237 
00238 class Chi2(StatisticalTest):
00239   def __init__(self, threshold):
00240     StatisticalTest.__init__(self,threshold)
00241     self.name="Chi2"     
00242 
00243   def check_filled_bins(self,min_filled):
00244     nbins=self.h1.GetNbinsX()
00245     n_filled_l=[]
00246     for h in self.h1,self.h2:
00247       nfilled=0.
00248       for ibin in xrange(1,nbins+1):
00249         if h.GetBinContent(ibin)>0:
00250           nfilled+=1
00251       n_filled_l.append(nfilled)
00252     return len(filter (lambda x:x>=min_filled,n_filled_l) )>0
00253 
00254   def absval(self):
00255     nbins=getNbins(self.h1)
00256     binc=0
00257     for i in xrange(1,nbins):
00258       for h in self.h1,self.h2:
00259         binc=h.GetBinContent(i)
00260         if binc<0:
00261           h.SetBinContent(i,-1*binc)
00262         if h.GetBinError(i)==0 and binc!=0:
00263           #print "Histo ",h.GetName()," Bin:",i,"-Content:",h.GetBinContent(i)," had zero error"
00264           h.SetBinContent(i,0)
00265 
00266   def check_histograms(self, histogram):
00267       if histogram.InheritsFrom("TProfile") or  (histogram.GetEntries()!=histogram.GetSumOfWeights()):
00268           return 'W'
00269       else:
00270           return 'U'
00271 
00272   def do_test(self):
00273     self.absval()
00274     if self.check_filled_bins(3):
00275       #if self.h1.InheritsFrom("TProfile") or  (self.h1.GetEntries()!=self.h1.GetSumOfWeights()):
00276       #  chi2=self.h1.Chi2Test(self.h2,'WW')
00277       #  #if chi2==0: print "DEBUG",self.h1.GetName(),"Chi2 is:", chi2
00278       #  return chi2
00279       #else:
00280       #  return self.h1.Chi2Test(self.h2,'UU')
00281       hist1 = self.check_histograms(self.h1)
00282       hist2 = self.check_histograms(self.h2)
00283       if hist1 =='W' and hist2 =='W': ##in case 
00284           chi2 = self.h1.Chi2Test(self.h2,'WW')     ## the both histograms are weighted
00285           return chi2
00286       elif hist1 == 'U' and hist2 == 'U':
00287           chi2 = self.h1.Chi2Test(self.h2,'UU')    ##the both histograms are unweighted
00288           return chi2
00289       elif hist1 == 'U' and hist2 == 'W':
00290           chi2 = self.h1.Chi2Test(self.h2,'UW')   ## 1st histogram is unweighted, 2nd weighted
00291           return chi2
00292       elif hist1 == 'W' and hist2 == 'U':
00293           chi2 = self.h2.Chi2Test(self.h1,'UW')   ## 1 is wieghted, 2nd unweigthed. so flip order to make a UW comparison
00294           return chi2
00295     else:
00296       return 1
00297       #return test_codes["FEW_BINS"]
00298 
00299 #-------------------------------------------------------------------------------
00300 
00301 class BinToBin(StatisticalTest):
00302   """The bin to bin comparison builds a fake pvalue. It is 0 if the number of 
00303   bins is different. It is % of corresponding bins otherwhise.
00304   A threshold of 1 is needed to require a 1 to 1 correspondance between 
00305   hisograms.
00306   """
00307   def __init__(self, threshold=1):
00308     StatisticalTest.__init__(self, threshold)
00309     self.name='BinToBin'
00310     self.epsilon= 0.000001
00311 
00312   def checkBinningMatches(self):
00313     if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
00314            or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
00315            or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
00316            or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
00317            or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
00318            or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
00319            or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
00320            or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
00321            or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
00322       return False
00323     return True
00324 
00325   def do_test(self):
00326     # fist check that binning matches
00327     if not self.checkBinningMatches():
00328       return test_codes["DIFF_BIN"]
00329     # then do the real check
00330     equal = 1
00331     nbins = getNbins(self.h1)
00332     n_ok_bins=0.0
00333     for ibin in xrange(0,nbins):
00334       h1bin=self.h1.GetBinContent(ibin)
00335       h2bin=self.h2.GetBinContent(ibin)
00336       bindiff=h1bin-h2bin
00337 
00338       binavg=.5*(h1bin+h2bin)
00339 
00340       if binavg==0 or abs(bindiff) < self.epsilon:
00341         n_ok_bins+=1
00342         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00343       else:
00344         print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00345 
00346       #if abs(bindiff)!=0 :
00347         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00348     
00349     rank=n_ok_bins/nbins
00350     
00351     if rank!=1:
00352       print "Histogram %s differs: nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins)
00353     
00354     return rank
00355 
00356 #-------------------------------------------------------------------------------
00357 
00358 class BinToBin1percent(StatisticalTest):
00359   """The bin to bin comparison builds a fake pvalue. It is 0 if the number of 
00360   bins is different. It is % of corresponding bins otherwhise.
00361   A threshold of 1 is needed to require a 1 to 1 correspondance between 
00362   hisograms.
00363   """
00364   def __init__(self, threshold=1):
00365     StatisticalTest.__init__(self, threshold)
00366     self.name='BinToBin1percent'
00367     self.epsilon= 0.000001
00368     self.tolerance= 0.01
00369 
00370   def checkBinningMatches(self):
00371     if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
00372            or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
00373            or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
00374            or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
00375            or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
00376            or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
00377            or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
00378            or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
00379            or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
00380       return False
00381     return True
00382 
00383   def do_test(self):
00384     # fist check that binning matches
00385     if not self.checkBinningMatches():
00386       return test_codes["DIFF_BIN"]
00387     # then do the real check
00388     equal = 1
00389     nbins = getNbins(self.h1)
00390     n_ok_bins=0.0
00391     for ibin in xrange(0,nbins):
00392       ibin+=1
00393       h1bin=self.h1.GetBinContent(ibin)
00394       h2bin=self.h2.GetBinContent(ibin)
00395       bindiff=h1bin-h2bin
00396 
00397       binavg=.5*(h1bin+h2bin)
00398 
00399       if binavg==0 or 100*abs(bindiff)/binavg < self.tolerance:
00400         n_ok_bins+=1
00401         #print "Bin %i bin: bindiff %s" %(ibin,bindiff)
00402       else:
00403         print "-->Bin %i bin: bindiff %s (%s - %s )" %(ibin,bindiff,h1bin,h2bin)
00404 
00405       #if abs(bindiff)!=0 :
00406         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00407     
00408     rank=n_ok_bins/nbins
00409     
00410     if rank!=1:
00411       print "%s nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins)
00412     
00413     return rank
00414 #-------------------------------------------------------------------------------
00415 Statistical_Tests={"KS":KS,
00416                    "Chi2":Chi2,
00417                    "BinToBin":BinToBin,
00418                    "BinToBin1percent":BinToBin1percent,
00419                    "Bin2Bin":BinToBin,
00420                    "b2b":BinToBin,}
00421 #-------------------------------------------------------------------------------  
00422 
00423 def ask_ok(prompt, retries=4, complaint='yes or no'):
00424     while True:
00425         ok = raw_input(prompt)
00426         if ok in ('y', 'ye', 'yes'):
00427             return True
00428         if ok in ('n', 'no'):
00429             return False
00430         retries = retries - 1
00431         if retries < 0:
00432             raise IOError('refusenik user')
00433         print complaint
00434 
00435 #-------------------------------------------------------------------------------
00436 
00437 class unpickler(Thread):
00438   def __init__(self,filename):
00439     Thread.__init__(self)
00440     self.filename=filename
00441     self.directory=""
00442 
00443   def run(self):
00444     print "Reading directory from %s" %(self.filename)
00445     ifile=open(self.filename,"rb")
00446     self.directory=load(ifile) 
00447     ifile.close()
00448 
00449 #-------------------------------------------------------------------------------    
00450 
00451 def wget(url):
00452   """ Fetch the WHOLE file, not in bunches... To be optimised.
00453   """
00454   opener=build_opener(X509CertOpen())  
00455   datareq = Request(url)
00456   datareq.add_header('authenticated_wget', "The ultimate wgetter")    
00457   bin_content=None
00458   try:
00459     filename=basename(url)  
00460     print "Checking existence of file %s on disk..."%filename
00461     if not isfile("./%s"%filename):      
00462       bin_content=opener.open(datareq).read()
00463     else:
00464       print "File %s exists, skipping.." %filename
00465   except ValueError:
00466     print "Error: Unknown url %s" %url
00467   
00468   if bin_content!=None:  
00469     ofile = open(filename, 'wb')
00470     ofile.write(bin_content)
00471     ofile.close()
00472 
00473 #-------------------------------------------------------------------------------
00474 ##-----------------   Make files pairs:  RelValData utils   --------------------
00475 
00476 def get_relvaldata_id(file):
00477     """Returns unique relvaldata ID for a given file."""
00478     run_id = re.search('R\d{9}', file)
00479     run = re.search('_RelVal_([\w\d]*)-v\d__', file)
00480     if not run:
00481         run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
00482     if run_id and run:
00483         return (run_id.group(), run.group(1))
00484     return None
00485 
00486 def get_relvaldata_cmssw_version(file):
00487     """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
00488     cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00489     gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
00490     if not gr_r_version:
00491         gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
00492     if cmssw_release and gr_r_version:
00493         return (cmssw_release[0], gr_r_version[0])
00494 
00495 def get_relvaldata_version(file):
00496     """Returns tuple (CMSSW version, run version) for specified file."""
00497     cmssw_version = re.findall('DQM_V(\d*)_', file)
00498     run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
00499     if not run_version:
00500         run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
00501     if cmssw_version and run_version:
00502         return (int(cmssw_version[0]), int(run_version[0]))
00503 
00504 def get_relvaldata_max_version(files):
00505     """Returns file with maximum version at a) beggining of the file,
00506     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00507     max_file = files[0]
00508     max_v = get_relvaldata_version(files[0])
00509     for file in files:
00510         file_v = get_relvaldata_version(file)
00511         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00512             max_file = file
00513             max_v = file_v
00514     return max_file
00515 
00516 ##-------------------   Make files pairs:  RelVal utils   ---------------------
00517 def get_relval_version(file):
00518     """Returns tuple (CMSSW version, run version) for specified file."""
00519     cmssw_version = re.findall('DQM_V(\d*)_', file)
00520     run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
00521     if cmssw_version and run_version:
00522         return (int(cmssw_version[0]), int(run_version[0]))
00523 
00524 def get_relval_max_version(files):
00525     """Returns file with maximum version at a) beggining of the file,
00526     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00527     max_file = files[0]
00528     max_v = get_relval_version(files[0])
00529     for file in files:
00530         file_v = get_relval_version(file)
00531         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00532             max_file = file
00533             max_v = file_v
00534     return max_file
00535 
00536 def get_relval_cmssw_version(file):
00537     cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00538     gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(_[\w\d]*)?-v', file)
00539     if cmssw_release and gr_r_version:
00540         return (cmssw_release[0], gr_r_version[0])
00541 
00542 def get_relval_id(file):
00543     """Returns unique relval ID (dataset name) for a given file."""
00544     dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
00545     return dataset_name[0]
00546 
00547 ##-------------------------  Make files pairs --------------------------
00548 def is_relvaldata(files):
00549     is_relvaldata_re = re.compile('_RelVal_')
00550     return any([is_relvaldata_re.search(filename) for filename in files])
00551 
00552 def make_files_pairs(files, verbose=True):
00553     ## Select functions to use
00554     if is_relvaldata(files):
00555         is_relval_data = True
00556         get_cmssw_version = get_relvaldata_cmssw_version
00557         get_id = get_relvaldata_id
00558         get_max_version = get_relvaldata_max_version
00559         # print 'Pairing Data RelVal files.'
00560     else:
00561         is_relval_data = False
00562         get_cmssw_version = get_relval_cmssw_version
00563         get_id = get_relval_id
00564         get_max_version = get_relval_max_version
00565         # print 'Pairing Monte Carlo RelVal files.'
00566 
00567     ## Divide files into groups
00568     versions_files = dict()
00569     for file in files:
00570         version = get_cmssw_version(file)
00571         if versions_files.has_key(version):
00572             versions_files[version].append(file)
00573         else:
00574             versions_files[version] = [file]
00575 
00576     ## Print the division into groups
00577     if verbose:
00578         print '\nFound versions:'
00579         for version in versions_files:
00580             print '%s: %d files' % (str(version),  len(versions_files[version]))
00581 
00582     if len(versions_files.keys()) <= 1:
00583         print '\nFound too little versions, there is nothing to pair. Exiting...\n'
00584         exit()
00585 
00586     ## Select two biggest groups.
00587     versions = versions_files.keys()
00588     sizes = [len(value) for value in versions_files.values()]
00589     v1 = versions[sizes.index(max(sizes))]
00590     versions.remove(v1)
00591     sizes.remove(max(sizes))
00592     v2 = versions[sizes.index(max(sizes))]
00593 
00594     ## Print two biggest groups.
00595     if verbose:
00596         print '\nPairing %s (%d files) and %s (%d files)' % (str(v1),
00597                 len(versions_files[v1]), str(v2), len(versions_files[v2]))
00598 
00599     ## Pairing two versions
00600     print '\nGot pairs:'
00601     pairs = []
00602     for unique_id in set([get_id(file) for file in versions_files[v1]]):
00603         if is_relval_data:
00604             dataset_re = re.compile(unique_id[0]+'_')
00605             run_re = re.compile(unique_id[1])
00606             c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and run_re.search(file)]
00607             c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and run_re.search(file)]
00608         else:
00609             dataset_re = re.compile(unique_id+'_')
00610             c1_files = [file for file in versions_files[v1] if dataset_re.search(file)]
00611             c2_files = [file for file in versions_files[v2] if dataset_re.search(file)]
00612 
00613         if len(c1_files) > 0 and len(c2_files) > 0:
00614             first_file = get_max_version(c1_files)
00615             second_file = get_max_version(c2_files)
00616             print '%s\n%s\n' % (first_file, second_file)
00617             pairs.extend((first_file, second_file))
00618     if verbose:
00619         print "Paired and got %d files.\n" % len(pairs)
00620     return pairs