CMS 3D CMS Logo

/data/refman/pasoursint/CMSSW_5_3_3/src/Utilities/RelMon/python/utils.py

Go to the documentation of this file.
00001 ################################################################################
00002 # RelMon: a tool for automatic Release Comparison                              
00003 # https://twiki.cern.ch/twiki/bin/view/CMSPublic/RelMon
00004 #
00005 # $Author: agimbuta $
00006 # $Date: 2012/07/17 14:55:50 $
00007 # $Revision: 1.5 $
00008 #
00009 #                                                                              
00010 # Danilo Piparo CERN - danilo.piparo@cern.ch                                   
00011 #                                                                              
00012 ################################################################################
00013 
00014 
00015 import array
00016 import os
00017 import re
00018 import sys
00019 from cPickle import load
00020 from os.path import dirname,basename,join,isfile
00021 from threading import Thread
00022 from time import asctime
00023 
00024 theargv=sys.argv
00025 sys.argv=[]
00026 from ROOT import *
00027 import ROOT
00028 ROOT.gErrorIgnoreLevel=1001
00029 ROOT.gROOT.SetBatch(True)
00030 sys.argv=theargv
00031 
00032 from urllib2  import Request,build_opener,urlopen
00033 
00034 if os.environ.has_key("RELMON_SA"):
00035   from definitions import *
00036   from authentication import X509CertOpen
00037   from utils import __file__ as this_module_name  
00038 else:
00039   from Utilities.RelMon.definitions import *
00040   from Utilities.RelMon.authentication import X509CertOpen
00041   from Utilities.RelMon.utils import __file__ as this_module_name
00042 
00043 #ROOT.gErrorIgnoreLevel=1001
00044 
00045 
00046 _log_level=10
00047 def logger(msg_level,message):
00048   if msg_level>=_log_level:
00049     print "[%s] %s" %(asctime(),message)
00050 
00051 #-------------------------------------------------------------------------------
00052 def setTDRStyle():  
00053   this_dir=dirname(this_module_name)
00054   this_dir_one_up=this_dir[:this_dir.rfind("/")+1]
00055   #this_dir_two_up=this_dir_one_up[:this_dir_one_up.rfind("/")+1]
00056   style_file=''
00057   if os.environ.has_key("RELMON_SA"):
00058     style_file=this_dir_one_up+"data/tdrstyle_mod.C"
00059   else:
00060     style_file="%s/src/Utilities/RelMon/data/tdrstyle_mod.C"%(os.environ["CMSSW_BASE"])
00061   try:
00062     gROOT.ProcessLine(".L %s" %style_file)
00063     gROOT.ProcessLine("setTDRStyle()")
00064   except:
00065     "Print could not set the TDR style. File %s not found?" %style_file
00066     
00067 
00068 #-------------------------------------------------------------------------------
00069 def literal2root (literal,rootType):
00070   bitsarray = array.array('B')
00071   bitsarray.fromstring(literal.decode('hex'))
00072 
00073   tbuffer=0
00074   try:  
00075       tbuffer = TBufferFile(TBufferFile.kRead, len(bitsarray), bitsarray, False,0)
00076   except:
00077       print "could not transform to object array:"
00078       print [ i for i in  bitsarray ]
00079   
00080   # replace a couple of shortcuts with the real root class name
00081   if rootType == 'TPROF':
00082       rootType = 'TProfile'
00083   if rootType == 'TPROF2D':
00084       rootType = 'TProfile2D'
00085   
00086   root_class=eval(rootType+'.Class()')
00087   
00088   return tbuffer.ReadObject(root_class)
00089   
00090 #-------------------------------------------------------------------------------
00091 
00092 def getNbins(h):
00093   biny=h.GetNbinsY()
00094   if biny>1:biny+=1
00095   binz=h.GetNbinsZ()
00096   if binz>1:binz+=1
00097   return (h.GetNbinsX()+1)*(biny)*(binz)
00098 
00099 #-------------------------------------------------------------------------------
00100 
00101 
00102 class StatisticalTest(object):
00103   def __init__(self,threshold):
00104     self.name=""
00105     self.h1=None
00106     self.h2=None
00107     self.threshold=float(threshold)
00108     self.rank=-1
00109     self.is_init=False
00110 
00111   def set_operands(self,h1,h2):
00112     self.h1=h1
00113     self.h2=h2
00114 
00115   def get_rank(self):
00116     if not self.is_init:
00117       if self.rank<0:
00118         type1=type(self.h1)
00119         type2=type(self.h2)
00120         if (type1 != type2):
00121           logger(1,"*** ERROR: object types in comparison don't match: %s!=%s" %(type1,type2))
00122           self.rank=test_codes["DIFF_TYPES"]
00123         elif not self.h2.InheritsFrom("TH1"):
00124           logger(1,"*** ERROR: object type is not histogram but a %s" %(type1))
00125           self.rank=test_codes["NO_HIST"]    
00126         # if histos are empty
00127         #elif self.h1.InheritsFrom("TH2") and not "BinToBin" in self.name:
00128           ## 2D!
00129           #return test_codes["2D"]
00130         else:
00131           is_empty1=is_empty(self.h1)
00132           is_empty2=is_empty(self.h2)
00133           are_empty=is_empty1 and is_empty2
00134           one_empty=is_empty1 or is_empty2
00135 
00136           Nbins1= getNbins(self.h1)
00137           Nbins2= getNbins(self.h2)
00138 
00139           if are_empty:
00140             #return -103
00141             # Conversation with JeanRoch and David 5 April
00142             return 1
00143           elif one_empty:
00144             #return -103
00145             # Conversation with JeanRoch and David 5 April
00146             return 1
00147 
00148           # if histos have different number of bins
00149           if Nbins1!=Nbins2:
00150             return test_codes["DIFF_BIN"]
00151 
00152       self.rank=self.do_test()
00153     self.is_init=True
00154     return self.rank
00155 
00156   def get_status(self):
00157     status = SUCCESS
00158     if self.get_rank()<0:
00159       status=NULL
00160       logger(0,"+++ Test %s FAILED: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))  
00161     elif self.get_rank()<=self.threshold:
00162       status=FAIL
00163     logger(0,"+++ Test %s: rank is %s and threshold is %s ==> %s" %(self.name, self.rank, self.threshold, status))
00164     return status
00165 
00166   def do_test(self):
00167     pass
00168 
00169 #-------------------------------------------------------------------------------
00170 
00171 def is_empty(h):
00172   for i in xrange(1,getNbins(h)):
00173     if h.GetBinContent(i)!=0: return False
00174   return True
00175   #return h.GetSumOfWeights()==0
00176 
00177 #-------------------------------------------------------------------------------
00178 
00179 def is_sparse(h):
00180   filled_bins=0.
00181   nbins=h.GetNbinsX()
00182   for ibin in xrange(nbins):
00183     if h.GetBinContent(ibin)>0:
00184       filled_bins+=1
00185   #print "%s %s --> %s" %(filled_bins,nbins,filled_bins/nbins)
00186   if filled_bins/nbins < .5:
00187     return True
00188   else:
00189     return False
00190 
00191 #-------------------------------------------------------------------------------
00192 
00193 class KS(StatisticalTest):
00194   def __init__(self, threshold):
00195    StatisticalTest.__init__(self,threshold)
00196    self.name="KS"
00197 
00198   def do_test(self):
00199 
00200     # Calculate errors if not there...
00201     for h in self.h1,self.h2:
00202       w2s=h.GetSumw2()
00203       if w2s.GetSize()==0:
00204         h.Sumw2()
00205 
00206     ## If errors are 0:
00207     #zero_errors=True
00208     #for h in self.h1,self.h2:
00209       #for ibin in xrange(Nbins1):
00210         #if h.GetBinError(ibin+1) >0:
00211           #zero_errors=False
00212           #break
00213     #if zero_errors:
00214       #return test_codes["ZERO_ERR"]
00215 
00216     return self.h1.KolmogorovTest(self.h2)
00217 
00218 #-------------------------------------------------------------------------------
00219 import array
00220 def profile2histo(profile):
00221   if not profile.InheritsFrom("TH1"):
00222     return profile
00223     
00224   bin_low_edges=[]
00225   n_bins=profile.GetNbinsX()
00226   
00227   for ibin in xrange(1,n_bins+2):
00228     bin_low_edges.append(profile.GetBinLowEdge(ibin))
00229   bin_low_edges=array.array('f',bin_low_edges)
00230   histo=TH1F(profile.GetName(),profile.GetTitle(),n_bins,bin_low_edges)
00231   for ibin in xrange(0,n_bins+1):
00232     histo.SetBinContent(ibin,profile.GetBinContent(ibin))
00233     histo.SetBinError(ibin,profile.GetBinError(ibin))    
00234   
00235   return histo
00236 #-------------------------------------------------------------------------------
00237 
00238 class Chi2(StatisticalTest):
00239   def __init__(self, threshold):
00240     StatisticalTest.__init__(self,threshold)
00241     self.name="Chi2"     
00242 
00243   def check_filled_bins(self,min_filled):
00244     nbins=self.h1.GetNbinsX()
00245     n_filled_l=[]
00246     for h in self.h1,self.h2:
00247       nfilled=0.
00248       for ibin in xrange(1,nbins+1):
00249         if h.GetBinContent(ibin)>0:
00250           nfilled+=1
00251       n_filled_l.append(nfilled)
00252     return len(filter (lambda x:x>=min_filled,n_filled_l) )>0
00253 
00254   def absval(self):
00255     nbins=getNbins(self.h1)
00256     binc=0
00257     for i in xrange(1,nbins):
00258       for h in self.h1,self.h2:
00259         binc=h.GetBinContent(i)
00260         if binc<0:
00261           h.SetBinContent(i,-1*binc)
00262         if h.GetBinError(i)==0 and binc!=0:
00263           #print "Histo ",h.GetName()," Bin:",i,"-Content:",h.GetBinContent(i)," had zero error"
00264           h.SetBinContent(i,0)
00265 
00266 
00267   def do_test(self):
00268     self.absval()
00269     if self.check_filled_bins(3):
00270       if self.h1.InheritsFrom("TProfile") or  (self.h1.GetEntries()!=self.h1.GetSumOfWeights()):
00271         chi2=self.h1.Chi2Test(self.h2,'WW')
00272         #if chi2==0: print "DEBUG",self.h1.GetName(),"Chi2 is:", chi2
00273         return chi2
00274       else:
00275         return self.h1.Chi2Test(self.h2,'UU')
00276     else:
00277       return 1
00278       #return test_codes["FEW_BINS"]
00279 
00280 #-------------------------------------------------------------------------------
00281 
00282 class BinToBin(StatisticalTest):
00283   """The bin to bin comparison builds a fake pvalue. It is 0 if the number of 
00284   bins is different. It is % of corresponding bins otherwhise.
00285   A threshold of 1 is needed to require a 1 to 1 correspondance between 
00286   hisograms.
00287   """
00288   def __init__(self, threshold=1):
00289     StatisticalTest.__init__(self, threshold)
00290     self.name='BinToBin'
00291     self.epsilon= 0.000001
00292 
00293   def checkBinningMatches(self):
00294     if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
00295            or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
00296            or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
00297            or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
00298            or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
00299            or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
00300            or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
00301            or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
00302            or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
00303       return False
00304     return True
00305 
00306   def do_test(self):
00307     # fist check that binning matches
00308     if not self.checkBinningMatches():
00309       return test_codes["DIFF_BIN"]
00310     # then do the real check
00311     equal = 1
00312     nbins = getNbins(self.h1)
00313     n_ok_bins=0.0
00314     for ibin in xrange(0,nbins):
00315       ibin+=1
00316       h1bin=self.h1.GetBinContent(ibin)
00317       h2bin=self.h2.GetBinContent(ibin)
00318       bindiff=h1bin-h2bin
00319 
00320       binavg=.5*(h1bin+h2bin)
00321 
00322       if binavg==0 or abs(bindiff) < self.epsilon:
00323         n_ok_bins+=1
00324         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00325       else:
00326         print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00327 
00328       #if abs(bindiff)!=0 :
00329         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00330     
00331     rank=n_ok_bins/nbins
00332     
00333     if rank!=1:
00334       print "Histogram %s differs: nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins)
00335     
00336     return rank
00337 
00338 #-------------------------------------------------------------------------------
00339 
00340 class BinToBin1percent(StatisticalTest):
00341   """The bin to bin comparison builds a fake pvalue. It is 0 if the number of 
00342   bins is different. It is % of corresponding bins otherwhise.
00343   A threshold of 1 is needed to require a 1 to 1 correspondance between 
00344   hisograms.
00345   """
00346   def __init__(self, threshold=1):
00347     StatisticalTest.__init__(self, threshold)
00348     self.name='BinToBin1percent'
00349     self.epsilon= 0.000001
00350     self.tolerance= 0.01
00351 
00352   def checkBinningMatches(self):
00353     if self.h1.GetNbinsX() != self.h2.GetNbinsX() \
00354            or self.h1.GetNbinsY() != self.h2.GetNbinsY() \
00355            or self.h1.GetNbinsZ() != self.h2.GetNbinsZ() \
00356            or abs(self.h1.GetXaxis().GetXmin() - self.h2.GetXaxis().GetXmin()) >self.epsilon \
00357            or abs(self.h1.GetYaxis().GetXmin() - self.h2.GetYaxis().GetXmin()) >self.epsilon \
00358            or abs(self.h1.GetZaxis().GetXmin() - self.h2.GetZaxis().GetXmin()) >self.epsilon \
00359            or abs(self.h1.GetXaxis().GetXmax() - self.h2.GetXaxis().GetXmax()) >self.epsilon \
00360            or abs(self.h1.GetYaxis().GetXmax() - self.h2.GetYaxis().GetXmax()) >self.epsilon \
00361            or abs(self.h1.GetZaxis().GetXmax() - self.h2.GetZaxis().GetXmax()) >self.epsilon:
00362       return False
00363     return True
00364 
00365   def do_test(self):
00366     # fist check that binning matches
00367     if not self.checkBinningMatches():
00368       return test_codes["DIFF_BIN"]
00369     # then do the real check
00370     equal = 1
00371     nbins = getNbins(self.h1)
00372     n_ok_bins=0.0
00373     for ibin in xrange(0,nbins):
00374       ibin+=1
00375       h1bin=self.h1.GetBinContent(ibin)
00376       h2bin=self.h2.GetBinContent(ibin)
00377       bindiff=h1bin-h2bin
00378 
00379       binavg=.5*(h1bin+h2bin)
00380 
00381       if binavg==0 or 100*abs(bindiff)/binavg < self.tolerance:
00382         n_ok_bins+=1
00383         #print "Bin %i bin: bindiff %s" %(ibin,bindiff)
00384       else:
00385         print "-->Bin %i bin: bindiff %s (%s - %s )" %(ibin,bindiff,h1bin,h2bin)
00386 
00387       #if abs(bindiff)!=0 :
00388         #print "Bin %ibin: bindiff %s" %(ibin,bindiff)
00389     
00390     rank=n_ok_bins/nbins
00391     
00392     if rank!=1:
00393       print "%s nok: %s ntot: %s" %(self.h1.GetName(),n_ok_bins,nbins)
00394     
00395     return rank
00396 #-------------------------------------------------------------------------------
00397 Statistical_Tests={"KS":KS,
00398                    "Chi2":Chi2,
00399                    "BinToBin":BinToBin,
00400                    "BinToBin1percent":BinToBin1percent,
00401                    "Bin2Bin":BinToBin,
00402                    "b2b":BinToBin,}
00403 #-------------------------------------------------------------------------------  
00404 
00405 def ask_ok(prompt, retries=4, complaint='yes or no'):
00406     while True:
00407         ok = raw_input(prompt)
00408         if ok in ('y', 'ye', 'yes'):
00409             return True
00410         if ok in ('n', 'no'):
00411             return False
00412         retries = retries - 1
00413         if retries < 0:
00414             raise IOError('refusenik user')
00415         print complaint
00416 
00417 #-------------------------------------------------------------------------------
00418 
00419 class unpickler(Thread):
00420   def __init__(self,filename):
00421     Thread.__init__(self)
00422     self.filename=filename
00423     self.directory=""
00424 
00425   def run(self):
00426     print "Reading directory from %s" %(self.filename)
00427     ifile=open(self.filename,"rb")
00428     self.directory=load(ifile) 
00429     ifile.close()
00430 
00431 #-------------------------------------------------------------------------------    
00432 
00433 def wget(url):
00434   """ Fetch the WHOLE file, not in bunches... To be optimised.
00435   """
00436   opener=build_opener(X509CertOpen())  
00437   datareq = Request(url)
00438   datareq.add_header('authenticated_wget', "The ultimate wgetter")    
00439   bin_content=None
00440   try:
00441     filename=basename(url)  
00442     print "Checking existence of file %s on disk..."%filename
00443     if not isfile("./%s"%filename):      
00444       bin_content=opener.open(datareq).read()
00445     else:
00446       print "File %s exists, skipping.." %filename
00447   except ValueError:
00448     print "Error: Unknown url %s" %url
00449   
00450   if bin_content!=None:  
00451     ofile = open(filename, 'wb')
00452     ofile.write(bin_content)
00453     ofile.close()
00454 
00455 #-------------------------------------------------------------------------------
00456 ##-----------------   Make files pairs:  RelValData utils   --------------------
00457 
00458 def get_relvaldata_id(file):
00459     """Returns unique relvaldata ID for a given file."""
00460     run_id = re.search('R\d{9}', file)
00461     run = re.search('_RelVal_([\w\d]*)-v\d__', file)
00462     if not run:
00463         run = re.search('GR_R_\d*_V\d*C?_([\w\d]*)-v\d__', file)
00464     if run_id and run:
00465         return (run_id.group(), run.group(1))
00466     return None
00467 
00468 def get_relvaldata_cmssw_version(file):
00469     """Returns tuple (CMSSW release, GR_R version) for specified RelValData file."""
00470     cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00471     gr_r_version = re.findall('-(GR_R_\d*_V\d*\w?)(?:_RelVal)?_', file)
00472     if not gr_r_version:
00473         gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-(\w*)_RelVal_', file)
00474     if cmssw_release and gr_r_version:
00475         return (cmssw_release[0], gr_r_version[0])
00476 
00477 def get_relvaldata_version(file):
00478     """Returns tuple (CMSSW version, run version) for specified file."""
00479     cmssw_version = re.findall('DQM_V(\d*)_', file)
00480     run_version = re.findall('_RelVal_[\w\d]*-v(\d)__', file)
00481     if not run_version:
00482         run_version = re.findall('GR_R_\d*_V\d*C?_[\w\d]*-v(\d)__', file)
00483     if cmssw_version and run_version:
00484         return (int(cmssw_version[0]), int(run_version[0]))
00485 
00486 def get_relvaldata_max_version(files):
00487     """Returns file with maximum version at a) beggining of the file,
00488     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00489     max_file = files[0]
00490     max_v = get_relvaldata_version(files[0])
00491     for file in files:
00492         file_v = get_relvaldata_version(file)
00493         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00494             max_file = file
00495             max_v = file_v
00496     return max_file
00497 
00498 ##-------------------   Make files pairs:  RelVal utils   ---------------------
00499 def get_relval_version(file):
00500     """Returns tuple (CMSSW version, run version) for specified file."""
00501     cmssw_version = re.findall('DQM_V(\d*)_', file)
00502     run_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_[\w\d]*)?-v(\d*)__', file)
00503     if cmssw_version and run_version:
00504         return (int(cmssw_version[0]), int(run_version[0]))
00505 
00506 def get_relval_max_version(files):
00507     """Returns file with maximum version at a) beggining of the file,
00508     e.g. DQM_V000M b) at the end of run, e.g. _run2012-vM. M has to be max."""
00509     max_file = files[0]
00510     max_v = get_relval_version(files[0])
00511     for file in files:
00512         file_v = get_relval_version(file)
00513         if file_v[1] > max_v[1] or ((file_v[1] == max_v[1]) and (file_v[0] > max_v[0])):
00514             max_file = file
00515             max_v = file_v
00516     return max_file
00517 
00518 def get_relval_cmssw_version(file):
00519     cmssw_release = re.findall('(CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?)-', file)
00520     gr_r_version = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-([\w\d]*)_V\d*\w?(?:_[\w\d]*)?-v', file)
00521     if cmssw_release and gr_r_version:
00522         return (cmssw_release[0], gr_r_version[0])
00523 
00524 def get_relval_id(file):
00525     """Returns unique relval ID (dataset name) for a given file."""
00526     dataset_name = re.findall('R\d{9}__([\w\d]*)__CMSSW_', file)
00527     run = re.findall('CMSSW_\d*_\d*_\d*(?:_[\w\d]*)?-[\w\d]*_V\d*\w?(?:_([\w\d]*))?-v\d*__', file)
00528     if run[0]:
00529         return (dataset_name[0], run[0])
00530     else:
00531         return (dataset_name[0], '_V\d*\w?-v\d*__')
00532 
00533 ##-------------------------  Make files pairs --------------------------
00534 def is_relvaldata(files):
00535     is_relvaldata_re = re.compile('_RelVal_')
00536     return any([is_relvaldata_re.search(filename) for filename in files])
00537 
00538 def make_files_pairs(files, verbose=True):
00539     ## Select functions to use
00540     if is_relvaldata(files):
00541         get_cmssw_version = get_relvaldata_cmssw_version
00542         get_id = get_relvaldata_id
00543         get_max_version = get_relvaldata_max_version
00544         # print 'Pairing Data RelVal files.'
00545     else:
00546         get_cmssw_version = get_relval_cmssw_version
00547         get_id = get_relval_id
00548         get_max_version = get_relval_max_version
00549         # print 'Pairing Monte Carlo RelVal files.'
00550 
00551     ## Divide files into groups
00552     versions_files = dict()
00553     for file in files:
00554         version = get_cmssw_version(file)
00555         if versions_files.has_key(version):
00556             versions_files[version].append(file)
00557         else:
00558             versions_files[version] = [file]
00559 
00560     ## Print the division into groups
00561     if verbose:
00562         print '\nFound versions:'
00563         for version in versions_files:
00564             print '%s: %d files' % (str(version),  len(versions_files[version]))
00565 
00566     ## Select two biggest groups.
00567     versions = versions_files.keys()
00568     sizes = [len(value) for value in versions_files.values()]
00569     v1 = versions[sizes.index(max(sizes))]
00570     versions.remove(v1)
00571     sizes.remove(max(sizes))
00572     v2 = versions[sizes.index(max(sizes))]
00573 
00574     ## Print two biggest groups.
00575     if verbose:
00576         print '\nPairing %s (%d files) and %s (%d files)' % (str(v1),
00577                 len(versions_files[v1]), str(v2), len(versions_files[v2]))
00578 
00579     ## Pairing two versions
00580     pairs = []
00581     for unique_id in set([get_id(file) for file in versions_files[v1]]):
00582         dataset_re = re.compile(unique_id[0]+'_')
00583         run = re.compile(unique_id[1])
00584         c1_files = [file for file in versions_files[v1] if dataset_re.search(file) and run.search(file)]
00585         c2_files = [file for file in versions_files[v2] if dataset_re.search(file) and run.search(file)]
00586         if len(c1_files) > 0 and len(c2_files) > 0:
00587             first_file = get_max_version(c1_files)
00588             second_file = get_max_version(c2_files)
00589             pairs.extend((first_file, second_file))
00590     if verbose:
00591         print "\nPaired and got %d files.\n" % len(pairs)
00592     return pairs