CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
isotrackTrainRegressor.py
Go to the documentation of this file.
1 ######################################################################################
2 # Trains regressor and saves model for evaluation
3 # Usage:
4 # python3 isotrackTrainRegressor.py -I isotk_relval_hi.pkl -V 1
5 # python3 isotrackTrainRegressor.py -I isotk_relval_lo.pkl -V 2
6 ######################################################################################
7 
8 
9 import pandas as pd
10 import numpy as np
11 import matplotlib.pyplot as plt
12 import argparse
13 import tensorflow.keras
14 from tensorflow.keras.models import Sequential
15 from tensorflow.keras.layers import Dense
16 from tensorflow.keras.layers import Dropout
17 from tensorflow.keras.utils import to_categorical
18 from tensorflow.keras.utils import plot_model
19 from tensorflow.keras import regularizers
20 from sklearn.metrics import roc_curve, auc
21 from tensorflow.keras.layers import Activation
22 from tensorflow.keras import backend as K
23 from tensorflow.keras.models import save_model
24 
25 
26 
27 parser = argparse.ArgumentParser()
28 parser.add_argument("-I", "--input",help="input file",default="isotk_relval_hi.pkl")
29 parser.add_argument("-V", "--modelv",help="model version (any number) ",default="1")
30 
31 
32 
33 fName = parser.parse_args().input
34 modv = parser.parse_args().modelv
35 # In[2]:
36 
37 
38 df = pd.read_pickle(fName)
39 print ("vars in file:",df.keys())
40 print("events in df original:",df.shape[0])
41 df = df.loc[df['t_eHcal_y'] > 20]
42 print("events in df after energy cut:",df.shape[0])
43 df['t_eHcal_xun'] = df['t_eHcal_x']
44 df['t_delta_un'] = df['t_delta']
45 df['t_ieta_un'] = df['t_ieta']
46 
47 mina = []
48 maxa = []
49 keya = []
50 
51 
52 for i in df.keys():
53  keya.append(i)
54  mina.append(df[i].min())
55  maxa.append(df[i].max())
56 
57 print('var = ',keya)
58 print('mina = ',mina)
59 print('maxa = ',maxa)
60 
61 
62 #cols_to_stand = ['t_nVtx','t_ieta','t_eHcal10', 't_eHcal30','t_rhoh','t_eHcal_x']
63 #cols_to_minmax = ['t_delta', 't_hmaxNearP','t_emaxNearP', 't_hAnnular', 't_eAnnular','t_pt']
64 cols_to_minmax = ['t_delta', 't_hmaxNearP','t_emaxNearP', 't_hAnnular', 't_eAnnular','t_pt','t_nVtx','t_ieta','t_eHcal10', 't_eHcal30','t_rhoh','t_eHcal_x']
65 #df[cols_to_stand] = df[cols_to_stand].apply(lambda x: (x - x.mean()) /(x.std()))
66 #df[cols_to_minmax] = df[cols_to_minmax].apply(lambda x: (x - x.mean()) / (x.max() - x.min()))
67 # #(x.max() - x.min()))
68 df[cols_to_minmax] = df[cols_to_minmax].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
69 
70 data = df.values
71 print ('data shape:',data.shape)
72 targets = data[:,12]
73 targets.shape
74 
75 
76 # In[3]:
77 
78 
79 data = df.values
80 ntest = 20000
81 testindx = data.shape[0] - ntest
82 X_train = data[:testindx,0:12]
83 Y_train = data[:testindx,12]
84 X_test = data[testindx:,:]
85 print ("shape of X_train:",X_train.shape)
86 print ("shape of Y_train:",Y_train.shape)
87 print ("shape of X_test:",X_test.shape)
88 meany = np.mean(Y_train)
89 print ("mean y:",meany)
90 stdy = np.std(Y_train)
91 print ("std y:", stdy)
92 
93 
94 # In[4]:
95 
96 
97 ############################################# marinas correction form
98 a0 = [0.973, 0.998, 0.992, 0.965 ]
99 a1 = [0, -0.318, -0.261, -0.406]
100 a2 = [0, 0, 0.047, 0.089]
101 def fac0(jeta):
102  PU_IETA_1 = 9
103  PU_IETA_2 = 16
104  PU_IETA_3 = 25
105  idx = (int(jeta >= PU_IETA_1) + int(jeta >= PU_IETA_2) + int(jeta >= PU_IETA_3))
106  return a0[idx]
107 def fac1(jeta):
108  PU_IETA_1 = 9
109  PU_IETA_2 = 16
110  PU_IETA_3 = 25
111  idx = (int(jeta >= PU_IETA_1) + int(jeta >= PU_IETA_2) + int(jeta >= PU_IETA_3))
112  return a1[idx]
113 def fac2(jeta):
114  PU_IETA_1 = 9
115  PU_IETA_2 = 16
116  PU_IETA_3 = 25
117  idx = (int(jeta >= PU_IETA_1) + int(jeta >= PU_IETA_2) + int(jeta >= PU_IETA_3))
118  return a2[idx]
119 
120 vec0 = np.vectorize(fac0)
121 vec1 = np.vectorize(fac1)
122 vec2 = np.vectorize(fac2)
123 
124 X_test[:,17]
125 eop = (X_test[:,15]/X_test[:,13])
126 dop = (X_test[:,16]/X_test[:,13])
127 #mcorrval = vec0(abs(X_test[:,17])) + vec1(abs(X_test[:,17]))*(X_test[:,15]/X_test[:,13])*(X_test[:,16]/X_test[:,13])*( 1 + vec2(fac(abs(X_test[:,17])))*(X_test[:,16]/X_test[:,13]))
128 
129 mcorrval = vec0(abs(X_test[:,17])) + vec1(abs(X_test[:,17]))*eop*dop*( 1 + vec2(abs(X_test[:,17]))*dop)
130 
131 
132 # In[5]:
133 
134 
135 def propweights(y_true):
136  weight = np.copy(y_true)
137  weight[abs(y_true - meany) > 0] = 1.90*abs(y_true - meany)/stdy #1.25
138 # weight[abs(y_true - meany) > stdy] = 1.75*abs((weight[abs(y_true - meany) > stdy]) - meany)/(stdy)
139  weight[abs(y_true - meany) < stdy] = 1
140  return weight
141 
142 
143 # In[6]:
144 
145 
146 from keras import optimizers
147 print ("creating model=========>")
148 model = Sequential()
149 model.add(Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
150 model.add(Dropout(0.2))
151 model.add(Dense(500, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
152 model.add(Dense(500, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
153 model.add(Dense(60, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
154 model.add(Dense(1))
155 
156 RMS = keras.optimizers.RMSprop(lr=0.001, rho=0.9)
157 # Compile model
158 print ("compilation up next=======>")
159 model.compile(loss='logcosh',optimizer='adam')
160 model.summary()
161 #fitting
162 print ("fitting now=========>")
163 history = model.fit(X_train,Y_train , batch_size=5000, epochs=100, validation_split=0.2, verbose=1,sample_weight=propweights(Y_train))
164 
165 
166 # In[7]:
167 plt.plot(history.history['loss'])
168 plt.plot(history.history['val_loss'])
169 plt.title('model loss')
170 plt.ylabel('loss')
171 plt.xlabel('epoch')
172 plt.legend(['train', 'validation'], loc='upper left')
173 plt.savefig(modv+'_loss_distributions.png')
174 plt.close()
175 
176 
177 preds = model.predict(X_test[:,0:12])
178 targets = X_test[:,12]
179 uncorrected = X_test[:,15]
180 marinascorr = X_test[:,15]*mcorrval
181 plt.hist(preds, bins =100, range=(0,200),label='PU regression',alpha=0.6)
182 plt.hist(targets, bins =100, range=(0,200),label='no PU',alpha=0.6)
183 plt.hist(uncorrected, bins =100, range=(0,200),label='uncorrected',alpha=0.6)
184 #plt.hist(marinascorr, bins =100, range=(0,200),label='marinas correction',alpha=0.6)
185 plt.yscale('log')
186 plt.title("Energy distribution")
187 plt.legend(loc='upper right')
188 plt.savefig(modv+'_energy_distributions.png')
189 plt.close()
190 
191 preds = preds.reshape(preds.shape[0])
192 print (preds.shape)
193 
194 
195 
196 #plt.hist(preds, bins =100, range=(0,100),label='PU regression',alpha=0.6)
197 #plt.hist(abs(preds -targets)/targets , bins =100, range=(0,40),label='PU regression',alpha=0.6)
198 #plt.hist(targets, bins =100, range=(0,100),label='no PU',alpha=0.6)
199 plt.hist(abs(uncorrected -targets)/targets*100, bins =100, range=(0,100),label='uncorrected',alpha=0.6)
200 #plt.hist(abs(marinascorr -targets)/targets*100, bins =100, range=(0,100),label='marinas correction',alpha=0.6)
201 plt.hist(100*abs(preds -targets)/targets, bins =100, range=(0,100),label='PU correction',alpha=0.6)
202 #plt.yscale('log')
203 plt.title("error distribution")
204 plt.legend(loc='upper right')
205 plt.savefig(modv+'_errors.png')
206 plt.close()
207 
208 #plt.scatter(targets, marinascorr,alpha=0.3,label='marinascorr')
209 plt.scatter(targets, uncorrected,alpha=0.3,label='uncorr')
210 plt.scatter(targets, preds,alpha=0.3,label='PUcorr')
211 plt.xlabel('True Values ')
212 plt.ylabel('Predictions ')
213 plt.legend(loc='upper right')
214 lims = [0, 200]
215 plt.xlim(lims)
216 plt.ylim(lims)
217 _ = plt.plot(lims, lims)
218 plt.savefig(modv+'_energyscatt.png')
219 plt.close()
220 
221 
222 
223 pmom= X_test[:,13]
224 #get_ipython().run_line_magic('matplotlib', 'inline')
225 plt.hist(targets/pmom, bins =100, range=(0,5),label='E/p noPU',alpha=0.6)
226 #plt.hist(preds/pmom, bins =100, range=(0,5),label='E/p PUcorr',alpha=0.6)
227 #plt.hist(uncorrected/pmom, bins =100, range=(0,5),label='E/p uncorr',alpha=0.6)
228 plt.hist(marinascorr/pmom, bins =100, range=(0,5),label='E/p marina corr',alpha=0.6)
229 #plt.hist(np.exp(preds.reshape((preds.shape[0])))/pmom[0:n_test_events], bins =100, range=(0,5),label='corrEnp/p',alpha=0.3)
230 #plt.hist(preds.reshape((preds.shape[0]))/pmom[0:n_test_events], bins =100, range=(0,5),label='corrEnp/p',alpha=0.3)
231 plt.legend(loc='upper right')
232 plt.yscale('log')
233 plt.title("E/p distributions")
234 plt.savefig(modv+'_eopdist.png')
235 plt.close()
236 
237 # In[8]:
238 
239 
240 import os
241 ############## save model
242 if not os.path.exists('models'):
243  os.makedirs('models')
244 model.save('models/model'+modv+'.h5')
245 #new_model_2 = load_model('my_model.h5')
246 
void print(TMatrixD &m, const char *label=nullptr, bool mathematicaFormat=false)
Definition: Utilities.cc:47
Abs< T >::type abs(const T &t)
Definition: Abs.h:22
T min(T a, T b)
Definition: MathUtil.h:58
Vec apply(Vec v, F f)
Definition: ExtVec.h:81