CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
VDTMath.h
Go to the documentation of this file.
1 #ifndef vdt_math_h
2 #define vdt_math_h
3 
4 /*******************************************************************************
5  *
6  * VDT math library: collection of double precision vectorisable trascendental
7  * functions.
8  * The c++11 standard is used: remember to enable it for the compilation.
9  *
10  * The basic idea is to exploit pade polinomials.
11  * A lot of ideas were inspired by the cephes math library (by Stephen L. Moshier
12  * moshier@na-net.ornl.gov) as well as actual code for the exp, log, sin, cos,
13  * tan, asin, acos and atan functions. The Cephes library can be found here:
14  * http://www.netlib.org/cephes/
15  *
16  ******************************************************************************/
17 
18 
19 #include <iostream>
20 #include <cmath>
21 #include <limits>
22 
23 // used to enable some compiler option internally if needed
24 #define CMS_VECTORIZE
25 //#define CMS_VECTORIZE_VERBOSE __attribute__ ((optimize("-ftree-vectorizer-verbose=7")))
26 #define CMS_VECTORIZE_VERBOSE
27 //#define VDT_RESTRICT __restrict__
28 #define VDT_RESTRICT
29 
30 #define VDT_FORCE_INLINE __attribute__((always_inline)) inline
31 
32 namespace vdt {
33 
34 // paramters
35 constexpr double LOG2E = 1.4426950408889634073599; // 1/log(2)
36 constexpr double SQRTH = 0.70710678118654752440;
37 
38 /*
39 Pade' polinomials coefficients and constants needed throughout the code.
40 The result of constexpr *here* is likely to be the same of const static
41 nevertheless we show its power: all the computations will take place at compile
42 time
43 */
44 
45 // exp
46 constexpr double EXP_LIMIT = 708.;
47 constexpr double PX1exp = 1.26177193074810590878E-4;
48 constexpr double PX2exp = 3.02994407707441961300E-2;
49 constexpr double PX3exp = 9.99999999999999999910E-1;
50 constexpr double QX1exp = 3.00198505138664455042E-6;
51 constexpr double QX2exp = 2.52448340349684104192E-3;
52 constexpr double QX3exp = 2.27265548208155028766E-1;
53 constexpr double QX4exp = 2.00000000000000000009E0;
54 
55 // logarithm
56 constexpr double LOG_UPPER_LIMIT = 1e307;
57 constexpr double LOG_LOWER_LIMIT = 1e-307;
58 constexpr double PX1log = 1.01875663804580931796E-4;
59 constexpr double PX2log = 4.97494994976747001425E-1;
60 constexpr double PX3log = 4.70579119878881725854E0;
61 constexpr double PX4log = 1.44989225341610930846E1;
62 constexpr double PX5log = 1.79368678507819816313E1;
63 constexpr double PX6log = 7.70838733755885391666E0;
64 
65 constexpr double QX1log = 1.12873587189167450590E1;
66 constexpr double QX2log = 4.52279145837532221105E1;
67 constexpr double QX3log = 8.29875266912776603211E1;
68 constexpr double QX4log = 7.11544750618563894466E1;
69 constexpr double QX5log = 2.31251620126765340583E1;
70 
71 // Sin and cos
72 constexpr double DP1sc = 7.85398125648498535156E-1;
73 constexpr double DP2sc = 3.77489470793079817668E-8;
74 constexpr double DP3sc = 2.69515142907905952645E-15;
75 constexpr double TWOPI = 2.*M_PI;
76 constexpr double PI = M_PI;
78 constexpr double PIO4 = M_PI_4;
79 
80 // Sin
83 constexpr double C1sin = 1.58962301576546568060E-10;
84 constexpr double C2sin =-2.50507477628578072866E-8;
85 constexpr double C3sin = 2.75573136213857245213E-6;
86 constexpr double C4sin =-1.98412698295895385996E-4;
87 constexpr double C5sin = 8.33333333332211858878E-3;
88 constexpr double C6sin =-1.66666666666666307295E-1;
89 
90 //Cos
91 constexpr double C1cos =-1.13585365213876817300E-11;
92 constexpr double C2cos = 2.08757008419747316778E-9;
93 constexpr double C3cos =-2.75573141792967388112E-7;
94 constexpr double C4cos = 2.48015872888517045348E-5;
95 constexpr double C5cos =-1.38888888888730564116E-3;
96 constexpr double C6cos = 4.16666666666665929218E-2;
97 
98 // Asin and acos
99 
100 constexpr double RX1asin = 2.967721961301243206100E-3;
101 constexpr double RX2asin = -5.634242780008963776856E-1;
102 constexpr double RX3asin = 6.968710824104713396794E0;
103 constexpr double RX4asin = -2.556901049652824852289E1;
104 constexpr double RX5asin = 2.853665548261061424989E1;
105 
106 constexpr double SX1asin = -2.194779531642920639778E1;
107 constexpr double SX2asin = 1.470656354026814941758E2;
108 constexpr double SX3asin = -3.838770957603691357202E2;
109 constexpr double SX4asin = 3.424398657913078477438E2;
110 
111 constexpr double PX1asin = 4.253011369004428248960E-3;
112 constexpr double PX2asin = -6.019598008014123785661E-1;
113 constexpr double PX3asin = 5.444622390564711410273E0;
114 constexpr double PX4asin = -1.626247967210700244449E1;
115 constexpr double PX5asin = 1.956261983317594739197E1;
116 constexpr double PX6asin = -8.198089802484824371615E0;
117 
118 constexpr double QX1asin = -1.474091372988853791896E1;
119 constexpr double QX2asin = 7.049610280856842141659E1;
120 constexpr double QX3asin = -1.471791292232726029859E2;
121 constexpr double QX4asin = 1.395105614657485689735E2;
122 constexpr double QX5asin = -4.918853881490881290097E1;
123 
124 //Tan
125 constexpr double PX1tan=-1.30936939181383777646E4;
126 constexpr double PX2tan=1.15351664838587416140E6;
127 constexpr double PX3tan=-1.79565251976484877988E7;
128 
129 constexpr double QX1tan = 1.36812963470692954678E4;
130 constexpr double QX2tan = -1.32089234440210967447E6;
131 constexpr double QX3tan = 2.50083801823357915839E7;
132 constexpr double QX4tan = -5.38695755929454629881E7;
133 
134 constexpr double DP1tan = 7.853981554508209228515625E-1;
135 constexpr double DP2tan = 7.94662735614792836714E-9;
136 constexpr double DP3tan = 3.06161699786838294307E-17;
138 
139 // Atan
140 constexpr double T3PO8 = 2.41421356237309504880;
141 constexpr double MOREBITS = 6.123233995736765886130E-17;
143 
144 constexpr double PX1atan = -8.750608600031904122785E-1;
145 constexpr double PX2atan = -1.615753718733365076637E1;
146 constexpr double PX3atan = -7.500855792314704667340E1;
147 constexpr double PX4atan = -1.228866684490136173410E2;
148 constexpr double PX5atan = -6.485021904942025371773E1;
149 
150 constexpr double QX1atan = - 2.485846490142306297962E1;
151 constexpr double QX2atan = 1.650270098316988542046E2;
152 constexpr double QX3atan = 4.328810604912902668951E2;
153 constexpr double QX4atan = 4.853903996359136964868E2;
154 constexpr double QX5atan = 1.945506571482613964425E2;
155 
156 constexpr double ATAN_LIMIT = 1e307;
157 
158 // Inverse Sqrt
159 // constexpr unsigned int ISQRT_ITERATIONS = 4;
160 constexpr double SQRT_LIMIT = 1e307;
161 
162 // Service----------------------------------------------------------------------
165 
166 //------------------------------------------------------------------------------
167 
169 typedef union {
170  double d;
171  int i[2];
172  long long ll;
173  unsigned short s[4];
174 } ieee754;
175 
176 //------------------------------------------------------------------------------
177 
179 VDT_FORCE_INLINE double ll2d(unsigned long long x) {
180  ieee754 tmp;
181  tmp.ll=x;
182  return tmp.d;
183 }
184 
185 //------------------------------------------------------------------------------
186 
188 VDT_FORCE_INLINE unsigned long long d2ll(double x) {
189  ieee754 tmp;
190  tmp.d=x;
191  return tmp.ll;
192 }
193 
194 //------------------------------------------------------------------------------
196 VDT_FORCE_INLINE double getMantExponent(double x, double& fe){
197 
198  unsigned long long n = d2ll(x);
199 
200  // shift to the right up to the beginning of the exponent
201  // then with a mask, cut off the sign bit
202  unsigned long long le = ((n >> 52) & 0x7ffLL);
203 
204  // chop the head of the number: an int contains more than 11 bits (32)
205  int e = le; // This is important since sums on ull do not vectorise
206  fe = (e-1023) +1 ; // the plus one to make the result identical to frexp
207 
208  // 13 times f means 52 1. Masking with this means putting to 0 exponent
209  // and sign of a double, leaving the Mantissa, the first 52 bits of a double.
210  n &=0xfffffffffffffLL;
211 
212  // build a mask which is 0.5, i.e. an exponent equal to 1022
213  // which means *2, see the above +1.
214  const unsigned long long p05 = d2ll(0.5);
215  n |= p05;
216  x = ll2d(n);
217  return x;
218 }
219 
220 //------------------------------------------------------------------------------
221 // Now the mathematical functions are encoded.
222 
223 // Exp -------------------------------------------------------------------------
224 // Vectorises in a loop without any change in 4.7
226 VDT_FORCE_INLINE double fast_exp(double x){
227 
228  double initial_x = x;
229 
230 // double px =int(LOG2E * x + 0.5); // std::floor(LOG2E * x + 0.5);
231  double px = std::floor(LOG2E * x + 0.5);
232 
233  int n = px;
234 
235  x -= px * 6.93145751953125E-1;
236  x -= px * 1.42860682030941723212E-6;
237 
238  double xx = x * x;
239 
240  // px = x * P(x**2).
241  px = PX1exp;
242  px *= xx;
243  px += PX2exp;
244  px *= xx;
245  px += PX3exp;
246  px *= x;
247 
248  // Evaluate Q(x**2).
249  double qx = QX1exp;
250  qx *= xx;
251  qx += QX2exp;
252  qx *= xx;
253  qx += QX3exp;
254  qx *= xx;
255  qx += QX4exp;
256 
257  // e**x = 1 + 2x P(x**2)/( Q(x**2) - P(x**2) )
258  x = px / (qx - px);
259  x = 1.0 + 2.0 * x;
260 
261  // Build 2^n in double.
262  ieee754 u;
263  u.d = 0;
264  n += 1023;
265  u.ll = (long long) (n) << 52;
266 
267  double res = x * u.d;
268  if (initial_x > EXP_LIMIT)
270  if (initial_x < -EXP_LIMIT)
271  res = 0.;
272 
273  return res;
274 }
275 
276 
277 // Log -------------------------------------------------------------------------
278 
279 VDT_FORCE_INLINE double fast_log(double x){
280 
281  double input_x=x;
282 
283  /* separate mantissa from exponent */
284  double fe;
285  x = getMantExponent(x,fe);
286 
287  // blending
288  if( x < SQRTH ) {
289  fe-=1;
290  x += x ;
291  }
292  x -= 1.0;
293 
294  /* rational form */
295 
296  double z = x*x;
297  double px = PX1log;
298  px *= x;
299  px += PX2log;
300  px *= x;
301  px += PX3log;
302  px *= x;
303  px += PX4log;
304  px *= x;
305  px += PX5log;
306  px *= x;
307  px += PX6log;
308  //
309  //for the final formula
310  px *= x;
311  px *= z;
312 
313  double qx = x;
314  qx += QX1log;
315  qx *=x;
316  qx += QX2log;
317  qx *=x;
318  qx += QX3log;
319  qx *=x;
320  qx += QX4log;
321  qx *=x;
322  qx += QX5log;
323 
324  double y = px / qx ;
325 
326  y -= fe * 2.121944400546905827679e-4;
327  y -= 0.5 * z ;
328 
329  z = x + y;
330  z += fe * 0.693359375;
331 
332  if (input_x > LOG_UPPER_LIMIT)
334  if (input_x < LOG_LOWER_LIMIT)
336 
337 // std::cout << input_x << " " << std::log(input_x) << " " << z << std::endl;
338 
339  return( z );
340 
341  }
342 
343 //------------------------------------------------------------------------------
345 VDT_FORCE_INLINE double fast_sin(double x){
346 
347  int sign = 1;
348 
349  if (x < 0){
350  x = - x;
351  sign = -1;
352  }
353 
354  if( x > PI ){
355  x = TWOPI - x;
356  sign = - sign;
357  }
358 
359  if( x > PIO2 )
360  x = PI - x ;
361 
362  double y = int( x/PIO4 ); // integer part of x/PIO4
363 
364  int j=0;
365  if (x>PIO4){
366  j=2;
367  y+=1;
368  }
369 
370  /* Extended precision modular arithmetic */
371  double z = ((x - y * DP1sc) - y * DP2sc) - y * DP3sc;
372 
373  double zz = z * z;
374 
375  double px=0;
376 
377  if( j==2 ){
378  px = C1cos;
379  px *= zz;
380  px += C2cos;
381  px *= zz;
382  px += C3cos;
383  px *= zz;
384  px += C4cos;
385  px *= zz;
386  px += C5cos;
387  px *= zz;
388  px += C6cos;
389  y = 1.0 - zz * .5 + zz * zz * px;
390  }
391  else{
392  px = C1sin;
393  px *= zz;
394  px += C2sin;
395  px *= zz;
396  px += C3sin;
397  px *= zz;
398  px += C4sin;
399  px *= zz;
400  px += C5sin;
401  px *= zz;
402  px += C6sin;
403  y = z + z * zz * px;
404  }
405 
406  y *= sign;
407 
408  return y;
409  }
410 
411 //------------------------------------------------------------------------------
412 
413 VDT_FORCE_INLINE double fast_asin(double x){
414 
415  int sign=1;
416  double a = x; //necessary for linear approx
417 
418  if ( x < 0. ){
419  sign *= -1;
420  a *= -1;
421  }
422 
423  double p, z, zz;
424  double px,qx;
425 
426  /* arcsin(1-x) = pi/2 - sqrt(2x)(1+R(x)) */
427  zz = 1.0 - a;
428 
429  px = RX1asin;
430  px*= zz;
431  px+= RX2asin;
432  px*= zz;
433  px+= RX3asin;
434  px*= zz;
435  px+= RX4asin;
436  px*= zz;
437  px+= RX5asin;
438 
439  qx = zz;
440  qx+= SX1asin;
441  qx*= zz;
442  qx+= SX2asin;
443  qx*= zz;
444  qx+= SX3asin;
445  qx*= zz;
446  qx+= SX4asin;
447 
448  p =zz* px/qx;
449 
450 // p = zz * polevl( zz, R, 4)/p1evl( zz, S, 4);
451 
452  zz = std::sqrt(zz+zz);
453  z = PIO4 - zz;
454  zz = zz * p - MOREBITS;
455  z -= zz;
456  z += PIO4;
457 
458  if( a < 0.625 ){
459  zz = a * a;
460  px = PX1asin;
461  px*= zz;
462  px+= PX2asin;
463  px*= zz;
464  px+= PX3asin;
465  px*= zz;
466  px+= PX4asin;
467  px*= zz;
468  px+= PX5asin;
469  px*= zz;
470  px+= PX6asin;
471 
472  qx = zz;
473  qx+= QX1asin;
474  qx*= zz;
475  qx+= QX2asin;
476  qx*= zz;
477  qx+= QX3asin;
478  qx*= zz;
479  qx+= QX4asin;
480  qx*= zz;
481  qx+= QX5asin;
482 
483  z = zz*px/qx;
484 
485  z = a * z + a;
486  }
487 
488  z *= sign;
489 
490  //linear approx, not sooo needed but seable. Price is cheap though
491  if( a < 1.0e-8 )
492  z = a;
493 
494  return z;
495  }
496 
497 //------------------------------------------------------------------------------
498 
499 
501 VDT_FORCE_INLINE double fast_cos(double x){
502 
503  x = std::abs(x);
504 
505  if( x > PI )
506  x = TWOPI - x ;
507 
508  int sign = 1;
509  if( x > PIO2 ){
510  x = PI - x;
511  sign=-1;
512  }
513 
514  double y = int( x/PIO4 ); // integer part of x/PIO4
515 
516  int j=0;
517  if (x>PIO4){
518  j=2;
519  y+=1;
520  sign = -sign;
521  }
522 
523  /* Extended precision modular arithmetic */
524  double z = ((x - y * DP1sc) - y * DP2sc) - y * DP3sc;
525 
526  double zz = z * z;
527 
528  double px=0;
529  if( j==2 ){
530  px = C1sin;
531  px *= zz;
532  px += C2sin;
533  px *= zz;
534  px += C3sin;
535  px *= zz;
536  px += C4sin;
537  px *= zz;
538  px += C5sin;
539  px *= zz;
540  px += C6sin;
541  y = z + z * zz * px;
542  }
543  else{
544  px = C1cos;
545  px *= zz;
546  px += C2cos;
547  px *= zz;
548  px += C3cos;
549  px *= zz;
550  px += C4cos;
551  px *= zz;
552  px += C5cos;
553  px *= zz;
554  px += C6cos;
555  y = 1. - zz * .5 + zz * zz * px;
556  }
557 
558  y *= sign;
559 
560  return y;
561  }
562 
563 // Acos ------------------------------------------------------------------------
564 VDT_FORCE_INLINE double fast_acos(double x){
565  double z;
566 
567 // z = PIO4 - fast_asin(x);
568 // z += MOREBITS;
569 // z += PIO4;
570 
571 // if (x > .5 )
572  z = 2.0 * fast_asin( sqrt(0.5 - 0.5 * x ) ) ;
573 
574  return z;
575  }
576 
577 // Tangent --------------------------------------------------------------------
579 VDT_FORCE_INLINE double fast_tan( double x ){
580 /* DP
581  * Some of the ifs had to be skipped and replaced by calculations. This allowed
582  * the vectorisation but introduced a loss of performance.
583  * A solution should be found
584 */
585 
586  // make argument positive but save the sign
587  // without ifs
588  double abs_x =std::abs(x);
589  int sign = x/abs_x;
590  x = abs_x;
591 
592 // remove this if
593 // if (x > PI)
594 // x = x - PI;
595 // like this:
596  int nPI = x /PI;
597  x = x - nPI * PI;
598 
599 
600 // reflect and flip with if
601 // if (x > PIO2){
602 // x = PI - x ;
603 // sign = - sign;
604 // }
605 // and without
606  int nPIO2 = x/PIO2;
607  int factor = ( 1 - 2* nPIO2);
608  x = nPIO2* PI + factor * x;
609  sign *= factor;
610 
611 
612  /* compute x mod PIO4 */
613  int nPIO4 = x/PIO4;
614  double y = 2 * nPIO4;
615 
616  /* integer and fractional part modulo one octant */
617 
618 // This if can be removed and the expression becomes
619 // if (x > PIO4){
620 // y=2.0;
621 // }
622 // like this:
623 // y = y + nPIO4;
624 
625 
626  double z = ((x - y * DP1tan) - y * DP2tan) - y * DP3tan;
627 
628  double zz = z * z;
629 
630  y=z;
631 
632  if( zz > 1.0e-14 ){
633  double px = PX1tan;
634  px *= zz;
635  px += PX2tan;
636  px *= zz;
637  px += PX3tan;
638 
639  double qx=zz;
640  qx += QX1tan;
641  qx *=zz;
642  qx += QX2tan;
643  qx *=zz;
644  qx += QX3tan;
645  qx *=zz;
646  qx += QX4tan;
647 
648  y = z + z * zz * px / qx;
649  }
650 
651  // here if we are in the second octant we have
652  // y = -1 /y
653  // else we have y!
654  // again a trick not to use ifs...
655  y -= nPIO4 * ( y + 1.0 / y);
656 
657  y *= sign;
658 
659  return y ;
660  }
661 
662 // Atan -------------------------------------------------------------------------
663 // REMEMBER pi/2 == inf!!
664 VDT_FORCE_INLINE double fast_atan(double x){
665 
666  /* make argument positive and save the sign */
667  int sign = 1;
668  if( x < 0.0 ) {
669  x = - x;
670  sign = -1;
671  }
672 
673  /* range reduction */
674  double originalx=x;
675 
676 // This is slower!
677 // double y = 0.0;
678 // double factor = 0.;
679 //
680 // if (x > .66){
681 // y = PIO4;
682 // factor = MOREBITSO2;
683 // x = (x-1.0) / (x+1.0);
684 // }
685 // if( originalx > T3PO8 ) {
686 // y = PIO2;
687 // factor = MOREBITS;
688 // x = -1.0 / originalx ;
689 // }
690 
691  double y = PIO4;
692  double factor = MOREBITSO2;
693  x = (x-1.0) / (x+1.0);
694 
695  if( originalx > T3PO8 ) {
696  y = PIO2;
697  //flag = 1.;
698  factor = MOREBITS;
699  x = -1.0 / originalx ;
700  }
701  if ( originalx <= 0.66 ) {
702  y = 0.0;
703  x = originalx;
704  //flag = 0.;
705  factor = 0.;
706  }
707 
708  double z = x * x;
709 
710  double px = PX1atan;
711  px *= z;
712  px += PX2atan;
713  px *= z;
714  px += PX3atan;
715  px *= z;
716  px += PX4atan;
717  px *= z;
718  px += PX5atan;
719  px *= z; // for the final formula
720 
721  double qx=z;
722  qx += QX1atan;
723  qx *=z;
724  qx += QX2atan;
725  qx *=z;
726  qx += QX3atan;
727  qx *=z;
728  qx += QX4atan;
729  qx *=z;
730  qx += QX5atan;
731 
732 // z = px / qx;
733 // z = x * px / qx + x;
734 
735  y = y +x * px / qx + x +factor;
736 
737  y = sign * y;
738 
739  return y;
740  }
741 
742 //------------------------------------------------------------------------------
743 
744 // Taken from from quake and remixed :-)
745 
746 VDT_FORCE_INLINE double fast_isqrt_general(double x, const unsigned short ISQRT_ITERATIONS) {
747 
748  double x2 = x * 0.5;
749  double y = x;
750  unsigned long long i = d2ll(y);
751  // Evil!
752  i = 0x5fe6eb50c7aa19f9 - ( i >> 1 );
753  y = ll2d(i);
754  for (unsigned int j=0;j<ISQRT_ITERATIONS;++j)
755  y *= 1.5 - ( x2 * y * y ) ;
756 
757  return y;
758 }
759 
760 
761 
762 //------------------------------------------------------------------------------
763 
764 // Four iterations
765 VDT_FORCE_INLINE double fast_isqrt(double x) {return fast_isqrt_general(x,4);}
766 
767 // Two iterations
769 
770 //------------------------------------------------------------------------------
771 
772 VDT_FORCE_INLINE double std_isqrt (double x) {return 1./std::sqrt(x);}
773 
774 //------------------------------------------------------------------------------
775 
776 VDT_FORCE_INLINE double fast_inv (double x) {
777  double sign = 1;
778  if( x < 0.0 ) {
779  x = - x;
780  sign = -1;
781  }
782  double y=fast_isqrt(x);
783  return y*y*sign;
784  }
785 
787  double sign = 1;
788  if( x < 0.0 ) {
789  x = - x;
790  sign = -1;
791  }
792  double y=fast_approx_isqrt(x);
793  return y*y*sign;}
794 
795 VDT_FORCE_INLINE double std_inv (double x) {return 1./x;}
796 
797 //------------------------------------------------------------------------------
798 // Some preprocessor in order to avoid a lot of error prone repetitions
799 // CMS_VECTORIZE_VERBOSE is a preprocessor variable in a preprocessor function
800 
801 // Fast vector functions
802 #define FAST_VECT_FUNC(NAME) __attribute__((always_inline)) inline void NAME##_vect(double const * VDT_RESTRICT input , double * VDT_RESTRICT outupt, const unsigned int arr_size) { \
803  for (unsigned int i=0;i<arr_size;++i) \
804  outupt[i] = NAME ( input[i] ) CMS_VECTORIZE_VERBOSE; \
805  }
806 
808 void fast_exp_vect_46(double const* input, double* output, const unsigned int arr_size);
809 
810 // Profitability threshold = 3
812 
813 void fast_log_vect_46(double const* input, double* output, const unsigned int arr_size);
815 
816 // Profitability threshold = 3
818 
819 // Profitability threshold = 7
821 
822 // Profitability threshold = 3
824 
825 // Profitability threshold = 7
827 
828 // Profitability threshold = 3
830 
831 //Profitability threshold = 3
833 
834 //Profitability threshold = 3
836 
837 //Profitability threshold = 2 (2!!!)
839 
840 //Profitability threshold = 2 (2!!!)
842 
843 //Profitability threshold = 2
845 
846 //Profitability threshold = 2
848 
849 //------------------------------------------------------------------------------
850 // Reference vector functions
851 #define VECT_FUNC(NAME) __attribute__((always_inline)) inline void std_##NAME##_vect(double const * VDT_RESTRICT input , double* VDT_RESTRICT outupt, const unsigned int arr_size) { \
852  for (unsigned int i=0;i<arr_size;++i) \
853  outupt[i] = std::NAME ( input[i] ) CMS_VECTORIZE_VERBOSE; \
854  }
855 
856 VECT_FUNC(exp)
857 
858 VECT_FUNC(log)
859 
860 VECT_FUNC(sin)
861 
862 VECT_FUNC(asin)
863 
864 VECT_FUNC(cos)
865 
866 VECT_FUNC(acos)
867 
868 VECT_FUNC(tan)
869 
870 VECT_FUNC(atan)
871 
873  double* VDT_RESTRICT output,
874  const unsigned int arr_size) CMS_VECTORIZE_VERBOSE{
875  //Profitability threshold = 6
876  for (unsigned int i=0;i<arr_size;++i)
877  output[i] = vdt::std_isqrt(input[i]);
878  }
879 
880 VDT_FORCE_INLINE void std_inv_vect(double const * VDT_RESTRICT input ,
881  double* VDT_RESTRICT output,
882  const unsigned int arr_size) CMS_VECTORIZE_VERBOSE{
883  //Profitability threshold = 6
884  for (unsigned int i=0;i<arr_size;++i)
885  output[i] = vdt::std_inv(input[i]);
886  }
887 
888 
889 //------------------------------------------------------------------------------
890 
891 } // end of vdt namespace
892 
893 #endif
894 
constexpr double PX3tan
Definition: VDTMath.h:127
VDT_FORCE_INLINE double fast_approx_isqrt(double x)
Definition: VDTMath.h:768
constexpr double QX4log
Definition: VDTMath.h:68
constexpr double QX3log
Definition: VDTMath.h:67
constexpr double QX1asin
Definition: VDTMath.h:118
int i
Definition: DBlmapReader.cc:9
constexpr double PI
Definition: VDTMath.h:76
constexpr double C5cos
Definition: VDTMath.h:95
constexpr double QX1tan
Definition: VDTMath.h:129
constexpr double PIO4
Definition: VDTMath.h:78
constexpr double QX4asin
Definition: VDTMath.h:121
constexpr double PX1log
Definition: VDTMath.h:58
double d
Definition: VDTMath.h:170
long long ll
Definition: VDTMath.h:172
constexpr double QX2asin
Definition: VDTMath.h:119
constexpr double SQRT_LIMIT
Definition: VDTMath.h:160
constexpr double PX4asin
Definition: VDTMath.h:114
constexpr double C5sin
Definition: VDTMath.h:87
constexpr double PX2log
Definition: VDTMath.h:59
constexpr double C3cos
Definition: VDTMath.h:93
constexpr double SX2asin
Definition: VDTMath.h:107
constexpr double PX1asin
Definition: VDTMath.h:111
constexpr double PX3asin
Definition: VDTMath.h:113
constexpr double C6cos
Definition: VDTMath.h:96
Sin< T >::type sin(const T &t)
Definition: Sin.h:22
void fast_exp_vect_46(double const *input, double *output, const unsigned int arr_size)
Some tweaks to make it vectorise with gcc46.
Definition: VDTMath.cc:63
#define abs(x)
Definition: mlp_lapack.h:159
VDT_FORCE_INLINE double std_isqrt(double x)
Definition: VDTMath.h:772
VDT_FORCE_INLINE double fast_isqrt(double x)
Definition: VDTMath.h:765
#define M_PI_2
constexpr double RX5asin
Definition: VDTMath.h:104
constexpr double DP2tan
Definition: VDTMath.h:135
constexpr double EXP_LIMIT
Definition: VDTMath.h:46
constexpr double QX5asin
Definition: VDTMath.h:122
constexpr double QX2tan
Definition: VDTMath.h:130
VDT_FORCE_INLINE unsigned long long d2ll(double x)
Converts a double to an unsigned long long.
Definition: VDTMath.h:188
constexpr double PX2exp
Definition: VDTMath.h:48
constexpr double TAN_LIMIT
Definition: VDTMath.h:137
constexpr double C2sin
Definition: VDTMath.h:84
constexpr double QX3tan
Definition: VDTMath.h:131
constexpr double DP3tan
Definition: VDTMath.h:136
constexpr double C3sin
Definition: VDTMath.h:85
double double double z
#define FAST_VECT_FUNC(NAME)
Definition: VDTMath.h:802
constexpr double SX4asin
Definition: VDTMath.h:109
constexpr double PX3atan
Definition: VDTMath.h:146
void print_instructions_info()
Print the instructions used on screen.
Definition: VDTMath.cc:24
constexpr double LOG_UPPER_LIMIT
Definition: VDTMath.h:56
VDT_FORCE_INLINE double getMantExponent(double x, double &fe)
Like frexp but vectorising and the exponent is a double.
Definition: VDTMath.h:196
VDT_FORCE_INLINE double fast_asin(double x)
Definition: VDTMath.h:413
constexpr double MOREBITSO2
Definition: VDTMath.h:142
VDT_FORCE_INLINE double fast_exp(double x)
Exponential Function.
Definition: VDTMath.h:226
constexpr double QX1log
Definition: VDTMath.h:65
constexpr double QX3asin
Definition: VDTMath.h:120
constexpr double PX2tan
Definition: VDTMath.h:126
constexpr double RX1asin
Definition: VDTMath.h:100
VDT_FORCE_INLINE void std_inv_vect(double const *VDT_RESTRICT input, double *VDT_RESTRICT output, const unsigned int arr_size) CMS_VECTORIZE_VERBOSE
Definition: VDTMath.h:880
constexpr double SX1asin
Definition: VDTMath.h:106
constexpr double SX3asin
Definition: VDTMath.h:108
constexpr double DP1sc
Definition: VDTMath.h:72
T sqrt(T t)
Definition: SSEVec.h:46
VDT_FORCE_INLINE double std_inv(double x)
Definition: VDTMath.h:795
constexpr double SQRTH
Definition: VDTMath.h:36
constexpr double LOG_LOWER_LIMIT
Definition: VDTMath.h:57
constexpr double PX4atan
Definition: VDTMath.h:147
Cos< T >::type cos(const T &t)
Definition: Cos.h:22
constexpr double QX5log
Definition: VDTMath.h:69
VDT_FORCE_INLINE void std_isqrt_vect(double const *VDT_RESTRICT input, double *VDT_RESTRICT output, const unsigned int arr_size) CMS_VECTORIZE_VERBOSE
Definition: VDTMath.h:872
const double infinity
Tan< T >::type tan(const T &t)
Definition: Tan.h:22
int j
Definition: DBlmapReader.cc:9
constexpr double QX2log
Definition: VDTMath.h:66
constexpr double QX4exp
Definition: VDTMath.h:53
void fast_log_vect_46(double const *input, double *output, const unsigned int arr_size)
Some tweaks to make it vectorise with gcc46.
Definition: VDTMath.cc:150
VDT_FORCE_INLINE double fast_tan(double x)
Sin defined between -2pi and 2pi.
Definition: VDTMath.h:579
constexpr double SIN_UPPER_LIMIT
Definition: VDTMath.h:81
constexpr double PX5atan
Definition: VDTMath.h:148
constexpr double C4sin
Definition: VDTMath.h:86
constexpr double PX1tan
Definition: VDTMath.h:125
constexpr double PX1atan
Definition: VDTMath.h:144
constexpr double DP3sc
Definition: VDTMath.h:74
constexpr double QX4atan
Definition: VDTMath.h:153
constexpr double QX4tan
Definition: VDTMath.h:132
#define VDT_RESTRICT
Definition: VDTMath.h:28
constexpr double ATAN_LIMIT
Definition: VDTMath.h:156
constexpr double PX1exp
Definition: VDTMath.h:47
constexpr double RX2asin
Definition: VDTMath.h:101
constexpr double QX2exp
Definition: VDTMath.h:51
#define VECT_FUNC(NAME)
Definition: VDTMath.h:851
VDT_FORCE_INLINE double fast_isqrt_general(double x, const unsigned short ISQRT_ITERATIONS)
Definition: VDTMath.h:746
constexpr double PX5log
Definition: VDTMath.h:62
constexpr double SIN_LOWER_LIMIT
Definition: VDTMath.h:82
#define M_PI
Definition: BFit3D.cc:3
constexpr double C1cos
Definition: VDTMath.h:91
constexpr double RX3asin
Definition: VDTMath.h:102
constexpr double QX2atan
Definition: VDTMath.h:151
string const
Definition: compareJSON.py:14
constexpr double RX4asin
Definition: VDTMath.h:103
constexpr double PX2asin
Definition: VDTMath.h:112
VDT_FORCE_INLINE double fast_cos(double x)
Cos defined between -2pi and 2pi.
Definition: VDTMath.h:501
constexpr double C4cos
Definition: VDTMath.h:94
constexpr double PX2atan
Definition: VDTMath.h:145
VDT_FORCE_INLINE double fast_sin(double x)
Sin defined between -2pi and 2pi.
Definition: VDTMath.h:345
std::vector< std::vector< double > > tmp
Definition: MVATrainer.cc:100
VDT_FORCE_INLINE double fast_acos(double x)
Definition: VDTMath.h:564
constexpr double PX3log
Definition: VDTMath.h:60
constexpr double QX1exp
Definition: VDTMath.h:50
constexpr double PX4log
Definition: VDTMath.h:61
constexpr double PX3exp
Definition: VDTMath.h:49
double a
Definition: hdecay.h:121
#define CMS_VECTORIZE_VERBOSE
Definition: VDTMath.h:26
constexpr double DP1tan
Definition: VDTMath.h:134
VDT_FORCE_INLINE double fast_atan(double x)
Definition: VDTMath.h:664
constexpr double QX3atan
Definition: VDTMath.h:152
constexpr double C1sin
Definition: VDTMath.h:83
constexpr double MOREBITS
Definition: VDTMath.h:141
constexpr double TWOPI
Definition: VDTMath.h:75
constexpr double QX5atan
Definition: VDTMath.h:154
constexpr double C6sin
Definition: VDTMath.h:88
constexpr double PX6log
Definition: VDTMath.h:63
constexpr double C2cos
Definition: VDTMath.h:92
constexpr double PX5asin
Definition: VDTMath.h:115
constexpr double DP2sc
Definition: VDTMath.h:73
constexpr double T3PO8
Definition: VDTMath.h:140
Definition: DDAxes.h:10
VDT_FORCE_INLINE double fast_log(double x)
Definition: VDTMath.h:279
constexpr double PIO2
Definition: VDTMath.h:77
constexpr double PX6asin
Definition: VDTMath.h:116
VDT_FORCE_INLINE double fast_inv(double x)
Definition: VDTMath.h:776
VDT_FORCE_INLINE double fast_approx_inv(double x)
Definition: VDTMath.h:786
constexpr double LOG2E
Definition: VDTMath.h:35
#define constexpr
constexpr double QX3exp
Definition: VDTMath.h:52
constexpr double QX1atan
Definition: VDTMath.h:150
VDT_FORCE_INLINE double ll2d(unsigned long long x)
Converts an unsigned long long to a double.
Definition: VDTMath.h:179
#define VDT_FORCE_INLINE
Definition: VDTMath.h:30
Used to switch between different type of interpretations of the data (64 bits)
Definition: VDTMath.h:169