CMS 3D CMS Logo

crc32c.cc
Go to the documentation of this file.
1 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2  * Copyright (C) 2013 Mark Adler
3  * Version 1.1 1 Aug 2013 Mark Adler
4  */
5 
6 /*
7  This software is provided 'as-is', without any express or implied
8  warranty. In no event will the author be held liable for any damages
9  arising from the use of this software.
10 
11  Permission is granted to anyone to use this software for any purpose,
12  including commercial applications, and to alter it and redistribute it
13  freely, subject to the following restrictions:
14 
15  1. The origin of this software must not be misrepresented; you must not
16  claim that you wrote the original software. If you use this software
17  in a product, an acknowledgment in the product documentation would be
18  appreciated but is not required.
19  2. Altered source versions must be plainly marked as such, and must not be
20  misrepresented as being the original software.
21  3. This notice may not be removed or altered from any source distribution.
22 
23  Mark Adler
24  madler@alumni.caltech.edu
25  */
26 
27 /* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a
28  CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software
29  version is provided as a fall-back, as well as for speed comparisons. */
30 
31 /* Version history:
32  1.0 10 Feb 2013 First version
33  1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel
34  */
35 
36 /* Srecko Morovic, Apr 02 2015: (crc32.cc) modified to compile in c++
37  * and ifdefs to compile and call hw version only with X86_64
38  */
39 
40 
41 
42 #include <cstdio>
43 #include <cstdlib>
44 #include <cstdint>
45 #include <unistd.h>
46 #include <pthread.h>
47 
48 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
49 #define POLY 0x82f63b78
50 
51 /* Table for a quadword-at-a-time software crc. */
52 static pthread_once_t crc32c_once_sw = PTHREAD_ONCE_INIT;
53 static uint32_t crc32c_table[8][256];
54 
55 /* Construct table for software CRC-32C calculation. */
56 static void crc32c_init_sw(void)
57 {
58  uint32_t n, crc, k;
59 
60  for (n = 0; n < 256; n++) {
61  crc = n;
62  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
63  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
64  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
65  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
66  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
67  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
68  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
69  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
70  crc32c_table[0][n] = crc;
71  }
72  for (n = 0; n < 256; n++) {
73  crc = crc32c_table[0][n];
74  for (k = 1; k < 8; k++) {
75  crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8);
76  crc32c_table[k][n] = crc;
77  }
78  }
79 }
80 
81 /* Table-driven software version as a fall-back. This is about 15 times slower
82  than using the hardware instructions. This assumes little-endian integers,
83  as is the case on Intel processors that the assembler code here is for. */
84 static uint32_t crc32c_sw(uint32_t crci, const unsigned char *buf, size_t len)
85 {
86  const unsigned char *next = buf;
87  uint64_t crc;
88 
89  pthread_once(&crc32c_once_sw, crc32c_init_sw);
90  crc = crci ^ 0xffffffff;
91  while (len && ((const uintptr_t)next & 7) != 0) {
92  crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
93  len--;
94  }
95  while (len >= 8) {
96  crc ^= *(uint64_t *)next;
97  crc = crc32c_table[7][crc & 0xff] ^
98  crc32c_table[6][(crc >> 8) & 0xff] ^
99  crc32c_table[5][(crc >> 16) & 0xff] ^
100  crc32c_table[4][(crc >> 24) & 0xff] ^
101  crc32c_table[3][(crc >> 32) & 0xff] ^
102  crc32c_table[2][(crc >> 40) & 0xff] ^
103  crc32c_table[1][(crc >> 48) & 0xff] ^
104  crc32c_table[0][crc >> 56];
105  next += 8;
106  len -= 8;
107  }
108  while (len) {
109  crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
110  len--;
111  }
112  return (uint32_t)crc ^ 0xffffffff;
113 }
114 
115 #if defined(__x86_64__)
116 /* Multiply a matrix times a vector over the Galois field of two elements,
117  GF(2). Each element is a bit in an unsigned integer. mat must have at
118  least as many entries as the power of two for most significant one bit in
119  vec. */
120 static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec)
121 {
122  uint32_t sum;
123 
124  sum = 0;
125  while (vec) {
126  if (vec & 1)
127  sum ^= *mat;
128  vec >>= 1;
129  mat++;
130  }
131  return sum;
132 }
133 
134 /* Multiply a matrix by itself over GF(2). Both mat and square must have 32
135  rows. */
136 static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat)
137 {
138  int n;
139 
140  for (n = 0; n < 32; n++)
141  square[n] = gf2_matrix_times(mat, mat[n]);
142 }
143 
144 /* Construct an operator to apply len zeros to a crc. len must be a power of
145  two. If len is not a power of two, then the result is the same as for the
146  largest power of two less than len. The result for len == 0 is the same as
147  for len == 1. A version of this routine could be easily written for any
148  len, but that is not needed for this application. */
149 static void crc32c_zeros_op(uint32_t *even, size_t len)
150 {
151  int n;
152  uint32_t row;
153  uint32_t odd[32]; /* odd-power-of-two zeros operator */
154 
155  /* put operator for one zero bit in odd */
156  odd[0] = POLY; /* CRC-32C polynomial */
157  row = 1;
158  for (n = 1; n < 32; n++) {
159  odd[n] = row;
160  row <<= 1;
161  }
162 
163  /* put operator for two zero bits in even */
164  gf2_matrix_square(even, odd);
165 
166  /* put operator for four zero bits in odd */
167  gf2_matrix_square(odd, even);
168 
169  /* first square will put the operator for one zero byte (eight zero bits),
170  in even -- next square puts operator for two zero bytes in odd, and so
171  on, until len has been rotated down to zero */
172  do {
173  gf2_matrix_square(even, odd);
174  len >>= 1;
175  if (len == 0)
176  return;
177  gf2_matrix_square(odd, even);
178  len >>= 1;
179  } while (len);
180 
181  /* answer ended up in odd -- copy to even */
182  for (n = 0; n < 32; n++)
183  even[n] = odd[n];
184 }
185 
186 /* Take a length and build four lookup tables for applying the zeros operator
187  for that length, byte-by-byte on the operand. */
188 static void crc32c_zeros(uint32_t zeros[][256], size_t len)
189 {
190  uint32_t n;
191  uint32_t op[32];
192 
193  crc32c_zeros_op(op, len);
194  for (n = 0; n < 256; n++) {
195  zeros[0][n] = gf2_matrix_times(op, n);
196  zeros[1][n] = gf2_matrix_times(op, n << 8);
197  zeros[2][n] = gf2_matrix_times(op, n << 16);
198  zeros[3][n] = gf2_matrix_times(op, n << 24);
199  }
200 }
201 
202 /* Apply the zeros operator table to crc. */
203 static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc)
204 {
205  return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
206  zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
207 }
208 
209 /* Block sizes for three-way parallel crc computation. LONG and SHORT must
210  both be powers of two. The associated string constants must be set
211  accordingly, for use in constructing the assembler instructions. */
212 #define LONG 8192
213 #define LONGx1 "8192"
214 #define LONGx2 "16384"
215 #define SHORT 256
216 #define SHORTx1 "256"
217 #define SHORTx2 "512"
218 
219 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
220 static pthread_once_t crc32c_once_hw = PTHREAD_ONCE_INIT;
221 static uint32_t crc32c_long[4][256];
222 static uint32_t crc32c_short[4][256];
223 
224 /* Initialize tables for shifting crcs. */
225 static void crc32c_init_hw(void)
226 {
227  crc32c_zeros(crc32c_long, LONG);
228  crc32c_zeros(crc32c_short, SHORT);
229 }
230 
231 /* Compute CRC-32C using the Intel hardware instruction. */
232 static uint32_t crc32c_hw(uint32_t crc, const unsigned char *buf, size_t len)
233 {
234  const unsigned char *next = buf;
235  const unsigned char *end;
236  uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */
237 
238  /* populate shift tables the first time through */
239  pthread_once(&crc32c_once_hw, crc32c_init_hw);
240 
241  /* pre-process the crc */
242  crc0 = crc ^ 0xffffffff;
243 
244  /* compute the crc for up to seven leading bytes to bring the data pointer
245  to an eight-byte boundary */
246  while (len && ((const uintptr_t)next & 7) != 0) {
247  __asm__("crc32b\t" "(%1), %0"
248  : "=r"(crc0)
249  : "r"(next), "0"(crc0));
250  next++;
251  len--;
252  }
253 
254  /* compute the crc on sets of LONG*3 bytes, executing three independent crc
255  instructions, each on LONG bytes -- this is optimized for the Nehalem,
256  Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
257  throughput of one crc per cycle, but a latency of three cycles */
258  while (len >= LONG*3) {
259  crc1 = 0;
260  crc2 = 0;
261  end = next + LONG;
262  do {
263  __asm__("crc32q\t" "(%3), %0\n\t"
264  "crc32q\t" LONGx1 "(%3), %1\n\t"
265  "crc32q\t" LONGx2 "(%3), %2"
266  : "=r"(crc0), "=r"(crc1), "=r"(crc2)
267  : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
268  next += 8;
269  } while (next < end);
270  crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
271  crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
272  next += LONG*2;
273  len -= LONG*3;
274  }
275 
276  /* do the same thing, but now on SHORT*3 blocks for the remaining data less
277  than a LONG*3 block */
278  while (len >= SHORT*3) {
279  crc1 = 0;
280  crc2 = 0;
281  end = next + SHORT;
282  do {
283  __asm__("crc32q\t" "(%3), %0\n\t"
284  "crc32q\t" SHORTx1 "(%3), %1\n\t"
285  "crc32q\t" SHORTx2 "(%3), %2"
286  : "=r"(crc0), "=r"(crc1), "=r"(crc2)
287  : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
288  next += 8;
289  } while (next < end);
290  crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
291  crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
292  next += SHORT*2;
293  len -= SHORT*3;
294  }
295 
296  /* compute the crc on the remaining eight-byte units less than a SHORT*3
297  block */
298  end = next + (len - (len & 7));
299  while (next < end) {
300  __asm__("crc32q\t" "(%1), %0"
301  : "=r"(crc0)
302  : "r"(next), "0"(crc0));
303  next += 8;
304  }
305  len &= 7;
306 
307  /* compute the crc for up to seven trailing bytes */
308  while (len) {
309  __asm__("crc32b\t" "(%1), %0"
310  : "=r"(crc0)
311  : "r"(next), "0"(crc0));
312  next++;
313  len--;
314  }
315 
316  /* return a post-processed crc */
317  return (uint32_t)crc0 ^ 0xffffffff;
318 }
319 
320 /* Check for SSE 4.2. SSE 4.2 was first supported in Nehalem processors
321  introduced in November, 2008. This does not check for the existence of the
322  cpuid instruction itself, which was introduced on the 486SL in 1992, so this
323  will fail on earlier x86 processors. cpuid works on all Pentium and later
324  processors. */
325 #define SSE42(have) \
326  do { \
327  uint32_t eax, ecx; \
328  eax = 1; \
329  __asm__("cpuid" \
330  : "=c"(ecx) \
331  : "a"(eax) \
332  : "%ebx", "%edx"); \
333  (have) = (ecx >> 20) & 1; \
334  } while (0)
335 
336 #endif //defined(__x86_64__)
337 
338 /* Compute a CRC-32C. If the crc32 instruction is available, use the hardware
339  version. Otherwise, use the software version. */
340 uint32_t crc32c(uint32_t crc, const unsigned char *buf, size_t len)
341 {
342 #if defined(__x86_64__)
343  int sse42;
344 
345  SSE42(sse42);
346  return sse42 ? crc32c_hw(crc, buf, len) : crc32c_sw(crc, buf, len);
347 #else
348  return crc32c_sw(crc, buf, len);
349 #endif
350 }
351 
352 
353 
355 {
356 
357 #if defined(__x86_64__)
358  int sse42;
359 
360  SSE42(sse42);
361  return sse42;
362 #else
363  return 0;
364 #endif
365 }
366 
dqmiodumpmetadata.n
n
Definition: dqmiodumpmetadata.py:28
crc32c_hw_test
bool crc32c_hw_test()
Definition: crc32c.cc:354
crc32c_once_sw
static pthread_once_t crc32c_once_sw
Definition: crc32c.cc:52
mps_fire.end
end
Definition: mps_fire.py:242
dqmdumpme.k
k
Definition: dqmdumpme.py:60
crc32c_init_sw
static void crc32c_init_sw(void)
Definition: crc32c.cc:56
square
static double square(double x)
Definition: DDCutTubsFromPoints.cc:100
crc32c_table
static uint32_t crc32c_table[8][256]
Definition: crc32c.cc:53
visDQMUpload.buf
buf
Definition: visDQMUpload.py:154
POLY
#define POLY
Definition: crc32c.cc:49
crc32c
uint32_t crc32c(uint32_t crc, const unsigned char *buf, size_t len)
Definition: crc32c.cc:340
crc32c_sw
static uint32_t crc32c_sw(uint32_t crci, const unsigned char *buf, size_t len)
Definition: crc32c.cc:84
cond::uint64_t
unsigned long long uint64_t
Definition: Time.h:13
GetRecoTauVFromDQM_MC_cff.next
next
Definition: GetRecoTauVFromDQM_MC_cff.py:31