CMS 3D CMS Logo

crc32c.cc
Go to the documentation of this file.
1 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2  * Copyright (C) 2013 Mark Adler
3  * Version 1.1 1 Aug 2013 Mark Adler
4  */
5 
6 /*
7  This software is provided 'as-is', without any express or implied
8  warranty. In no event will the author be held liable for any damages
9  arising from the use of this software.
10 
11  Permission is granted to anyone to use this software for any purpose,
12  including commercial applications, and to alter it and redistribute it
13  freely, subject to the following restrictions:
14 
15  1. The origin of this software must not be misrepresented; you must not
16  claim that you wrote the original software. If you use this software
17  in a product, an acknowledgment in the product documentation would be
18  appreciated but is not required.
19  2. Altered source versions must be plainly marked as such, and must not be
20  misrepresented as being the original software.
21  3. This notice may not be removed or altered from any source distribution.
22 
23  Mark Adler
24  madler@alumni.caltech.edu
25  */
26 
27 /* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a
28  CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software
29  version is provided as a fall-back, as well as for speed comparisons. */
30 
31 /* Version history:
32  1.0 10 Feb 2013 First version
33  1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel
34  */
35 
36 /* Srecko Morovic, Apr 02 2015: (crc32.cc) modified to compile in c++
37  * and ifdefs to compile and call hw version only with X86_64
38  */
39 
40 #include <cstdio>
41 #include <cstdlib>
42 #include <cstdint>
43 #include <unistd.h>
44 #include <pthread.h>
45 
46 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
47 #define POLY 0x82f63b78
48 
49 /* Table for a quadword-at-a-time software crc. */
50 static pthread_once_t crc32c_once_sw = PTHREAD_ONCE_INIT;
51 static uint32_t crc32c_table[8][256];
52 
53 /* Construct table for software CRC-32C calculation. */
54 static void crc32c_init_sw(void) {
55  uint32_t n, crc, k;
56 
57  for (n = 0; n < 256; n++) {
58  crc = n;
59  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
60  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
61  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
62  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
63  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
64  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
65  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
66  crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
67  crc32c_table[0][n] = crc;
68  }
69  for (n = 0; n < 256; n++) {
70  crc = crc32c_table[0][n];
71  for (k = 1; k < 8; k++) {
72  crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8);
73  crc32c_table[k][n] = crc;
74  }
75  }
76 }
77 
78 /* Table-driven software version as a fall-back. This is about 15 times slower
79  than using the hardware instructions. This assumes little-endian integers,
80  as is the case on Intel processors that the assembler code here is for. */
81 static uint32_t crc32c_sw(uint32_t crci, const unsigned char *buf, size_t len) {
82  const unsigned char *next = buf;
83  uint64_t crc;
84 
85  pthread_once(&crc32c_once_sw, crc32c_init_sw);
86  crc = crci ^ 0xffffffff;
87  while (len && ((const uintptr_t)next & 7) != 0) {
88  crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
89  len--;
90  }
91  while (len >= 8) {
92  crc ^= *(uint64_t *)next;
93  crc = crc32c_table[7][crc & 0xff] ^ crc32c_table[6][(crc >> 8) & 0xff] ^ crc32c_table[5][(crc >> 16) & 0xff] ^
94  crc32c_table[4][(crc >> 24) & 0xff] ^ crc32c_table[3][(crc >> 32) & 0xff] ^
95  crc32c_table[2][(crc >> 40) & 0xff] ^ crc32c_table[1][(crc >> 48) & 0xff] ^ crc32c_table[0][crc >> 56];
96  next += 8;
97  len -= 8;
98  }
99  while (len) {
100  crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
101  len--;
102  }
103  return (uint32_t)crc ^ 0xffffffff;
104 }
105 
106 #if defined(__x86_64__)
107 /* Multiply a matrix times a vector over the Galois field of two elements,
108  GF(2). Each element is a bit in an unsigned integer. mat must have at
109  least as many entries as the power of two for most significant one bit in
110  vec. */
111 static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) {
112  uint32_t sum;
113 
114  sum = 0;
115  while (vec) {
116  if (vec & 1)
117  sum ^= *mat;
118  vec >>= 1;
119  mat++;
120  }
121  return sum;
122 }
123 
124 /* Multiply a matrix by itself over GF(2). Both mat and square must have 32
125  rows. */
126 static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat) {
127  int n;
128 
129  for (n = 0; n < 32; n++)
130  square[n] = gf2_matrix_times(mat, mat[n]);
131 }
132 
133 /* Construct an operator to apply len zeros to a crc. len must be a power of
134  two. If len is not a power of two, then the result is the same as for the
135  largest power of two less than len. The result for len == 0 is the same as
136  for len == 1. A version of this routine could be easily written for any
137  len, but that is not needed for this application. */
138 static void crc32c_zeros_op(uint32_t *even, size_t len) {
139  int n;
140  uint32_t row;
141  uint32_t odd[32]; /* odd-power-of-two zeros operator */
142 
143  /* put operator for one zero bit in odd */
144  odd[0] = POLY; /* CRC-32C polynomial */
145  row = 1;
146  for (n = 1; n < 32; n++) {
147  odd[n] = row;
148  row <<= 1;
149  }
150 
151  /* put operator for two zero bits in even */
152  gf2_matrix_square(even, odd);
153 
154  /* put operator for four zero bits in odd */
155  gf2_matrix_square(odd, even);
156 
157  /* first square will put the operator for one zero byte (eight zero bits),
158  in even -- next square puts operator for two zero bytes in odd, and so
159  on, until len has been rotated down to zero */
160  do {
161  gf2_matrix_square(even, odd);
162  len >>= 1;
163  if (len == 0)
164  return;
165  gf2_matrix_square(odd, even);
166  len >>= 1;
167  } while (len);
168 
169  /* answer ended up in odd -- copy to even */
170  for (n = 0; n < 32; n++)
171  even[n] = odd[n];
172 }
173 
174 /* Take a length and build four lookup tables for applying the zeros operator
175  for that length, byte-by-byte on the operand. */
176 static void crc32c_zeros(uint32_t zeros[][256], size_t len) {
177  uint32_t n;
178  uint32_t op[32];
179 
180  crc32c_zeros_op(op, len);
181  for (n = 0; n < 256; n++) {
182  zeros[0][n] = gf2_matrix_times(op, n);
183  zeros[1][n] = gf2_matrix_times(op, n << 8);
184  zeros[2][n] = gf2_matrix_times(op, n << 16);
185  zeros[3][n] = gf2_matrix_times(op, n << 24);
186  }
187 }
188 
189 /* Apply the zeros operator table to crc. */
190 static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) {
191  return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^ zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
192 }
193 
194 /* Block sizes for three-way parallel crc computation. LONG and SHORT must
195  both be powers of two. The associated string constants must be set
196  accordingly, for use in constructing the assembler instructions. */
197 #define LONG 8192
198 #define LONGx1 "8192"
199 #define LONGx2 "16384"
200 #define SHORT 256
201 #define SHORTx1 "256"
202 #define SHORTx2 "512"
203 
204 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
205 static pthread_once_t crc32c_once_hw = PTHREAD_ONCE_INIT;
206 static uint32_t crc32c_long[4][256];
207 static uint32_t crc32c_short[4][256];
208 
209 /* Initialize tables for shifting crcs. */
210 static void crc32c_init_hw(void) {
211  crc32c_zeros(crc32c_long, LONG);
212  crc32c_zeros(crc32c_short, SHORT);
213 }
214 
215 /* Compute CRC-32C using the Intel hardware instruction. */
216 static uint32_t crc32c_hw(uint32_t crc, const unsigned char *buf, size_t len) {
217  const unsigned char *next = buf;
218  const unsigned char *end;
219  uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */
220 
221  /* populate shift tables the first time through */
222  pthread_once(&crc32c_once_hw, crc32c_init_hw);
223 
224  /* pre-process the crc */
225  crc0 = crc ^ 0xffffffff;
226 
227  /* compute the crc for up to seven leading bytes to bring the data pointer
228  to an eight-byte boundary */
229  while (len && ((const uintptr_t)next & 7) != 0) {
230  __asm__(
231  "crc32b\t"
232  "(%1), %0"
233  : "=r"(crc0)
234  : "r"(next), "0"(crc0));
235  next++;
236  len--;
237  }
238 
239  /* compute the crc on sets of LONG*3 bytes, executing three independent crc
240  instructions, each on LONG bytes -- this is optimized for the Nehalem,
241  Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
242  throughput of one crc per cycle, but a latency of three cycles */
243  while (len >= LONG * 3) {
244  crc1 = 0;
245  crc2 = 0;
246  end = next + LONG;
247  do {
248  __asm__(
249  "crc32q\t"
250  "(%3), %0\n\t"
251  "crc32q\t" LONGx1
252  "(%3), %1\n\t"
253  "crc32q\t" LONGx2 "(%3), %2"
254  : "=r"(crc0), "=r"(crc1), "=r"(crc2)
255  : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
256  next += 8;
257  } while (next < end);
258  crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
259  crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
260  next += LONG * 2;
261  len -= LONG * 3;
262  }
263 
264  /* do the same thing, but now on SHORT*3 blocks for the remaining data less
265  than a LONG*3 block */
266  while (len >= SHORT * 3) {
267  crc1 = 0;
268  crc2 = 0;
269  end = next + SHORT;
270  do {
271  __asm__(
272  "crc32q\t"
273  "(%3), %0\n\t"
274  "crc32q\t" SHORTx1
275  "(%3), %1\n\t"
276  "crc32q\t" SHORTx2 "(%3), %2"
277  : "=r"(crc0), "=r"(crc1), "=r"(crc2)
278  : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
279  next += 8;
280  } while (next < end);
281  crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
282  crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
283  next += SHORT * 2;
284  len -= SHORT * 3;
285  }
286 
287  /* compute the crc on the remaining eight-byte units less than a SHORT*3
288  block */
289  end = next + (len - (len & 7));
290  while (next < end) {
291  __asm__(
292  "crc32q\t"
293  "(%1), %0"
294  : "=r"(crc0)
295  : "r"(next), "0"(crc0));
296  next += 8;
297  }
298  len &= 7;
299 
300  /* compute the crc for up to seven trailing bytes */
301  while (len) {
302  __asm__(
303  "crc32b\t"
304  "(%1), %0"
305  : "=r"(crc0)
306  : "r"(next), "0"(crc0));
307  next++;
308  len--;
309  }
310 
311  /* return a post-processed crc */
312  return (uint32_t)crc0 ^ 0xffffffff;
313 }
314 
315 /* Check for SSE 4.2. SSE 4.2 was first supported in Nehalem processors
316  introduced in November, 2008. This does not check for the existence of the
317  cpuid instruction itself, which was introduced on the 486SL in 1992, so this
318  will fail on earlier x86 processors. cpuid works on all Pentium and later
319  processors. */
320 #define SSE42(have) \
321  do { \
322  uint32_t eax, ecx; \
323  eax = 1; \
324  __asm__("cpuid" : "=c"(ecx) : "a"(eax) : "%ebx", "%edx"); \
325  (have) = (ecx >> 20) & 1; \
326  } while (0)
327 
328 #endif //defined(__x86_64__)
329 
330 /* Compute a CRC-32C. If the crc32 instruction is available, use the hardware
331  version. Otherwise, use the software version. */
332 uint32_t crc32c(uint32_t crc, const unsigned char *buf, size_t len) {
333 #if defined(__x86_64__)
334  int sse42;
335 
336  SSE42(sse42);
337  return sse42 ? crc32c_hw(crc, buf, len) : crc32c_sw(crc, buf, len);
338 #else
339  return crc32c_sw(crc, buf, len);
340 #endif
341 }
342 
344 #if defined(__x86_64__)
345  int sse42;
346 
347  SSE42(sse42);
348  return sse42;
349 #else
350  return 0;
351 #endif
352 }
static pthread_once_t crc32c_once_sw
Definition: crc32c.cc:50
bool crc32c_hw_test()
Definition: crc32c.cc:343
uint32_t crc32c(uint32_t crc, const unsigned char *buf, size_t len)
Definition: crc32c.cc:332
static uint32_t crc32c_table[8][256]
Definition: crc32c.cc:51
unsigned long long uint64_t
Definition: Time.h:13
static void crc32c_init_sw(void)
Definition: crc32c.cc:54
static double square(double x)
#define POLY
Definition: crc32c.cc:47
static uint32_t crc32c_sw(uint32_t crci, const unsigned char *buf, size_t len)
Definition: crc32c.cc:81