CMS 3D CMS Logo

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
IceRevisitedRadix.cc
Go to the documentation of this file.
1 //----------------------------------------------------------------------
8 //----------------------------------------------------------------------
9 
10 //----------------------------------------------------------------------
39 //----------------------------------------------------------------------
40 
41 /*
42 To do:
43  - add an offset parameter between two input values (avoid some data recopy sometimes)
44  - unroll ? asm ?
45  - 11 bits trick & 3 passes as Michael did
46  - prefetch stuff the day I have a P3
47  - make a version with 16-bits indices ?
48 */
49 
50 //------------------------------------------------------------------------------
51 
52 // Snatch from Opcode.h in Gled::Var1
53 
54 #include "IceRevisitedRadix.h"
55 
56 #include "IceMemoryMacros.h"
57 
58 //------------------------------------------------------------------------------
59 
60 #define INVALIDATE_RANKS mCurrentSize |= 0x80000000
61 #define VALIDATE_RANKS mCurrentSize &= 0x7fffffff
62 #define CURRENT_SIZE (mCurrentSize & 0x7fffffff)
63 #define INVALID_RANKS (mCurrentSize & 0x80000000)
64 
65 #define CHECK_RESIZE(n) \
66  if (n != mPreviousSize) { \
67  if (n > mCurrentSize) \
68  Resize(n); \
69  else \
70  ResetRanks(); \
71  mPreviousSize = n; \
72  }
73 
74 #define CREATE_HISTOGRAMS(type, buffer) \
75  /* Clear counters/histograms */ \
76  ZeroMemory(mHistogram, 256 * 4 * sizeof(udword)); \
77  \
78  /* Prepare to count */ \
79  ubyte* p = (ubyte*)input; \
80  ubyte* pe = &p[nb * 4]; \
81  udword* h0 = &mHistogram[0]; /* Histogram for first pass (LSB) */ \
82  udword* h1 = &mHistogram[256]; /* Histogram for second pass */ \
83  udword* h2 = &mHistogram[512]; /* Histogram for third pass */ \
84  udword* h3 = &mHistogram[768]; /* Histogram for last pass (MSB) */ \
85  \
86  bool AlreadySorted = true; /* Optimism... */ \
87  \
88  if (INVALID_RANKS) { \
89  /* Prepare for temporal coherence */ \
90  type* Running = (type*)buffer; \
91  type PrevVal = *Running; \
92  \
93  while (p != pe) { \
94  /* Read input buffer in previous sorted order */ \
95  type Val = *Running++; \
96  /* Check whether already sorted or not */ \
97  if (Val < PrevVal) { \
98  AlreadySorted = false; \
99  break; \
100  } /* Early out */ \
101  /* Update for next iteration */ \
102  PrevVal = Val; \
103  \
104  /* Create histograms */ \
105  h0[*p++]++; \
106  h1[*p++]++; \
107  h2[*p++]++; \
108  h3[*p++]++; \
109  } \
110  \
111  /* If all input values are already sorted, we just have to return and leave the */ \
112  /* previous list unchanged. That way the routine may take advantage of temporal */ \
113  /* coherence, for example when used to sort transparent faces. */ \
114  if (AlreadySorted) { \
115  mNbHits++; \
116  for (udword i = 0; i < nb; i++) \
117  mRanks[i] = i; \
118  return *this; \
119  } \
120  } else { \
121  /* Prepare for temporal coherence */ \
122  udword* Indices = mRanks; \
123  type PrevVal = (type)buffer[*Indices]; \
124  \
125  while (p != pe) { \
126  /* Read input buffer in previous sorted order */ \
127  type Val = (type)buffer[*Indices++]; \
128  /* Check whether already sorted or not */ \
129  if (Val < PrevVal) { \
130  AlreadySorted = false; \
131  break; \
132  } /* Early out */ \
133  /* Update for next iteration */ \
134  PrevVal = Val; \
135  \
136  /* Create histograms */ \
137  h0[*p++]++; \
138  h1[*p++]++; \
139  h2[*p++]++; \
140  h3[*p++]++; \
141  } \
142  \
143  /* If all input values are already sorted, we just have to return and leave the */ \
144  /* previous list unchanged. That way the routine may take advantage of temporal */ \
145  /* coherence, for example when used to sort transparent faces. */ \
146  if (AlreadySorted) { \
147  mNbHits++; \
148  return *this; \
149  } \
150  } \
151  \
152  /* Else there has been an early out and we must finish computing the histograms */ \
153  while (p != pe) { \
154  /* Create histograms without the previous overhead */ \
155  h0[*p++]++; \
156  h1[*p++]++; \
157  h2[*p++]++; \
158  h3[*p++]++; \
159  }
160 
161 #define CHECK_PASS_VALIDITY(pass) \
162  /* Shortcut to current counters */ \
163  udword* CurCount = &mHistogram[pass << 8]; \
164  \
165  /* Reset flag. The sorting pass is supposed to be performed. (default) */ \
166  bool PerformPass = true; \
167  \
168  /* Check pass validity */ \
169  \
170  /* If all values have the same byte, sorting is useless. */ \
171  /* It may happen when sorting bytes or words instead of dwords. */ \
172  /* This routine actually sorts words faster than dwords, and bytes */ \
173  /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
174  /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
175  \
176  /* Get first byte */ \
177  ubyte UniqueVal = *(((ubyte*)input) + pass); \
178  \
179  /* Check that byte's counter */ \
180  if (CurCount[UniqueVal] == nb) \
181  PerformPass = false;
182 
183 //----------------------------------------------------------------------
187 //----------------------------------------------------------------------
188 RadixSort::RadixSort() : mCurrentSize(0), mRanks(nullptr), mRanks2(nullptr), mTotalCalls(0), mNbHits(0) {
189 #ifndef RADIX_LOCAL_RAM
190  // Allocate input-independent ram
191  mHistogram = new udword[256 * 4];
192  mLink = new udword[256];
193 #endif
194  // Initialize indices
196 }
197 
198 //----------------------------------------------------------------------
202 //----------------------------------------------------------------------
204  // Release everything
205 #ifndef RADIX_LOCAL_RAM
206  DELETEARRAY(mLink);
207  DELETEARRAY(mHistogram);
208 #endif
211 }
212 
213 //----------------------------------------------------------------------
218 //----------------------------------------------------------------------
220  udword* ranks = mRanks;
221  mRanks = nullptr;
223  mCurrentSize = 0;
224  return ranks;
225 }
226 
227 //----------------------------------------------------------------------
233 //----------------------------------------------------------------------
235  // Free previously used ram
238 
239  // Get some fresh one
240  mRanks = new udword[nb];
242  mRanks2 = new udword[nb];
244 
245  return true;
246 }
247 
249  udword CurSize = CURRENT_SIZE;
250  if (nb != CurSize) {
251  if (nb > CurSize)
252  Resize(nb);
253  mCurrentSize = nb;
255  }
256 }
257 
258 //----------------------------------------------------------------------
270 //----------------------------------------------------------------------
272  // Checkings
273  if (!input || !nb || nb & 0x80000000)
274  return *this;
275 
276  // Stats
277  mTotalCalls++;
278 
279  // Resize lists if needed
280  CheckResize(nb);
281 
282 #ifdef RADIX_LOCAL_RAM
283  // Allocate histograms & offsets on the stack
284  udword mHistogram[256 * 4];
285  udword* mLink[256];
286 #endif
287 
288  // Create histograms (counters). Counters for all passes are created in one run.
289  // Pros: read input buffer once instead of four times
290  // Cons: mHistogram is 4Kb instead of 1Kb
291  // We must take care of signed/unsigned values for temporal
292  // coherence.... I just have 2 code paths even if just a single
293  // opcode changes. Self-modifying code, someone?
294  if (hint == RADIX_UNSIGNED) {
295  CREATE_HISTOGRAMS(udword, input);
296  } else {
297  CREATE_HISTOGRAMS(sdword, input);
298  }
299 
300  // Compute #negative values involved if needed
301  udword NbNegativeValues = 0;
302  if (hint == RADIX_SIGNED) {
303  // An efficient way to compute the number of negatives values
304  // we'll have to deal with is simply to sum the 128 last values
305  // of the last histogram. Last histogram because that's the one
306  // for the Most Significant Byte, responsible for the sign. 128
307  // last values because the 128 first ones are related to
308  // positive numbers.
309  udword* h3 = &mHistogram[768];
310  for (udword i = 128; i < 256; i++)
311  NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
312  }
313 
314  // Radix sort, j is the pass number (0=LSB, 3=MSB)
315  for (udword j = 0; j < 4; j++) {
317 
318  // Sometimes the fourth (negative) pass is skipped because all
319  // numbers are negative and the MSB is 0xFF (for example). This
320  // is not a problem, numbers are correctly sorted anyway.
321  if (PerformPass) {
322  // Should we care about negative values?
323  if (j != 3 || hint == RADIX_UNSIGNED) {
324  // Here we deal with positive values only
325 
326  // Create offsets
327  mLink[0] = mRanks2;
328  for (udword i = 1; i < 256; i++)
329  mLink[i] = mLink[i - 1] + CurCount[i - 1];
330  } else {
331  // This is a special case to correctly handle negative
332  // integers. They're sorted in the right order but at
333  // the wrong place.
334 
335  // Create biased offsets, in order for negative numbers to be sorted as well
336  mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
337  for (udword i = 1; i < 128; i++)
338  mLink[i] = mLink[i - 1] + CurCount[i - 1]; // 1 to 128 for positive numbers
339 
340  // Fixing the wrong place for negative values
341  mLink[128] = mRanks2;
342  for (udword i = 129; i < 256; i++)
343  mLink[i] = mLink[i - 1] + CurCount[i - 1];
344  }
345 
346  // Perform Radix Sort
347  ubyte* InputBytes = (ubyte*)input;
348  InputBytes += j;
349  if (INVALID_RANKS) {
350  for (udword i = 0; i < nb; i++)
351  *mLink[InputBytes[i << 2]]++ = i;
353  } else {
354  udword* Indices = mRanks;
355  udword* IndicesEnd = &mRanks[nb];
356  while (Indices != IndicesEnd) {
357  udword id = *Indices++;
358  *mLink[InputBytes[id << 2]]++ = id;
359  }
360  }
361 
362  // Swap pointers for next pass. Valid indices - the most
363  // recent ones - are in mRanks after the swap.
364  udword* Tmp = mRanks;
365  mRanks = mRanks2;
366  mRanks2 = Tmp;
367  }
368  }
369  return *this;
370 }
371 
372 //----------------------------------------------------------------------
383 //----------------------------------------------------------------------
385  // Checkings
386  if (!input2 || !nb || nb & 0x80000000)
387  return *this;
388 
389  // Stats
390  mTotalCalls++;
391 
392  udword* input = (udword*)input2;
393 
394  // Resize lists if needed
395  CheckResize(nb);
396 
397 #ifdef RADIX_LOCAL_RAM
398  // Allocate histograms & offsets on the stack
399  udword mHistogram[256 * 4];
400  udword* mLink[256];
401 #endif
402 
403  // Create histograms (counters). Counters for all passes are created
404  // in one run.
405  // Pros: read input buffer once instead of four times
406  // Cons: mHistogram is 4Kb instead of 1Kb
407  //
408  // Floating-point values are always supposed to be signed values, so
409  // there's only one code path there.
410  // Please note the floating point comparison needed for temporal
411  // coherence! Although the resulting asm code is dreadful, this is
412  // surprisingly not such a performance hit - well, I suppose that's
413  // a big one on first generation Pentiums....We can't make
414  // comparison on integer representations because, as Chris said, it
415  // just wouldn't work with mixed positive/negative values....
416  { CREATE_HISTOGRAMS(float, input2); }
417 
418  // Compute #negative values involved if needed
419  udword NbNegativeValues = 0;
420  // An efficient way to compute the number of negatives values we'll
421  // have to deal with is simply to sum the 128 last values of the
422  // last histogram. Last histogram because that's the one for the
423  // Most Significant Byte, responsible for the sign. 128 last values
424  // because the 128 first ones are related to positive numbers.
425  udword* h3 = &mHistogram[768];
426  for (udword i = 128; i < 256; i++)
427  NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
428 
429  // Radix sort, j is the pass number (0=LSB, 3=MSB)
430  for (udword j = 0; j < 4; j++) {
431  // Should we care about negative values?
432  if (j != 3) {
433  // Here we deal with positive values only
435 
436  if (PerformPass) {
437  // Create offsets
438  mLink[0] = mRanks2;
439  for (udword i = 1; i < 256; i++)
440  mLink[i] = mLink[i - 1] + CurCount[i - 1];
441 
442  // Perform Radix Sort
443  ubyte* InputBytes = (ubyte*)input;
444  InputBytes += j;
445  if (INVALID_RANKS) {
446  for (udword i = 0; i < nb; i++)
447  *mLink[InputBytes[i << 2]]++ = i;
449  } else {
450  udword* Indices = mRanks;
451  udword* IndicesEnd = &mRanks[nb];
452  while (Indices != IndicesEnd) {
453  udword id = *Indices++;
454  *mLink[InputBytes[id << 2]]++ = id;
455  }
456  }
457 
458  // Swap pointers for next pass. Valid indices - the most
459  // recent ones - are in mRanks after the swap.
460  udword* Tmp = mRanks;
461  mRanks = mRanks2;
462  mRanks2 = Tmp;
463  }
464  } else {
465  // This is a special case to correctly handle negative values
467 
468  if (PerformPass) {
469  // Create biased offsets, in order for negative numbers
470  // to be sorted as well
471  mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
472  for (udword i = 1; i < 128; i++)
473  mLink[i] = mLink[i - 1] + CurCount[i - 1]; // 1 to 128 for positive numbers
474 
475  // We must reverse the sorting order for negative numbers!
476  mLink[255] = mRanks2;
477  for (udword i = 0; i < 127; i++)
478  mLink[254 - i] = mLink[255 - i] + CurCount[255 - i]; // Fixing the wrong order for negative values
479  for (udword i = 128; i < 256; i++)
480  mLink[i] += CurCount[i]; // Fixing the wrong place for negative values
481 
482  // Perform Radix Sort
483  if (INVALID_RANKS) {
484  for (udword i = 0; i < nb; i++) {
485  udword Radix = input[i] >> 24; // Radix byte, same as above. AND is useless here (udword).
486  // ### cmp to be killed. Not good. Later.
487  if (Radix < 128)
488  *mLink[Radix]++ = i; // Number is positive, same as above
489  else
490  *(--mLink[Radix]) = i; // Number is negative, flip the sorting order
491  }
493  } else {
494  for (udword i = 0; i < nb; i++) {
495  udword Radix = input[mRanks[i]] >> 24; // Radix byte, same as above. AND is useless here (udword).
496  // ### cmp to be killed. Not good. Later.
497  if (Radix < 128)
498  *mLink[Radix]++ = mRanks[i]; // Number is positive, same as above
499  else
500  *(--mLink[Radix]) = mRanks[i]; // Number is negative, flip the sorting order
501  }
502  }
503  // Swap pointers for next pass. Valid indices - the most
504  // recent ones - are in mRanks after the swap.
505  udword* Tmp = mRanks;
506  mRanks = mRanks2;
507  mRanks2 = Tmp;
508  } else {
509  // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
510  if (UniqueVal >= 128) {
511  if (INVALID_RANKS) {
512  // ###Possible?
513  for (udword i = 0; i < nb; i++)
514  mRanks2[i] = nb - i - 1;
516  } else {
517  for (udword i = 0; i < nb; i++)
518  mRanks2[i] = mRanks[nb - i - 1];
519  }
520 
521  // Swap pointers for next pass. Valid indices - the
522  // most recent ones - are in mRanks after the swap.
523  udword* Tmp = mRanks;
524  mRanks = mRanks2;
525  mRanks2 = Tmp;
526  }
527  }
528  }
529  }
530  return *this;
531 }
532 
533 //----------------------------------------------------------------------
538 //----------------------------------------------------------------------
540  udword UsedRam = sizeof(RadixSort);
541 #ifndef RADIX_LOCAL_RAM
542  UsedRam += 256 * 4 * sizeof(udword); // Histograms
543  UsedRam += 256 * sizeof(udword); // Link
544 #endif
545  UsedRam += 2 * CURRENT_SIZE * sizeof(udword); // 2 lists of indices
546  return UsedRam;
547 }
bool Resize(udword nb)
#define INVALIDATE_RANKS
#define VALIDATE_RANKS
#define inline_
Definition: IceTypes.h:18
uint16_t *__restrict__ id
signed int sdword
sizeof(sdword) must be 4
Definition: IceTypes.h:51
udword mTotalCalls
Total number of calls to the sort routine.
udword * RelinquishRanks()
udword mCurrentSize
Current size of the indices list.
udword * mRanks2
udword GetUsedRam() const
#define DELETEARRAY(x)
Deletes an array.
Input values are signed.
RadixSort & Sort(const udword *input, udword nb, RadixHint hint=RADIX_SIGNED)
#define INVALID_RANKS
#define input2
Definition: AMPTWrapper.h:159
static std::string const input
Definition: EdmProvDump.cc:47
#define CHECK_PASS_VALIDITY(pass)
unsigned int udword
sizeof(udword) must be 4
Definition: IceTypes.h:52
Indices
Definition: EdmEventSize.cc:28
#define CREATE_HISTOGRAMS(type, buffer)
#define CHECKALLOC(x)
unsigned char ubyte
sizeof(ubyte) must be 1
Definition: IceTypes.h:48
void CheckResize(udword nb)
RadixHint
#define CURRENT_SIZE
Input values are unsigned.
udword * mRanks
Two lists, swapped each pass.