d8/d5c/IceRevisitedRadix_8cc_source.html

 //----------------------------------------------------------------------

 //----------------------------------------------------------------------


 //----------------------------------------------------------------------

 //----------------------------------------------------------------------


 /*

 To do:

     - add an offset parameter between two input values (avoid some data recopy sometimes)

     - unroll ? asm ?

     - 11 bits trick & 3 passes as Michael did

     - prefetch stuff the day I have a P3

     - make a version with 16-bits indices ?

 */


 //------------------------------------------------------------------------------


 // Snatch from Opcode.h in Gled::Var1


 #include "IceRevisitedRadix.h"


 #include "IceMemoryMacros.h"


 //------------------------------------------------------------------------------


 #define INVALIDATE_RANKS mCurrentSize |= 0x80000000

 #define VALIDATE_RANKS mCurrentSize &= 0x7fffffff

 #define CURRENT_SIZE (mCurrentSize & 0x7fffffff)

 #define INVALID_RANKS (mCurrentSize & 0x80000000)


 #define CHECK_RESIZE(n)     \

   if (n != mPreviousSize) { \

     if (n > mCurrentSize)   \

       Resize(n);            \

     else                    \

       ResetRanks();         \

     mPreviousSize = n;      \

   }


 #define CREATE_HISTOGRAMS(type, buffer)                                                \

   /* Clear counters/histograms */                                                      \

   ZeroMemory(mHistogram, 256 * 4 * sizeof(udword));                                    \

                                                                                        \

   /* Prepare to count */                                                               \

   ubyte* p = (ubyte*)input;                                                            \

   ubyte* pe = &p[nb * 4];                                                              \

   udword* h0 = &mHistogram[0];   /* Histogram for first pass (LSB) */                  \

   udword* h1 = &mHistogram[256]; /* Histogram for second pass     */                     \

   udword* h2 = &mHistogram[512]; /* Histogram for third pass      */                      \

   udword* h3 = &mHistogram[768]; /* Histogram for last pass (MSB)  */                  \

                                                                                        \

   bool AlreadySorted = true; /* Optimism... */                                         \

                                                                                        \

   if (INVALID_RANKS) {                                                                 \

     /* Prepare for temporal coherence */                                               \

     type* Running = (type*)buffer;                                                     \

     type PrevVal = *Running;                                                           \

                                                                                        \

     while (p != pe) {                                                                  \

       /* Read input buffer in previous sorted order */                                 \

       type Val = *Running++;                                                           \

       /* Check whether already sorted or not */                                        \

       if (Val < PrevVal) {                                                             \

         AlreadySorted = false;                                                         \

         break;                                                                         \

       } /* Early out */                                                                \

       /* Update for next iteration */                                                  \

       PrevVal = Val;                                                                   \

                                                                                        \

       /* Create histograms */                                                          \

       h0[*p++]++;                                                                      \

       h1[*p++]++;                                                                      \

       h2[*p++]++;                                                                      \

       h3[*p++]++;                                                                      \

     }                                                                                  \

                                                                                        \

     /* If all input values are already sorted, we just have to return and leave the */ \

     /* previous list unchanged. That way the routine may take advantage of temporal */ \

     /* coherence, for example when used to sort transparent faces.                  */              \

     if (AlreadySorted) {                                                               \

       mNbHits++;                                                                       \

       for (udword i = 0; i < nb; i++)                                                  \

         mRanks[i] = i;                                                                 \

       return *this;                                                                    \

     }                                                                                  \

   } else {                                                                             \

     /* Prepare for temporal coherence */                                               \

     udword* Indices = mRanks;                                                          \

     type PrevVal = (type)buffer[*Indices];                                             \

                                                                                        \

     while (p != pe) {                                                                  \

       /* Read input buffer in previous sorted order */                                 \

       type Val = (type)buffer[*Indices++];                                             \

       /* Check whether already sorted or not */                                        \

       if (Val < PrevVal) {                                                             \

         AlreadySorted = false;                                                         \

         break;                                                                         \

       } /* Early out */                                                                \

       /* Update for next iteration */                                                  \

       PrevVal = Val;                                                                   \

                                                                                        \

       /* Create histograms */                                                          \

       h0[*p++]++;                                                                      \

       h1[*p++]++;                                                                      \

       h2[*p++]++;                                                                      \

       h3[*p++]++;                                                                      \

     }                                                                                  \

                                                                                        \

     /* If all input values are already sorted, we just have to return and leave the */ \

     /* previous list unchanged. That way the routine may take advantage of temporal */ \

     /* coherence, for example when used to sort transparent faces.                  */              \

     if (AlreadySorted) {                                                               \

       mNbHits++;                                                                       \

       return *this;                                                                    \

     }                                                                                  \

   }                                                                                    \

                                                                                        \

   /* Else there has been an early out and we must finish computing the histograms */   \

   while (p != pe) {                                                                    \

     /* Create histograms without the previous overhead */                              \

     h0[*p++]++;                                                                        \

     h1[*p++]++;                                                                        \

     h2[*p++]++;                                                                        \

     h3[*p++]++;                                                                        \

   }


 #define CHECK_PASS_VALIDITY(pass)                                                         \

   /* Shortcut to current counters */                                                      \

   udword* CurCount = &mHistogram[pass << 8];                                              \

                                                                                           \

   /* Reset flag. The sorting pass is supposed to be performed. (default) */               \

   bool PerformPass = true;                                                                \

                                                                                           \

   /* Check pass validity */                                                               \

                                                                                           \

   /* If all values have the same byte, sorting is useless. */                             \

   /* It may happen when sorting bytes or words instead of dwords. */                      \

   /* This routine actually sorts words faster than dwords, and bytes */                   \

   /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */             \

   /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \

                                                                                           \

   /* Get first byte */                                                                    \

   ubyte UniqueVal = *(((ubyte*)input) + pass);                                            \

                                                                                           \

   /* Check that byte's counter */                                                         \

   if (CurCount[UniqueVal] == nb)                                                          \

     PerformPass = false;


 //----------------------------------------------------------------------

 //----------------------------------------------------------------------

 RadixSort::RadixSort() : mCurrentSize(0), mRanks(nullptr), mRanks2(nullptr), mTotalCalls(0), mNbHits(0) {

 #ifndef RADIX_LOCAL_RAM

   // Allocate input-independent ram

   mHistogram = new udword[256 * 4];

   mLink = new udword[256];

 #endif

   // Initialize indices

   INVALIDATE_RANKS;

 }


 //----------------------------------------------------------------------

 //----------------------------------------------------------------------

 RadixSort::~RadixSort() {

   // Release everything

 #ifndef RADIX_LOCAL_RAM

   DELETEARRAY(mLink);

   DELETEARRAY(mHistogram);

 #endif

   DELETEARRAY(mRanks2);

   DELETEARRAY(mRanks);

 }


 //----------------------------------------------------------------------

 //----------------------------------------------------------------------

 udword* RadixSort::RelinquishRanks() {

   udword* ranks = mRanks;

   mRanks = nullptr;

   DELETEARRAY(mRanks2);

   mCurrentSize = 0;

   return ranks;

 }


 //----------------------------------------------------------------------

 //----------------------------------------------------------------------

 bool RadixSort::Resize(udword nb) {

   // Free previously used ram

   DELETEARRAY(mRanks2);

   DELETEARRAY(mRanks);


   // Get some fresh one

   mRanks = new udword[nb];

   CHECKALLOC(mRanks);

   mRanks2 = new udword[nb];

   CHECKALLOC(mRanks2);


   return true;

 }


 inline_ void RadixSort::CheckResize(udword nb) {

   udword CurSize = CURRENT_SIZE;

   if (nb != CurSize) {

     if (nb > CurSize)

       Resize(nb);

     mCurrentSize = nb;

     INVALIDATE_RANKS;

   }

 }


 //----------------------------------------------------------------------

 //----------------------------------------------------------------------

 RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint) {

   // Checkings

   if (!input || !nb || nb & 0x80000000)

     return *this;


   // Stats

   mTotalCalls++;


   // Resize lists if needed

   CheckResize(nb);


 #ifdef RADIX_LOCAL_RAM

   // Allocate histograms & offsets on the stack

   udword mHistogram[256 * 4];

   udword* mLink[256];

 #endif


   // Create histograms (counters). Counters for all passes are created in one run.

   // Pros:  read input buffer once instead of four times

   // Cons:  mHistogram is 4Kb instead of 1Kb

   // We must take care of signed/unsigned values for temporal

   // coherence.... I just have 2 code paths even if just a single

   // opcode changes. Self-modifying code, someone?

   if (hint == RADIX_UNSIGNED) {

     CREATE_HISTOGRAMS(udword, input);

   } else {

     CREATE_HISTOGRAMS(sdword, input);

   }


   // Compute #negative values involved if needed

   udword NbNegativeValues = 0;

   if (hint == RADIX_SIGNED) {

     // An efficient way to compute the number of negatives values

     // we'll have to deal with is simply to sum the 128 last values

     // of the last histogram. Last histogram because that's the one

     // for the Most Significant Byte, responsible for the sign. 128

     // last values because the 128 first ones are related to

     // positive numbers.

     udword* h3 = &mHistogram[768];

     for (udword i = 128; i < 256; i++)

       NbNegativeValues += h3[i];  // 768 for last histogram, 128 for negative part

   }


   // Radix sort, j is the pass number (0=LSB, 3=MSB)

   for (udword j = 0; j < 4; j++) {

     CHECK_PASS_VALIDITY(j);


     // Sometimes the fourth (negative) pass is skipped because all

     // numbers are negative and the MSB is 0xFF (for example). This

     // is not a problem, numbers are correctly sorted anyway.

     if (PerformPass) {

       // Should we care about negative values?

       if (j != 3 || hint == RADIX_UNSIGNED) {

         // Here we deal with positive values only


         // Create offsets

         mLink[0] = mRanks2;

         for (udword i = 1; i < 256; i++)

           mLink[i] = mLink[i - 1] + CurCount[i - 1];

       } else {

         // This is a special case to correctly handle negative

         // integers. They're sorted in the right order but at

         // the wrong place.


         // Create biased offsets, in order for negative numbers to be sorted as well

         mLink[0] = &mRanks2[NbNegativeValues];  // First positive number takes place after the negative ones

         for (udword i = 1; i < 128; i++)

           mLink[i] = mLink[i - 1] + CurCount[i - 1];  // 1 to 128 for positive numbers


         // Fixing the wrong place for negative values

         mLink[128] = mRanks2;

         for (udword i = 129; i < 256; i++)

           mLink[i] = mLink[i - 1] + CurCount[i - 1];

       }


       // Perform Radix Sort

       ubyte* InputBytes = (ubyte*)input;

       InputBytes += j;

       if (INVALID_RANKS) {

         for (udword i = 0; i < nb; i++)

           *mLink[InputBytes[i << 2]]++ = i;

         VALIDATE_RANKS;

       } else {

         udword* Indices = mRanks;

         udword* IndicesEnd = &mRanks[nb];

         while (Indices != IndicesEnd) {

           udword id = *Indices++;

           *mLink[InputBytes[id << 2]]++ = id;

         }

       }


       // Swap pointers for next pass. Valid indices - the most

       // recent ones - are in mRanks after the swap.

       udword* Tmp = mRanks;

       mRanks = mRanks2;

       mRanks2 = Tmp;

     }

   }

   return *this;

 }


 //----------------------------------------------------------------------

 //----------------------------------------------------------------------

 RadixSort& RadixSort::Sort(const float* input2, udword nb) {

   // Checkings

   if (!input2 || !nb || nb & 0x80000000)

     return *this;


   // Stats

   mTotalCalls++;


   udword* input = (udword*)input2;


   // Resize lists if needed

   CheckResize(nb);


 #ifdef RADIX_LOCAL_RAM

   // Allocate histograms & offsets on the stack

   udword mHistogram[256 * 4];

   udword* mLink[256];

 #endif


   // Create histograms (counters). Counters for all passes are created

   // in one run.

   // Pros:  read input buffer once instead of four times

   // Cons:  mHistogram is 4Kb instead of 1Kb

   //

   // Floating-point values are always supposed to be signed values, so

   // there's only one code path there.

   // Please note the floating point comparison needed for temporal

   // coherence! Although the resulting asm code is dreadful, this is

   // surprisingly not such a performance hit - well, I suppose that's

   // a big one on first generation Pentiums....We can't make

   // comparison on integer representations because, as Chris said, it

   // just wouldn't work with mixed positive/negative values....

   { CREATE_HISTOGRAMS(float, input2); }


   // Compute #negative values involved if needed

   udword NbNegativeValues = 0;

   // An efficient way to compute the number of negatives values we'll

   // have to deal with is simply to sum the 128 last values of the

   // last histogram. Last histogram because that's the one for the

   // Most Significant Byte, responsible for the sign. 128 last values

   // because the 128 first ones are related to positive numbers.

   udword* h3 = &mHistogram[768];

   for (udword i = 128; i < 256; i++)

     NbNegativeValues += h3[i];  // 768 for last histogram, 128 for negative part


   // Radix sort, j is the pass number (0=LSB, 3=MSB)

   for (udword j = 0; j < 4; j++) {

     // Should we care about negative values?

     if (j != 3) {

       // Here we deal with positive values only

       CHECK_PASS_VALIDITY(j);


       if (PerformPass) {

         // Create offsets

         mLink[0] = mRanks2;

         for (udword i = 1; i < 256; i++)

           mLink[i] = mLink[i - 1] + CurCount[i - 1];


         // Perform Radix Sort

         ubyte* InputBytes = (ubyte*)input;

         InputBytes += j;

         if (INVALID_RANKS) {

           for (udword i = 0; i < nb; i++)

             *mLink[InputBytes[i << 2]]++ = i;

           VALIDATE_RANKS;

         } else {

           udword* Indices = mRanks;

           udword* IndicesEnd = &mRanks[nb];

           while (Indices != IndicesEnd) {

             udword id = *Indices++;

             *mLink[InputBytes[id << 2]]++ = id;

           }

         }


         // Swap pointers for next pass. Valid indices - the most

         // recent ones - are in mRanks after the swap.

         udword* Tmp = mRanks;

         mRanks = mRanks2;

         mRanks2 = Tmp;

       }

     } else {

       // This is a special case to correctly handle negative values

       CHECK_PASS_VALIDITY(j);


       if (PerformPass) {

         // Create biased offsets, in order for negative numbers

         // to be sorted as well

         mLink[0] = &mRanks2[NbNegativeValues];  // First positive number takes place after the negative ones

         for (udword i = 1; i < 128; i++)

           mLink[i] = mLink[i - 1] + CurCount[i - 1];  // 1 to 128 for positive numbers


         // We must reverse the sorting order for negative numbers!

         mLink[255] = mRanks2;

         for (udword i = 0; i < 127; i++)

           mLink[254 - i] = mLink[255 - i] + CurCount[255 - i];  // Fixing the wrong order for negative values

         for (udword i = 128; i < 256; i++)

           mLink[i] += CurCount[i];  // Fixing the wrong place for negative values


         // Perform Radix Sort

         if (INVALID_RANKS) {

           for (udword i = 0; i < nb; i++) {

             udword Radix = input[i] >> 24;  // Radix byte, same as above. AND is useless here (udword).

             // ### cmp to be killed. Not good. Later.

             if (Radix < 128)

               *mLink[Radix]++ = i;  // Number is positive, same as above

             else

               *(--mLink[Radix]) = i;  // Number is negative, flip the sorting order

           }

           VALIDATE_RANKS;

         } else {

           for (udword i = 0; i < nb; i++) {

             udword Radix = input[mRanks[i]] >> 24;  // Radix byte, same as above. AND is useless here (udword).

             // ### cmp to be killed. Not good. Later.

             if (Radix < 128)

               *mLink[Radix]++ = mRanks[i];  // Number is positive, same as above

             else

               *(--mLink[Radix]) = mRanks[i];  // Number is negative, flip the sorting order

           }

         }

         // Swap pointers for next pass. Valid indices - the most

         // recent ones - are in mRanks after the swap.

         udword* Tmp = mRanks;

         mRanks = mRanks2;

         mRanks2 = Tmp;

       } else {

         // The pass is useless, yet we still have to reverse the order of current list if all values are negative.

         if (UniqueVal >= 128) {

           if (INVALID_RANKS) {

             // ###Possible?

             for (udword i = 0; i < nb; i++)

               mRanks2[i] = nb - i - 1;

             VALIDATE_RANKS;

           } else {

             for (udword i = 0; i < nb; i++)

               mRanks2[i] = mRanks[nb - i - 1];

           }


           // Swap pointers for next pass. Valid indices - the

           // most recent ones - are in mRanks after the swap.

           udword* Tmp = mRanks;

           mRanks = mRanks2;

           mRanks2 = Tmp;

         }

       }

     }

   }

   return *this;

 }


 //----------------------------------------------------------------------

 //----------------------------------------------------------------------

 udword RadixSort::GetUsedRam() const {

   udword UsedRam = sizeof(RadixSort);

 #ifndef RADIX_LOCAL_RAM

   UsedRam += 256 * 4 * sizeof(udword);  // Histograms

   UsedRam += 256 * sizeof(udword);      // Link

 #endif

   UsedRam += 2 * CURRENT_SIZE * sizeof(udword);  // 2 lists of indices

   return UsedRam;

 }

RadixSort::Resize
bool Resize(udword nb)
Definition: IceRevisitedRadix.cc:234

dqmiolumiharvest.j
tuple j
Definition: dqmiolumiharvest.py:66

INVALIDATE_RANKS
#define INVALIDATE_RANKS
Definition: IceRevisitedRadix.cc:60

mps_fire.i
i
Definition: mps_fire.py:428

VALIDATE_RANKS
#define VALIDATE_RANKS
Definition: IceRevisitedRadix.cc:61

inline_
#define inline_
Definition: IceTypes.h:18

gpuClustering::id
uint16_t *__restrict__ id
Definition: gpuClusterChargeCut.h:21

sdword
signed int sdword
sizeof(sdword) must be 4
Definition: IceTypes.h:51

RadixSort::mTotalCalls
udword mTotalCalls
Total number of calls to the sort routine.
Definition: IceRevisitedRadix.h:65

RadixSort::RelinquishRanks
udword * RelinquishRanks()
Definition: IceRevisitedRadix.cc:219

IceMemoryMacros.h

RadixSort::mCurrentSize
udword mCurrentSize
Current size of the indices list.
Definition: IceRevisitedRadix.h:61

RadixSort::mRanks2
udword * mRanks2
Definition: IceRevisitedRadix.h:63

RadixSort::~RadixSort
~RadixSort()
Definition: IceRevisitedRadix.cc:203

RadixSort::GetUsedRam
udword GetUsedRam() const
Definition: IceRevisitedRadix.cc:539

DELETEARRAY
#define DELETEARRAY(x)
Deletes an array.
Definition: IceMemoryMacros.h:95

RADIX_SIGNED
Input values are signed.
Definition: IceRevisitedRadix.h:22

RadixSort::Sort
RadixSort & Sort(const udword *input, udword nb, RadixHint hint=RADIX_SIGNED)
Definition: IceRevisitedRadix.cc:271

INVALID_RANKS
#define INVALID_RANKS
Definition: IceRevisitedRadix.cc:63

input2
#define input2
Definition: AMPTWrapper.h:159

input
static std::string const input
Definition: EdmProvDump.cc:47

RadixSort::RadixSort
RadixSort()
Definition: IceRevisitedRadix.cc:188

CHECK_PASS_VALIDITY
#define CHECK_PASS_VALIDITY(pass)
Definition: IceRevisitedRadix.cc:161

udword
unsigned int udword
sizeof(udword) must be 4
Definition: IceTypes.h:52

Indices
Indices
Definition: EdmEventSize.cc:28

CREATE_HISTOGRAMS
#define CREATE_HISTOGRAMS(type, buffer)
Definition: IceRevisitedRadix.cc:74

CHECKALLOC
#define CHECKALLOC(x)
Definition: IceMemoryMacros.h:128

ubyte
unsigned char ubyte
sizeof(ubyte) must be 1
Definition: IceTypes.h:48

RadixSort
Definition: IceRevisitedRadix.h:28

RadixSort::CheckResize
void CheckResize(udword nb)
Definition: IceRevisitedRadix.cc:248

RadixHint
RadixHint
Definition: IceRevisitedRadix.h:21

CURRENT_SIZE
#define CURRENT_SIZE
Definition: IceRevisitedRadix.cc:62

RADIX_UNSIGNED
Input values are unsigned.
Definition: IceRevisitedRadix.h:23

IceRevisitedRadix.h

RadixSort::mRanks
udword * mRanks
Two lists, swapped each pass.
Definition: IceRevisitedRadix.h:62