/* This file contains several programs for computing the number of 1-bits in an array of fullwords. Most of these programs are variations of the Harley/Seal method, which uses a carry-save adder. This is quite superior to the method given in Hacker's Delight, first edition, pp 72-73. Max line length is 57, to fit in hacker.book. */ #include #include #include // ------------------------------ pop ---------------------------------- int pop(unsigned x) { x = x - ((x >> 1) & 0x55555555); x = (x & 0x33333333) + ((x >> 2) & 0x33333333); x = (x + (x >> 4)) & 0x0F0F0F0F; x = x + (x >> 8); x = x + (x >> 16); return x & 0x0000003F; } /* Note: an alternative to the last three executable lines above is: return x*0x01010101 >> 24; if your machine has a fast multiplier (suggested by Jari Kirma). */ // --------------------------- popArray1 ------------------------------- /* This is the naive method, simply evaluating the population count for each word of the array, and adding. Ignoring loop control and loads, this is 1 + p ops per word of the array. For p = 15 (code of Fig. 5-2, inlined and with loads of constants moved out of the loop) this is 16 ops per word. */ int popArray1(unsigned A[], int n) { int i, tot; tot = 0; for (i = 0; i < n; i++) tot = tot + pop(A[i]); return tot; } // --------------------------- popArray2 ------------------------------- /* This is Harley's basic method. It combines groups of three array elements into two words to which pop(x) is applied. The running time, ignoring loop control and loads, is 7 elementary ops plus 2 pop counts for each 3 array elements, i.e., (7 + 2p)/3 ops per word, where p is the number of ops for one population count. For p = 15 (code of Fig. 5-2, inlined) this is 37/3 = 12.33 ops per word. */ #define CSA(h,l, a,b,c) \ {unsigned u = a ^ b; unsigned v = c; \ h = (a & b) | (u & v); l = u ^ v;} int popArray2(unsigned A[], int n) { int tot1, tot2, i; unsigned ones, twos; tot1 = 0; tot2 = 0; for (i = 0; i <= n - 3; i = i + 3) { CSA(twos, ones, A[i], A[i+1], A[i+2]) tot1 = tot1 + pop(ones); tot2 = tot2 + pop(twos); } for (i = i; i < n; i++) // Add in the last tot1 = tot1 + pop(A[i]); // 0, 1, or 2 elements. return 2*tot2 + tot1; } // --------------------------- popArray3 ------------------------------- /* This is Harley's basic method but used in a different way (at Seal's suggestion) than that of the above function. It brings in values from the array two at a time and combines them with "ones" and "twos". The array is assumed to have at least one element. The running time, ignoring loop control and loads, is 6 elementary ops plus 1 pop count for each 2 array elements, i.e., (6 + p)/2 ops per word, where p is the number of ops for one population count. For p = 15 (code of Fig. 5-2, inlined) this is 21/2 = 10.5 ops per word. */ int popArray3(unsigned A[], int n) { int tot, i; unsigned ones, twos; tot = 0; // Initialize. ones = 0; for (i = 0; i <= n - 2; i = i + 2) { CSA(twos, ones, ones, A[i], A[i+1]) tot = tot + pop(twos); } tot = 2*tot + pop(ones); if (n & 1) // If there's a last one, tot = tot + pop(A[i]); // add it in. return tot; } // --------------------------- popArray4 ------------------------------- /* This is similar to the above but it brings in array elements 4 at a time and combines them with "ones", "twos", and "fours". Harley gave this algorithm. The array is assumed to have at least three elements. The running time, ignoring loop control and loads, is 16 elementary ops plus 1 pop count for each 4 array elements, i.e., (16 + p)/4 ops per word, where p is the number of ops for one population count. For p = 15 (code of Fig. 5-2, inlined) this is 31/4 = 7.75 ops per word. */ int popArray4(unsigned A[], int n) { int tot, i; unsigned ones, twos, twosA, twosB, fours; tot = 0; // Initialize. twos = ones = 0; for (i = 0; i <= n - 4; i = i + 4) { CSA(twosA, ones, ones, A[i], A[i+1]) CSA(twosB, ones, ones, A[i+2], A[i+3]) CSA(fours, twos, twos, twosA, twosB) tot = tot + pop(fours); } tot = 4*tot + 2*pop(twos) + pop(ones); for (i = i; i < n; i++) // Simply add in the last tot = tot + pop(A[i]); // 0, 1, 2, or 3 elements. return tot; } // --------------------------- popArray5 ------------------------------- /* At the risk of being a bore, the function below is similar to that above, but it brings in array elements 8 at a time and combines them with "ones", "twos", "fours", and "eights". The array is assumed to have at least seven elements. The running time, ignoring loop control and loads, is 36 elementary ops plus 1 pop count for each 8 array elements, i.e., (36 + p)/8 ops per word, where p is the number of ops for one population count. For p = 15 (code of Fig. 5-2, inlined) this is 51/8 = 6.375 ops per word. */ int popArray5(unsigned A[], int n) { int tot, i; unsigned ones, twos, twosA, twosB, fours, foursA, foursB, eights; tot = 0; // Initialize. fours = twos = ones = 0; for (i = 0; i <= n - 8; i = i + 8) { CSA(twosA, ones, ones, A[i], A[i+1]) CSA(twosB, ones, ones, A[i+2], A[i+3]) CSA(foursA, twos, twos, twosA, twosB) CSA(twosA, ones, ones, A[i+4], A[i+5]) CSA(twosB, ones, ones, A[i+6], A[i+7]) CSA(foursB, twos, twos, twosA, twosB) CSA(eights, fours, fours, foursA, foursB) tot = tot + pop(eights); } tot = 8*tot + 4*pop(fours) + 2*pop(twos) + pop(ones); for (i = i; i < n; i++) // Simply add in the last tot = tot + pop(A[i]); // 0 to 7 elements. return tot; } // --------------------------- popArray6 ------------------------------- /* This function generalizes the pattern illustrated by the function above, with the result that the bits in an n-word array can be counted with ceil(log2(n+3)) evaluations of population count. The inner loop (with the CSA) is done very close to 2 times for each outer loop iteration. This is based on both a mathematical calculation (which shows something less than 9/4 times) and instrumentation (e.g., for n = 10,000 the inner loop is executed 9983 times). The inner loop compiles into 19 instructions, mostly housekeeping (shifts, adds loads, stores). This results in a time of 2*19 + 19 = 57 instructions for each outer loop iteration, or 28.5 instructions per word of the array. This is worse than the naive method, and MUCH worse than the above program, which compiles into 8.0 instructions/word. So this routine is a bad idea unless the time to do a population count on one word is very large (greater than 30 instructions anyway). Haven't timed the other loop, but it is executed only about log2(n) times and so isn't so important. Therefore, if this method is to be useful, the housekeeping steps must be greatly reduced. It may be possible to do this by "unwinding" the first few inner loop iterations. Not sure how good the result would be. */ int popArray6(unsigned A[], int n) { int tot, i, k; unsigned z, hi, lo; char nrow[30]; unsigned sum[30][2]; memset(nrow, 0, sizeof(nrow)); // Clear "nrow". sum[0][0] = 0; // Init. by putting in nrow[0] = 1; // a fake 0-element. for (i = 0; i <= n-2; i = i + 2){ sum[0][1] = A[i]; z = A[i+1]; k = 0; do { CSA(z, sum[k][0], sum[k][0], sum[k][1], z) nrow[k] = 1; k = k + 1; } while (nrow[k] == 2); sum[k][nrow[k]] = z; nrow[k] = nrow[k] + 1; } if (i == n - 1) { // If there's one more in sum[0][1] = A[i]; // the array, put it in nrow[0] = 2; // sum[0][1]. } /* Make a pass over the "sum" array compressing all rows that have two entries to the same row but having only one entry, while adding an entry to the subsequent row. This can make the subsequent row have, in effect, three entries, which we similarly compress. Compute the total during this pass. When an empty row is encountered, we're done. */ tot = 0; hi = 0; for (k = 0; nrow[k] != 0; k++) { if (nrow[k] == 1) z = 0; else z = sum[k][1]; // (Is 2.) CSA(hi, lo, sum[k][0], z, hi) tot = tot + (pop(lo) << k); } tot = tot + (pop(hi) << k); return tot; } // ------------------------------ main --------------------------------- int main(void) { unsigned A[101]; int i, n, s1, s; n = sizeof(A)/sizeof(A[0]); A[0] = 0xFFFFFFFF; // Fill the array A[1] = 5; // with somewhat // printf("%08x\n", A[0]); // random numbers. // printf("%08x\n", A[1]); for (i = 2; i < n; i++) { A[i] = rand(); // printf("%08x\n", A[i]); } s1 = popArray1(A, n); printf("Array size = %d, pop count = %d\n", n, s1); s = popArray2(A, n); if (s == s1) printf("popArray2 is ok.\n"); else printf("popArray2 = %d, ERROR.\n", s); s = popArray3(A, n); if (s == s1) printf("popArray3 is ok.\n"); else printf("popArray3 = %d, ERROR.\n", s); s = popArray4(A, n); if (s == s1) printf("popArray4 is ok.\n"); else printf("popArray4 = %d, ERROR.\n", s); s = popArray5(A, n); if (s == s1) printf("popArray5 is ok.\n"); else printf("popArray5 = %d, ERROR.\n", s); s = popArray6(A, n); if (s == s1) printf("popArray6 is ok.\n"); else printf("popArray6 = %d, ERROR.\n", s); return s - s1; }