// This has the programs for computing the number of leading zeros // in a word. // Max line length is 57, to fit in hacker.book. // Compile with g++, not gcc. #include #include // To define "exit", req'd by XLC. #define LE 1 // 1 for little-endian, 0 for big-endian. int pop(unsigned x) { x = x - ((x >> 1) & 0x55555555); x = (x & 0x33333333) + ((x >> 2) & 0x33333333); x = (x + (x >> 4)) & 0x0F0F0F0F; x = x + (x << 8); x = x + (x << 16); return x >> 24; } int nlz1(unsigned x) { int n; if (x == 0) return(32); n = 0; if (x <= 0x0000FFFF) {n = n +16; x = x <<16;} if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;} if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;} if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;} if (x <= 0x7FFFFFFF) {n = n + 1;} return n; } int nlz1a(unsigned x) { int n; /* if (x == 0) return(32); */ if ((int)x <= 0) return (~x >> 26) & 32; n = 1; if ((x >> 16) == 0) {n = n +16; x = x <<16;} if ((x >> 24) == 0) {n = n + 8; x = x << 8;} if ((x >> 28) == 0) {n = n + 4; x = x << 4;} if ((x >> 30) == 0) {n = n + 2; x = x << 2;} n = n - (x >> 31); return n; } // On basic Risc, 12 to 20 instructions. int nlz2(unsigned x) { unsigned y; int n; n = 32; y = x >>16; if (y != 0) {n = n -16; x = y;} y = x >> 8; if (y != 0) {n = n - 8; x = y;} y = x >> 4; if (y != 0) {n = n - 4; x = y;} y = x >> 2; if (y != 0) {n = n - 2; x = y;} y = x >> 1; if (y != 0) return n - 2; return n - x; } // As above but coded as a loop for compactness: // 23 to 33 basic Risc instructions. int nlz2a(unsigned x) { unsigned y; int n, c; n = 32; c = 16; do { y = x >> c; if (y != 0) {n = n - c; x = y;} c = c >> 1; } while (c != 0); return n - x; } int nlz3(int x) { int y, n; n = 0; y = x; L: if (x < 0) return n; if (y == 0) return 32 - n; n = n + 1; x = x << 1; y = y >> 1; goto L; } int nlz4(unsigned x) { int y, m, n; y = -(x >> 16); // If left half of x is 0, m = (y >> 16) & 16; // set n = 16. If left half n = 16 - m; // is nonzero, set n = 0 and x = x >> m; // shift x right 16. // Now x is of the form 0000xxxx. y = x - 0x100; // If positions 8-15 are 0, m = (y >> 16) & 8; // add 8 to n and shift x left 8. n = n + m; x = x << m; y = x - 0x1000; // If positions 12-15 are 0, m = (y >> 16) & 4; // add 4 to n and shift x left 4. n = n + m; x = x << m; y = x - 0x4000; // If positions 14-15 are 0, m = (y >> 16) & 2; // add 2 to n and shift x left 2. n = n + m; x = x << m; y = x >> 14; // Set y = 0, 1, 2, or 3. m = y & ~(y >> 1); // Set m = 0, 1, 2, or 2 resp. return n + 2 - m; } int nlz5(unsigned x) { int pop(unsigned x); x = x | (x >> 1); x = x | (x >> 2); x = x | (x >> 4); x = x | (x >> 8); x = x | (x >>16); return pop(~x); } /* The four programs below are not valid ANSI C programs. This is because they refer to the same storage locations as two different types. However, they work with xlc/AIX, gcc/AIX, and gcc/NT. If you try to code them more compactly by declaring a variable xx to be "double," and then using n = 1054 - (*((unsigned *)&xx + LE) >> 20); then you are violating not only the rule above, but also the ANSI C rule that pointer arithmetic can be performed only on pointers to array elements. When coded with the above statement, the program fails with xlc, gcc/AIX, and gcc/NT, at some optimization levels. BTW, these programs use the "anonymous union" feature of C++, not available in C. */ int nlz6(unsigned k) { union { unsigned asInt[2]; double asDouble; }; int n; asDouble = (double)k + 0.5; n = 1054 - (asInt[LE] >> 20); return n; } int nlz7(unsigned k) { union { unsigned asInt[2]; double asDouble; }; int n; asDouble = (double)k; n = 1054 - (asInt[LE] >> 20); n = (n & 31) + (n >> 9); return n; } /* In single precision, round-to-nearest mode, the basic method fails for: k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF, 07FFFFFC <= k <= 07FFFFFF, 0FFFFFF8 <= k <= 0FFFFFFF, ... 7FFFFFC0 <= k <= 7FFFFFFF. FFFFFF80 <= k <= FFFFFFFF. For k = 0 it gives 158, and for the other values it is too low by 1. */ int nlz8(unsigned k) { union { unsigned asInt; float asFloat; }; int n; k = k & ~(k >> 1); /* Fix problem with rounding. */ asFloat = (float)k + 0.5f; n = 158 - (asInt >> 23); return n; } /* The example below shows how to make a macro for nlz. It uses an extension to the C and C++ languages that is provided by the GNU C/C++ compiler, namely, that of allowing statements and declarations in expressions (see "Using and Porting GNU CC", by Richard M. Stallman (1998). The underscores are necessary to protect against the possibility that the macro argument will conflict with one of its local variables, e.g., NLZ(k). */ #define NLZ(kp) \ ({union {unsigned _asInt; float _asFloat;}; \ unsigned _k = (kp), _kk = _k & ~(_k >> 1); \ _asFloat = (float)_kk + 0.5f; \ 158 - (_asInt >> 23);}) int nlz8a(unsigned k) { return NLZ(k) + NLZ(-5 + 1); } int nlz9(unsigned k) { union { unsigned asInt; float asFloat; }; int n; k = k & ~(k >> 1); /* Fix problem with rounding. */ asFloat = (float)k; n = 158 - (asInt >> 23); n = (n & 31) + (n >> 6); /* Fix problem with k = 0. */ return n; } /* Below are three nearly equivalent programs for computing the number of leading zeros in a word. This material is not in HD, but may be in a future edition. Immediately below is Robert Harley's algorithm, found at the comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert Juffa. Table entries marked "u" are unused. 14 ops including a multiply, plus an indexed load. The smallest multiplier that works is 0x045BCED1 = 17*65*129*513 (all of form 2**k + 1). There are no multipliers of three terms of the form 2**k +- 1 that work, with a table size of 64 or 128. There are some, with a table size of 64, if you precede the multiplication with x = x - (x >> 1), but that seems less elegant. There are also some if you use a table size of 256, the smallest is 0x01033CBF = 65*255*1025 (this would save two instructions in the form of this algorithm with the multiplication expanded into shifts and adds, but the table size is getting a bit large). */ #define u 99 int nlz10(unsigned x) { static char table[64] = {32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u, u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u, 17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18, 5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u}; x = x | (x >> 1); // Propagate leftmost x = x | (x >> 2); // 1-bit to the right. x = x | (x >> 4); x = x | (x >> 8); x = x | (x >>16); x = x*0x06EB14F9; // Multiplier is 7*255**3. return table[x >> 26]; } /* Harley's algorithm with multiply expanded. 19 elementary ops plus an indexed load. */ int nlz10a(unsigned x) { static char table[64] = {32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u, u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u, 17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18, 5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u}; x = x | (x >> 1); // Propagate leftmost x = x | (x >> 2); // 1-bit to the right. x = x | (x >> 4); x = x | (x >> 8); x = x | (x >> 16); x = (x << 3) - x; // Multiply by 7. x = (x << 8) - x; // Multiply by 255. x = (x << 8) - x; // Again. x = (x << 8) - x; // Again. return table[x >> 26]; } /* Julius Goryavsky's version of Harley's algorithm. 17 elementary ops plus an indexed load, if the machine has "and not." */ int nlz10b(unsigned x) { static char table[64] = {32,20,19, u, u,18, u, 7, 10,17, u, u,14, u, 6, u, u, 9, u,16, u, u, 1,26, u,13, u, u,24, 5, u, u, u,21, u, 8,11, u,15, u, u, u, u, 2,27, 0,25, u, 22, u,12, u, u, 3,28, u, 23, u, 4,29, u, u,30,31}; x = x | (x >> 1); // Propagate leftmost x = x | (x >> 2); // 1-bit to the right. x = x | (x >> 4); x = x | (x >> 8); x = x & ~(x >> 16); x = x*0xFD7049FF; // Activate this line or the following 3. // x = (x << 9) - x; // Multiply by 511. // x = (x << 11) - x; // Multiply by 2047. // x = (x << 14) - x; // Multiply by 16383. return table[x >> 26]; } int errors; void error(int x, int y) { errors = errors + 1; printf("Error for x = %08x, got %d\n", x, y); } int main() { int i, n; static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29, 7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23, 512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16, 65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11, 0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6, 0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3, 0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0, 0xFFFFFFFF,0}; n = sizeof(test)/4; printf("nlz1:\n"); for (i = 0; i < n; i += 2) { if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));} printf("nlz1a:\n"); for (i = 0; i < n; i += 2) { if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));} printf("nlz2:\n"); for (i = 0; i < n; i += 2) { if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));} printf("nlz2a:\n"); for (i = 0; i < n; i += 2) { if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));} printf("nlz3:\n"); for (i = 0; i < n; i += 2) { if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));} printf("nlz4:\n"); for (i = 0; i < n; i += 2) { if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));} printf("nlz5:\n"); for (i = 0; i < n; i += 2) { if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));} printf("nlz6:\n"); for (i = 0; i < n; i += 2) { if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));} printf("nlz7:\n"); for (i = 0; i < n; i += 2) { if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));} printf("nlz8:\n"); for (i = 0; i < n; i += 2) { if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));} printf("nlz8a:\n"); for (i = 0; i < n; i += 2) { if (nlz8a(test[i]) != test[i+1]) error(test[i], nlz8a(test[i]));} printf("nlz9:\n"); for (i = 0; i < n; i += 2) { if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));} printf("nlz10:\n"); for (i = 0; i < n; i += 2) { if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));} printf("nlz10a:\n"); for (i = 0; i < n; i += 2) { if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));} printf("nlz10b:\n"); for (i = 0; i < n; i += 2) { if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));} if (errors == 0) printf("Passed all %d cases.\n", sizeof(test)/8); }