jenkinshash.C

Go to the documentation of this file.
00001 /*!@file Util/jenkinshash.C 32-bit hash functions (NOT CRYPTOGRAPHIC!) */
00002 
00003 // This code here is almost verbatim from
00004 // http://burtleburtle.net/bob/c/lookup2.c (see original public domain
00005 // notice below) with only trivial modifications to use ANSI C
00006 // prototypes instead of K&R prototypes
00007 
00008 //
00009 // Primary maintainer for this file: Rob Peters <rjpeters at usc dot edu>
00010 // $HeadURL: svn://isvn.usc.edu/software/invt/trunk/saliency/src/Util/jenkinshash.C $
00011 // $Id: jenkinshash.C 14376 2011-01-11 02:44:34Z pez $
00012 //
00013 
00014 #ifndef UTIL_JENKINSHASH_C_DEFINED
00015 #define UTIL_JENKINSHASH_C_DEFINED
00016 
00017 #include "Util/jenkinshash.H"
00018 
00019 /*
00020 --------------------------------------------------------------------
00021 lookup2.c, by Bob Jenkins, December 1996, Public Domain.
00022 hash(), hash2(), hash3, and mix() are externally useful functions.
00023 Routines to test the hash are included if SELF_TEST is defined.
00024 You can use this free for any purpose.  It has no warranty.
00025 --------------------------------------------------------------------
00026 */
00027 // #define SELF_TEST
00028 
00029 #include <stdio.h>
00030 #include <stddef.h>
00031 #include <stdlib.h>
00032 
00033 namespace
00034 {
00035 
00036 typedef  uint32  ub4;   /* unsigned 4-byte quantities */
00037 typedef  byte    ub1;
00038 
00039 #define hashsize(n) ((ub4)1<<(n))
00040 #define hashmask(n) (hashsize(n)-1)
00041 
00042 /*
00043 --------------------------------------------------------------------
00044 mix -- mix 3 32-bit values reversibly.
00045 For every delta with one or two bit set, and the deltas of all three
00046   high bits or all three low bits, whether the original value of a,b,c
00047   is almost all zero or is uniformly distributed,
00048 * If mix() is run forward or backward, at least 32 bits in a,b,c
00049   have at least 1/4 probability of changing.
00050 * If mix() is run forward, every bit of c will change between 1/3 and
00051   2/3 of the time.  (Well, 22/100 and 78/100 for some 2-bit deltas.)
00052 mix() was built out of 36 single-cycle latency instructions in a
00053   structure that could supported 2x parallelism, like so:
00054       a -= b;
00055       a -= c; x = (c>>13);
00056       b -= c; a ^= x;
00057       b -= a; x = (a<<8);
00058       c -= a; b ^= x;
00059       c -= b; x = (b>>13);
00060       ...
00061   Unfortunately, superscalar Pentiums and Sparcs can't take advantage
00062   of that parallelism.  They've also turned some of those single-cycle
00063   latency instructions into multi-cycle latency instructions.  Still,
00064   this is the fastest good hash I could find.  There were about 2^^68
00065   to choose from.  I only looked at a billion or so.
00066 --------------------------------------------------------------------
00067 */
00068 #define mix(a,b,c) \
00069 { \
00070   a -= b; a -= c; a ^= (c>>13); \
00071   b -= c; b -= a; b ^= (a<<8); \
00072   c -= a; c -= b; c ^= (b>>13); \
00073   a -= b; a -= c; a ^= (c>>12);  \
00074   b -= c; b -= a; b ^= (a<<16); \
00075   c -= a; c -= b; c ^= (b>>5); \
00076   a -= b; a -= c; a ^= (c>>3);  \
00077   b -= c; b -= a; b ^= (a<<10); \
00078   c -= a; c -= b; c ^= (b>>15); \
00079 }
00080 
00081 /* same, but slower, works on systems that might have 8 byte ub4's */
00082 #define mix2(a,b,c) \
00083 { \
00084   a -= b; a -= c; a ^= (c>>13); \
00085   b -= c; b -= a; b ^= (a<< 8); \
00086   c -= a; c -= b; c ^= ((b&0xffffffff)>>13); \
00087   a -= b; a -= c; a ^= ((c&0xffffffff)>>12); \
00088   b -= c; b -= a; b = (b ^ (a<<16)) & 0xffffffff; \
00089   c -= a; c -= b; c = (c ^ (b>> 5)) & 0xffffffff; \
00090   a -= b; a -= c; a = (a ^ (c>> 3)) & 0xffffffff; \
00091   b -= c; b -= a; b = (b ^ (a<<10)) & 0xffffffff; \
00092   c -= a; c -= b; c = (c ^ (b>>15)) & 0xffffffff; \
00093 }
00094 
00095 /*
00096 --------------------------------------------------------------------
00097 hash() -- hash a variable-length key into a 32-bit value
00098   k     : the key (the unaligned variable-length array of bytes)
00099   len   : the length of the key, counting by bytes
00100   level : can be any 4-byte value
00101 Returns a 32-bit value.  Every bit of the key affects every bit of
00102 the return value.  Every 1-bit and 2-bit delta achieves avalanche.
00103 About 36+6len instructions.
00104 
00105 The best hash table sizes are powers of 2.  There is no need to do
00106 mod a prime (mod is sooo slow!).  If you need less than 32 bits,
00107 use a bitmask.  For example, if you need only 10 bits, do
00108   h = (h & hashmask(10));
00109 In which case, the hash table should have hashsize(10) elements.
00110 
00111 If you are hashing n strings (ub1 **)k, do it like this:
00112   for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
00113 
00114 By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.  You may use this
00115 code any way you wish, private, educational, or commercial.  It's free.
00116 
00117 See http://burlteburtle.net/bob/hash/evahash.html
00118 Use for hash table lookup, or anything where one collision in 2^32 is
00119 acceptable.  Do NOT use for cryptographic purposes.
00120 --------------------------------------------------------------------
00121 */
00122 
00123 ub4 hash(const ub1* k,      /* the key */
00124          const ub4 length,  /* the length of the key */
00125          const ub4 initval) /* the previous hash, or an arbitrary value */
00126 {
00127    ub4 a,b,c,len;
00128 
00129    /* Set up the internal state */
00130    len = length;
00131    a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value */
00132    c = initval;           /* the previous hash value */
00133 
00134    /*---------------------------------------- handle most of the key */
00135    while (len >= 12)
00136    {
00137       a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24));
00138       b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24));
00139       c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24));
00140       mix(a,b,c);
00141       k += 12; len -= 12;
00142    }
00143 
00144    /*------------------------------------- handle the last 11 bytes */
00145    c += length;
00146    switch(len)              /* all the case statements fall through */
00147    {
00148    case 11: c+=((ub4)k[10]<<24);
00149    case 10: c+=((ub4)k[9]<<16);
00150    case 9 : c+=((ub4)k[8]<<8);
00151       /* the first byte of c is reserved for the length */
00152    case 8 : b+=((ub4)k[7]<<24);
00153    case 7 : b+=((ub4)k[6]<<16);
00154    case 6 : b+=((ub4)k[5]<<8);
00155    case 5 : b+=k[4];
00156    case 4 : a+=((ub4)k[3]<<24);
00157    case 3 : a+=((ub4)k[2]<<16);
00158    case 2 : a+=((ub4)k[1]<<8);
00159    case 1 : a+=k[0];
00160      /* case 0: nothing left to add */
00161    }
00162    mix(a,b,c);
00163    /*-------------------------------------------- report the result */
00164    return c;
00165 }
00166 
00167 
00168 /*
00169 --------------------------------------------------------------------
00170  This works on all machines.  hash2() is identical to hash() on
00171  little-endian machines, except that the length has to be measured
00172  in ub4s instead of bytes.  It is much faster than hash().  It
00173  requires
00174  -- that the key be an array of ub4's, and
00175  -- that all your machines have the same endianness, and
00176  -- that the length be the number of ub4's in the key
00177 --------------------------------------------------------------------
00178 */
00179 ub4 hash2(const ub4 *k,       /* the key */
00180           const ub4  length,  /* the length of the key, in ub4s */
00181           const ub4  initval) /* the previous hash, or an arbitrary value */
00182 {
00183    ub4 a,b,c,len;
00184 
00185    /* Set up the internal state */
00186    len = length;
00187    a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value */
00188    c = initval;           /* the previous hash value */
00189 
00190    /*---------------------------------------- handle most of the key */
00191    while (len >= 3)
00192    {
00193       a += k[0];
00194       b += k[1];
00195       c += k[2];
00196       mix(a,b,c);
00197       k += 3; len -= 3;
00198    }
00199 
00200    /*-------------------------------------- handle the last 2 ub4's */
00201    c += length;
00202    switch(len)              /* all the case statements fall through */
00203    {
00204      /* c is reserved for the length */
00205    case 2 : b+=k[1];
00206    case 1 : a+=k[0];
00207      /* case 0: nothing left to add */
00208    }
00209    mix(a,b,c);
00210    /*-------------------------------------------- report the result */
00211    return c;
00212 }
00213 
00214 /*
00215 --------------------------------------------------------------------
00216  This is identical to hash() on little-endian machines (like Intel
00217  x86s or VAXen).  It gives nondeterministic results on big-endian
00218  machines.  It is faster than hash(), but a little slower than
00219  hash2(), and it requires
00220  -- that all your machines be little-endian
00221 --------------------------------------------------------------------
00222 */
00223 
00224 ub4 hash3(const ub1 *k,        /* the key */
00225           const ub4  length,   /* the length of the key */
00226           const ub4  initval)  /* the previous hash, or an arbitrary value */
00227 {
00228    ub4 a,b,c,len;
00229 
00230    /* Set up the internal state */
00231    len = length;
00232    a = b = 0x9e3779b9;    /* the golden ratio; an arbitrary value */
00233    c = initval;           /* the previous hash value */
00234 
00235    /*---------------------------------------- handle most of the key */
00236    if (((unsigned long)k)&3)
00237    {
00238       while (len >= 12)    /* unaligned */
00239       {
00240          a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24));
00241          b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24));
00242          c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24));
00243          mix(a,b,c);
00244          k += 12; len -= 12;
00245       }
00246    }
00247    else
00248    {
00249       while (len >= 12)    /* aligned */
00250       {
00251          a += *(ub4 *)(k+0);
00252          b += *(ub4 *)(k+4);
00253          c += *(ub4 *)(k+8);
00254          mix(a,b,c);
00255          k += 12; len -= 12;
00256       }
00257    }
00258 
00259    /*------------------------------------- handle the last 11 bytes */
00260    c += length;
00261    switch(len)              /* all the case statements fall through */
00262    {
00263    case 11: c+=((ub4)k[10]<<24);
00264    case 10: c+=((ub4)k[9]<<16);
00265    case 9 : c+=((ub4)k[8]<<8);
00266       /* the first byte of c is reserved for the length */
00267    case 8 : b+=((ub4)k[7]<<24);
00268    case 7 : b+=((ub4)k[6]<<16);
00269    case 6 : b+=((ub4)k[5]<<8);
00270    case 5 : b+=k[4];
00271    case 4 : a+=((ub4)k[3]<<24);
00272    case 3 : a+=((ub4)k[2]<<16);
00273    case 2 : a+=((ub4)k[1]<<8);
00274    case 1 : a+=k[0];
00275      /* case 0: nothing left to add */
00276    }
00277    mix(a,b,c);
00278    /*-------------------------------------------- report the result */
00279    return c;
00280 }
00281 
00282 
00283 
00284 /* used for timings */
00285 void driver1()
00286 {
00287   ub1 buf[1024];
00288   ub4 i;
00289   ub4 h=0;
00290 
00291   for (i=0; i<256; ++i)
00292   {
00293     h = hash(buf,i,h);
00294   }
00295 }
00296 
00297 /* check that every input bit changes every output bit half the time */
00298 #define HASHSTATE 1
00299 #define HASHLEN   1
00300 #define MAXPAIR 80
00301 #define MAXLEN 70
00302 void driver2()
00303 {
00304   ub1 qa[MAXLEN+1], qb[MAXLEN+2], *a = &qa[0], *b = &qb[1];
00305   ub4 c[HASHSTATE], d[HASHSTATE], i, j=0, k, l, m=1, z;
00306   ub4 e[HASHSTATE],f[HASHSTATE],g[HASHSTATE],h[HASHSTATE];
00307   ub4 x[HASHSTATE],y[HASHSTATE];
00308   ub4 hlen;
00309 
00310   printf("No more than %d trials should ever be needed \n",MAXPAIR/2);
00311   for (hlen=0; hlen < MAXLEN; ++hlen)
00312   {
00313     z=0;
00314     for (i=0; i<hlen; ++i)  /*----------------------- for each input byte, */
00315     {
00316       for (j=0; j<8; ++j)   /*------------------------ for each input bit, */
00317       {
00318         for (m=1; m<8; ++m) /*------------ for serveral possible initvals, */
00319         {
00320           for (l=0; l<HASHSTATE; ++l) e[l]=f[l]=g[l]=h[l]=x[l]=y[l]=~((ub4)0);
00321 
00322           /*---- check that every output bit is affected by that input bit */
00323           for (k=0; k<MAXPAIR; k+=2)
00324           {
00325             ub4 finished=1;
00326             /* keys have one bit different */
00327             for (l=0; l<hlen+1; ++l) {a[l] = b[l] = (ub1)0;}
00328             /* have a and b be two keys differing in only one bit */
00329             a[i] ^= (k<<j);
00330             a[i] ^= (k>>(8-j));
00331              c[0] = hash(a, hlen, m);
00332             b[i] ^= ((k+1)<<j);
00333             b[i] ^= ((k+1)>>(8-j));
00334              d[0] = hash(b, hlen, m);
00335             /* check every bit is 1, 0, set, and not set at least once */
00336             for (l=0; l<HASHSTATE; ++l)
00337             {
00338               e[l] &= (c[l]^d[l]);
00339               f[l] &= ~(c[l]^d[l]);
00340               g[l] &= c[l];
00341               h[l] &= ~c[l];
00342               x[l] &= d[l];
00343               y[l] &= ~d[l];
00344               if (e[l]|f[l]|g[l]|h[l]|x[l]|y[l]) finished=0;
00345             }
00346             if (finished) break;
00347           }
00348           if (k>z) z=k;
00349           if (k==MAXPAIR)
00350           {
00351              printf("Some bit didn't change: ");
00352              printf("%x %x %x %x %x %x  ",
00353                     e[0],f[0],g[0],h[0],x[0],y[0]);
00354              printf("i %d j %d m %d len %d\n",i,j,m,hlen);
00355           }
00356           if (z==MAXPAIR) goto done;
00357         }
00358       }
00359     }
00360    done:
00361     if (z < MAXPAIR)
00362     {
00363       printf("Mix success  %2d bytes  %2d initvals  ",i,m);
00364       printf("required  %d  trials\n",z/2);
00365     }
00366   }
00367   printf("\n");
00368 }
00369 
00370 /* Check for reading beyond the end of the buffer and alignment problems */
00371 void driver3()
00372 {
00373   ub1 buf[MAXLEN+20], *b;
00374   ub4 len;
00375   ub1 q[] = "This is the time for all good men to come to the aid of their country";
00376   ub1 qq[] = "xThis is the time for all good men to come to the aid of their country";
00377   ub1 qqq[] = "xxThis is the time for all good men to come to the aid of their country";
00378   ub1 qqqq[] = "xxxThis is the time for all good men to come to the aid of their country";
00379   ub4 h,i,j,ref,x,y;
00380 
00381   printf("Endianness.  These should all be the same:\n");
00382   printf("%x\n", hash(q, sizeof(q)-1, (ub4)0));
00383   printf("%x\n", hash(qq+1, sizeof(q)-1, (ub4)0));
00384   printf("%x\n", hash(qqq+2, sizeof(q)-1, (ub4)0));
00385   printf("%x\n", hash(qqqq+3, sizeof(q)-1, (ub4)0));
00386   printf("\n");
00387   for (h=0, b=buf+1; h<8; ++h, ++b)
00388   {
00389     for (i=0; i<MAXLEN; ++i)
00390     {
00391       len = i;
00392       for (j=0; j<i; ++j) *(b+j)=0;
00393 
00394       /* these should all be equal */
00395       ref = hash(b, len, (ub4)1);
00396       *(b+i)=(ub1)~0;
00397       *(b-1)=(ub1)~0;
00398       x = hash(b, len, (ub4)1);
00399       y = hash(b, len, (ub4)1);
00400       if ((ref != x) || (ref != y))
00401       {
00402         printf("alignment error: %x %x %x %d %d\n",ref,x,y,h,i);
00403       }
00404     }
00405   }
00406 }
00407 
00408 /* check for problems with nulls */
00409 void driver4()
00410 {
00411   ub1 buf[1];
00412   ub4 h,i;
00413 
00414 
00415   buf[0] = ~0;
00416   printf("These should all be different\n");
00417   for (i=0, h=0; i<8; ++i)
00418   {
00419     h = hash(buf, (ub4)0, h);
00420     printf("%2d  0-byte strings, hash is  %x\n", i, h);
00421   }
00422 }
00423 
00424 }
00425 
00426 #ifdef SELF_TEST
00427 
00428 int main()
00429 {
00430   driver1();   /* test that the key is hashed: used for timings */
00431   driver2();   /* test that whole key is hashed thoroughly */
00432   driver3();   /* test that nothing but the key is hashed */
00433   driver4();   /* test hashing multiple buffers (all buffers are null) */
00434   return 1;
00435 }
00436 
00437 #endif  /* SELF_TEST */
00438 
00439 #include "rutz/trace.h"
00440 
00441 uint32 jenkinshash(const byte* key, uint32 length, uint32 initval)
00442 {
00443 GVX_TRACE(__PRETTY_FUNCTION__);
00444   return hash(key, length, initval);
00445 }
00446 
00447 uint32 jenkinshash2(const uint32* key, uint32 length, uint32 initval)
00448 {
00449 GVX_TRACE(__PRETTY_FUNCTION__);
00450   return hash2(key, length, initval);
00451 }
00452 
00453 // ######################################################################
00454 /* So things look consistent in everyone's emacs... */
00455 /* Local Variables: */
00456 /* indent-tabs-mode: nil */
00457 /* End: */
00458 
00459 #endif // UTIL_JENKINSHASH_C_DEFINED
Generated on Sun May 8 08:42:26 2011 for iLab Neuromorphic Vision Toolkit by  doxygen 1.6.3