سؤال

I was building a Bloom filter and looked into what hashes to use and the Bob Jenkins' hash seemed like a good choice because of the evenness of the distribution. I adapted the given C++ code to Go (possibly making a mistake but it seems to work).

I got around to benchmarking the cost of the hash and found that the SHA1 hash in the Go std library was much faster.

PASS
BenchmarkJenkins     1000000          2649 ns/op
BenchmarkSHA256  1000000          1218 ns/op
BenchmarkSHA1    5000000           462 ns/op

Was I misled when I read that you shouldn't use cryptographic hashes in this use case? Or is the standard library code much more optimized than mine?

package jenkins

import (
    "bytes"
    "encoding/gob"
)

// adapted from http://bretmulvey.com/hash/7.html
func ComputeHash(key interface{}) (uint64, error) {
    var buf bytes.Buffer
    enc := gob.NewEncoder(&buf)
    err := enc.Encode(key)
    if err != nil {
        return 0, err
    }
    data := buf.Bytes()

    var a, b, c uint64
    a, b = 0x9e3779b9, 0x9e3779b9
    c = 0
    i := 0

    for i = 0; i < len(data)-12; {
        a += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
        i += 4
        b += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
        i += 4
        c += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)

        a, b, c = mix(a, b, c)
    }

    c += uint64(len(data))

    if i < len(data) {
        a += uint64(data[i])
        i++
    }
    if i < len(data) {
        a += uint64(data[i]) << 8
        i++
    }
    if i < len(data) {
        a += uint64(data[i]) << 16
        i++
    }
    if i < len(data) {
        a += uint64(data[i]) << 24
        i++
    }

    if i < len(data) {
        b += uint64(data[i])
        i++
    }
    if i < len(data) {
        b += uint64(data[i]) << 8
        i++
    }
    if i < len(data) {
        b += uint64(data[i]) << 16
        i++
    }
    if i < len(data) {
        b += uint64(data[i]) << 24
        i++
    }

    if i < len(data) {
        c += uint64(data[i]) << 8
        i++
    }
    if i < len(data) {
        c += uint64(data[i]) << 16
        i++
    }
    if i < len(data) {
        c += uint64(data[i]) << 24
        i++
    }

    a, b, c = mix(a, b, c)
    return c, nil
}

func mix(a, b, c uint64) (uint64, uint64, uint64) {
    a -= b
    a -= c
    a ^= (c >> 13)
    b -= c
    b -= a
    b ^= (a << 8)
    c -= a
    c -= b
    c ^= (b >> 13)
    a -= b
    a -= c
    a ^= (c >> 12)
    b -= c
    b -= a
    b ^= (a << 16)
    c -= a
    c -= b
    c ^= (b >> 5)
    a -= b
    a -= c
    a ^= (c >> 3)
    b -= c
    b -= a
    b ^= (a << 10)
    c -= a
    c -= b
    c ^= (b >> 15)
    return a, b, c
}

EDIT:

Benchmarking code:

package bloom

import (
    "testing"

    "crypto/sha1"
    "crypto/sha256"
)

func BenchmarkJenkins(b *testing.B) {
    j := jenkinsHash{}

    for i := 0; i < b.N; i++ {
        j.ComputeHash(i)
    }
}

func BenchmarkSHA1(b *testing.B) {
    for i := 0; i < b.N; i++ {
        sha1.Sum([]byte{byte(i)})
    }
}


func BenchmarkSHA256(b *testing.B) {
    for i := 0; i < b.N; i++ {
        sha256.Sum256([]byte{byte(i)})
    }
}
هل كانت مفيدة؟

المحلول 3

@JensG was on the right track. The calls to gob to encode the key in binary made up the vast majority of the cost. When I transitioned to passing in byte arrays the benchmark started getting the results I was expecting. Thanks for the help!

BenchmarkJenkins    100000000           20.4 ns/op
BenchmarkSHA1    5000000           463 ns/op
BenchmarkSHA256  1000000          1223 ns/op

Benchmark code:

package bloom

import (
    "testing"

    "crypto/sha1"
    "crypto/sha256"
)

func BenchmarkJenkins(b *testing.B) {
    j := jenkinsHash{}

    for i := 0; i < b.N; i++ {
        j.ComputeHash([]byte{byte(i)})
    }
}

func BenchmarkSHA1(b *testing.B) {
    for i := 0; i < b.N; i++ {
        sha1.Sum([]byte{byte(i)})
    }
}


func BenchmarkSHA256(b *testing.B) {
    for i := 0; i < b.N; i++ {
        sha256.Sum256([]byte{byte(i)})
    }
}

Altered code:

package bloom

type jenkinsHash struct {
}

// adapted from http://bretmulvey.com/hash/7.html
func (_ jenkinsHash) ComputeHash(data []byte) (uint64, error) {    
    var a, b, c uint64
    a, b = 0x9e3779b9, 0x9e3779b9
    c = 0
    i := 0

    for i = 0; i < len(data)-12; {
        a += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
        i += 4
        b += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)
        i += 4
        c += uint64(data[i]) | uint64(data[i+1]<<8) | uint64(data[i+2]<<16) | uint64(data[i+3]<<24)

        a, b, c = mix(a, b, c)
    }

    c += uint64(len(data))

    if i < len(data) {
        a += uint64(data[i])
        i++
    }
    if i < len(data) {
        a += uint64(data[i]) << 8
        i++
    }
    if i < len(data) {
        a += uint64(data[i]) << 16
        i++
    }
    if i < len(data) {
        a += uint64(data[i]) << 24
        i++
    }

    if i < len(data) {
        b += uint64(data[i])
        i++
    }
    if i < len(data) {
        b += uint64(data[i]) << 8
        i++
    }
    if i < len(data) {
        b += uint64(data[i]) << 16
        i++
    }
    if i < len(data) {
        b += uint64(data[i]) << 24
        i++
    }

    if i < len(data) {
        c += uint64(data[i]) << 8
        i++
    }
    if i < len(data) {
        c += uint64(data[i]) << 16
        i++
    }
    if i < len(data) {
        c += uint64(data[i]) << 24
        i++
    }

    a, b, c = mix(a, b, c)
    return c, nil
}

func mix(a, b, c uint64) (uint64, uint64, uint64) {
    a -= b
    a -= c
    a ^= (c >> 13)
    b -= c
    b -= a
    b ^= (a << 8)
    c -= a
    c -= b
    c ^= (b >> 13)
    a -= b
    a -= c
    a ^= (c >> 12)
    b -= c
    b -= a
    b ^= (a << 16)
    c -= a
    c -= b
    c ^= (b >> 5)
    a -= b
    a -= c
    a ^= (c >> 3)
    b -= c
    b -= a
    b ^= (a << 10)
    c -= a
    c -= b
    c ^= (b >> 15)
    return a, b, c
}

نصائح أخرى

I'm going to lay bets on optimization; Bob Jenkin's hash should be substantially faster than any crypto-style hash like SHA. I would bet that the standard library is calling out into heavily-optimized C (or even assembly) for that, which is why it beats your unoptimized Go.

There appears to be an efficient Murmur3 available for Go at https://github.com/reusee/mmh3 (I haven't tried it). You might have better luck with that, or by calling out into C/C++ for your Bob Jenkins implementation.

The go sha1 hash is written in assembler and has been heavily optimised (I contributed the ARM version of the code).

Your hash function looks of about equivalent complexity to sha1 to me so I'm not surprised by your run times.

You could try the md5 hash which should do for your purpose and may be faster still (it is also in assembler).

If you only need a short hash result (int64) you could try one of Go's CRC functions.

مرخصة بموجب: CC-BY-SA مع الإسناد
لا تنتمي إلى StackOverflow
scroll top