Pregunta

¿Dónde puedo encontrar una implementación válida de Algoritmo de registro de registro?Intenté implementarlo yo mismo, pero mi borrador de implementación arroja resultados extraños.

Aquí es:

function LogLog(max_error, max_count)
{
    function log2(x)
    {
         return Math.log(x) / Math.LN2;
    }

    var m = 1.30 / max_error;
    var k = Math.ceil(log2(m * m));
    m = Math.pow(2, k);

    var k_comp = 32 - k;

    var l = log2(log2(max_count / m));
    if (isNaN(l)) l = 1; else l = Math.ceil(l);
    var l_mask = ((1 << l) - 1) >>> 0;

    var M = [];
    for (var i = 0; i < m; ++i) M[i] = 0;

    function count(hash)
    {
          if (hash !== undefined)
          {
                var j = hash >>> k_comp;

                var rank = 0;
                for (var i = 0; i < k_comp; ++i)
                {
                     if ((hash >>> i) & 1)
                     {
                          rank = i + 1;
                          break;
                     }
                }

                M[j] = Math.max(M[j], rank & l_mask);
          }
          else
          {
                var c = 0;
                for (var i = 0; i < m; ++i) c += M[i];
                return 0.79402 * m * Math.pow(2, c / m);
          }
    }

    return {count: count};
}

function fnv1a(text)
{
     var hash = 2166136261;
     for (var i = 0; i < text.length; ++i)
     {
          hash ^= text.charCodeAt(i);
          hash += (hash << 1) + (hash << 4) + (hash << 7) +
            (hash << 8) + (hash << 24);
     }
    return hash >>> 0;
}

var words = ['aardvark', 'abyssinian', ... ,'zoology']; // about 2 300 words

var log_log = LogLog(0.01, 100000);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]));
alert(log_log.count());

Por razones desconocidas, la implementación es muy sensible a max_error parámetro, es el factor principal que determina la magnitud del resultado.Estoy seguro de que hay algún error estúpido :)

ACTUALIZAR: Este problema se resuelve en el Versión más nueva de algoritmo.Publicaré su implementación más tarde.

¿Fue útil?

Solución

Aquí es la versión actualizada del algoritmo basado en el papel más nuevo:

var pow_2_32 = 0xFFFFFFFF + 1;

function HyperLogLog(std_error)
{
     function log2(x)
     {
          return Math.log(x) / Math.LN2;
     }

     function rank(hash, max)
     {
          var r = 1;
          while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
          return r;
     }

     var m = 1.04 / std_error;
     var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
     m = Math.pow(2, k);

     var alpha_m = m == 16 ? 0.673
          : m == 32 ? 0.697
          : m == 64 ? 0.709
          : 0.7213 / (1 + 1.079 / m);

     var M = []; for (var i = 0; i < m; ++i) M[i] = 0;

     function count(hash)
     {
          if (hash !== undefined)
          {
                var j = hash >>> k_comp;
                M[j] = Math.max(M[j], rank(hash, k_comp));
          }
          else
          {
                var c = 0.0;
                for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
                var E = alpha_m * m * m / c;

                // -- make corrections

                if (E <= 5/2 * m)
                {
                     var V = 0;
                     for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
                     if (V > 0) E = m * Math.log(m / V);
                }
                else if (E > 1/30 * pow_2_32)
                     E = -pow_2_32 * Math.log(1 - E / pow_2_32);

                // --

                return E;
          }
    }

    return {count: count};
}

function fnv1a(text)
{
     var hash = 2166136261;
     for (var i = 0; i < text.length; ++i)
     {
          hash ^= text.charCodeAt(i);
          hash += (hash << 1) + (hash << 4) + (hash << 7) +
            (hash << 8) + (hash << 24);
     }
     return hash >>> 0;
}

var words = ['aardvark', 'abyssinian', ..., 'zoology']; // 2336 words

var seed = Math.floor(Math.random() * pow_2_32); // make more fun

var log_log = HyperLogLog(0.065);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]) ^ seed);
var count = log_log.count();
alert(count + ', error ' +
    (count - words.length) / (words.length / 100.0) + '%');

Otros consejos

Aquí hay una versión ligeramente modificada que agrega la operación de fusión.

Merge le permite tomar los contadores de varios casos de Hyperloglog, y determinar los contadores únicos en general.

Por ejemplo, si tiene visitantes únicos recolectados los lunes, martes y miércoles, entonces puede fusionar los cubos y contar la cantidad de visitantes únicos durante el lapso de tres días:

var pow_2_32 = 0xFFFFFFFF + 1; 
function HyperLogLog(std_error)
{
    function log2(x)
    {
        return Math.log(x) / Math.LN2;
    }

    function rank(hash, max)
    {
        var r = 1;
        while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
        return r;
    }

    var m = 1.04 / std_error;
    var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
    m = Math.pow(2, k);

    var alpha_m = m == 16 ? 0.673
         : m == 32 ? 0.697
         : m == 64 ? 0.709
         : 0.7213 / (1 + 1.079 / m);

    var M = []; for (var i = 0; i < m; ++i) M[i] = 0;

    function merge(other)
    {
        for (var i = 0; i < m; i++)
        M[i] = Math.max(M[i], other.buckets[i]);
    }

    function count(hash)
    {
        if (hash !== undefined)
        {
            var j = hash >>> k_comp;
            M[j] = Math.max(M[j], rank(hash, k_comp));
        }
        else
        {
            var c = 0.0;
            for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
            var E = alpha_m * m * m / c;

            // -- make corrections

            if (E <= 5/2 * m)
            {
                 var V = 0;
                 for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
                 if (V > 0) E = m * Math.log(m / V);
            }
            else if (E > 1/30 * pow_2_32)
                 E = -pow_2_32 * Math.log(1 - E / pow_2_32);

            // --

            return E;
        }
    }

    return {count: count, merge: merge, buckets: M};
}

function fnv1a(text)
{
    var hash = 2166136261;
    for (var i = 0; i < text.length; ++i)
    {
        hash ^= text.charCodeAt(i);
        hash += (hash << 1) + (hash << 4) + (hash << 7) +
          (hash << 8) + (hash << 24);
    }
    return hash >>> 0;
}

Entonces puedes hacer algo como esto:

// initialize one counter per day
var ll_monday = HyperLogLog(0.01);
var ll_tuesday = HyperLogLog(0.01);
var ll_wednesday = HyperLogLog(0.01);


// add 5000 unique values in each day
for(var i=0; i<5000; i++) ll_monday.count(fnv1a('' + Math.random()));
for(var i=0; i<5000; i++) ll_tuesday.count(fnv1a('' + Math.random()));
for(var i=0; i<5000; i++) ll_wednesday.count(fnv1a('' + Math.random()));

// add 5000 values which appear every day
for(var i=0; i<5000; i++) {ll_monday.count(fnv1a(''+i)); ll_tuesday.count(fnv1a('' + i));   ll_wednesday.count(fnv1a('' + i));}


// merge three days together
together = HyperLogLog(0.01);
together.merge(ll_monday);
together.merge(ll_tuesday);
together.merge(ll_wednesday);

// report
console.log('unique per day: ' + Math.round(ll_monday.count()) + ' ' + Math.round(ll_tuesday.count()) + ' ' + Math.round(ll_wednesday.count()));
console.log('unique numbers overall: ' + Math.round(together.count()));

Hemos abierto un proyecto llamado Stream-Lib que tiene un Implementación de registro de registro.El trabajo se basó en este papel.

Usando la versión js proporcionada por @actual, intenté implementar lo mismo en C#, lo que parece bastante parecido.Simplemente cambié un poco la función fnv1a y le cambié el nombre a getHashCode.(El crédito va a la función hash de Jenkins, http://en.wikipedia.org/wiki/Jenkins_hash_function)

public class HyperLogLog
{
    private double mapSize, alpha_m, k;
    private int kComplement;
    private Dictionary<int, int> Lookup = new Dictionary<int, int>();
    private const double pow_2_32 = 4294967297;

    public HyperLogLog(double stdError)
    {
        mapSize = (double)1.04 / stdError;
        k = (long)Math.Ceiling(log2(mapSize * mapSize));

        kComplement = 32 - (int)k;
        mapSize = (long)Math.Pow(2, k);

        alpha_m = mapSize == 16 ? (double)0.673
              : mapSize == 32 ? (double)0.697
              : mapSize == 64 ? (double)0.709
              : (double)0.7213 / (double)(1 + 1.079 / mapSize);
        for (int i = 0; i < mapSize; i++)
            Lookup[i] = 0;
    }

    private static double log2(double x)
    {
        return Math.Log(x) / 0.69314718055994530941723212145818;//Ln2
    }
    private static int getRank(uint hash, int max)
    {
        int r = 1;
        uint one = 1;
        while ((hash & one) == 0 && r <= max)
        {
            ++r;
            hash >>= 1;
        }
        return r;
    }
    public static uint getHashCode(string text)
    {
        uint hash = 0;

        for (int i = 0, l = text.Length; i < l; i++)
        {
            hash += (uint)text[i];
            hash += hash << 10;
            hash ^= hash >> 6;
        }
        hash += hash << 3;
        hash ^= hash >> 6;
        hash += hash << 16;

        return hash;
    }

    public int Count()
    {
        double c = 0, E;

        for (var i = 0; i < mapSize; i++)
            c += 1d / Math.Pow(2, (double)Lookup[i]);

        E = alpha_m * mapSize * mapSize / c;

        // Make corrections & smoothen things.
        if (E <= (5 / 2) * mapSize)
        {
            double V = 0;
            for (var i = 0; i < mapSize; i++)
                if (Lookup[i] == 0) V++;
            if (V > 0)
                E = mapSize * Math.Log(mapSize / V);
        }
        else
            if (E > (1 / 30) * pow_2_32)
                E = -pow_2_32 * Math.Log(1 - E / pow_2_32);
        // Made corrections & smoothen things, or not.

        return (int)E;
    }

    public void Add(object val)
    {
        uint hashCode = getHashCode(val.ToString());
        int j = (int)(hashCode >> kComplement);

        Lookup[j] = Math.Max(Lookup[j], getRank(hashCode, kComplement));
    }
} 

Sé que esta es una publicación antigua, pero la implementación de @buryat se ha movido y, en cualquier caso, está incompleta y un poco lenta (lo siento o_o).

Tomé la implementación utilizada por la nueva versión de Redis que se puede encontrar aquí y lo transfirió a PHP.El repositorio está aquí. https://github.com/joegreen0991/HyperLogLog

<?php

class HyperLogLog {

    private $HLL_P_MASK;

    private $HLL_REGISTERS;

    private $ALPHA;

    private $registers;

    public function __construct($HLL_P = 14)
    {
        $this->HLL_REGISTERS = (1 << $HLL_P); /* With P=14, 16384 registers. */

        $this->HLL_P_MASK = ($this->HLL_REGISTERS - 1); /* Mask to index register. */

        $this->ALPHA = 0.7213 / (1 + 1.079 / $this->HLL_REGISTERS);

        $this->registers = new SplFixedArray($this->HLL_REGISTERS);

        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            $this->registers[$i] = 0;
        }
    }

    public function add($v)
    {
        $h = crc32(md5($v));

        $h |= 1 << 63; /* Make sure the loop terminates. */
        $bit = $this->HLL_REGISTERS; /* First bit not used to address the register. */
        $count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
        while(($h & $bit) == 0) {
            $count++;
            $bit <<= 1;
        }

        /* Update the register if this element produced a longer run of zeroes. */
        $index = $h & $this->HLL_P_MASK; /* Index a register inside registers. */

        if ($this->registers[$index] < $count) {
            $this->registers[$index] = $count;
        }
    }

    public function export()
    {
        $str = '';
        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            $str .= chr($this->registers[$i]);
        }
        return $str;
    }

    public function import($str)
    {
        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            $this->registers[$i] = isset($str[$i]) ? ord($str[$i]) : 0;
        }
    }

    public function merge($str)
    {
        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            if(isset($str[$i]))
            {
                $ord = ord($str[$i]);
                if ($this->registers[$i] < $ord) {
                    $this->registers[$i] = $ord;
                }
            }

        }
    }

    /**
     * @static
     * @param $arr
     * @return int Number of unique items in $arr
     */
    public function count() {
        $E = 0;

        $ez = 0;

        for ($i = 0; $i < $this->HLL_REGISTERS; $i++) {
            if ($this->registers[$i] !== 0) {
                $E += (1.0 / pow(2, $this->registers[$i]));
            } else {
                $ez++;
                $E += 1.0;
            }
        }

        $E = (1 / $E) * $this->ALPHA * $this->HLL_REGISTERS * $this->HLL_REGISTERS;

        /* Use the LINEARCOUNTING algorithm for small cardinalities.
         * For larger values but up to 72000 HyperLogLog raw approximation is
         * used since linear counting error starts to increase. However HyperLogLog
         * shows a strong bias in the range 2.5*16384 - 72000, so we try to
         * compensate for it. */
        if ($E < $this->HLL_REGISTERS * 2.5 && $ez != 0) {
            $E = $this->HLL_REGISTERS * log($this->HLL_REGISTERS / $ez);
        }

        else if ($this->HLL_REGISTERS == 16384 && $E < 72000) {
            // We did polynomial regression of the bias for this range, this
            // way we can compute the bias for a given cardinality and correct
            // according to it. Only apply the correction for P=14 that's what
            // we use and the value the correction was verified with.
            $bias = 5.9119 * 1.0e-18 * ($E*$E*$E*$E)
                -1.4253 * 1.0e-12 * ($E*$E*$E)+
                1.2940 * 1.0e-7 * ($E*$E)
                -5.2921 * 1.0e-3 * $E+
                83.3216;
            $E -= $E * ($bias/100);
        }

        return floor($E);
    }
}

Implementé loglog e hyperloglog en JS y PHP y código bien comentado. https://github.com/buryat/loglog

Licenciado bajo: CC-BY-SA con atribución
No afiliado a StackOverflow
scroll top