Skip to content

Commit

Permalink
Polish
Browse files Browse the repository at this point in the history
  • Loading branch information
clarkduvall committed Oct 6, 2014
1 parent 06db8b0 commit be99898
Show file tree
Hide file tree
Showing 7 changed files with 298 additions and 325 deletions.
71 changes: 46 additions & 25 deletions utils.go → common.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,32 @@ package main

import "math"

const a16 = 0.673
const a32 = 0.697
const a64 = 0.709
const two32 = 1 << 32
type sortableSlice []uint32
func (p sortableSlice) Len() int { return len(p) }
func (p sortableSlice) Less(i, j int) bool { return p[i] < p[j] }
func (p sortableSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }

type set map[uint32]bool
func (s set) Add(i uint32) { s[i] = true }

func a(m uint32) float64 {
if m == 16 {
return a16
return 0.673
} else if m == 32 {
return a32
return 0.697
} else if m == 64 {
return a64
return 0.709
}
return 0.7213 / (1 + 1.079 / float64(m))
}

var clzLookup = [...]byte {
var clzLookup = [...]uint8 {
32, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28,
}

// http://embeddedgurus.com/state-space/2014/09/fast-deterministic-and-portable-counting-leading-zeros/
func clz(x uint32) byte {
n := byte(0)
func clz32(x uint32) uint8 {
var n uint8

if x >= (1 << 16) {
if x >= (1 << 24) {
Expand All @@ -42,30 +45,22 @@ func clz(x uint32) byte {
return clzLookup[x >> n] - n;
}

func countZeroBits2(num uint32, start uint) byte {
count := byte(1)
for x := uint32(1 << (start - 1)); (x & num) == 0 && x != 0; x >>= 1 {
count++
}
return count
}

func countZeroBits(num uint64) byte {
count := byte(1)
for x := uint64(1 << 63); (x & num) == 0 && x != 0; x >>= 1 {
count++
func clz64(x uint64) uint8 {
var c uint8
for m := uint64(1 << 63); m & x == 0 && m != 0; m >>= 1 {
c++
}
return count
return c
}

// Extract bits from uint32 using LSB 0 numbering, including lo
func eb32(bits uint32, hi uint, lo uint) uint32 {
func eb32(bits uint32, hi uint8, lo uint8) uint32 {
m := uint32(((1 << (hi - lo)) - 1) << lo)
return (bits & m) >> lo
}

// Extract bits from uint64 using LSB 0 numbering, including lo
func eb64(bits uint64, hi uint, lo uint) uint64 {
func eb64(bits uint64, hi uint8, lo uint8) uint64 {
m := uint64(((1 << (hi - lo)) - 1) << lo)
return (bits & m) >> lo
}
Expand All @@ -74,3 +69,29 @@ func linearCounting(m uint32, v uint32) float64 {
fm := float64(m)
return fm * math.Log(fm / float64(v))
}

func countZeros(s []uint8) uint32 {
var c uint32
for _, v := range s {
if v == 0 { c++ }
}
return c
}

func calculateEstimate(s []uint8) float64 {
sum := 0.0
for _, val := range s {
sum += 1.0 / float64(uint32(1) << val)
}

m := uint32(len(s))
fm := float64(m)
return a(m) * fm * fm / sum
}

func insert(s []uint32, i int, item uint32) []uint32 {
s = append(s, 0)
copy(s[i+1:], s[i:])
s[i] = item
return s
}
55 changes: 55 additions & 0 deletions hll.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package main

import (
"errors"
"hash"
"math"
)

var two32 float64 = 1 << 32

type hyperLogLog struct {
reg []uint8
m uint32
p uint8
}

func NewHyperLogLog(precision uint8) (*hyperLogLog, error) {
if precision > 16 || precision < 4 {
return nil, errors.New("precision must be between 4 and 16")
}

h := new(hyperLogLog)
h.p = precision
h.m = 1 << precision
h.reg = make([]uint8, h.m)
return h, nil
}

func (h *hyperLogLog) Clear() {
h.reg = make([]uint8, h.m)
}

func (h *hyperLogLog) Add(item hash.Hash32) {
x := item.Sum32()
i := eb32(x, 32, 32 - h.p) // {x31,...,x32-p}
w := x << h.p | 1 << (h.p - 1) // {x32-p,...,x0}

zeroBits := clz32(w) + 1
if zeroBits > h.reg[i] {
h.reg[i] = zeroBits
}
}

func (h *hyperLogLog) Estimate() uint64 {
est := calculateEstimate(h.reg)
if est <= float64(h.m) * 2.5 {
if v := countZeros(h.reg); v != 0 {
return uint64(linearCounting(h.m, v))
}
return uint64(est)
} else if est < two32 / 30 {
return uint64(est)
}
return -uint64(two32 * math.Log(1 - est / two32))
}
File renamed without changes.
186 changes: 186 additions & 0 deletions hllpp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
package main

import (
"errors"
"hash"
"sort"
)

const pPrime uint8 = 25
const mPrime uint32 = 1 << (uint32(pPrime) - 1)

var threshold = []uint {
10, 20, 40, 80, 220, 400, 900, 1800, 3100,
6500, 11500, 20000, 50000, 120000, 350000,
}

type hyperLogLogPP struct {
reg []uint8
p uint8
m uint32
sparse bool
tmp_set set
sparse_list []uint32
}

func (h *hyperLogLogPP) encodeHash(x uint64) uint32 {
idx := uint32(eb64(x, 64, 64 - pPrime) << 7)

if eb64(x, 64 - h.p, 64 - pPrime) == 0 {
zeros := clz64(eb64(x, 64 - pPrime, 0) << pPrime) + 1
return idx | uint32(zeros << 1) | 1
}
return idx | 1 << 6
}

func (h *hyperLogLogPP) getIndex(k uint32) uint32 {
return eb32(k, h.p + 7, 7)
}

func (h *hyperLogLogPP) decodeHash(k uint32) (uint32, uint8) {
r := uint8(0)
if k & 1 == 1 {
r = uint8(eb32(k, 7 , 1)) + pPrime - h.p
} else {
r = clz32(k << (pPrime - h.p)) + 1
}
return h.getIndex(k), r
}

func (h *hyperLogLogPP) merge() {
keys := make(sortableSlice, 0, len(h.tmp_set))
for k := range h.tmp_set {
keys = append(keys, k)
}
sort.Sort(keys)

mask := mPrime - 1
key_less := func(a uint32, b uint32) bool { return a & mask < b & mask }
key_equal := func(a uint32, b uint32) bool { return a & mask == b & mask }

i := 0
for _, k := range keys {
for ; i < len(h.sparse_list) && key_less(h.sparse_list[i], k); i++ {}

if i >= len(h.sparse_list) {
h.sparse_list = append(h.sparse_list, k)
continue
}

item := h.sparse_list[i]
if k > item {
if key_equal(k, item) {
h.sparse_list[i] = k
} else {
h.sparse_list = insert(h.sparse_list, i + 1, k)
}
} else if key_less(k, item) {
h.sparse_list = insert(h.sparse_list, i, k)
}
i++
}

h.tmp_set = set{}
}

func NewHyperLogLogPP(precision uint8) (*hyperLogLogPP, error) {
if precision > 18 || precision < 4 {
return nil, errors.New("precision must be between 4 and 16")
}

h := new(hyperLogLogPP)
h.p = precision
h.m = 1 << precision
h.sparse = true
h.tmp_set = set{}
h.sparse_list = make([]uint32, 0, h.m / 4)
return h, nil
}

func (h *hyperLogLogPP) Clear() {
h.sparse = true
h.tmp_set = set{}
h.sparse_list = make([]uint32, 0, h.m / 4)
h.reg = nil
}

func (h *hyperLogLogPP) toNormal() {
h.reg = make([]uint8, h.m)
for _, k := range h.sparse_list {
i, r := h.decodeHash(k)
if h.reg[i] < r {
h.reg[i] = r
}
}

h.sparse = false
h.tmp_set = nil
h.sparse_list = nil
}

func (h *hyperLogLogPP) Add(item hash.Hash64) {
x := item.Sum64()
if h.sparse {
h.tmp_set.Add(h.encodeHash(x))

// Hash map takes approximately (4 + 4 + 1) * 2 * 4 * n bytes
if uint32(len(h.tmp_set)) * 72 > h.m {
h.merge()
// Sparse list takes approximately 4 * n bytes. Add 2 extra to account for
// memory use of tmp_set.
if uint32(len(h.sparse_list)) * 6 > h.m {
h.toNormal()
}
}
} else {
i := eb64(x, 64, 64 - h.p) // {x63,...,x64-p}
w := x << h.p | 1 << (h.p - 1) // {x63-p,...,x0}

zeroBits := clz64(w) + 1
if zeroBits > h.reg[i] {
h.reg[i] = zeroBits
}
}
}

func (h *hyperLogLogPP) estimateBias(est float64) float64 {
estTable, biasTable := rawEstimateData[h.p - 4], biasData[h.p - 4]

if estTable[0] > est {
return estTable[0] - biasTable[0]
}

lastEstimate := estTable[len(estTable)-1]
if lastEstimate < est {
return lastEstimate - biasTable[len(biasTable)-1]
}

var i int
for i = 0; i < len(estTable) && estTable[i] < est; i++ {}

e1, b1 := estTable[i - 1], biasTable[i - 1]
e2, b2 := estTable[i], biasTable[i]

c := (est - e1) / (e2 - e1)
return b1 * c + b2 * (1 - c)
}

func (h *hyperLogLogPP) Estimate() uint64 {
if h.sparse {
h.merge()
return uint64(linearCounting(mPrime, mPrime - uint32(len(h.sparse_list))))
}

est := calculateEstimate(h.reg)
if est <= float64(h.m) * 5.0 {
est -= h.estimateBias(est)
}

if v := countZeros(h.reg); v != 0 {
lc := linearCounting(h.m, v)
if lc <= float64(threshold[h.p - 4]) {
return uint64(lc)
}
}
return uint64(est)
}
Loading

0 comments on commit be99898

Please sign in to comment.