Skip to content

Commit

Permalink
Initial release
Browse files Browse the repository at this point in the history
  • Loading branch information
ddofborg committed Jan 23, 2014
0 parents commit 73ec563
Show file tree
Hide file tree
Showing 5 changed files with 311 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pyc
71 changes: 71 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
WHAT IS THIS?
=============

DiskDict is a replacement for Python `dict` object with one big difference.
When the number of items in the `dict` is larger than `cache_size`, the
disk will be used to store newly added data. This makes it possible for
the `dict` to contain more data than available in RAM. The drawback is that
the dict becomes much slower, but still usable.



USAGE EXAMPLE
=============

DiskDict could/should be used as a default `dict` object.


d = DiskDict(cache_size=2)
d[1] = 1
d[2] = 4
d[3] = 9
d[4] = 16
d[5] = 25

# d == {1: 1, 2: 4, 3: 9, 4: 16, 5: 25}

d[10] = 100
d[20] = 400
d[30] = 900
d[40] = 1600
d[50] = 2500

# d = {1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 40: 1600, 10: 100, 50: 2500, 20: 400, 30: 900}

d[1] = -1
d[2] = -4
d[3] = -9
d[4] = -16
d[5] = -25

# d = {1: -1, 2: -4, 3: -9, 4: -16, 5: -25, 40: 1600, 10: 100, 50: 2500, 20: 400, 30: 900}

# d.counters == {'mem_hits': 6, 'misses': 0, 'set_ops': 15, 'get_ops': 25, 'mem_items': 2,
# 'disk_items': 8, 'disk_hits': 19, 'del_ops': 0}



SPEED TESTS
===========


SSD Laptop cache_size=0
-----------------------

Write : n=1000000, s='a'*1024, ops/s=85607
Read SEQ: n=1000000, s='a'*1024, ops/s=131875
Read RND: n=1000000, s='a'*1024, ops/s=106608

Write : n=1000000, s=range(100), ops/s=28614
Read SEQ: n=1000000, s=range(100), ops/s=27398
Read RND: n=1000000, s=range(100), ops/s=26072


SSD Laptop cache_size=1000000
-----------------------------

Write : n=1000000, s='a'*1024, ops/s=316754
Read SEQ: n=1000000, s='a'*1024, ops/s=704680
Read RND: n=1000000, s='a'*1024, ops/s=409498


6 changes: 6 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
TODO
----

- TODO Update cache_size if there is not more free memory.
- TODO Close/delete tmp file when object is destroyed of garbasecollected.

96 changes: 96 additions & 0 deletions diskdict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# -*- encoding: utf-8 -*-

import collections
import tempfile
import marshal as pickle
import os
import sys


class DiskDict():

def __init__(self,init={},cache_size=1000000):
self.counters = { 'mem_hits' : 0, 'disk_hits' : 0, 'misses' : 0, 'get_ops' : 0, 'set_ops' : 0,
'del_ops' : 0, 'mem_items' : 0, 'disk_items' : 0 }
self.cache_size = cache_size
self.cache = {}
self.disk_index = {}
self.storage_seek_at_end = False
try:
self.storage_fd = tempfile.TemporaryFile(mode='w+b',prefix='diskdict-')
except:
raise

if len(init):
for k in init:
self[k] = init[k]


def __setitem__(self, k, v):
if k in self.cache: # Make sure to overide the existing memory dict before adding it
self.cache[k] = v
elif self.counters['mem_items'] < self.cache_size:
self.cache[k] = v
self.counters['mem_items'] += 1
else:
try:
if not self.storage_seek_at_end:
self.storage_fd.seek(0,2) # Seek to EOF
self.storage_seek_at_end = True

if k not in self.disk_index:
self.counters['disk_items'] += 1

self.disk_index[k] = self.storage_fd.tell()
pickle.dump(v, self.storage_fd)

except:
raise

self.counters['set_ops'] += 1


def __getitem__(self, k):
self.counters['get_ops'] += 1

if k in self.cache:
self.counters['mem_hits'] += 1
return self.cache[k]
elif k in self.disk_index:
try:
self.counters['disk_hits'] += 1
self.storage_fd.seek(self.disk_index[k])
self.storage_seek_at_end = False
data = pickle.load(self.storage_fd)
return data
except:
raise
else:
self.counters['misses'] += 1
raise KeyError( k )


def __delitem__(self, k):
if k in self.cache:
del self.cache[k]
self.counters['mem_items'] -= 1
elif k in self.disk_index:
del self.disk_index[k]
self.counters['disk_items'] -= 1

self.counters['del_ops'] += 1


def __str__(self):
return str( { el : self[el] for el in self.__iter__() } )


def __iter__(self):
for it in (self.cache, self.disk_index):
for el in it:
yield el


def __len__(self):
return self.counters['mem_items'] + self.counters['disk_items']

137 changes: 137 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/usr/bin/python -u
# -*- encoding: utf-8 -*-

import random
import sys
import os
from diskdict import DiskDict
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)


def test_basics():
print "Starting basis test, cache_size=2:"
print " ",
d = DiskDict(cache_size=2)
d[1] = 1
d[2] = 4
d[3] = 9
d[4] = 16
d[5] = 25
assert( str(d) == '{1: 1, 2: 4, 3: 9, 4: 16, 5: 25}' )
print ".",
assert( len(d) == 5 )
print ".",
d[10] = 100
d[20] = 400
d[30] = 900
d[40] = 1600
d[50] = 2500
assert( str(d) == '{1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 40: 1600, 10: 100, 50: 2500, 20: 400, 30: 900}' )
print ".",
assert( len(d) == 10 )
print ".",
d[1] = -1
d[2] = -4
d[3] = -9
d[4] = -16
d[5] = -25

assert( str(d) == '{1: -1, 2: -4, 3: -9, 4: -16, 5: -25, 40: 1600, 10: 100, 50: 2500, 20: 400, 30: 900}' )
print ".",
assert( len(d) == 10 )
print ".",

del d[1]
assert( len(d) == 9 )
print ".",

assert( str(d.counters) == "{'mem_hits': 6, 'misses': 0, 'set_ops': 15, 'get_ops': 25, 'mem_items': 1, 'disk_items': 8, 'disk_hits': 19, 'del_ops': 1}" )
print ".",
print
print " Counters: " + str(d.counters)

print "all rests passed."

def speed_test_w(n,s):
print " ",
c=0
p = min(100000,int(n/10))
for i in range(n):
if c % p == 0:
print str(int(round(float(c) / n * 100))) + '%',
d['a'+str(i)] = ( i*i, s )
c += 1
print 'done',

def speed_test_r(n,s):
print " ",
c=0
p = min(100000,int(n/10))
for i in range(n):
if c % p == 0:
print str(int(round(float(c) / n * 100))) + '%',
x = d['a'+str(i)]
c += 1
print 'done',

def speed_test_rnd(n,s):
print " ",
r = range(n)
random.shuffle(r)
c=0
p = min(100000,int(n/10))
for i in r:
if c % p == 0:
print str(int(round(float(c) / n * 100))) + '%',
x = d['a'+str(i)]
c += 1
print 'done',


if __name__ == '__main__':

test_basics()
print


import timeit

n = 100000

print "Starting speed test with {} 1k strings, cache_size=0:".format( str(n) )

s = "'" + 'a' * 1024 + "'"
d = DiskDict(cache_size=0)

t = int( n/timeit.timeit('speed_test_w({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) )
print " >> Write : n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t )

t = int( n/timeit.timeit('speed_test_r({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) )
print " >> Read SEQ: n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t )

t = int( n/timeit.timeit('speed_test_rnd({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) )
print " >> Read RND: n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t )

print " Counters: " + str(d.counters)
print "done."
print

n = 100000

print "Starting speed test with {} 100 item lists, cache_size=0:".format( str(n) )

s = range(100)
d = DiskDict(cache_size=0)

t = int( n/timeit.timeit('speed_test_w({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) )
print " >> Write : n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t )

t = int( n/timeit.timeit('speed_test_r({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) )
print " >> Read SEQ: n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t )

t = int( n/timeit.timeit('speed_test_rnd({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) )
print " >> Read RND: n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t )

print " Counters: " + str(d.counters)
print "done."
print

0 comments on commit 73ec563

Please sign in to comment.