-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 73ec563
Showing
5 changed files
with
311 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
WHAT IS THIS? | ||
============= | ||
|
||
DiskDict is a replacement for Python `dict` object with one big difference. | ||
When the number of items in the `dict` is larger than `cache_size`, the | ||
disk will be used to store newly added data. This makes it possible for | ||
the `dict` to contain more data than available in RAM. The drawback is that | ||
the dict becomes much slower, but still usable. | ||
|
||
|
||
|
||
USAGE EXAMPLE | ||
============= | ||
|
||
DiskDict could/should be used as a default `dict` object. | ||
|
||
|
||
d = DiskDict(cache_size=2) | ||
d[1] = 1 | ||
d[2] = 4 | ||
d[3] = 9 | ||
d[4] = 16 | ||
d[5] = 25 | ||
|
||
# d == {1: 1, 2: 4, 3: 9, 4: 16, 5: 25} | ||
|
||
d[10] = 100 | ||
d[20] = 400 | ||
d[30] = 900 | ||
d[40] = 1600 | ||
d[50] = 2500 | ||
|
||
# d = {1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 40: 1600, 10: 100, 50: 2500, 20: 400, 30: 900} | ||
|
||
d[1] = -1 | ||
d[2] = -4 | ||
d[3] = -9 | ||
d[4] = -16 | ||
d[5] = -25 | ||
|
||
# d = {1: -1, 2: -4, 3: -9, 4: -16, 5: -25, 40: 1600, 10: 100, 50: 2500, 20: 400, 30: 900} | ||
|
||
# d.counters == {'mem_hits': 6, 'misses': 0, 'set_ops': 15, 'get_ops': 25, 'mem_items': 2, | ||
# 'disk_items': 8, 'disk_hits': 19, 'del_ops': 0} | ||
|
||
|
||
|
||
SPEED TESTS | ||
=========== | ||
|
||
|
||
SSD Laptop cache_size=0 | ||
----------------------- | ||
|
||
Write : n=1000000, s='a'*1024, ops/s=85607 | ||
Read SEQ: n=1000000, s='a'*1024, ops/s=131875 | ||
Read RND: n=1000000, s='a'*1024, ops/s=106608 | ||
|
||
Write : n=1000000, s=range(100), ops/s=28614 | ||
Read SEQ: n=1000000, s=range(100), ops/s=27398 | ||
Read RND: n=1000000, s=range(100), ops/s=26072 | ||
|
||
|
||
SSD Laptop cache_size=1000000 | ||
----------------------------- | ||
|
||
Write : n=1000000, s='a'*1024, ops/s=316754 | ||
Read SEQ: n=1000000, s='a'*1024, ops/s=704680 | ||
Read RND: n=1000000, s='a'*1024, ops/s=409498 | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
TODO | ||
---- | ||
|
||
- TODO Update cache_size if there is not more free memory. | ||
- TODO Close/delete tmp file when object is destroyed of garbasecollected. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# -*- encoding: utf-8 -*- | ||
|
||
import collections | ||
import tempfile | ||
import marshal as pickle | ||
import os | ||
import sys | ||
|
||
|
||
class DiskDict(): | ||
|
||
def __init__(self,init={},cache_size=1000000): | ||
self.counters = { 'mem_hits' : 0, 'disk_hits' : 0, 'misses' : 0, 'get_ops' : 0, 'set_ops' : 0, | ||
'del_ops' : 0, 'mem_items' : 0, 'disk_items' : 0 } | ||
self.cache_size = cache_size | ||
self.cache = {} | ||
self.disk_index = {} | ||
self.storage_seek_at_end = False | ||
try: | ||
self.storage_fd = tempfile.TemporaryFile(mode='w+b',prefix='diskdict-') | ||
except: | ||
raise | ||
|
||
if len(init): | ||
for k in init: | ||
self[k] = init[k] | ||
|
||
|
||
def __setitem__(self, k, v): | ||
if k in self.cache: # Make sure to overide the existing memory dict before adding it | ||
self.cache[k] = v | ||
elif self.counters['mem_items'] < self.cache_size: | ||
self.cache[k] = v | ||
self.counters['mem_items'] += 1 | ||
else: | ||
try: | ||
if not self.storage_seek_at_end: | ||
self.storage_fd.seek(0,2) # Seek to EOF | ||
self.storage_seek_at_end = True | ||
|
||
if k not in self.disk_index: | ||
self.counters['disk_items'] += 1 | ||
|
||
self.disk_index[k] = self.storage_fd.tell() | ||
pickle.dump(v, self.storage_fd) | ||
|
||
except: | ||
raise | ||
|
||
self.counters['set_ops'] += 1 | ||
|
||
|
||
def __getitem__(self, k): | ||
self.counters['get_ops'] += 1 | ||
|
||
if k in self.cache: | ||
self.counters['mem_hits'] += 1 | ||
return self.cache[k] | ||
elif k in self.disk_index: | ||
try: | ||
self.counters['disk_hits'] += 1 | ||
self.storage_fd.seek(self.disk_index[k]) | ||
self.storage_seek_at_end = False | ||
data = pickle.load(self.storage_fd) | ||
return data | ||
except: | ||
raise | ||
else: | ||
self.counters['misses'] += 1 | ||
raise KeyError( k ) | ||
|
||
|
||
def __delitem__(self, k): | ||
if k in self.cache: | ||
del self.cache[k] | ||
self.counters['mem_items'] -= 1 | ||
elif k in self.disk_index: | ||
del self.disk_index[k] | ||
self.counters['disk_items'] -= 1 | ||
|
||
self.counters['del_ops'] += 1 | ||
|
||
|
||
def __str__(self): | ||
return str( { el : self[el] for el in self.__iter__() } ) | ||
|
||
|
||
def __iter__(self): | ||
for it in (self.cache, self.disk_index): | ||
for el in it: | ||
yield el | ||
|
||
|
||
def __len__(self): | ||
return self.counters['mem_items'] + self.counters['disk_items'] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
#!/usr/bin/python -u | ||
# -*- encoding: utf-8 -*- | ||
|
||
import random | ||
import sys | ||
import os | ||
from diskdict import DiskDict | ||
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) | ||
|
||
|
||
def test_basics(): | ||
print "Starting basis test, cache_size=2:" | ||
print " ", | ||
d = DiskDict(cache_size=2) | ||
d[1] = 1 | ||
d[2] = 4 | ||
d[3] = 9 | ||
d[4] = 16 | ||
d[5] = 25 | ||
assert( str(d) == '{1: 1, 2: 4, 3: 9, 4: 16, 5: 25}' ) | ||
print ".", | ||
assert( len(d) == 5 ) | ||
print ".", | ||
d[10] = 100 | ||
d[20] = 400 | ||
d[30] = 900 | ||
d[40] = 1600 | ||
d[50] = 2500 | ||
assert( str(d) == '{1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 40: 1600, 10: 100, 50: 2500, 20: 400, 30: 900}' ) | ||
print ".", | ||
assert( len(d) == 10 ) | ||
print ".", | ||
d[1] = -1 | ||
d[2] = -4 | ||
d[3] = -9 | ||
d[4] = -16 | ||
d[5] = -25 | ||
|
||
assert( str(d) == '{1: -1, 2: -4, 3: -9, 4: -16, 5: -25, 40: 1600, 10: 100, 50: 2500, 20: 400, 30: 900}' ) | ||
print ".", | ||
assert( len(d) == 10 ) | ||
print ".", | ||
|
||
del d[1] | ||
assert( len(d) == 9 ) | ||
print ".", | ||
|
||
assert( str(d.counters) == "{'mem_hits': 6, 'misses': 0, 'set_ops': 15, 'get_ops': 25, 'mem_items': 1, 'disk_items': 8, 'disk_hits': 19, 'del_ops': 1}" ) | ||
print ".", | ||
print " Counters: " + str(d.counters) | ||
|
||
print "all rests passed." | ||
|
||
def speed_test_w(n,s): | ||
print " ", | ||
c=0 | ||
p = min(100000,int(n/10)) | ||
for i in range(n): | ||
if c % p == 0: | ||
print str(int(round(float(c) / n * 100))) + '%', | ||
d['a'+str(i)] = ( i*i, s ) | ||
c += 1 | ||
print 'done', | ||
|
||
def speed_test_r(n,s): | ||
print " ", | ||
c=0 | ||
p = min(100000,int(n/10)) | ||
for i in range(n): | ||
if c % p == 0: | ||
print str(int(round(float(c) / n * 100))) + '%', | ||
x = d['a'+str(i)] | ||
c += 1 | ||
print 'done', | ||
|
||
def speed_test_rnd(n,s): | ||
print " ", | ||
r = range(n) | ||
random.shuffle(r) | ||
c=0 | ||
p = min(100000,int(n/10)) | ||
for i in r: | ||
if c % p == 0: | ||
print str(int(round(float(c) / n * 100))) + '%', | ||
x = d['a'+str(i)] | ||
c += 1 | ||
print 'done', | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
test_basics() | ||
|
||
|
||
import timeit | ||
|
||
n = 100000 | ||
|
||
print "Starting speed test with {} 1k strings, cache_size=0:".format( str(n) ) | ||
|
||
s = "'" + 'a' * 1024 + "'" | ||
d = DiskDict(cache_size=0) | ||
|
||
t = int( n/timeit.timeit('speed_test_w({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) ) | ||
print " >> Write : n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t ) | ||
|
||
t = int( n/timeit.timeit('speed_test_r({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) ) | ||
print " >> Read SEQ: n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t ) | ||
|
||
t = int( n/timeit.timeit('speed_test_rnd({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) ) | ||
print " >> Read RND: n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t ) | ||
|
||
print " Counters: " + str(d.counters) | ||
print "done." | ||
|
||
n = 100000 | ||
|
||
print "Starting speed test with {} 100 item lists, cache_size=0:".format( str(n) ) | ||
|
||
s = range(100) | ||
d = DiskDict(cache_size=0) | ||
|
||
t = int( n/timeit.timeit('speed_test_w({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) ) | ||
print " >> Write : n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t ) | ||
|
||
t = int( n/timeit.timeit('speed_test_r({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) ) | ||
print " >> Read SEQ: n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t ) | ||
|
||
t = int( n/timeit.timeit('speed_test_rnd({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) ) | ||
print " >> Read RND: n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t ) | ||
|
||
print " Counters: " + str(d.counters) | ||
print "done." | ||