Initial release

ddofborg · Jan 23, 2014 · 73ec563 · 73ec563
commit 73ec563
Show file tree

Hide file tree

Showing 5 changed files with 311 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/README.md b/README.md
@@ -0,0 +1,71 @@
+WHAT IS THIS?
+=============
+
+DiskDict is a replacement for Python `dict` object with one big difference.
+When the number of items in the `dict` is larger than `cache_size`, the
+disk will be used to store newly added data. This makes it possible for
+the `dict` to contain more data than available in RAM. The drawback is that
+the dict becomes much slower, but still usable.
+
+
+
+USAGE EXAMPLE
+=============
+
+DiskDict could/should be used as a default `dict` object.
+
+
+    d = DiskDict(cache_size=2)
+    d[1] = 1
+    d[2] = 4
+    d[3] = 9
+    d[4] = 16
+    d[5] = 25
+
+    # d == {1: 1, 2: 4, 3: 9, 4: 16, 5: 25}
+
+    d[10] = 100
+    d[20] = 400
+    d[30] = 900
+    d[40] = 1600
+    d[50] = 2500
+
+    # d = {1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 40: 1600, 10: 100, 50: 2500, 20: 400, 30: 900}
+
+    d[1] = -1
+    d[2] = -4
+    d[3] = -9
+    d[4] = -16
+    d[5] = -25
+
+    # d = {1: -1, 2: -4, 3: -9, 4: -16, 5: -25, 40: 1600, 10: 100, 50: 2500, 20: 400, 30: 900}
+
+    # d.counters == {'mem_hits': 6, 'misses': 0, 'set_ops': 15, 'get_ops': 25, 'mem_items': 2,
+    #                'disk_items': 8, 'disk_hits': 19, 'del_ops': 0}
+
+
+
+SPEED TESTS
+===========
+
+
+SSD Laptop cache_size=0
+-----------------------
+
+    Write   : n=1000000, s='a'*1024, ops/s=85607
+    Read SEQ: n=1000000, s='a'*1024, ops/s=131875
+    Read RND: n=1000000, s='a'*1024, ops/s=106608
+
+    Write   : n=1000000, s=range(100), ops/s=28614
+    Read SEQ: n=1000000, s=range(100), ops/s=27398
+    Read RND: n=1000000, s=range(100), ops/s=26072
+
+
+SSD Laptop cache_size=1000000
+-----------------------------
+
+    Write   : n=1000000, s='a'*1024, ops/s=316754
+    Read SEQ: n=1000000, s='a'*1024, ops/s=704680
+    Read RND: n=1000000, s='a'*1024, ops/s=409498
+
+
diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,6 @@
+TODO
+----
+
+-	TODO Update cache_size if there is not more free memory.
+-	TODO Close/delete tmp file when object is destroyed of garbasecollected.
+
diff --git a/diskdict.py b/diskdict.py
@@ -0,0 +1,96 @@
+# -*- encoding: utf-8 -*-
+
+import collections
+import tempfile
+import marshal as pickle
+import os
+import sys
+
+
+class DiskDict():
+
+	def __init__(self,init={},cache_size=1000000):
+		self.counters = { 'mem_hits' : 0, 'disk_hits' : 0, 'misses' : 0, 'get_ops' : 0, 'set_ops' : 0,
+			'del_ops' : 0, 'mem_items' : 0, 'disk_items' : 0 }
+		self.cache_size = cache_size
+		self.cache = {}
+		self.disk_index = {}
+		self.storage_seek_at_end = False
+		try:
+			self.storage_fd = tempfile.TemporaryFile(mode='w+b',prefix='diskdict-')
+		except:
+			raise
+
+		if len(init):
+			for k in init:
+				self[k] = init[k]
+
+
+	def __setitem__(self, k, v):
+		if k in self.cache: # Make sure to overide the existing memory dict before adding it
+			self.cache[k] = v
+		elif self.counters['mem_items'] < self.cache_size:
+			self.cache[k] = v
+			self.counters['mem_items'] += 1
+		else:
+			try:
+				if not self.storage_seek_at_end:
+					self.storage_fd.seek(0,2) # Seek to EOF
+					self.storage_seek_at_end = True
+
+				if k not in self.disk_index:
+					self.counters['disk_items'] += 1
+
+				self.disk_index[k] = self.storage_fd.tell()
+				pickle.dump(v, self.storage_fd)
+
+			except:
+				raise
+
+		self.counters['set_ops'] += 1
+
+
+	def __getitem__(self, k):
+		self.counters['get_ops'] += 1
+
+		if k in self.cache:
+			self.counters['mem_hits'] += 1
+			return self.cache[k]
+		elif k in self.disk_index:
+			try:
+				self.counters['disk_hits'] += 1
+				self.storage_fd.seek(self.disk_index[k])
+				self.storage_seek_at_end = False
+				data = pickle.load(self.storage_fd)
+				return data
+			except:
+				raise
+		else:
+			self.counters['misses'] += 1
+			raise KeyError( k )
+
+
+	def __delitem__(self, k):
+		if k in self.cache:
+			del self.cache[k]
+			self.counters['mem_items'] -= 1
+		elif k in self.disk_index:
+			del self.disk_index[k]
+			self.counters['disk_items'] -= 1
+
+		self.counters['del_ops'] += 1
+
+
+	def __str__(self):
+		return str( { el : self[el] for el in self.__iter__() } )
+
+
+	def __iter__(self):
+		for it in (self.cache, self.disk_index):
+			for el in it:
+				yield el
+
+
+	def __len__(self):
+		return self.counters['mem_items'] + self.counters['disk_items']
+
diff --git a/tests.py b/tests.py
@@ -0,0 +1,137 @@
+#!/usr/bin/python -u
+# -*- encoding: utf-8 -*-
+
+import random
+import sys
+import os
+from diskdict import DiskDict
+sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
+
+
+def test_basics():
+	print "Starting basis test, cache_size=2:"
+	print "  ",
+	d = DiskDict(cache_size=2)
+	d[1] = 1
+	d[2] = 4
+	d[3] = 9
+	d[4] = 16
+	d[5] = 25
+	assert( str(d) == '{1: 1, 2: 4, 3: 9, 4: 16, 5: 25}' )
+	print ".",
+	assert( len(d) == 5 )
+	print ".",
+	d[10] = 100
+	d[20] = 400
+	d[30] = 900
+	d[40] = 1600
+	d[50] = 2500
+	assert( str(d) == '{1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 40: 1600, 10: 100, 50: 2500, 20: 400, 30: 900}' )
+	print ".",
+	assert( len(d) == 10 )
+	print ".",
+	d[1] = -1
+	d[2] = -4
+	d[3] = -9
+	d[4] = -16
+	d[5] = -25
+
+	assert( str(d) == '{1: -1, 2: -4, 3: -9, 4: -16, 5: -25, 40: 1600, 10: 100, 50: 2500, 20: 400, 30: 900}' )
+	print ".",
+	assert( len(d) == 10 )
+	print ".",
+
+	del d[1]
+	assert( len(d) == 9 )
+	print ".",
+
+	assert( str(d.counters) == "{'mem_hits': 6, 'misses': 0, 'set_ops': 15, 'get_ops': 25, 'mem_items': 1, 'disk_items': 8, 'disk_hits': 19, 'del_ops': 1}" )
+	print ".",
+	print
+	print "  Counters: " + str(d.counters)
+
+	print "all rests passed."
+
+def speed_test_w(n,s):
+	print " ",
+	c=0
+	p = min(100000,int(n/10))
+	for i in range(n):
+		if c % p == 0:
+			print str(int(round(float(c) / n * 100))) + '%',
+		d['a'+str(i)] = ( i*i, s )
+		c += 1
+	print 'done',
+
+def speed_test_r(n,s):
+	print " ",
+	c=0
+	p = min(100000,int(n/10))
+	for i in range(n):
+		if c % p == 0:
+			print str(int(round(float(c) / n * 100))) + '%',
+		x = d['a'+str(i)]
+		c += 1
+	print 'done',
+
+def speed_test_rnd(n,s):
+	print " ",
+	r = range(n)
+	random.shuffle(r)
+	c=0
+	p = min(100000,int(n/10))
+	for i in r:
+		if c % p == 0:
+			print str(int(round(float(c) / n * 100))) + '%',
+		x = d['a'+str(i)]
+		c += 1
+	print 'done',
+
+
+if __name__ == '__main__':
+
+	test_basics()
+	print
+
+
+	import timeit
+
+	n = 100000
+
+	print "Starting speed test with {} 1k strings, cache_size=0:".format( str(n) )
+
+	s = "'" + 'a' * 1024 + "'"
+	d = DiskDict(cache_size=0)
+
+	t = int( n/timeit.timeit('speed_test_w({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) )
+	print " >> Write   : n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t )
+
+	t = int( n/timeit.timeit('speed_test_r({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) )
+	print " >> Read SEQ: n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t )
+
+	t = int( n/timeit.timeit('speed_test_rnd({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) )
+	print " >> Read RND: n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t )
+
+	print "  Counters: " + str(d.counters)
+	print "done."
+	print
+
+	n = 100000
+
+	print "Starting speed test with {} 100 item lists, cache_size=0:".format( str(n) )
+
+	s = range(100)
+	d = DiskDict(cache_size=0)
+
+	t = int( n/timeit.timeit('speed_test_w({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) )
+	print " >> Write   : n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t )
+
+	t = int( n/timeit.timeit('speed_test_r({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) )
+	print " >> Read SEQ: n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t )
+
+	t = int( n/timeit.timeit('speed_test_rnd({},{})'.format(n,s), setup='from __main__ import speed_test_w,speed_test_r,speed_test_rnd', number=1) )
+	print " >> Read RND: n={}, s={}/{}, ops/s={}".format( n, type(s), len(s), t )
+
+	print "  Counters: " + str(d.counters)
+	print "done."
+	print