Skip to content

Commit

Permalink
holdout likelihood added
Browse files Browse the repository at this point in the history
  • Loading branch information
jyscardioid committed Jan 28, 2015
1 parent 718e3f0 commit 2a95c0a
Show file tree
Hide file tree
Showing 2 changed files with 194 additions and 0 deletions.
37 changes: 37 additions & 0 deletions ptm/link_prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import cPickle
#cd Desktop/git/author_citation/cora

doc_cnt = cPickle.load(open('doc_cnt.pkl'))
doc_ids = cPickle.load(open('doc_ids.pkl'))
doc_links = cPickle.load(open('doc_links_sym.pkl'))
bow = cPickle.load(open('bow.pkl'))

'''
test_docs = random.choice(range(len(doc_cnt)), size = 100, replace = False)
test_list = list(test_docs)
for v in test_docs:
test_list += doc_links[v]
'''

test_docs = random.choice(range(len(doc_cnt)), size = 100, replace = False)

#for v in test_links:
for v in test_docs:
for v2 in doc_links[v]:
try:
doc_links[v2].remove(v)
except:
continue
doc_links[v] = []

doc_links_unremoved = cPickle.load(open('doc_links_sym.pkl'))

#model = rtm(30, len(doc_cnt), len(bow), doc_ids, doc_cnt, doc_links, rho = 3000)
#model.posterior_inference(50)

for rho in [0, 10, 10, 100, 100, 1000, 1000, 10000, 10000]:
#for rho in [10000, 100000, 1000000]:
model = rtm(30, len(doc_cnt), len(bow), doc_ids, doc_cnt, doc_links, rho = rho)
model.posterior_inference(30)
print

157 changes: 157 additions & 0 deletions ptm/rtm_heldout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import numpy as np
from scipy.special import gammaln, psi

eps = 1e-10

class rtm:
""" implementation of relational topic model by Chang and Blei (2009)
I implemented the exponential link probability function in here
"""

def __init__(self, num_topic, num_doc, num_voca, doc_ids, doc_cnt, doc_links, rho):
self.D = num_doc
self.K = num_topic
self.V = num_voca

self.alpha = 1.

self.gamma = np.random.gamma(100., 1./100, [self.D, self.K])
self.beta = np.random.dirichlet([5]*self.V, self.K)

self.nu = 0
self.eta = np.random.normal(0.,1, self.K)

self.phi = list()
self.pi = np.zeros([self.D, self.K])

for di in xrange(self.D):
unique_word = len(doc_ids[di])
cnt = doc_cnt[di]
self.phi.append(np.random.dirichlet([10]*self.K, unique_word).T) # list of KxW
self.pi[di,:] = np.sum(cnt*self.phi[di],1)/np.sum(cnt*self.phi[di])

self.doc_ids = doc_ids
self.doc_cnt = doc_cnt
self.doc_links = doc_links
self.rho = rho #regularization parameter

def posterior_inference(self, max_iter):
for iter in xrange(max_iter):
self.variation_update()
self.parameter_estimation()
#print self.compute_elbo()
if iter == max_iter-1:
print self.link_prediction()

def compute_elbo(self):
""" compute evidence lower bound for trained model
"""
elbo = 0

e_log_theta = psi(self.gamma) - psi(np.sum(self.gamma, 1))[:,np.newaxis] # D x K
log_beta = np.log(self.beta)

for di in xrange(self.D):
words = self.doc_ids[di]
cnt = self.doc_cnt[di]

elbo += np.sum(cnt * (self.phi[di] * log_beta[:,words])) # E_q[log p(w_{d,n}|\beta,z_{d,n})]
elbo += np.sum((self.alpha - 1.)*e_log_theta[di,:]) # E_q[log p(\theta_d | alpha)]
elbo += np.sum(self.phi[di].T * e_log_theta[di,:]) # E_q[log p(z_{d,n}|\theta_d)]

elbo += -gammaln(np.sum(self.gamma[di,:])) + np.sum(gammaln(self.gamma[di,:])) \
- np.sum((self.gamma[di,:] - 1.)*(e_log_theta[di,:])) # - E_q[log q(theta|gamma)]
elbo += - np.sum(cnt * self.phi[di] * np.log(self.phi[di])) # - E_q[log q(z|phi)]

for adi in self.doc_links[di]:
elbo += np.dot(self.eta, self.pi[di]*self.pi[adi]) # E_q[log p(y_{d1,d2}|z_{d1},z_{d2},\eta,\nu)]

return elbo

def variation_update(self):
#update phi, gamma
e_log_theta = psi(self.gamma) - psi(np.sum(self.gamma, 1))[:,np.newaxis]

new_beta = np.zeros([self.K, self.V])

for di in xrange(self.D):
words = self.doc_ids[di]
cnt = self.doc_cnt[di]
doc_len = np.sum(cnt)

new_phi = np.log(self.beta[:,words]) + e_log_theta[di,:][:,np.newaxis]

gradient = np.zeros(self.K)
for adi in self.doc_links[di]:
gradient += self.eta * self.pi[adi,:] / doc_len

new_phi += gradient[:,np.newaxis]
new_phi = np.exp(new_phi)
new_phi = new_phi/np.sum(new_phi,0)

self.phi[di] = new_phi

self.pi[di,:] = np.sum(cnt * self.phi[di],1)/np.sum(cnt * self.phi[di])
self.gamma[di,:] = np.sum(cnt * self.phi[di], 1) + self.alpha
new_beta[:, words] += (cnt * self.phi[di])

self.beta = new_beta / np.sum(new_beta, 1)[:,np.newaxis]

def parameter_estimation(self):
#update eta, nu
pi_sum = np.zeros(self.K)

num_links = 0.

for di in xrange(self.D):
for adi in self.doc_links[di]:
pi_sum += self.pi[di,:]*self.pi[adi,:]
num_links += 1

num_links /= 2. # divide by 2 for bidirectional edge
pi_sum /= 2.

pi_alpha = np.zeros(self.K) + self.alpha/(self.alpha*self.K)*self.alpha/(self.alpha*self.K)

self.nu = np.log(num_links-np.sum(pi_sum)) - np.log(self.rho*(self.K-1)/self.K + num_links - np.sum(pi_sum))
self.eta = np.log(pi_sum) - np.log(pi_sum + self.rho * pi_alpha) - self.nu

def link_prediction(self):

prediction = []
'''
for v in test_docs:
prediction_sub = dict()
for v2 in range(self.D):
prediction_sub[v2] = np.exp(self.eta.dot(self.pi[v]*self.pi[v2]) + self.nu)
sorted_prediction_sub = sorted(prediction_sub.items(), key = operator.itemgetter(1))[::-1]
sorted_prediction_sub_dict = {k[0] : idx for idx, k in enumerate(sorted_prediction_sub)}
linked_rankings = []
for v3 in doc_links_unremoved[v]:
linked_rankings.append(sorted_prediction_sub_dict[v3])
prediction.append(np.mean(linked_rankings))
'''

for v in test_docs:
predicted_likelihood = np.exp(self.eta.dot((self.pi[v]*self.pi).T) + self.nu)
sorted_p = predicted_likelihood.argsort()[::-1]
linked_rankings = []
for v2 in doc_links_unremoved[v]:
linked_rankings.append(list(sorted_p).index(v2))
prediction.append(np.mean(linked_rankings))

return np.mean(prediction)

def main():
rho = 1
num_topic = 2
num_voca = 6
num_doc = 2
doc_ids = [[0,1,4],[2,3,5]]
doc_cnt = [[2,2,3],[1,3,1]]
doc_links = [[1],[0]] #bidirectional link
model = rtm(num_topic, num_doc, num_voca, doc_ids, doc_cnt, doc_links, rho)
model.posterior_inference(10)

if __name__ == '__main__':
main()

0 comments on commit 2a95c0a

Please sign in to comment.