-
Notifications
You must be signed in to change notification settings - Fork 172
/
at_model.py
115 lines (95 loc) · 4.05 KB
/
at_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import numpy as n
import os
class at_model:
def __init__(self, vocab, K, A, docList, authorList, alpha=0.1, eta=0.01):
"""
Initialize at_model
vocab = vocabulary list
K = number of topics
A = number of authors
alpha = author-topic distribution dirichlet parameter
eta = word-topic distribution dirichlet parameter
docList
list of documents, constructed based on the vocab
format = list(list(words))
ex) [[0,2,2,3],[1,3,3,4]]
tokens of 1st document= 0,2,2,3 (note that 2 appears twice becase word 2 used twice in the first document)
authorList
format = list(list(authors))
at least one author should be exist for each document
ex) [[0,1],[1,2]]
authors of 1st doc = 0, 1
"""
self._vocab = vocab
self._W = len(vocab)
self._K = K
self._A = A
self._D = len(docList)
self._docList = docList
self._authorList = authorList
self._alpha = alpha
self._eta = eta
self.c_wt = n.zeros([self._W, self._K])
self.c_at = n.zeros([self._A, self._K])
self.topic_assigned = list()
self.author_assigned = list()
self.topic_sum = n.zeros(self._K)
self.author_sum = n.zeros(self._A)
#initialization
for di in xrange(0, self._D):
self.author_assigned.append(list())
self.topic_assigned.append(list())
doc = self._docList[di]
authors = self._authorList[di]
for wi in xrange(0, len(doc)):
w = doc[wi]
#random sampling topic
z = n.random.choice(self._K, 1)[0]
#random sampling author
a = n.random.choice(len(authors),1)[0]
#assigning sampled value (sufficient statistics)
self.c_wt[w,z] += 1
self.c_at[authors[a],z] += 1
self.topic_sum[z] += 1
self.author_sum[authors[a]] += 1
#keep sampled value for future sampling
self.topic_assigned[di].append(z)
self.author_assigned[di].append(authors[a])
def sampling_topics(self, max_iter):
for iter in xrange(0, max_iter):
for di in xrange(0, len(self._docList)):
doc = self._docList[di]
authors = self._authorList[di]
for wi in xrange(0, len(doc)):
w = doc[wi]
old_z = self.topic_assigned[di][wi]
old_a = self.author_assigned[di][wi]
self.c_wt[w, old_z] -= 1
self.c_at[old_a, old_z] -= 1
self.topic_sum[old_z] -= 1
self.author_sum[old_a] -= 1
wt = (self.c_wt[w, :]+ self._eta)/(self.topic_sum+self._W*self._eta)
at = (self.c_at[authors,:] + self._alpha)/(self.author_sum[authors].repeat(self._K).reshape(len(authors),self._K)+self._K*self._alpha)
pdf = at*wt
pdf = pdf.reshape(len(authors)*self._K)
pdf = pdf/pdf.sum()
#sampling author and topic
idx = n.random.multinomial(1, pdf).argmax()
new_ai = idx/self._K
new_z = idx%self._K
new_a = authors[new_ai]
self.c_wt[w,new_z] += 1
self.c_at[new_a, new_z] += 1
self.topic_sum[new_z] += 1
self.author_sum[new_a] += 1
self.topic_assigned[di][wi] = new_z
self.author_assigned[di][wi] = new_a
if __name__ == '__main__':
#test case
atm = at_model([0,1,2,3,4], 2, 3, [[0,0,2,2,3],[1,3,3,4,4]], [[0,1],[1,2]])
atm.sampling_topics(10)
folder = 'at-result'
if not os.path.exists(folder):
os.makedirs(folder)
n.savetxt(folder + '/word-topic.dat', atm.c_wt)
n.savetxt(folder + '/author-topic.dat', atm.c_at)