-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
202 lines (171 loc) · 7.16 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import requests, zipfile, io
import numpy as np
import torch
import networkx as nx
from sklearn.metrics import confusion_matrix
from sklearn.utils.linear_assignment_ import linear_assignment
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.neighbors import kneighbors_graph
from scipy import sparse
from scipy import linalg
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def download_datasets():
dataset_links = ['http://www.chrsmrrs.com/graphkerneldatasets/ENZYMES.zip', 'https://www.chrsmrrs.com/graphkerneldatasets/deezer_ego_nets.zip', 'https://www.chrsmrrs.com/graphkerneldatasets/facebook_ct1.zip',
'https://www.chrsmrrs.com/graphkerneldatasets/github_stargazers.zip', 'https://www.chrsmrrs.com/graphkerneldatasets/REDDIT-BINARY.zip','https://www.chrsmrrs.com/graphkerneldatasets/OHSU.zip',
'https://www.chrsmrrs.com/graphkerneldatasets/Peking_1.zip','https://www.chrsmrrs.com/graphkerneldatasets/KKI.zip', 'https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS.zip']
for l in dataset_links:
r = requests.get(l)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
# taken from https://github.com/JiaxuanYou/graph-generation/blob/3444b8ad2fd7ecb6ade45086b4c75f8e2e9f29d1/data.py#L24
def load_graph(min_num_nodes=10, name='ENZYMES'):
print('Loading graph dataset: ' + str(name))
G = nx.Graph()
# load data
path = name + '/'
data_adj = np.loadtxt(path + name + '_A.txt', delimiter=',').astype(int)
data_graph_indicator = np.loadtxt(path + name + '_graph_indicator.txt', delimiter=',').astype(int)
data_tuple = list(map(tuple, data_adj))
# add edges
G.add_edges_from(data_tuple)
G.remove_nodes_from(list(nx.isolates(G)))
# split into graphs
graph_num = data_graph_indicator.max()
node_list = np.arange(data_graph_indicator.shape[0]) + 1
graphs = []
# nx_graphs = []
max_nodes = 0
all_nodes = []
for i in range(graph_num):
# find the nodes for each graph
nodes = node_list[data_graph_indicator == i + 1]
G_sub = G.subgraph(nodes)
if G_sub.number_of_nodes() >= min_num_nodes:
adj = nx.adjacency_matrix(G_sub)
adj = adj.todense()
adj = torch.Tensor(np.asarray(adj)).to(device=device)
graphs.append(adj)
if G_sub.number_of_nodes() > max_nodes:
max_nodes = G_sub.number_of_nodes()
all_nodes.append(G_sub.number_of_nodes())
print('Loaded and the total number of graphs are ', len(graphs))
print('max num of nodes is ', max_nodes)
print('total graphs ', len(graphs))
print('histogram of number of nodes in ', name)
#print(all_nodes)
plt.hist(all_nodes)
plt.show()
return graphs
def kmeans_dist(dist, num_clusters=2):
w,v = torch.eig(dist,eigenvectors=True)
w_real = w[:,0] #symmetric matrix so no need to bother about the complex part
sorted_w = torch.argsort(-torch.abs(w_real))
to_pick_idx = sorted_w[:num_clusters]
eig_vec = v[:,to_pick_idx]
eig_vec = eig_vec.cpu().detach().numpy()
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(eig_vec)
return kmeans.labels_
#hungarian algorithm
def _make_cost_m(cm):
s = np.max(cm)
return (- cm + s)
def error(gt_real, labels):
cm = confusion_matrix(gt_real, labels)
indexes = linear_assignment(_make_cost_m(cm)) #Hungarian algorithm
js = [e[1] for e in sorted(indexes, key=lambda x: x[0])]
cm2 = cm[:, js]
err = 1 - np.trace(cm2) / np.sum(cm2)
return err
#spectral clustering
def generate_graph_laplacian(df, nn):
"""Generate graph Laplacian from data."""
# Adjacency Matrix.
connectivity = kneighbors_graph(X=df, n_neighbors=nn, mode='connectivity')
adjacency_matrix_s = (1 / 2) * (connectivity + connectivity.T)
# Graph Laplacian.
graph_laplacian_s = sparse.csgraph.laplacian(csgraph=adjacency_matrix_s, normed=False)
graph_laplacian = graph_laplacian_s.toarray()
return graph_laplacian
# We project onto the real numbers.
def compute_spectrum_graph_laplacian(graph_laplacian):
"""Compute eigenvalues and eigenvectors and project
them onto the real numbers.
"""
eigenvals, eigenvcts = linalg.eig(graph_laplacian)
eigenvals = np.real(eigenvals)
eigenvcts = np.real(eigenvcts)
return eigenvals, eigenvcts
def spectral_clustering(affinity_mat, num_clusters=3):
graph_laplacian = generate_graph_laplacian(df=affinity_mat, nn=8)
eigenvals, eigenvcts = compute_spectrum_graph_laplacian(graph_laplacian)
eigenvals_sorted_indices = np.argsort(eigenvals)
eigenvals_sorted = eigenvals[eigenvals_sorted_indices]
zero_eigenvals_index = np.argwhere(abs(eigenvals) < 1e+0)
# eigenvals[zero_eigenvals_index]
proj_df = pd.DataFrame(eigenvcts[:, eigenvals_sorted_indices[:num_clusters]]) # zero_eigenvals_index.squeeze()])
k_means = KMeans(random_state=25, n_clusters=num_clusters)
k_means.fit(proj_df)
labels = k_means.predict(proj_df)
return labels
#graphons for simulated data
def graphon_1(x):
p = torch.zeros((x.shape[0],x.shape[0]), dtype=torch.float64).to(device=device)
u = p + x.reshape(1, -1)
v = p + x.reshape(-1, 1)
graphon = u * v
return graphon
def graphon_2(x):
p = torch.zeros((x.shape[0],x.shape[0]), dtype=torch.float64).to(device=device)
u = p + x.reshape(1, -1)
v = p + x.reshape(-1, 1)
graphon = torch.exp(-torch.pow(torch.max(u,v),0.75))
return graphon
def graphon_3(x):
p = torch.zeros((x.shape[0],x.shape[0]), dtype=torch.float64).to(device=device)
u = p + x.reshape(1, -1)
v = p + x.reshape(-1, 1)
graphon = torch.exp(-0.5* (torch.min(u,v) + torch.pow(u,0.5) + torch.pow(v,0.5)))
return graphon
def graphon_4(x):
p = torch.zeros((x.shape[0],x.shape[0]), dtype=torch.float64).to(device=device)
u = p + x.reshape(1, -1)
v = p + x.reshape(-1, 1)
graphon = torch.abs(u-v)
return graphon
def generate_graphs(graphon_key, n):
graph_gen = []
for nn in n:
x = torch.distributions.uniform.Uniform(0, 1).sample([nn]).to(device=device)
if graphon_key == 1:
graph_prob = graphon_1(x)
elif graphon_key == 2:
graph_prob = graphon_2(x)
elif graphon_key == 3:
graph_prob = graphon_3(x)
elif graphon_key == 4:
graph_prob = graphon_4(x)
else:
print('Wrong key')
exit()
graph = torch.distributions.binomial.Binomial(1, graph_prob).sample()
graph = torch.triu(graph, diagonal=1)
graph = graph + graph.t()
graph_gen.append(graph)
return graph_gen
def data_simulation(graphons, number_of_graphs=10, start=100, stop=1000):
graphs = []
labels = []
for graphon in graphons:
p = torch.randperm(stop)
n = p[p > start][:number_of_graphs]
print('nodes ', n)
g = generate_graphs(graphon, n)
graphs = graphs + g
for i in range(len(graphons)):
l = i * np.ones(number_of_graphs)
labels = labels + l.tolist()
print('graphs generated', len(graphs))
print('true labels ', labels)
return graphs, labels