-
Notifications
You must be signed in to change notification settings - Fork 0
/
lea_streamlit_example.py
355 lines (306 loc) · 12.4 KB
/
lea_streamlit_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Streamlit example
"""
import pandas as pd
import copy
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import sklearn.cluster as cluster
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
# display title
st.title("🪄 The Muggle Sorting Hat")
# display text
st.subheader("Helping teachers form diverse classes.")
with st.expander("About this app", expanded=False):
st.write("""Use this app to form diverse classes while making sure that friends stay together.
The app first forms groups of friends and then organises them into classes.""")
st.write("Upload a .csv file with the following columns:")
st.write("- id (id for every kid)")
st.write("- previous_class_int (number for kids' previous class)")
st.write("- color_int (number according to result of psychological test)")
st.write("- gender_int (number for kids' gender)")
st.write("- reading_int (number for level of reading skill)")
st.write("- hobby_int (number for hobby)")
st.write("- language_int (number for primary language spoken at home)")
st.write("- introverted_int (number for introverted yes/no)")
st.write("- siblings (number of siblings)")
st.write("- friend1 (id of first friend the kid wants to be matched with)")
st.write("- friend2 (id of second friend the kid wants to be matched with)")
# upload file/use with sample file
csv_uploaded = st.file_uploader("Upload student data")
if csv_uploaded is not None:
df_uploaded = pd.read_csv(csv_uploaded, index_col=[0])
st.write(df_uploaded)
else:
st.info(
f"""
Upload a .csv file first. Sample to try: [new_df_friends_small.csv](https://github.com/lbeiermann/sorting_app/blob/main/new_df_friends_small.csv)
"""
)
# input: number of groups
choice = st.slider("Number of classes", 2, 10)
# get groups of students
def extract_comm(res_number):
orig_df = df_uploaded
G_symmetric = nx.Graph()
for index, row in orig_df.iterrows():
G_symmetric.add_edge(row["id"], row["friend1"])
G_symmetric.add_edge(row["id"], row["friend2"])
c = list(greedy_modularity_communities(G_symmetric, resolution = res_number))
return c
# turn groups of students into one row in dataframe
def merge_students(res):
df = df_uploaded
list_sets = extract_comm(res) # resolution set here
list_df = []
for set_students in list_sets:
mask = df["id"].isin(set_students)
df_set = df.loc[mask]
temp_df = pd.DataFrame({"id": [set_students],
"number_students": [len(set_students)],
"gender_int": [round(df_set["gender_int"].mean(), 1)],
"color_int": [round(df_set["color_int"].mean(), 1)],
"reading_int": [round(df_set["reading_int"].mean(), 1)],
"hobby_int": [round(df_set["hobby_int"].mean(), 1)],
"language_int": [round(df_set["language_int"].mean(), 1)],
"introverted_int": [round(df_set["introverted_int"].mean(), 1)],
"siblings": [round(df_set["siblings"].mean(), 1)],
"previous_class_int": [round(df_set["previous_class_int"].mean(), 1)]})
list_df.append(temp_df)
merged_communities = pd.concat(list_df, ignore_index=True)
return merged_communities
# make sklearn pipeline and cluster students
def cluster(df):
scaler = StandardScaler()
k = 4
kmeans = KMeans(n_clusters=k)
pipeline = make_pipeline(scaler,kmeans)
pipeline.fit(df[["gender_int", "color_int", "reading_int", "hobby_int", "language_int",
"introverted_int", "siblings", "previous_class_int"]])
labels = pipeline.predict(df[["gender_int", "color_int", "reading_int", "hobby_int", "language_int",
"introverted_int", "siblings", "previous_class_int"]])
df["clusters"] = kmeans.labels_
return df
# make one dataframe for each cluster and add to list
def list_clustered_dataframes(df):
dataframes_list = []
for i in df.clusters.unique():
temporary_df = df[df["clusters"] == i].reset_index(drop=True)
dataframes_list.append(temporary_df)
list_sorted = sorted(dataframes_list, key=len, reverse=True)
return list_sorted
# turn clustered ids into two-dimensional list
def convert_ids(lst):
ids_clustered = []
for df in lst:
ids_clustered.append(df["id"].tolist())
return ids_clustered
# form classes
def form_classes(lst_ids_clustered, number_classes):
ids_clustered_new = copy.deepcopy(lst_ids_clustered)
length = len(df_uploaded)
max_size = length // number_classes
ids_classes = []
for number in range(number_classes):
ids_classes.append([]) # append an empty list for every class to overall list
for every_class in ids_classes:
size = 0
while size < max_size:
for lst in ids_clustered_new: # go through the clusters within list of IDs/groups of students
if len(lst[0]) <= max_size - size: # take group of students from cluster if not too big
every_class.append(lst[0]) # add group to class
size += len(lst[0]) # change remaining class size
del lst[0] # delete group of students from cluster
if lst == []:
ids_clustered_new.remove(lst) # delete cluster if empty
else:
size = max_size
return ids_classes, ids_clustered_new
# convert frozenset1 to 2d list
def convert_set_to_list1(st):
new_lst = []
for i in st:
temp_lst = []
for e in i:
for j in e:
temp_lst.append(j)
new_lst.append(temp_lst)
return new_lst
# convert frozenset2 to 2d list
def convert_set_to_list2(st):
new_lst = []
for i in st:
for e in i:
temp_lst = []
for j in e:
temp_lst.append(j)
new_lst.append(temp_lst)
return new_lst
# convert frozensets to lists
def convert_frozensets(a, b):
c = convert_set_to_list1(a)
d = convert_set_to_list2(b)
return c,d
# adding remaining students #1
def add_rest1(lst_classes, lst_rest):
merged_list = []
list1 = sorted(lst_classes, key=len)
list2 = sorted(lst_rest, key=len, reverse=True)
for i in range(max((len(list1), len(list2)))):
while True:
try:
list3 = [list1[i], list2[i]]
except IndexError:
if len(list1) > len(list2):
list2.append([])
list3 = [list1[i], list2[i]]
elif len(list1) < len(list2):
list1.append([])
list3 = [list1[i], list2[i]]
continue
merged_list.append(list3)
break
merged_list_cleaned = []
for i in merged_list:
list4 = []
for e in i:
for j in e:
list4.append(j)
merged_list_cleaned.append(list4)
return merged_list_cleaned
# adding remaining students #2
def add_rest2(cleaned_lst_classes1, number_classes):
merged_list = []
list1 = sorted(cleaned_lst_classes1[:number_classes], key = len)
list2 = sorted(cleaned_lst_classes1[number_classes:], key = len, reverse = True)
for i in range(max((len(list1), len(list2)))):
while True:
try:
list3 = [list1[i], list2[i]]
except IndexError:
if len(list1) > len(list2):
list2.append([])
list3 = [list1[i], list2[i]]
elif len(list1) < len(list2):
list1.append([])
list3 = [list1[i], list2[i]]
continue
merged_list.append(list3)
break
merged_list_cleaned = []
for i in merged_list:
list4 = []
for e in i:
for j in e:
list4.append(j)
merged_list_cleaned.append(list4)
return merged_list_cleaned
# get list of final dfs
def make_final_dfs(first_df, list_final_classes):
list_dfs = []
for e in list_final_classes:
new_df = first_df[first_df["id"].isin(e)]
list_dfs.append(new_df)
return list_dfs
# get merged df
def final_df_merged(list_final_dfs, i):
new_list = []
x = 0
for df in list_final_dfs:
x += 1
class_df = df.copy()
class_df["class"] = x
new_list.append(class_df)
final_df = pd.concat(new_list)
final_df.reset_index(drop=True, inplace=True)
original_df = cluster(merge_students(i))
exp_original_df = original_df.explode("id")
final_merged_df = final_df.merge(exp_original_df[["id", "clusters"]], how="left", on="id")
return final_merged_df
# get wish score
def measure_wishs_granted(df):
friend1_found = df["friend1"].isin(df["id"]).value_counts(normalize=True).reset_index()
friend1_found.columns = ["friend1_found", "share"]
wish_score1 = round(friend1_found.loc[friend1_found["friend1_found"] == True, "share"].values[0], 2)
friend2_found = df["friend2"].isin(df["id"]).value_counts(normalize=True).reset_index()
friend2_found.columns = ["friend2_found", "share"]
wish_score2 = round(friend2_found.loc[friend2_found["friend2_found"] == True, "share"].values[0], 2)
overall_wish_score = round((wish_score1 + wish_score2) / 2, 2)
return overall_wish_score
# get diversity score
def measure_diversity(df, nb_clusters):
# calculate ideal score
ideal_score = 1 / nb_clusters
# calculate score for current class
cluster_distribution = df["clusters"].value_counts(normalize=True).reset_index()
cluster_distribution.columns = ["cluster", "share"]
diversity_score = round(cluster_distribution["share"].median(), 2) * nb_clusters # perfect split will turn to 1.0
return diversity_score
# overall quality check (wish score, diversity, class size)
def quality_check(final_merged_df):
size_lst = []
div_score_lst = []
wish_score_lst = []
for current_class in pd.unique(final_merged_df["class"]):
temp_df = final_merged_df[final_merged_df["class"]==current_class]
size = len(temp_df)
size_lst.append(size)
diversity_score = measure_diversity(temp_df, 4)
div_score_lst.append(diversity_score)
wish_score = measure_wishs_granted(temp_df)
wish_score_lst.append(wish_score)
size_difference = max(size_lst) - min(size_lst)
overall_div_score = round(sum(div_score_lst) / len(div_score_lst), 2)
overall_wish_score = round(sum(wish_score_lst) / len(wish_score_lst), 2)
final_score = round(((overall_div_score + overall_wish_score)/2) + size_difference, 2)
return final_score
def main(number_classes): # number of classes
final_dfs = []
final_scores = []
for i in range(14): # max resolution
# network analysis and merging students
df_merged_students = merge_students(i)
# clustering
clustered_df = cluster(df_merged_students)
list_df_clustered = list_clustered_dataframes(clustered_df)
ids_clustered = convert_ids(list_df_clustered)
# class formation
a, b = form_classes(ids_clustered, number_classes)
list1, list2 = convert_frozensets(a, b)
final_classes1 = add_rest1(list1, list2)
final_classes2 = add_rest2(final_classes1, number_classes)
list_final_dfs = make_final_dfs(df_uploaded, final_classes2)
# test quality
final_df = final_df_merged(list_final_dfs, i)
final_dfs.append(final_df)
check = quality_check(final_df)
final_scores.append(check)
# choose best result
minpos = final_scores.index(min(final_scores))
best_df = final_dfs[minpos]
return best_df
# button
if st.button("Form classes"):
with st.spinner("Accio classes!"):
result = main(choice)
st.dataframe(result)
# download
def convert_df(df):
return df.to_csv().encode('utf-8')
csv = convert_df(result)
st.download_button(
"Download classes",
csv,
"classes_sorted.csv",
"text/csv",
key='download-csv'
)