-
Notifications
You must be signed in to change notification settings - Fork 0
/
matrixes.py
165 lines (148 loc) · 7.35 KB
/
matrixes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import math
import pandas as pd
from relate import metrics
'''
Functions return different symmetric matrices (similarity and distance) using different calculations:
string similarity metrics with or without standardizing function.
'''
# Call string similarity metrics from the metrics module
metrics = metrics.select
# Default names of the analyzed strings
names = ('text1', 'text2', 'text3', 'text4')
def similarity_matrix(metrics, texts1, texts2):
'''
Function returns a symmetric similarity matrix, which has the same number of rows and
columns (n x n).
Parameters:
metrics:
'jaccard' (Jaccard similarity coefficient),
'sorensen_dice' (Sorensen-Dice coefficient),
'overlap' (Overlap coefficient),
'levenshtein' (Levenshtein similarity),
'hamming' (Hamming similarity)
texts1: first tuple (strings stored in the items of the tuple)
texts2: second tuple (strings stored in the items of the tuple)
Returns:
matrix: n x n symmetric similarity matrix
'''
# Compare a pair of strings at a time stored in tuples texts1 and texts2, return list of
# similarity values and sublists corresponding the number of strings in the dataset
SM = [[(metrics (a,b)) for a in texts1] for b in texts2]
# Convert sublists into a matrix using pandas Dataframe
matrix = pd.DataFrame(SM, columns=names, index=names)
# show the whole matrix
pd.set_option('display.width', None)
return matrix
def distance_matrix(metrics, texts1, texts2):
'''
Function returns a symmetric distance matrix, which has the same number of rows and
columns (n x n).
Parameters:
metrics:
'jaccard' (Jaccard similarity coefficient),
'sorensen_dice' (Sorensen-Dice coefficient),
'overlap' (Overlap coefficient),
'levenshtein' (Levenshtein similarity),
'hamming' (Hamming similarity)
texts1: first tuple (strings stored in the items of the tuple)
texts2: second tuple (strings stored in the items of the tuple)
Returns:
matrix: n x n symmetric distance matrix
'''
# Compare a pair of strings at a time stored in tuples texts1 and texts2, return a list of
# distance values ((1-metrics)*100) and sublists corresponding the number of strings in the dataset
DM = [[100-float(metrics (a,b)) for a in texts1] for b in texts2]
# Convert sublists into a matrix using pandas Dataframe
matrix = pd.DataFrame(DM, columns=names, index=names)
# show the whole matrix
pd.set_option('display.width', None)
return matrix
def standardized_similarity_matrix(metrics, texts1, texts2):
'''
Function returns a symmetric similarity matrix, which has the same number of rows
and columns (n x n) with standardized values. The used standardizing function:
original value - mean of the values in the dataset / standard deviation of the values in the dataset.
Parameters:
metrics:
'jaccard' (Jaccard similarity coefficient),
'sorensen_dice' (Sorensen-Dice coefficient),
'overlap' (Overlap coefficient),
'levenshtein' (Levenshtein similarity),
'hamming' (Hamming similarity)
texts1: first tuple (strings stored in the items of the tuple)
texts2: second tuple (strings stored in the items of the tuple)
Returns:
matrix: n x n symmetric similarity matrix with standardized values
'''
# Compare a pair of strings at a time stored in tuples texts1 and texts2, return list of similarity values
SM1 = [float(metrics (a,b)) for a in texts1 for b in texts2]
# Ignore instances where the strings are compared to itself, resulting to value 100 (= 100 % similarity)
SM = [x for x in SM1 if x != 100]
# Calculate mean of all the values (skipping value 100) in the dataset
mean = sum(SM)/(len(SM))
# The first step to calculate the standard deviation is to subtract the mean
subtract = [x - mean for x in SM]
# Square the mean from the previous step
square = [x ** 2 for x in subtract]
# Work out the mean of the squared values (the number of values -1, because one is usually dealing with a sample, not the whole population)
divide = sum(square)/(len(SM)-1)
# Take square root of the result == standard deviation
square_root = math.sqrt(divide)
standard_deviation = square_root
# Apply the standardization function to all values in the dataset, skip when the string is compared to itself
standardize = [0.0 if x == 100.0 else (x- mean)/standard_deviation for x in SM1]
# Return a list containing sublists corresponding the number of strings in the dataset
standardize = [standardize[x:x+len(texts1)] for x in range(0, len(standardize), len(texts1))]
# Convert sublists into a matrix using pandas Dataframe
matrix = pd.DataFrame(standardize, columns=names, index=names)
# show the whole matrix
pd.set_option('display.width', None)
return matrix
def standardized_distance_matrix(metrics, texts1, texts2):
'''
Function returns a symmetric distance matrix, which has the same number of rows
and columns (n x n) with standardized values. The used standardizing function:
original value - mean of the values in the dataset / standard deviation of the values in the dataset.
Parameters:
metrics:
'jaccard' (Jaccard similarity coefficient),
'sorensen_dice' (Sorensen-Dice coefficient),
'overlap' (Overlap coefficient),
'levenshtein' (Levenshtein similarity),
'hamming' (Hamming similarity)
texts1: first tuple (strings stored in the items of the tuple)
texts2: second tuple (strings stored in the items of the tuple)
Returns:
matrix: n x n symmetric distance matrix with standardized values
'''
# Compare a pair of strings at a time stored in tuples texts1 and texts2, return list of distance values ((1-metrics)*100)
DM1 = [100-(float(metrics (a,b))) for a in texts1 for b in texts2]
# Ignore instances where the strings are compared to itself, resulting to value 0 (= 0 % distance)
DM = [x for x in DM1 if x != 0]
# Calculate mean of all the values (skipping instances of 0) in the dataset
mean = sum(DM)/(len(DM))
# The first step to calculate the standard deviation is to subtract the mean
subtract = [x - mean for x in DM]
# Square the mean from the previous step
square = [x ** 2 for x in subtract]
# Work out the mean of the squared values (the number of values -1, because one is usually dealing with a sample, not the whole population)
divide = sum(square)/(len(DM)-1)
# Take square root of the result == standard deviation
square_root = math.sqrt(divide)
standard_deviation = square_root
# Apply the standardization function to all values in the dataset, skip when the string is compared to itself
standardize = [0.0 if x == 0.0 else (x- mean)/standard_deviation for x in DM1]
# Return a list containing sublists corresponding the number of strings in the dataset
standardize = [standardize[x:x+len(texts1)] for x in range(0, len(standardize), len(texts1))]
# Convert sublists into a matrix using pandas Dataframe
matrix = pd.DataFrame(standardize, columns=names, index=names)
# show the whole matrix
pd.set_option('display.width', None)
return matrix
# Use dictionary to call the functions
select={
'similarity': similarity_matrix,
'distance': distance_matrix,
'st_similarity': standardized_similarity_matrix,
'st_distance': standardized_distance_matrix
}