forked from bernhardcl/coot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
smoothProb.py
151 lines (121 loc) · 6.31 KB
/
smoothProb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python
"""Calculate the probabilities of various models for a given distance measure using linear interpolation or extrapolation"""
# Copyright 2010, 2011 Kevin Keating
#
# Licensed under the Educational Community License, Version 2.0 (the
# "License"); you may not use this file except in compliance with the
# License. You may obtain a copy of the License at
#
# http://www.osedu.org/licenses/ECL-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS"
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing
# permissions and limitations under the License.
from math import floor
import sys
#from pprint import pprint
#from linearInterp import LinearInterp
DELIM = ","
class SmoothProb:
"""Calculate the probabilities of various models for a given distance measure using linear interpolation or extrapolation"""
def __init__(self, fileName):
"""Creates and initialize a smoothProb object
ARGUMENTS:
inputFile - a file containing smoothed training data
RETURNS:
an initialized SmoothProb object
"""
self.__xStart = None #the starting value for the x-axis
self.__xStep = None #the size of the increment along the x-axis
self.__fileName = None #the input filename
self.__yData = {} #the y values read from fileName
self.__yLen = None #the number of Y values
self.__labels = None #the names of the columns
self.__fileName = fileName
input = open(fileName, 'r')
#read in the header line, which contains the labels for our probabilities
labels = input.readline().split(DELIM)
labels.pop(0) #we don't care about the header for the first column, since that's just distance
labels = [x.strip() for x in labels] #get rid of any leading and trailing spaces (at the very least, get rid of the \n at the end of the line)
self.__labels = labels
#read in the first line, which determines xStart
curdata = input.readline().split(DELIM)
self.__xStart = float(curdata.pop(0))
#create arrays for all y values
for curindex, curvalue in enumerate(curdata):
self.__yData[labels[curindex]] = [float(curvalue)]
#read in the second line, which determines xStep
curdata = input.readline().split(DELIM)
self.__xStep = float(curdata.pop(0)) - float(self.__xStart)
for curindex, curvalue in enumerate(curdata):
self.__yData[labels[curindex]].append(float(curvalue))
ep = self.__xStep / 10000 #since the step size is a float, we can't test that each step size is exactly
#right without running into rounding issues.
#Instead, we make sure that the step size is within .01% of the correct size
#using this epsilon value
#go through the rest of the input file
for curline in input:
#skip blank lines
if len(curline.rstrip()) == 0:
continue
curdata = curline.split(DELIM)
#make sure that step size is correct
xVal = self.__xStart + self.__xStep * len(self.__yData)
curdist = curdata.pop(0)
if (xVal - ep <= curdist) and (curdist <= xVal + ep):
raise "Incorrect smoothProb step size in file " + self.__fileName
for curindex, curvalue in enumerate(curdata):
self.__yData[labels[curindex]].append(float(curvalue))
self.__yLen = len(self.__yData[labels[0]])
input.close()
def calcProb(self, xVal, label = None):
"""calculate the probabilities for a given distance
ARGUMENTS:
dist - the distance
OPTIONAL ARGUMENTS:
label - what model to calculate the probability for
must match a label in the first line of the input file
RETURNS:
if label is given, the probability that the model label is correct
if label is not given, a dictionary containing probabilities for all models
"""
xPos = int(floor( (xVal-self.__xStart) / self.__xStep ))
#go through each label and determine a raw probability
yDict = {}
if xPos < 0:
#if we're extrapolating down
numSteps = (xVal-self.__xStart) / self.__xStep
for curlabel in self.__labels:
yStep = self.__yData[curlabel][0] - self.__yData[curlabel][1]
yDict[curlabel] = self.__yData[curlabel][0] - numSteps * yStep
elif xPos > self.__yLen-1:
#if we're extrapolating up
numSteps = (xVal-self.__xStart) / self.__xStep - (self.__yLen - 1)
for curlabel in self.__labels:
yStep = self.__yData[curlabel][-1] - self.__yData[curlabel][-2]
yDict[curlabel] = self.__yData[curlabel][-1] + numSteps * yStep
else:
#if we're interpolating
remainder = xVal - self.__xStart - self.__xStep * xPos
for curlabel in self.__labels:
yDict[curlabel] = self.__yData[curlabel][xPos] * ( 1 - remainder/self.__xStep) + self.__yData[curlabel][xPos+1] * (remainder/self.__xStep)
#scale the raw probabilities so they add up to 1
totalProb = sum(yDict.values())
try:
yDict = dict([(curlabel, curval/totalProb) for (curlabel, curval) in yDict.iteritems()])
except ZeroDivisionError:
#if totalProb = 0, then we're trying to make predictions with *very* bad data
raise SmoothProbError("Underflow error: all probabilities equal to zero.")
#return either the desired probability or the hash containing all probabilities
if label is None:
return yDict
else:
return yDict[label]
class SmoothProbError(Exception):
#this error gets raised when all probabilities are 0
def __init__(self, description):
self.value = description
def __str__(self):
return repr(self.value)