-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathann.py
More file actions
107 lines (93 loc) · 3.39 KB
/
ann.py
File metadata and controls
107 lines (93 loc) · 3.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import math
import random
import matplotlib.pyplot as plt
class ANN:
def __init__(self, train_set, k, num_buckets):
self.train_set = train_set
self.k = k
self.num_data = len(train_set)
self.dim = len(train_set[0])-1
self.num_buckets = num_buckets
self.buckets = [{} for x in xrange(num_buckets)]
self.projections = [[] for x in xrange(num_buckets)]
# Preprocessing of training input
for l in xrange(num_buckets):
# Generate random projection
self.num_proj = 10
proj = []
for i in xrange(self.num_proj):
cur_proj = []
for j in xrange(self.dim):
cur_proj += [random.random()*2-1]
proj += [cur_proj]
self.projections[l] = proj
for x in train_set:
key = self.lsh(x, self.projections[l])
if key not in self.buckets[l].keys():
self.buckets[l][key] = [x]
else:
self.buckets[l][key] += [x]
# Calculate dot product between two vectors
# Used as component for hash function for cosine distance
def dot_product(self, a, b):
temp_sum = 0
for i in xrange(self.dim):
temp_sum += a[i]*b[i]
return temp_sum
# Compute lsh for cosine distance measure
def lsh(self, x, proj):
key = ''
for p in proj:
dot = self.dot_product(x, p)
if dot > 0:
key += '1'
else:
key += '0'
return key
# Given an instance, get the label from near nearest neighbor in the training set
def predict(self, a):
neighbors = self.get_approximate_neighbors(a)
vote = [0]*2
for n in neighbors:
vote[int(n[-1])] += 1
return vote.index(max(vote))
# Calculate euclidean distance between two points
def calc_dist(self, a, b):
temp_sum = 0
for i in xrange(self.dim):
temp_sum += math.pow(a[i] - b[i], 2)
return math.sqrt(temp_sum)
# Get k approximately nearest neighbor
def get_approximate_neighbors(self, a):
candidates = []
for l in xrange(self.num_buckets):
key = self.lsh(a, self.projections[l])
if key in self.buckets[l]:
candidates += self.buckets[l][key]
neighbors = []
distances = []
for c in candidates:
distances += [(c, self.calc_dist(a, c))]
distances.sort(key= lambda tup: tup[1])
if len(distances) < self.k:
num_neighbors = len(distances)
else:
num_neighbors = self.k
for i in xrange(num_neighbors):
neighbors += [distances[i][0]]
return neighbors
# Plot the first two dimension of training dataset
def plot(self, dataset):
plt.axis((-1, 1, -1, 1))
plt.plot([x[0] for x in dataset if x[2] == 0], [x[1] for x in dataset if x[2] == 0], 'ro')
plt.plot([x[0] for x in dataset if x[2] == 1], [x[1] for x in dataset if x[2] == 1], 'b^')
# Plot the data inside each bucket
def plot_buckets(self, b):
n = len(self.buckets[b].keys())
index = 1
for k in self.buckets[b].keys():
data = self.buckets[b][k]
plt.subplot(n/2, 2, index)
index += 1
self.plot(data)
plt.show()