-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path.project
More file actions
251 lines (232 loc) · 10.2 KB
/
.project
File metadata and controls
251 lines (232 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import random
class Genomabin:
def run(): #fNameArray is an array with all of the file names containing the binary file data, which the user enters ##make this a parameter
clustersToMake = 5 #read this in
fileNameArray = ['gid_203169_0.bfv', 'gid_203169_1.bfv', 'gid_203169_2.bfv'] #read this in later
rawDataArray = Genomabin.compileRawData(fileNameArray)
arrayRows = int(len(rawDataArray)/193) ##THIS WILL BE CHANGED LATER TO A CONSTANT #
normalArray = Genomabin.normalizeArray(rawDataArray, arrayRows)
byteArray = Genomabin.compileToByte(normalArray, arrayRows)
seedLocationArray = Genomabin.generateSeeds(byteArray, arrayRows, clustersToMake)
clusterArray = Genomabin.clusterData(seedLocationArray, byteArray, arrayRows, clustersToMake)
Genomabin.printClusters(clusterArray, byteArray)
#print(Genomabin.length(clusterArray))
def length(array):
length = 0
for line in array:
for value in line:
length+=1
return(length)
def compileRawData(fileNameArray):
print("Compiling Raw Data")
rawDataArray = []
for file in fileNameArray:
arraySection = Genomabin.readBinFile(file)
rawDataArray = arraySection #MAKE THIS +=!!!
return(rawDataArray)
def readBinFile(fName):
print("Reading binary file")
import sys, traceback, array
f = open(fName, 'rb')
thisBatchCnt = 134217604 #this controls how many values are read in
binValues = array.array('d') #'d' double
binValues.fromfile(f, 19300)
f.close()
return(binValues)
def compileToByte(normalArray, arrayRows):
print("Compiling to byte Array")
maxMinArray = Genomabin.maxMin(normalArray)
maximum = maxMinArray[0] #Determine max and min of arraySection
minimum = maxMinArray[1]
byteArray = Genomabin.convertToByte(normalArray, arrayRows, maximum, minimum)
return(byteArray)
def maxMin(normalArray):
print("Determining max/min values")
maximum = 0
minimum = normalArray[0]
maxMinArray = []
row = []
for value in normalArray:
if (value > maximum):
maximum = value
elif (value < minimum):
minimum = value
maxMinArray.extend((maximum,minimum))
return(maxMinArray)
def convertToByte(normalArray, arrayRows, maximum, minimum):
print("Converting array to Bytearray")
totalArrayLength = arrayRows *193
byteArray = bytearray(totalArrayLength)
mapRange = maximum - minimum
mapScalar = 255/mapRange
for x in range(0, totalArrayLength):
mappedValue = normalArray[x]
mappedValue -= minimum
mappedValue *= mapScalar
byteArray[x] = int(mappedValue)
return(byteArray)
def normalizeArray(rawData, arrayRows):
print("Begin normalizing data in the array")
normalizedArray = []
row = []
valuesInArray = arrayRows * 193
for x in range(0, valuesInArray):
row.append(rawData[x])
if((x+1) % 193 == 0):
normalRow = Genomabin.normalizeData(row)
normalizedArray += normalRow
row = []
#del normalizedArray[0] #THIS IS THE MISSING LINE!
return(normalizedArray)
def normalizeData(locus):
#print("Normalizing data in the array")
locusMean = 0
locusVariance = 0
returnLocus = []
cellLines = 193 #make sure this is 193, or modify it for any smaller test data set
for index in range(0,cellLines):
locusMean += locus[index]
locusMean /= cellLines
for index in range(0,cellLines):
locusVariance += (locus[index]-locusMean)**2
locusVariance /= cellLines
locusStandardDev = (locusVariance)**(.5)
for index in range(0,cellLines):
returnLocus.append((locus[index]-locusMean)/locusStandardDev)
return(returnLocus)
def generateSeeds(byteArray, arrayRows, clustersToMake):
print("Begin generating seeds")
seedsLocationArray = []
randIntLimit = ((arrayRows-1) * 193) #Total number of values in array minus 193, as randint is inclusive #SHOULD THIS BE -1??
centerID = random.randint(0, randIntLimit)
centerVector = byteArray[centerID : (centerID +193)]
seedsLocationArray.append(centerID)
distancesArray = Genomabin.calcDistances(seedsLocationArray, byteArray, arrayRows)
for seed in range(0, clustersToMake):
totalDistance = distancesArray[-1] #last value of distancesArray, MAKE SURE THIS IS NOT 0!
newSeedIndex = Genomabin.generateRandomSeed(byteArray, distancesArray, totalDistance)
seedsLocationArray.append(newSeedIndex)
distancesArray = Genomabin.calcDistances(seedsLocationArray, byteArray, arrayRows)
del seedsLocationArray[1]
return(seedsLocationArray)
def calcDistances(seedLocationArray, byteArray, arrayRows): #Caculates distances between each data point and its closest seed, and returns an array with each distance
#print("Calculating Distances")
dataDistanceArray = [0 for x in range(0, arrayRows)]
index = 0
arrayRowVector = byteArray[0:193]
distance = Genomabin.findNearestDistance(arrayRowVector, seedLocationArray, byteArray)
dataDistanceArray[0] = distance**2
for arrayRow in range(1, arrayRows):
byteArrayIndex = arrayRow * 193
arrayRowVector = byteArray[byteArrayIndex: byteArrayIndex + 193]
distance = Genomabin.findNearestDistance(arrayRowVector, seedLocationArray, byteArray)
dataDistanceArray[arrayRow] = distance**2 + dataDistanceArray[arrayRow-1]
del dataDistanceArray[0] #MAKE SURE THIS DOES NOT DELETE THE ACTUAL FIRST VALUE!!!!!!!!!!!!!!!!!!!!!!!!!
return(dataDistanceArray)
def findNearestDistance(locus, seedLocationArray, byteArray):
#print("Finding Nearest Distance")
initialSeedIndex = seedLocationArray[0]
closestSeed = byteArray[initialSeedIndex : (initialSeedIndex + 193)]
closestDistance = 1-Genomabin.calcCorrelationCoefficient(locus,closestSeed)
for seedID in seedLocationArray:
seedVector = byteArray[seedID: (seedID + 193)]
if (1-Genomabin.calcCorrelationCoefficient(locus,seedVector) < closestDistance):
closestDistance = 1-Genomabin.calcCorrelationCoefficient(locus,seedVector)
return(closestDistance)
def calcCorrelationCoefficient(locus1, locus2):
#print("Calculating Correlation Coefficient")
locus1Mean = 0
locus2Mean = 0
locus1Variance = 0
locus2Variance = 0
covariance = 0
cellLines = 193 #make sure this is 193, or modify it for any smaller test data set
for index in range(0,cellLines):
locus1Mean += locus1[index]
locus2Mean += locus2[index]
locus1Mean /= cellLines
locus2Mean /= cellLines
for index in range(0,cellLines):
locus1Variance += (locus1[index]-locus1Mean)**2
locus2Variance += (locus2[index]-locus2Mean)**2
covariance += (locus1[index]-locus1Mean)*(locus2[index]-locus2Mean)
locus1Variance /= cellLines
locus2Variance /= cellLines
locus1StandardDev = (locus1Variance)**(.5)
locus2StandardDev = (locus2Variance)**(.5)
covariance /= cellLines
correlationCoefficient = covariance/(locus1StandardDev*locus2StandardDev)
return(correlationCoefficient) #should this be r^2?
def generateRandomSeed(byteArray, distanceArray, totalDistance): #generating random seed here
#print("Generating random seeds")
randomNumber = random.uniform(0, totalDistance)
newSeed = 0
newSeedIndex = 0
while(distanceArray[newSeedIndex] < randomNumber): #THERE USED TO BE float(distanceArray) HERE
newSeedIndex+=1 #newSeedIndex is equal to the index of the seed in seedLocationArray
byteArraySeedIndex = newSeedIndex *193
return(byteArraySeedIndex)
def clusterData(seedLocationArray, byteArray, arrayRows, clustersToMake):
print("Clustering data")
seedLocationArrayLen = len(seedLocationArray)
clusters = clustersToMake
loopIterations = 0
clusterArray = [[]]
beforeClusterArray = clusterArray
while((loopIterations == 0) or (clusterArray != beforeClusterArray)): #Make this cleaner
beforeClusterArray = clusterArray
clusterArray = Genomabin.clusterDataLoop(seedLocationArray, byteArray, arrayRows, clustersToMake)
loopIterations += 1 #this value can be printed to show how efficient loop is, as it is how many times it ran
return(clusterArray)
def clusterDataLoop(seedLocationArray, byteArray, arrayRows, clustersToMake): ##arrayRows = number of vectors in the raw data
print("Clustering data in loop")
clustersArray = [[] for row in range(0, clustersToMake)] #make this a list instead?
clusterID = 0
for vectorInitialIndex in range(0,arrayRows): #gets the "row" of the vector
vectorCompleteIndex = vectorInitialIndex *193 #multiply index by 193 to find actual index of vector in byteArrat
locus = byteArray[vectorCompleteIndex : (vectorCompleteIndex + 193)] #a vector row is the initial index to 192 values after, as each "row" is 193 values long
seedIndex = Genomabin.findNearestSeed(locus, seedLocationArray, clustersToMake, byteArray)
clustersArray[seedIndex].append(vectorCompleteIndex) #add the vector index to the correct cluster/row, which is the seed index, watch out for mismatch of indexes
seedLocationArray = [] #Restart Array
for cluster in clustersArray:
newCenter = Genomabin.updateCenter(cluster, byteArray)
if(newCenter != -1):
seedLocationArray.append(newCenter) #Not Sure if Correct
clusterID+=1
return(clustersArray)
def findNearestSeed(locus, seedLocationArray, clustersToMake, byteArray): #seeds have already been generated
#print("Finding the nearest seed")
closestSeedIndex = 0
closestSeed = byteArray[closestSeedIndex : (closestSeedIndex + 193)]
for x in range(0, clustersToMake):
seedID = seedLocationArray[x]
seedVector = byteArray[seedID: (seedID + 193)]
if (abs(1-Genomabin.calcCorrelationCoefficient(locus, seedVector)) < abs(1-Genomabin.calcCorrelationCoefficient(locus,closestSeed))):
closestSeed = seedVector
closestSeedIndex = x
return(closestSeedIndex)
def updateCenter(cluster, byteArray):
#print("Updating Center")
locusLength = 193
lociInCluster = 0
center = [0 for x in range(0, locusLength)]
for vectorID in cluster:
for x in range(0, locusLength):
center[x] += byteArray[vectorID + x] #increment x to go through each value in the "row"
lociInCluster +=1
for x in range(0, locusLength):
if(lociInCluster != 0):
center[x] /= lociInCluster
if(lociInCluster == 0):
center = -1
return(center)
def printClusters(clusterArray, byteArray):
clusterArrayLen = len(clusterArray)
for x in range(0,clusterArrayLen):
cluster = clusterArray[x]
if cluster:
print("Cluster", x+1)
for vectorID in cluster:
print(vectorID,' ', end='') #vectorID is the row of the normal data
print()
Genomabin.run()