JHUIGMProject/.project at master · steffen12/JHUIGMProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import random
class Genomabin:
	def run(): #fNameArray is an array with all of the file names containing the binary file data, which the user enters ##make this a parameter
		clustersToMake = 5 #read this in
		fileNameArray = ['gid_203169_0.bfv', 'gid_203169_1.bfv', 'gid_203169_2.bfv'] #read this in later
		rawDataArray = Genomabin.compileRawData(fileNameArray)
		arrayRows = int(len(rawDataArray)/193)  ##THIS WILL BE CHANGED LATER TO A CONSTANT #
		normalArray = Genomabin.normalizeArray(rawDataArray, arrayRows)
		byteArray = Genomabin.compileToByte(normalArray, arrayRows)
		seedLocationArray = Genomabin.generateSeeds(byteArray, arrayRows, clustersToMake)
		clusterArray = Genomabin.clusterData(seedLocationArray, byteArray, arrayRows, clustersToMake)
		Genomabin.printClusters(clusterArray, byteArray)
		#print(Genomabin.length(clusterArray))

	def length(array):
		length = 0
		for line in array:
			for value in line:
				length+=1
		return(length)

	def compileRawData(fileNameArray):
		print("Compiling Raw Data")
		rawDataArray = []
		for file in fileNameArray:
			arraySection = Genomabin.readBinFile(file)
			rawDataArray = arraySection #MAKE THIS +=!!!
		return(rawDataArray)

	def readBinFile(fName):
		print("Reading binary file")
		import sys, traceback, array
		f = open(fName, 'rb')
		thisBatchCnt = 134217604 #this controls how many values are read in
		binValues = array.array('d') #'d' double
		binValues.fromfile(f, 19300)
		f.close()
		return(binValues)

	def compileToByte(normalArray, arrayRows):
		print("Compiling to byte Array")
		maxMinArray = Genomabin.maxMin(normalArray)
		maximum = maxMinArray[0] #Determine max and min of arraySection
		minimum = maxMinArray[1]
		byteArray = Genomabin.convertToByte(normalArray, arrayRows, maximum, minimum)
		return(byteArray)

	def maxMin(normalArray):
		print("Determining max/min values")
		maximum = 0
		minimum = normalArray[0]
		maxMinArray = []
		row = []
		for value in normalArray:
			if (value > maximum):
				maximum = value
			elif (value < minimum):
				minimum = value
		maxMinArray.extend((maximum,minimum))
		return(maxMinArray)

	def convertToByte(normalArray, arrayRows, maximum, minimum):
		print("Converting array to Bytearray")
		totalArrayLength = arrayRows *193
		byteArray = bytearray(totalArrayLength)
		mapRange = maximum - minimum
		mapScalar = 255/mapRange
		for x in range(0, totalArrayLength):
			mappedValue = normalArray[x]
			mappedValue -= minimum
			mappedValue *= mapScalar
			byteArray[x] = int(mappedValue)
		return(byteArray)

	def normalizeArray(rawData, arrayRows):
		print("Begin normalizing data in the array")
		normalizedArray = []
		row = []
		valuesInArray = arrayRows * 193
		for x in range(0, valuesInArray):
			row.append(rawData[x])
			if((x+1) % 193 == 0):
				normalRow = Genomabin.normalizeData(row)
				normalizedArray += normalRow
				row = []
		#del normalizedArray[0] #THIS IS THE MISSING LINE!
		return(normalizedArray)

	def normalizeData(locus):
		#print("Normalizing data in the array")
		locusMean = 0
		locusVariance = 0
		returnLocus = []
		cellLines = 193 #make sure this is 193, or modify it for any smaller test data set
		for index in range(0,cellLines):
			locusMean += locus[index]
		locusMean /= cellLines
		for index in range(0,cellLines):
			locusVariance += (locus[index]-locusMean)**2
		locusVariance /= cellLines
		locusStandardDev = (locusVariance)**(.5)
		for index in range(0,cellLines):
			returnLocus.append((locus[index]-locusMean)/locusStandardDev)
		return(returnLocus)

	def generateSeeds(byteArray, arrayRows, clustersToMake):
		print("Begin generating seeds")
		seedsLocationArray = []
		randIntLimit = ((arrayRows-1) * 193) #Total number of values in array minus 193, as randint is inclusive #SHOULD THIS BE -1??
		centerID = random.randint(0, randIntLimit)
		centerVector = byteArray[centerID : (centerID +193)]
		seedsLocationArray.append(centerID)
		distancesArray = Genomabin.calcDistances(seedsLocationArray, byteArray, arrayRows)
		for seed in range(0, clustersToMake):
			totalDistance = distancesArray[-1] #last value of distancesArray, MAKE SURE THIS IS NOT 0!
			newSeedIndex = Genomabin.generateRandomSeed(byteArray, distancesArray, totalDistance)
			seedsLocationArray.append(newSeedIndex)
			distancesArray = Genomabin.calcDistances(seedsLocationArray, byteArray, arrayRows)
		del seedsLocationArray[1]
		return(seedsLocationArray)

	def calcDistances(seedLocationArray, byteArray, arrayRows): #Caculates distances between each data point and its closest seed, and returns an array with each distance
		#print("Calculating Distances")
		dataDistanceArray = [0 for x in range(0, arrayRows)]
		index = 0
		arrayRowVector = byteArray[0:193]
		distance = Genomabin.findNearestDistance(arrayRowVector, seedLocationArray, byteArray)
		dataDistanceArray[0] = distance**2
		for arrayRow in range(1, arrayRows):
			byteArrayIndex = arrayRow * 193
			arrayRowVector = byteArray[byteArrayIndex: byteArrayIndex + 193]
			distance = Genomabin.findNearestDistance(arrayRowVector, seedLocationArray, byteArray)
			dataDistanceArray[arrayRow] = distance**2 + dataDistanceArray[arrayRow-1]
		del dataDistanceArray[0] #MAKE SURE THIS DOES NOT DELETE THE ACTUAL FIRST VALUE!!!!!!!!!!!!!!!!!!!!!!!!!
		return(dataDistanceArray)

	def findNearestDistance(locus, seedLocationArray, byteArray):
		#print("Finding Nearest Distance")
		initialSeedIndex = seedLocationArray[0]
		closestSeed = byteArray[initialSeedIndex : (initialSeedIndex + 193)]
		closestDistance = 1-Genomabin.calcCorrelationCoefficient(locus,closestSeed)
		for seedID in seedLocationArray:
			seedVector = byteArray[seedID: (seedID + 193)]
			if (1-Genomabin.calcCorrelationCoefficient(locus,seedVector) < closestDistance):
				closestDistance = 1-Genomabin.calcCorrelationCoefficient(locus,seedVector)
		return(closestDistance)

	def calcCorrelationCoefficient(locus1, locus2):
		#print("Calculating Correlation Coefficient")
		locus1Mean = 0
		locus2Mean = 0
		locus1Variance = 0
		locus2Variance = 0
		covariance = 0
		cellLines = 193 #make sure this is 193, or modify it for any smaller test data set
		for index in range(0,cellLines):
			locus1Mean += locus1[index]
			locus2Mean += locus2[index]
		locus1Mean /= cellLines
		locus2Mean /= cellLines
		for index in range(0,cellLines):
			locus1Variance += (locus1[index]-locus1Mean)**2
			locus2Variance += (locus2[index]-locus2Mean)**2
			covariance += (locus1[index]-locus1Mean)*(locus2[index]-locus2Mean)
		locus1Variance /= cellLines
		locus2Variance /= cellLines
		locus1StandardDev = (locus1Variance)**(.5)
		locus2StandardDev = (locus2Variance)**(.5)
		covariance /= cellLines
		correlationCoefficient = covariance/(locus1StandardDev*locus2StandardDev)
		return(correlationCoefficient) #should this be r^2?

	def generateRandomSeed(byteArray, distanceArray, totalDistance): #generating random seed here
		#print("Generating random seeds")
		randomNumber = random.uniform(0, totalDistance)
		newSeed = 0
		newSeedIndex = 0
		while(distanceArray[newSeedIndex] < randomNumber): #THERE USED TO BE float(distanceArray) HERE
			newSeedIndex+=1  #newSeedIndex is equal to the index of the seed in seedLocationArray
		byteArraySeedIndex = newSeedIndex *193
		return(byteArraySeedIndex)

	def clusterData(seedLocationArray, byteArray, arrayRows, clustersToMake):
		print("Clustering data")
		seedLocationArrayLen = len(seedLocationArray)
		clusters = clustersToMake
		loopIterations = 0
		clusterArray = [[]]
		beforeClusterArray = clusterArray
		while((loopIterations == 0) or (clusterArray != beforeClusterArray)): #Make this cleaner
			beforeClusterArray = clusterArray
			clusterArray = Genomabin.clusterDataLoop(seedLocationArray, byteArray, arrayRows, clustersToMake)
			loopIterations += 1 #this value can be printed to show how efficient loop is, as it is how many times it ran
		return(clusterArray)

	def clusterDataLoop(seedLocationArray, byteArray, arrayRows, clustersToMake): ##arrayRows = number of vectors in the raw data
		print("Clustering data in loop")
		clustersArray = [[] for row in range(0, clustersToMake)] #make this a list instead?
		clusterID = 0
		for vectorInitialIndex in range(0,arrayRows): #gets the "row" of the vector
			vectorCompleteIndex = vectorInitialIndex *193 #multiply index by 193 to find actual index of vector in byteArrat
			locus = byteArray[vectorCompleteIndex : (vectorCompleteIndex + 193)] #a vector row is the initial index to 192 values after, as each "row" is 193 values long
			seedIndex = Genomabin.findNearestSeed(locus, seedLocationArray, clustersToMake, byteArray)
			clustersArray[seedIndex].append(vectorCompleteIndex) #add the vector index to the correct cluster/row, which is the seed index, watch out for mismatch of indexes
		seedLocationArray = [] #Restart Array
		for cluster in clustersArray:
			newCenter = Genomabin.updateCenter(cluster, byteArray)
			if(newCenter != -1):
			    seedLocationArray.append(newCenter) #Not Sure if Correct
			clusterID+=1
		return(clustersArray)

	def findNearestSeed(locus, seedLocationArray, clustersToMake, byteArray): #seeds have already been generated
		#print("Finding the nearest seed")
		closestSeedIndex = 0
		closestSeed = byteArray[closestSeedIndex : (closestSeedIndex + 193)]
		for x in range(0, clustersToMake):
			seedID = seedLocationArray[x]
			seedVector = byteArray[seedID: (seedID + 193)]
			if (abs(1-Genomabin.calcCorrelationCoefficient(locus, seedVector)) < abs(1-Genomabin.calcCorrelationCoefficient(locus,closestSeed))):
				closestSeed = seedVector
				closestSeedIndex = x
		return(closestSeedIndex)

	def updateCenter(cluster, byteArray):
		#print("Updating Center")
		locusLength = 193
		lociInCluster = 0
		center = [0 for x in range(0, locusLength)]
		for vectorID in cluster:
			for x in range(0, locusLength):
				center[x] += byteArray[vectorID + x] #increment x to go through each value in the "row"
			lociInCluster +=1
		for x in range(0, locusLength):
			if(lociInCluster != 0):
				center[x] /= lociInCluster
		if(lociInCluster == 0):
			center = -1
		return(center)

	def printClusters(clusterArray, byteArray):
		clusterArrayLen = len(clusterArray)
		for x in range(0,clusterArrayLen):
			cluster = clusterArray[x]
			if cluster:
				print("Cluster", x+1)
				for vectorID in cluster:
					print(vectorID,' ', end='') #vectorID is the row of the normal data
				print()

Genomabin.run()