I have tried to use k-means clustering to select the most diverse markers in my population, for example, if we want to select 100 lines I cluster the whole population to 100 clusters then select the closest marker to the centroid from each cluster.
The problem with my solution is it takes too much time (probably my function needs optimization), especially when the number of markers exceeds 100000.
So, I will appreciate it so much if anyone can show me a new way to select markers that maximize diversity in my population and/or help me optimize my function to make it work faster.
Thank you
# example:
library(BLR)
data(wheat)
dim(X)
mdf<-mostdiff(t(X), 100,1,nstart=1000)
Here is the mostdiff function that i used:
mostdiff <- function(markers, nClust, nMrkPerClust, nstart=1000) {
transposedMarkers <- as.array(markers)
mrkClust <- kmeans(transposedMarkers, nClust, nstart=nstart)
save(mrkClust, file="markerCluster.Rdata")
# within clusters, pick the markers that are closest to the cluster centroid
# turn the vector of which markers belong to which clusters into a list nClust long
# each element of the list is a vector of the markers in that cluster
clustersToList <- function(nClust, clusters) {
vecOfCluster <- function(whichClust, clusters) {
return(which(whichClust == clusters))
}
return(apply(as.array(1:nClust), 1, vecOfCluster, clusters))
}
pickCloseToCenter <- function(vecOfCluster, whichClust, transposedMarkers, centers, pickHowMany) {
clustSize <- length(vecOfCluster)
# if there are fewer than three markers, the center is equally distant from all so don't bother
if (clustSize < 3) return(vecOfCluster[1:min(pickHowMany, clustSize)])
# figure out the distance (squared) between each marker in the cluster and the cluster center
distToCenter <- function(marker, center){
diff <- center - marker
return(sum(diff*diff))
}
dists <- apply(transposedMarkers[vecOfCluster,], 1, distToCenter, center=centers[whichClust,])
return(vecOfCluster[order(dists)[1:min(pickHowMany, clustSize)]])
}
}