library(stats) library(cluster) RemoveRedundant<- function(datafilename){#genotypes_ENr112.2p16.3_CEU_.phased data <- read.table(datafilename); numrow <-dim(data)[1] numcol <-dim(data)[2] mylist <- list(0) removed <- rep(FALSE, times= numcol) for(i in 1:numcol ){ if(!removed[i]){ class <- i; remove<-FALSE; for(j in (i+1):numcol){ if(!removed[j]){ if(sum(data[,i]==data[,j]) == numrow){# SNP i is the same as SNP j remove<-TRUE removed[j]<-TRUE class <-c(class, j) } } } mylist[i] <- list(class) } } left <-numeric(0) for(i in 1:length(mylist)){ if(length(mylist[[i]]) !=0){ left<- c(left, mylist[[i]][1]) # here we simply take the first SNP to represent all the SNPs that is the same as it. May use other method } } #write.table(data[,left],"genotypes_ENr112.2p16.3_CEU_.phased.NR", row.names = FALSE, col.names = FALSE);# if the row.names is true, include the order of orignial SNPs, should have this, so have position info write.table(data[,left],"genotypes_ENr112.2p16.3_CEU_.phased.NR1", row.names = FALSE); } RankingSNPs <- function(pilotdata, weights, type){ numcol <-dim(pilotdata)[2] if(type == 1){#\chi^2, if a cell is less than 7, use fish exact, still use p value without simulation for( i in 3:numcol){ if(weights[i-2] != 1){ #weights[i-2] <-chisq.test(pilotdata[,2], pilotdata[,i], simulate.p.value = TRUE, B = 10000)$p.value weights[i-2] <-chisq.test(pilotdata[,2], pilotdata[,i])$p.value #weights[i-2] <-chisq.test(pilotdata[,2], pilotdata[,i])$statistic } } } weights } Dprimef <- function(x, y) { n<-length(x) xy<-factor(paste(x, y)) s<-summary(xy); nn <-rep(0, times=4) for(i in 1:length(s)){ nn[i]<-s[[i]] } # n11<-s[[1]] # n12<-s[[2]] # n21<-s[[3]] # n22<-s[[4]] dprim <-nn[1]*nn[4]-nn[2]*nn[3] if(dprim > 0){ dprim <- dprim/min((nn[1]+nn[2])*(nn[2]+nn[4]), (nn[1]+nn[3])*(nn[3]+nn[4])) } if(dprim < 0){ dprim <- dprim/min((nn[1]+nn[2])*(nn[1]+nn[3]), (nn[2]+nn[4])*(nn[3]+nn[4])) } abs(dprim) } #this function need to speedup LDDprime <- function(pilotdata, type){#type = 1, correlation coe, 2, D' numcol <-dim(pilotdata)[2] Dprime <-matrix(nrow=numcol-2, ncol=numcol-2) for(i in 1:numcol-2){ for(j in 1:numcol-2){ #pilotdata[,i+2 ] and pilotdata[,j+2 ] if(type ==1){ Dprime[i,j]<- abs(cor(pilotdata[, i+2],pilotdata[,j+2])) } if(type ==2){ Dprime[i,j]<- Dprimef(pilotdata[,i+2],pilotdata[,j+2 ]) } } } Dprime } ClusterSNPs <- function(Dprime, weights, positions, distThreshold, valueThreshold){ curSNP <- which.min(weights) total <- length(weights) class <- curSNP mylist <- list(0) j<-0 while(weights[curSNP] != 1){ j <- j+1 #print(j) class <- curSNP #print(curSNP) i<-curSNP+1 while (i <=total){ #print(i) #if(positions[i] - positions[curSNP] > distThreshold){ # break #} if(Dprime[curSNP, i] > valueThreshold && weights[i] <1){ class <-c(class, i) weights[i]<-1 } i<-i+1 } i<-curSNP-1 while(i >=1){ #print(i) #if(positions[curSNP] - positions[i] > distThreshold) { # break #} if(Dprime[curSNP, i] > valueThreshold && weights[i]<1){ class <-c(class, i) weights[i]<-1 } i<-i-1 } weights[curSNP]<-1 mylist[j] <- list(class) curSNP <- which.min(weights) } mylist } Entropy <- function(x){ #H(x) = -sum(p*log(p)) xf<-factor(x) xfr<-table(xf) xfre<-as.vector(xfr) entropy<-0 i<-1 xfre<-xfre/sum(xfre) while(i<=length(xfre)){ if(xfre[i] != 0){ entropy <- entropy - xfre[i]*log2(xfre[i]) } i <-i+1 } entropy } JointEntropy <-function(x,y){ #H(x,y) xy<-table(x,y) xy<-as.vector(xy) entropy<-0 i<-1 xy<-xy/sum(xy) while(i<=length(xy)){ if(xy[i] != 0){ entropy <- entropy - xy[i]*log2(xy[i]) } i <-i+1 } entropy } JointEntropy3Var <-function(x,y,z){ #H(x,y) xy<-table(x,y,z) xy<-as.vector(xy) entropy<-0 i<-1 xy<-xy/sum(xy) while(i<=length(xy)){ if(xy[i] != 0){ entropy <- entropy - xy[i]*log2(xy[i]) } i <-i+1 } entropy } ConditionalEntropy<-function(x,y){#H(x|y) = H(x,y)-H(y) hxy<-JointEntropy(x,y) - Entropy(y) hxy } MultalInfo<-function(x,y){#I(x|y) =H(x)- H(X|Y)= H(x)+H(y)-H(x,y) hxy<-Entropy(x) + Entropy(y) - JointEntropy(x,y) hxy } MultalInfo3Var <- function(z,x,y){#I(z|xy) = H(z)+H(x,y)-H(z,x,y) hzxy<-Entropy(z)+JointEntropy(x,y)-JointEntropy3Var(z,x,y) hzxy } #this one need to speedup EntropySubsetSelection <- function(data, snps,threshold){ #first one is the one with max mutual information i<-1 miniindex<-0 maxI<-0 all<-length(snps) while(i<=length(snps)){ entropyi<-MultalInfo(data[,2],data[,snps[i]]) #print(entropyi) if(maxI < entropyi){ maxI <- entropyi miniindex<-i } i<-i+1 } newclass<-numeric(0) snpsleft<-numeric(0) newclass[1]<-snps[miniindex] if(miniindex-1>0){ snpsleft<-c(snpsleft, snps[1:miniindex-1]) } if(miniindex+1 <= length(snps)){ snpsleft<-c(snpsleft, snps[(miniindex+1):length(snps)]) } while(1){ #for each snp to be examined x<-numeric(0) for(i in 1:length(snpsleft)){ y<-numeric(0) for (j in 1:length(newclass)){ y[j]<-MultalInfo3Var(data[,2], data[,newclass[j]],data[,snpsleft[i]])-MultalInfo(data[,2], data[,newclass[j]]) } x[i] <- min(y) } if(threshold>0 && max(x)< threshold){ break } miniindex<-which.max(x) snps<-snpsleft total <- length(newclass)+1 newclass[total]<-snpsleft[miniindex] snpsleft<-numeric(0) if(miniindex-1>0){ snpsleft<-c(snpsleft, snps[1:miniindex-1]) } if(miniindex+1 <= length(snps)){ snpsleft<-c(snpsleft, snps[(miniindex+1):length(snps)]) } if(total == all){ break } } newclass <-newclass-2 newclass } Preprocessing <- function(data, weights, type){ ## 1.1)preprocess, minor allele freq >0.01, using D', r^2, or discrete method, based on haplotype or genotype, minor allele freq >0.05, numrow <- dim(data)[1] numcol <-dim(data)[2] MAF <-rep(0, times=numcol-2) #allele 1 frequency for( i in 3:numcol){ MAF[i-2] <-table(data[,i])[[1]] if( MAF[i-2] > numrow*0.01 && MAF[i-2] < numrow*0.99){ weights[i-2] <- 0; } } for( i in 3:(numcol-1)){ for( j in (i+1):numcol){ #if i is the same as j or if i is eactly the opposite of j, then remove i same <- (data[,i] == data[,j])#every pair is the same if (sum(same) == length(same)){# every element is TRUE weights[i-2] <- 1; } same <- (data[,i] != data[,j])#every pair is not the same if (sum(same) == length(same)){# every element is TRUE weights[i-2] <- 1; } } } weights } data <- read.table("test.data"); numHap <-dim(data)[1] numSNP <-dim(data)[2]-2 #need preprocess weights <-rep(1, times=numSNP-2) #here weights is actually p-value, to remove some SNPs in the preprocessing, #we only need to set its value to be 1. weights <- Preprocessing(data, weights, 1) weights <- RankingSNPs(data,weights, 1) Dprime <- LDDprime(data, type=2) positions <- numeric(numSNP-1); distThreshold <- 100000 valueThreshold <-0.8 classlst <- ClusterSNPs(Dprime, weights, positions, distThreshold, valueThreshold) class <- numeric(0) for(i in 1:length(classlst)){ class[i] <-classlst[[i]][1] } classlst #subset selection here. threshold<-0.001 newclass<- EntropySubsetSelection(pilotdata, class+2, threshold) newclass