library(stats)
library(cluster)

RemoveRedundant<- function(datafilename){#genotypes_ENr112.2p16.3_CEU_.phased
	data <- read.table(datafilename);
	numrow <-dim(data)[1]
	numcol <-dim(data)[2]
	mylist <- list(0)
	removed <- rep(FALSE, times= numcol)
	for(i in 1:numcol ){
		if(!removed[i]){
		class <- i;
		remove<-FALSE;
		for(j in (i+1):numcol){
			if(!removed[j]){
			if(sum(data[,i]==data[,j]) == numrow){# SNP i is the same as SNP j
				remove<-TRUE	
				removed[j]<-TRUE
				class <-c(class, j)
			}
			}
		}		
		mylist[i] <- list(class)
		}
	}
	left <-numeric(0)
	for(i in 1:length(mylist)){
		if(length(mylist[[i]]) !=0){
			left<- c(left, mylist[[i]][1]) # here we simply take the first SNP to represent all the SNPs that is the same as it. May use other method
		}
	}
	#write.table(data[,left],"genotypes_ENr112.2p16.3_CEU_.phased.NR", row.names = FALSE, col.names = FALSE);# if the row.names is true, include the order of orignial SNPs, should have this, so have position info
	write.table(data[,left],"genotypes_ENr112.2p16.3_CEU_.phased.NR1", row.names = FALSE);
}


RankingSNPs <- function(pilotdata, weights, type){
	numcol <-dim(pilotdata)[2]	
	if(type == 1){#\chi^2, if a cell is less than 7, use fish exact, still use p value without simulation 
		for( i in 3:numcol){
			if(weights[i-2] != 1){
				#weights[i-2] <-chisq.test(pilotdata[,2], pilotdata[,i], simulate.p.value = TRUE, B = 10000)$p.value				
				weights[i-2] <-chisq.test(pilotdata[,2], pilotdata[,i])$p.value				
				#weights[i-2] <-chisq.test(pilotdata[,2], pilotdata[,i])$statistic
			}
		}
	}
	weights
}

Dprimef <- function(x, y)
{
	n<-length(x)
	xy<-factor(paste(x, y))
	s<-summary(xy);
	nn <-rep(0, times=4)
	for(i in 1:length(s)){
		nn[i]<-s[[i]]
	}
	
#	n11<-s[[1]]
#	n12<-s[[2]]
#	n21<-s[[3]]
#	n22<-s[[4]]	
	dprim <-nn[1]*nn[4]-nn[2]*nn[3]
	if(dprim > 0){
		dprim <- dprim/min((nn[1]+nn[2])*(nn[2]+nn[4]), (nn[1]+nn[3])*(nn[3]+nn[4]))
	}
	if(dprim < 0){
		dprim <- dprim/min((nn[1]+nn[2])*(nn[1]+nn[3]), (nn[2]+nn[4])*(nn[3]+nn[4]))
	}
	abs(dprim)
} 

#this function need to speedup
LDDprime <- function(pilotdata, type){#type = 1, correlation coe, 2, D'
		numcol <-dim(pilotdata)[2]
		Dprime <-matrix(nrow=numcol-2, ncol=numcol-2)
		for(i in 1:numcol-2){
			for(j in 1:numcol-2){
				#pilotdata[,i+2 ] and pilotdata[,j+2 ]
				if(type ==1){  
					Dprime[i,j]<- abs(cor(pilotdata[, i+2],pilotdata[,j+2]))
				}
				if(type ==2){  
					Dprime[i,j]<- Dprimef(pilotdata[,i+2],pilotdata[,j+2 ])
				}
				
			}
		}
	Dprime
}

ClusterSNPs <- function(Dprime, weights, positions, distThreshold, valueThreshold){
	curSNP <- which.min(weights)
	total <- length(weights)
	class <- curSNP
	mylist <- list(0)
	j<-0
	while(weights[curSNP] != 1){
		j <- j+1
		#print(j)
		class <- curSNP
		#print(curSNP)
		i<-curSNP+1
		while (i <=total){
			#print(i)
			#if(positions[i] - positions[curSNP] > distThreshold){
			#	break
			#}
			if(Dprime[curSNP, i] > valueThreshold && weights[i] <1){
				class <-c(class, i)
				weights[i]<-1
			}
			i<-i+1				
		}
		i<-curSNP-1
		while(i >=1){
			#print(i)
			#if(positions[curSNP] - positions[i] > distThreshold) {
			#	break
			#}
			if(Dprime[curSNP, i] > valueThreshold && weights[i]<1){
				class <-c(class, i)
				weights[i]<-1
			}	
			i<-i-1			
		}
		weights[curSNP]<-1
		mylist[j] <- list(class)
		curSNP <- which.min(weights)
	}
	mylist
}

Entropy <- function(x){ #H(x) = -sum(p*log(p)) 
	xf<-factor(x)
	xfr<-table(xf)
	xfre<-as.vector(xfr)
	entropy<-0
	i<-1
	xfre<-xfre/sum(xfre)
	while(i<=length(xfre)){
		if(xfre[i] != 0){
			entropy <- entropy - xfre[i]*log2(xfre[i]) 
		}
		i <-i+1
	}
	entropy	
}

JointEntropy <-function(x,y){ #H(x,y)
	xy<-table(x,y)
	xy<-as.vector(xy)
	entropy<-0
	i<-1
	xy<-xy/sum(xy)
	while(i<=length(xy)){
		if(xy[i] != 0){
			entropy <- entropy - xy[i]*log2(xy[i]) 
		}
		i <-i+1
	}
	entropy
}

JointEntropy3Var <-function(x,y,z){ #H(x,y)
	xy<-table(x,y,z)
	xy<-as.vector(xy)
	entropy<-0
	i<-1
	xy<-xy/sum(xy)
	while(i<=length(xy)){
		if(xy[i] != 0){
			entropy <- entropy - xy[i]*log2(xy[i]) 
		}
		i <-i+1
	}
	entropy
}

ConditionalEntropy<-function(x,y){#H(x|y) = H(x,y)-H(y) 
	hxy<-JointEntropy(x,y) - Entropy(y)
	hxy
}

MultalInfo<-function(x,y){#I(x|y) =H(x)- H(X|Y)= H(x)+H(y)-H(x,y) 
	hxy<-Entropy(x) + Entropy(y) - JointEntropy(x,y) 
	hxy
}

MultalInfo3Var <- function(z,x,y){#I(z|xy) = H(z)+H(x,y)-H(z,x,y)
	hzxy<-Entropy(z)+JointEntropy(x,y)-JointEntropy3Var(z,x,y)
	hzxy
}

#this one need to speedup
EntropySubsetSelection <- function(data, snps,threshold){
	#first one is the one with max mutual information
	i<-1
	miniindex<-0
	maxI<-0
	all<-length(snps)
	while(i<=length(snps)){		
		entropyi<-MultalInfo(data[,2],data[,snps[i]])		
		#print(entropyi)
		if(maxI < entropyi){
			maxI <- entropyi
			miniindex<-i
		}
		i<-i+1
	}
	newclass<-numeric(0)
	snpsleft<-numeric(0)
	newclass[1]<-snps[miniindex]
	if(miniindex-1>0){
		snpsleft<-c(snpsleft, snps[1:miniindex-1])
	}
	if(miniindex+1 <= length(snps)){
		snpsleft<-c(snpsleft, snps[(miniindex+1):length(snps)])
	}
	
	while(1){
	#for each snp to be examined
	x<-numeric(0)
	for(i in 1:length(snpsleft)){
		y<-numeric(0)
		for (j in 1:length(newclass)){
			y[j]<-MultalInfo3Var(data[,2], data[,newclass[j]],data[,snpsleft[i]])-MultalInfo(data[,2], data[,newclass[j]])
		}
		x[i] <- min(y)
	}
	if(threshold>0 && max(x)< threshold){
		break
	}
	miniindex<-which.max(x)
	snps<-snpsleft
	total <- length(newclass)+1
	newclass[total]<-snpsleft[miniindex]
	snpsleft<-numeric(0)
	if(miniindex-1>0){
		snpsleft<-c(snpsleft, snps[1:miniindex-1])
	}
	if(miniindex+1 <= length(snps)){
		snpsleft<-c(snpsleft, snps[(miniindex+1):length(snps)])
	}
	if(total == all){
		break
	}
	}
	
	newclass <-newclass-2
	newclass
}

Preprocessing <- function(data, weights, type){
## 1.1)preprocess, minor allele freq >0.01, using D', r^2, or discrete method, based on haplotype or genotype, minor allele freq >0.05, 
	numrow <- dim(data)[1] 
	numcol <-dim(data)[2]	
	MAF <-rep(0, times=numcol-2) #allele 1 frequency
	for( i in 3:numcol){
		MAF[i-2] <-table(data[,i])[[1]]
		if(	MAF[i-2] > numrow*0.01 &&  MAF[i-2] < numrow*0.99){
			weights[i-2] <- 0;
		} 
	}
	for( i in 3:(numcol-1)){
		for( j in (i+1):numcol){
			#if i is the same as j or if i is eactly the opposite of j, then remove i		 
			same <- (data[,i] == data[,j])#every pair is the same
			if (sum(same) == length(same)){# every element is TRUE
				weights[i-2] <- 1;	
			}
			same <- (data[,i] != data[,j])#every pair is not the same
			if (sum(same) == length(same)){# every element is TRUE
				weights[i-2] <- 1;	
			}
		}
	}	
	weights
}

data <- read.table("test.data");
numHap  <-dim(data)[1]
numSNP <-dim(data)[2]-2
#need preprocess
weights <-rep(1, times=numSNP-2) 
#here weights is actually p-value, to remove some SNPs in the preprocessing, 
#we only need to set its value to be 1.
weights <- Preprocessing(data, weights, 1)
weights <- RankingSNPs(data,weights,  1) 
Dprime <- LDDprime(data, type=2)
positions <- numeric(numSNP-1);
distThreshold <- 100000
valueThreshold <-0.8
classlst <- ClusterSNPs(Dprime, weights, positions, distThreshold, valueThreshold)
class <- numeric(0)
for(i in 1:length(classlst)){
	class[i] <-classlst[[i]][1]
}
classlst
#subset selection here.
threshold<-0.001
newclass<- EntropySubsetSelection(pilotdata, class+2, threshold)
newclass