preprocess.data <- function(raw.data, n.snps.to.keep, permutation = NULL) {	
	
	original.genotypes <- raw.data$genotypes
	#original.p.values <- raw.data$p.values
	#original.maf <- raw.data$maf 
	original.n.snps <- ncol(original.genotypes)
	
	# output list
	data <- list()
	
	# from here starts the preprocessing in which training set / test set need
	# to be taken into account
	
	## process phenotypes
	
	
	
	# permute
	if (!is.null(permutation)) {
		phenotypes <- raw.data$phenotypes[permutation,]	
	} else {
		phenotypes <- raw.data$phenotypes
	}
	genotypes <- original.genotypes
	
	
	
	# compute major allele frequencies
	maf <- colMeans(genotypes)/2
	maf <- pmin(maf, 1-maf)  # elementwise minimum
	
	# compute p-values with CCA for pruning
	if (ncol(genotypes)>1) {
		
		snp.p.values <- calculate.cca.p.values(genotypes, phenotypes)
		
	} else {
		
		n.patients <- nrow(phenotypes)
		if (length(unique(genotypes))>1) {
			a <- cancor(genotypes, phenotypes)
			out <- f.test.cca.own(dim1=1, dim2=ncol(phenotypes), n=n.patients, cc.list=a$cor)
			snp.p.values <- ifelse(!is.na(out$p.value[1]), out$p.value[1], 1)
		} else {
			snp.p.values <- 1
		}
	}
	
	
	# Record the total variation before pruning of SNPs.
	# Use the unpruned value later when deciding the priors.
	
	total.unpruned.snp.variation <- sum( apply( scale(genotypes), 2, var))
	
	data$total.unpruned.snp.variation <- total.unpruned.snp.variation
	
	
	## Pruning of SNPs using the p-values
	if (n.snps.to.keep<ncol(genotypes)) {
		# Prune only if the number of SNPs is larger
		# than the threshold
		snps.to.keep <- sort(order(snp.p.values)[1:n.snps.to.keep])
		
		genotypes <- genotypes[,snps.to.keep, drop=FALSE]
		
	} else {
		
		snps.to.keep <- seq(1,original.n.snps)
	}
	
	data$maf <- maf[snps.to.keep]   # Record MAF before scaling the genotypes.
	
	# scale data with mean and sd
	data$genotypes <- scale(genotypes)  # (assume each SNP explains the same amount)	
	
	
	
	
	# fill in remainder of output list
	data$crossprod.genotypes <- crossprod(data$genotypes)
	
	data$phenotypes <- as.matrix(phenotypes)
	
	# if there are confounders in the data, the model
	# explains them away using Baysian linear regression
	if (!is.null(raw.data$confounders)) {
		data$confounders <- raw.data$confounders
	} else {
		data$confounders <- NA
	}

	return(data)
	
}


calculate.cca.p.values <- function(genotypes, phenotypes) {
	# Rao's F-approximation
	
	n.snps <- ncol(genotypes)
	n.patients <- nrow(genotypes)
	p.values <- rep(NA, n.snps)
	
	for (i in 1:ncol(genotypes)) {
		genotypes.to.test <- genotypes[,i, drop=FALSE]
		
		tester <-mean(genotypes.to.test)
		if ((tester != 0) && (tester != 1)) {
			# The SNP has more than a single value
			a <- cancor(genotypes.to.test, phenotypes)
			out <- f.test.cca.own(dim1=1, dim2=ncol(phenotypes), n=n.patients, cc.list=a$cor)
			cca.f.test.p.val <- ifelse(!is.na(out$p.value[1]), out$p.value[1], 1)
			p.values[i] <- cca.f.test.p.val
		} else {
			p.values[i] <- 1
		}
	}
	return(p.values)
}


f.test.cca.own <- function(dim1, dim2, n, cc.list) {
	# The same function can be found in gsCCA_simulation_tools.R
	s <- length(cc.list)
	p <- dim1
	q <- dim2
	N <- n
	k <- 1:s
	lambda <- sapply(k, function(i) {
		prod(1 - cc.list[i:s]^2)
	})
	r <- (N - s - 1) - ((abs(p - q) + 1)/2)
	Ndf <- (p - k + 1) * (q - k + 1)
	u <- (Ndf - 2)/4
	xx <- ((p - k + 1)^2 + (q - k + 1)^2) - 5
	t <- sqrt(((p - k + 1)^2 * (q - k + 1)^2 - 4)/xx)
	ilambda <- lambda^(1/t)
	Ddf <- (r * t) - (2 * u)
	Fstat <- ((1 - ilambda)/ilambda) * (Ddf/Ndf)
	
	pgF <- pf(Fstat, Ndf, Ddf, lower.tail = FALSE)
	out <- list()
	out$statistic <- Fstat
	out$parameter <- cbind(Ndf, Ddf)
	out$p.value <- pgF
	out$method <- "F test for significance of canonical correlations"
	out
	
}