collect.niche.distances <- function(root='/triton/ics/scratch/mi/pemartti/CCDD/PopSim', simulation.name='fast_21_4_2014', n.generations=80000, save.interval=2000, save.extension='_nic240_mig1_rec3', max.n.strains.to.consider=NULL, result.dir.name='/Results') {
	# if is.null(max.n.strains.to.consider), 
	# then all distances that have been computed will be collected.
	
	result.path <- paste(root, result.dir.name, '/', simulation.name, sep='')
	distance.path <- paste(result.path, '/distances', sep='')
	
	require(gdata)
	generations.to.consider <- seq(save.interval, n.generations, by=save.interval)
	core.dist.matrices <- list()
	
	at.least.one.file.exists <- FALSE
	for (generation.index in generations.to.consider) {
		generation.index <- as.integer(generation.index)
		dist.file.name <- paste(distance.path, '/dist_gen', generation.index, save.extension, '.RData', sep='')
		if (file.exists(dist.file.name)) {
			at.least.one.file.exists <- TRUE
			load(file=dist.file.name)  # core.dist, cog.dist=NULL
			core.dist.matrices[[as.character(generation.index)]] <- core.dist
		}
	}
	
	if (!at.least.one.file.exists) {
		core.dist.matrices <- NULL
	}
	return(list(core.dist.matrices=core.dist.matrices))
}



compute.niche.distances.over.whole.simulation <- function(n.generations, save.interval, n.strains.per.group, root, simulation.name, save.extension, result.dir.name='/Results') {

	generations.to.consider <- seq(save.interval, n.generations, by=save.interval)	
	
	for (generation.index in generations.to.consider) {
		
		temp <- compute.niche.distances(generation.index=generation.index, operation.code='', n.strains.per.group=n.strains.per.group, root=root, simulation.name=simulation.name, save.extension=save.extension, result.dir.name=result.dir.name)
	
	}
}



compute.niche.distances <- function(generation.index=20000, operation.code='', n.strains.per.group=100, root='/triton/ics/scratch/mi/pemartti/CCDD/PopSim', simulation.name='fast_most_strains_14_5_2014', save.extension='', result.dir.name='/Results') {
	# Computes strain Hamming distances of the core genomes.
	#
	# "parameters" must give parameter values in the simulation in the same
	# order as they were run in the cluster.
	require(gdata)
	require(mgcv)
	
	gene.path <- paste(root, result.dir.name, '/',simulation.name, '/saved_genes', sep='')
	dist.path <- paste(root, result.dir.name, '/',simulation.name, '/distances/', sep='')
	
	core.dist <- NULL
	groups <- c('A.in.A', 'A.in.AB', 'B.in.AB', 'B.in.B')
	
	generation.index <- as.integer(generation.index)
	gene.file.name <- paste(gene.path, '/gen', generation.index, operation.code, save.extension, '.RData', sep='')
	
	if (file.exists(gene.file.name)) {
		
		load(gene.file.name)
		# population, elapsed
		pop <- population
		rm(population)
		
		all.strains <- list()
		
		for (env in c('A','B','AB')) {
			
			for (strain.type in c('A','B')) {
				
				if (!((env=='A' & strain.type=='B') | (env=='B' & strain.type=='A'))) {
				
					group.name <- paste(strain.type, 'in', env, sep='.')
					all.strains[[group.name]] <- which(pop$strain.info$type==strain.type & pop$strain.info$environment==env)
				
				}
			}
		}
		
		n.genes <- ncol(pop$alleles)
		
		strains.to.use <- NULL
		strain.types <- NULL
		for (group in groups) {
			# The strains in the matrix will be in the order of groups:
			# 'A.in.A', 'A.in.AB', 'B.in.AB', 'B.in.B'
			n.strains.from.group <- min(n.strains.per.group, length(all.strains[[group]]))
			if (n.strains.from.group>0) {
				strains.to.use <- c(strains.to.use, all.strains[[group]][1:n.strains.from.group])
				strain.types <- c(strain.types, rep(group, n.strains.from.group))
			}
		}
		
		n.strains.to.use <- length(strains.to.use)
		if (n.strains.to.use>0) {
			
			core.dist <- matrix(NA, nrow=n.strains.to.use, ncol=n.strains.to.use)
			rownames(core.dist) <- strain.types
			colnames(core.dist) <- strain.types
			
			for (i in seq(1,n.strains.to.use-1)) {
				strain.i <- strains.to.use[i]
				strain.i.alleles <- pop$alleles[strain.i,]
				
				for (j in seq(i+1, n.strains.to.use)) {
					strain.j <- strains.to.use[j]
					strain.j.alleles <- pop$alleles[strain.j,]
					
					gene.distances <- rep(NA, n.genes)
					for (gene.index in 1:n.genes) {
						gene.distances[gene.index] <- pop$allele.distances[[gene.index]][strain.i.alleles[gene.index],strain.j.alleles[gene.index]]
					}					
					core.dist[i,j] <- sum(gene.distances) / (n.genes*pop$gene.length)			
				}
			}
		}
		
		# Save the calculated distances
		cog.dist <- NULL
		save(core.dist, cog.dist, file=paste(dist.path, '/dist_gen', generation.index, operation.code, save.extension, '.RData', sep=''))
	}
	
	return(core.dist)
}





#####################################################################
## Collect genewise within and between distance along a simulation ##
#####################################################################
collect.genewise.within.and.between.distances.in.full.simulation <- function(save.interval=2000, n.generations=100000, operation.code='', root='/triton/ics/scratch/mi/pemartti/CCDD/PopSim', simulation.name='fast_most_strains_14_5_2014', parameters=c(nic=4000, mig=0.25, rec=3)) {
	
	source('niche_summary_functions.R')
	generations.to.consider <- seq(save.interval, n.generations, by=save.interval)
	
	within.distances <- NULL
	for (generation.index in generations.to.consider) {
		
		print(generation.index)
		res <- compute.mean.genewise.distances.of.groups(generation.index=generation.index, operation.code=operation.code, root=root, simulation.name=simulation.name, parameters=parameters)
		
		if (!is.null(res)) {
			if (is.null(within.distances)) {
				# Initialize within and between distances (only once)
				within.distances <- matrix(NA, nrow=length(generations.to.consider), ncol=dim(res)[3])
				rownames(within.distances) <- generations.to.consider
				between.distances <- within.distances
			}
			within.distances[as.character(generation.index),] <- apply(res,3,function(x){sum(x[c(1,16)])/2})
				
			between.distances[as.character(generation.index),] <- apply(res,3,function(x){x[13]})	
		}
	}
	
	# Save the calculated distances
	dist <- list()
	dist$between <- between.distances
	dist$within <- within.distances
	
	par.names=names(parameters)
	save.extension <- paste(paste(par.names,parameters, sep=''), collapse='_')
	save.extension <- paste('_', save.extension, sep='')
	gene.dist.path <- paste(root, '/Results/',simulation.name, '/genewise_distances/', sep='')
	
	save(dist, file=paste(gene.dist.path, '/gene_dist', operation.code, save.extension, '.RData', sep=''))
	
	return(dist)
}



######################################################################
## Compute single generation genewise within and betweend distances ##
######################################################################
compute.mean.genewise.distances.of.groups <- function(generation.index=22000, operation.code='', root='/triton/ics/scratch/mi/pemartti/CCDD/PopSim', simulation.name='fast_most_strains_14_5_2014', parameters=c(nic=4000, mig=0.25, rec=3)) {
	# Computes strain Hamming distances of the core genomes.
	#
	# "parameters" must give parameter values in the simulation in the same
	# order as they were run in the cluster.
	require(gdata)
	require(mgcv)
	
	gene.path <- paste(root, '/Results/',simulation.name, '/saved_genes', sep='')
	
	par.names=names(parameters)
	save.extension <- paste(paste(par.names,parameters, sep=''), collapse='_')
	save.extension <- paste('_', save.extension, sep='')
	
	core.dist <- NULL
	groups <- c('A.in.A', 'A.in.AB', 'B.in.AB', 'B.in.B')
	n.groups <- length(groups)
	
	generation.index <- as.integer(generation.index)
	gene.file.name <- paste(gene.path, '/gen', generation.index, operation.code, save.extension, '.RData', sep='')
	
	if (file.exists(gene.file.name)) {
		
		load(gene.file.name)
		# population, elapsed
		pop <- population
		rm(population)
		
		all.strains <- list()
		
		for (env in c('A','B','AB')) {
			for (strain.type in c('A','B')) {	
				if (!((env=='A' & strain.type=='B') | (env=='B' & strain.type=='A'))) {
					group.name <- paste(strain.type, 'in', env, sep='.')
					all.strains[[group.name]] <- which(pop$strain.info$type==strain.type & pop$strain.info$environment==env)
				}
			}
		}
		
		n.genes <- ncol(pop$alleles)
		dim.names <- list()
		dim.names[[1]] <- groups
		dim.names[[2]] <- groups
		dim.names[[3]] <- 1:n.genes
		gene.dist <- array(NA, dim=c(n.groups,n.groups,n.genes), dimnames=dim.names)
		
		for (gene.index in 1:n.genes) {
			
			for (g1.index in seq(1,n.groups)) {
				g1.name <- groups[g1.index]
				strains1 <- all.strains[[g1.name]]
				alleles1 <- pop$alleles[strains1,gene.index]
				counts1 <- table(alleles1)
				
				# Between group distances
				if (g1.index < n.groups) {
					for (g2.index in seq(g1.index+1,n.groups)) {
						g2.name <- groups[g2.index]
						strains2 <- all.strains[[g2.name]]
						alleles2 <- pop$alleles[strains2,gene.index]
						counts2 <- table(alleles2)
						
						total.dist <- 0
						for (all1.name in names(counts1)) {
							all1.code <- as.integer(all1.name)
							for (all2.name in names(counts2)) {
								all2.code <- as.integer(all2.name)
								all.dist <- pop$allele.distances[[gene.index]][all1.code,all2.code]
								total.dist <- total.dist + all.dist * counts1[all1.name] * counts2[all2.name]
							}
						}
						
						avg.dist <- total.dist / sum(counts1) / sum(counts2)
						avg.dist <- avg.dist / pop$gene.length
						
						gene.dist[g1.name, g2.name, gene.index] <- avg.dist
					}
				}
				
				# Within group distances
				total.dist <- 0
				n.test <- 0
				for (all1.index in seq(1,length(counts1)-1)) {
					all1.code <- as.integer(names(counts1)[all1.index])
					all1.name <- as.character(all1.code)
					n.test <- n.test + choose(counts1[all1.name], 2)
					for (all2.index in seq(all1.index+1, length(counts1))) {
						all2.code <- as.integer(names(counts1)[all2.index])
						all2.name <- as.character(all2.code)
						
						all.dist <- pop$allele.distances[[gene.index]][all1.code,all2.code]
						total.dist <- total.dist + all.dist * counts1[all1.name] * counts1[all2.name]
						n.test <- n.test + counts1[all1.name] * counts1[all2.name]
					}
				}
				n.test <- n.test + choose(counts1[all2.name],2)
				avg.dist <- total.dist / choose(sum(counts1), 2)
				avg.dist <- avg.dist / pop$gene.length
				gene.dist[g1.name, g1.name, gene.index] <- avg.dist
				
				aux <- upperTriangle(gene.dist[,,gene.index])
				gene.dist[,,gene.index] <- t(gene.dist[,,gene.index])
				upperTriangle(gene.dist[,,gene.index]) <- aux
			}
		}
		
	} else {
		gene.dist <- NULL
	}
	
	return(gene.dist)
	
}



compute.genewise.modal.sequences <- function(n.generations, save.interval, root, simulation.name, parameters, result.dir.name='/Niche_Real_Data_Results') {
	
	source('niche_visualization.R')
	# for "formulate.save.extension"
	
	save.extension <- formulate.save.extension(parameters)
	
	gene.path <- paste(root, result.dir.name, '/', simulation.name, '/saved_genes', sep='')
	
	genewise.path <- paste(root, result.dir.name, '/', simulation.name, '/genewise', sep='')
	
	generations.to.consider <- seq(save.interval, n.generations, by=save.interval)
	modal.snp.list <- list()
	
	at.least.one.file.exists <- FALSE
	for (generation.index in generations.to.consider) {
		generation.index <- as.integer(generation.index)
		gene.file.name <- paste(gene.path, '/gen', generation.index, save.extension, '.RData', sep='')
		
		modal.snp.list[[as.character(generation.index)]] <- list()
		
		if (file.exists(gene.file.name)) {
			at.least.one.file.exists <- TRUE
			load(file=gene.file.name)  # population, elapsed
			pop <- population
			n.genes <- ncol(pop$alleles)
			
			for (strain.type in c('A','B')) {
				strains.now <- which(pop$strain.info$type == strain.type)
				
				modal.snp.list[[as.character(generation.index)]][[strain.type]] <- list()
				
				for (gene.index in 1:n.genes) {
					alleles.now <- pop$alleles[strains.now, gene.index]
					allele.counts <- table(alleles.now)
					
					# Compute the modal sequence in the gene
					snp.counts <- rep(0, pop$gene.length)
					
					for (allele.name in names(allele.counts)) {
						allele.index <- as.integer(allele.name)
						allele.snps <- pop$allele.snps[[gene.index]][[allele.index]]
						if (length(allele.snps)>0) {
							snp.counts[allele.snps] <- snp.counts[allele.snps] + allele.counts[allele.name]
						}
					}
					snp.proportions <- snp.counts / length(strains.now)
					modal.snps <- which(snp.proportions>0.5)
					
					modal.snp.list[[as.character(generation.index)]][[strain.type]][[gene.index]] <- modal.snps
				}
			}
		}
	}
	
	modal.snp.file <- paste(genewise.path, '/genewise_modal_seq', save.extension, '.RData', sep='')
	
	save(modal.snp.list, file=modal.snp.file)
}



identify.generation.with.given.between.dist <- function(n.generations, save.interval, root, simulation.name, save.extension, result.dir.name='/Niche_Real_Data_Results', border.dist=0.0172) {
	
	dist.res <- collect.niche.distances(root=root, simulation.name=simulation.name, n.generations=n.generations, save.interval=save.interval, save.extension=save.extension, max.n.strains.to.consider=NULL, result.dir.name=result.dir.name)
	
	if (!is.null(dist.res$core.dist.matrices)) {
		
		within.median <- unlist(lapply(dist.res$core.dist.matrices, retrieve.median.within.environment.distance))
		
		between.median <- unlist(lapply(dist.res$core.dist.matrices, retrieve.median.between.environment.distance))
		
	}
	
	over.the.border <- which(between.median>border.dist)
	if (length(over.the.border)>0) {
		border.generation <- as.integer(names(over.the.border)[1])
	} else {
		border.generation <- NA
	}
	return(border.generation)
}



identify.ecoSNPs <- function(n.generations, save.interval, root, simulation.name, parameters, result.dir.name='/Niche_Real_Data_Results', sample.sizes) {
	
	save.extension <- formulate.save.extension(parameters)
	
	gene.path <- paste(root, result.dir.name, '/', simulation.name, '/saved_genes', sep='')
	
	genewise.path <- paste(root, result.dir.name, '/', simulation.name, '/genewise', sep='')
	
	generations.to.consider <- seq(save.interval, n.generations, by=save.interval)
	ecoSNP.info.list <- list()
	
	at.least.one.file.exists <- FALSE
	for (generation.index in generations.to.consider) {
		generation.index <- as.integer(generation.index)
		gene.file.name <- paste(gene.path, '/gen', generation.index, save.extension, '.RData', sep='')
		if (file.exists(gene.file.name)) {
			at.least.one.file.exists <- TRUE
			load(file=gene.file.name)  # population, elapsed
			pop <- population
			n.genes <- ncol(pop$alleles)
			#
			ecoSNP.info <- data.frame(matrix(nrow=n.genes, ncol=2))
			colnames(ecoSNP.info) <- c('n.ecoSNP', 'n.snp.total')
			#
			strain.type.counts <- c(0,0)
			names(strain.type.counts) <- c('A','B')
			for (type.now in c('A','B')) {
				strain.type.counts[type.now] <- length(which(pop$strain.info$type == type.now))
			}
			#
			for (gene.index in 1:n.genes) {
			#	
				snp.proportions <- matrix(NA, nrow=2, ncol=gene.length) # TEST
				rownames(snp.proportions) <- c('A','B') # TEST
			#	
				snp.counts <- matrix(0, nrow=2, ncol=gene.length)
				rownames(snp.counts) <- c('A','B')
			#
				for (type.now in c('A','B')) {
					
					strains.now <- which(pop$strain.info$type == type.now)		
					alleles.now <- pop$alleles[strains.now, gene.index]
					allele.counts <- table(alleles.now)
					
					for (allele.name in names(allele.counts)) {
						allele.index <- as.integer(allele.name)
						allele.snps <- pop$allele.snps[[gene.index]][[allele.index]]
						if (length(allele.snps)>0) {
							snp.counts[type.now, allele.snps] <- snp.counts[type.now, allele.snps] + allele.counts[allele.name]
						}
					}
					snp.proportions[type.now,] <- snp.counts[type.now,] / length(strains.now) # TEST
				}
				
				is.SNP <- apply(snp.counts, 2, function(x){any(x>0)})
				snp.list <- which(is.SNP)
				
				prob.is.ecoSNP <- rep(0, gene.length)
				if (length(snp.list) > 0) {
					for (loc.index in snp.list) {
						
						# Compute the probability that this SNP is an ecoSNP
						k_A <- snp.counts['A',loc.index]
						n_A <- sample.sizes['A']
						N_A <- strain.type.counts['A']
						k_B <- snp.counts['B',loc.index]
						n_B <- sample.sizes['B']
						N_B <- strain.type.counts['B']
						
						log.prob.is.ecoSNP.in.A <- lchoose(k_A, n_A) + lchoose(N_B-k_B, n_B) - lchoose(N_A, n_A) - lchoose(N_B, n_B)
						
						log.prob.is.ecoSNP.in.B <- lchoose(N_A-k_A, n_A) + lchoose(k_B, n_B) - lchoose(N_A, n_A) - lchoose(N_B, n_B)
						
						prob.is.ecoSNP[loc.index] <- exp(log.prob.is.ecoSNP.in.A)+exp(log.prob.is.ecoSNP.in.B)
						
						#prob.is.ecoSNP[loc.index] <- (choose(k_A, n_A) * choose(N_B-k_B, n_B) + choose(N_A-k_A, n_A) * choose(k_B, n_B)) / (choose(N_A, n_a) * choose(N_B, n_B))
					}
				}
				n.ecoSNP.now <- sum(prob.is.ecoSNP) # Expected number of ecoSNPs after sampling is taken into account.
				ecoSNP.info[gene.index,'n.ecoSNP'] <- n.ecoSNP.now
				ecoSNP.info[gene.index,'n.snp.total'] <- length(which(is.SNP))
				
			} # Loop over genes
			print(generation.index)
			ecoSNP.info.list[[as.character(generation.index)]] <- ecoSNP.info

		}
	}
	ecoSNP.info.file <- paste(genewise.path, '/ecoSNP_info', save.extension, '.RData', sep='')
	save(ecoSNP.info.list, file=ecoSNP.info.file)
	
}



#########################################################################
## A small helper to construct the save.extension for given parameters ##
#########################################################################
formulate.save.extension <- function(parameters) {
	par.names=names(parameters)
	save.extension <- paste(paste(par.names,parameters, sep=''), collapse='_')
	save.extension <- paste('_', save.extension, sep='')
	return(save.extension)
}