compute.distances <- function(n.generations, save.interval, n.strains.to.use, n.features.per.gene, gene.length, print.output, gene.path, save.path=NULL, save.extension) {
	# Computes Jaccard distances of the accessory genomes and 
	# Hamming distances of the core genomes.
	#
	# n.strains.to.use specifies the number of strains between which
	# the distances are computed
	require(gdata)
	require(mgcv)
	
	generations.to.consider <- seq(save.interval, n.generations, by=save.interval)	
	core.dist <- NULL
	all.dist <- NULL
	cog.dist <- NULL
	
	for (generation.index in generations.to.consider) {
		
		generation.index <- as.integer(generation.index)
		gene.file.name <- paste(gene.path, '/gen', generation.index, save.extension, '.RData', sep='')
		if (file.exists(gene.file.name)) {
			load(gene.file.name)
			# population, elapsed
			pop <- population
			rm(population)
			
			n.strains.in.population <- sum(pop$n.strains)
			if (length(pop$n.strains)>1) {
				# First N/2 strains (A type strains) and the last N/2 strains 
			 	# (B type strains).
				strains.to.use <- c(seq(1,n.strains.to.use/2),seq(n.strains.in.population-n.strains.to.use/2+1, n.strains.in.population))
			} else {
				strains.to.use <- 1:n.strains.to.use
			}
			
			if (!is.null(pop$core.genes)) {
				n.core.genes <- ncol(pop$core.genes)/n.features.per.gene
				
				###########################################
				## Hamming distances of the core genomes ##
				###########################################
				core.dist <- matrix(NA, nrow=n.strains.to.use, ncol=n.strains.to.use)
				for (i in seq(1, n.strains.to.use-1)) {
					strain.i <- strains.to.use[i]
					if (print.output) {
						print(i)
					}
					for (j in seq(i+1, n.strains.to.use)) {
						strain.j <- strains.to.use[j]
						#block.distance <- sum(abs(pop$core.genes[i,]-pop$core.genes[j,]))
						#core.dist[i,j] <- 1 - (1 - 1 / (n.core.genes*gene.length) )^block.distance
						
						block.distances.per.gene <- rowSums(matrix(abs(pop$core.genes[strain.i,]-pop$core.genes[strain.j,]),ncol=n.features.per.gene, byrow=T))
						
						core.dist[i,j] <- mean(pop$block.to.hamming[block.distances.per.gene+1])
						
					}
				}
			
				#########################################
				## Allelic distances of the core genes ##
				#########################################
				# Transform the "genes" sequence matrix into gene matrix (each unique 
				# gene sequence is given a unique allelic identifier)
				alleles <- matrix(NA, nrow=n.strains.in.population, ncol=n.core.genes)
				for (gene.index in 1:n.core.genes) {
					sites.in.gene <- get.feature.cols(gene.indexes=gene.index, n.features.per.gene=n.features.per.gene)
					unique.sequences <- uniquecombs(pop$core.genes[,sites.in.gene])
					alleles[,gene.index] <- attr(unique.sequences, 'index')
				}
				# Compute the distances of the allele sequences
				all.dist <- matrix(NA, nrow=n.strains.to.use, ncol=n.strains.to.use)		
				for (i in seq(1, n.strains.to.use-1) ) {
					strain.i <- strains.to.use[i]
					#print(i)
					for (j in seq(i+1, n.strains.to.use) ) {
						strain.j <- strains.to.use[j]
						all.dist[i,j] <- sum( alleles[strain.i,] != alleles[strain.j,] ) / n.core.genes
					}
				}
			}
			
			
			if (!is.null(pop$gene.indicators)) {
				######################################
				## Jaccard distances using the COGs ##
				######################################
				cog.dist <- matrix(NA, nrow=n.strains.to.use, ncol=n.strains.to.use)
				for (i in seq(1, n.strains.to.use-1)) {
					strain.i <- strains.to.use[i]
					if (print.output) {
						print(i)
					}
					for (j in seq(i+1, n.strains.to.use)) {
						strain.j <- strains.to.use[j]
						
						cog.dist[i,j] <- length(which(pop$gene.indicators[strain.i,]!=pop$gene.indicators[strain.j,])) / length(which(pop$gene.indicators[strain.i,]==1 | pop$gene.indicators[strain.j,]==1))
					}
				}
			}
			
			if (!is.null(save.path)) {
				# Save the calculated distances, if save.path has been given.
				save(core.dist, all.dist, cog.dist, file=paste(save.path, '/dist_gen', generation.index, save.extension, '.RData', sep=''))
			}
		}
	}
	
	return(list(core.dist=core.dist, cog.dist=cog.dist, all.dist=all.dist))
}





add.columns <- function(mat, new.n.col) {
	# Adds columns with zero entries to matrix mat. 
	# The resulting matrix has new.n.col columns.
	n.col.to.add <- new.n.col - ncol(mat)
	mat <- cbind(mat, matrix(0, nrow=nrow(mat), ncol=n.col.to.add))
	return(mat)
}


compute.locus.correlations <- function(n.generations, n.strains.to.use, n.features.per.gene, gene.length, print.output, save.interval, gene.path, save.path, save.extension, n.core.genes.to.simulate) {

	corr.trace <- NULL
	if (n.core.genes.to.simulate>0) {

		require(gdata)
		
		generations.to.consider <- seq(save.interval, n.generations, by=save.interval)
		
		corr.trace <- rep(NA, length(generations.to.consider))
		names(corr.trace) <- as.integer(generations.to.consider)
		
		all.correlations <- list()
		
		for (generation.index in generations.to.consider) {
			
			generation.index <- as.integer(generation.index)
			
			gene.file.name <- paste(gene.path, '/gen', generation.index, save.extension, '.RData', sep='')
			if (file.exists(gene.file.name)) {
				load(gene.file.name)
				# population, elapsed
				pop <- population
				rm(population)
				
				n.core.genes <- ncol(pop$core.genes)/n.features.per.gene
				
				core.dist <- list()
				locus.correlations <- matrix(NA, nrow=n.core.genes, ncol=n.core.genes)
				
				for (gene.index in 1:n.core.genes) {
					gene.sites <- get.feature.cols(gene.indexes=gene.index, n.features.per.gene=n.features.per.gene)
					core.dist[[gene.index]] <- matrix(NA, nrow=n.strains.to.use, ncol=n.strains.to.use)
					for (j in seq(1, n.strains.to.use-1)) {
						for (k in seq(j+1, n.strains.to.use)) {
							
							block.distance <- sum(abs(pop$core.genes[j,gene.sites]-pop$core.genes[k,gene.sites]))
							#core.dist[[gene.index]][j,k] <- 1 - (1 - 1 / gene.length )^block.distance
							core.dist[[gene.index]][j,k] <- pop$block.to.hamming[block.distance+1]
						}
					}
				}
				
				# Calculate the correlation between genes i and j.
				for (i in seq(1,n.core.genes-1)) {		
					for (j in seq(i+1, n.core.genes)) {
						suppressWarnings(locus.correlations[i, j] <- cor(upperTriangle(core.dist[[i]]), upperTriangle(core.dist[[j]]), method='spearman'))
						# This gives NA and a warning if there is no variation
						# in the locus. Such NAs are removed below, before 
						# computing the median correlation.
					}
				}
				corr.trace[as.character(generation.index)] <- median(upperTriangle(locus.correlations), na.rm=TRUE) # NA occurs if there is no variation in a locus.
				
				all.correlations[[as.character(generation.index)]] <- upperTriangle(locus.correlations)
			}
		}
		
		output.file.name <- paste(save.path, '/corr_trace', save.extension, '.RData', sep='')
	
		save(corr.trace, all.correlations, file=output.file.name)
	}
	
	return(corr.trace)
}



compute.medium.sized.cluster.mixedness <- function(gene.path, dist.path, save.path, first.generation, n.generations, save.interval, save.extension, freq.lower, freq.upper) {
	
	require(gdata) # upperTriangle
	if (is.null(first.generation)) {
		first.generation <- save.interval
	}
	generations.to.consider <- seq(first.generation, n.generations, by=save.interval)
	
	scores <- rep(NA,10000)
	score.counter <- 1
	
	for (generation.index in generations.to.consider) {
		
		generation.index <- as.integer(generation.index)	
		
		dist.file.name <- paste(dist.path, '/dist_gen', generation.index, save.extension, '.RData', sep='')
		load(file=dist.file.name)
		# core.dist, cog.dist, all.dist (at least cog.dist must be non-null)
		if (!is.null(cog.dist)) {
			num.strains.to.sample <- nrow(cog.dist)
		} else {
			stop('Cog distances must be available for hgt scores.')
		}
		
		gene.file.name <- paste(gene.path, '/gen', generation.index, save.extension, '.RData', sep='')
		
		load(file=gene.file.name)  # population, elapsed
		
		population$gene.indicators <- population$gene.indicators[1:num.strains.to.sample,]
		pop <- population
		
		
		# Find interesting COGs
		cog.proportions <- colSums(pop$gene.indicators) / nrow(pop$gene.indicators)
		interesting.cogs <- which(cog.proportions>freq.lower & cog.proportions<freq.upper)
		
		
		if (length(interesting.cogs)>0) {
			for (cog.index in interesting.cogs) {
				
				strains.with.cog <- which(pop$gene.indicators[,cog.index]==1)
				strains.without.cog <- which(pop$gene.indicators[,cog.index]==0)
				
				with.cog.distances <- upperTriangle(cog.dist[strains.with.cog,strains.with.cog])
				without.cog.distances <- upperTriangle(cog.dist[strains.without.cog,strains.without.cog])
				aux <- cog.dist
				aux[strains.with.cog,strains.with.cog] <- NA
				aux[strains.without.cog, strains.without.cog] <- NA
				between.distances <- upperTriangle(aux)
				between.distances <- between.distances[-which(is.na(between.distances))]
			
				within.distances <- c(with.cog.distances, without.cog.distances)
				
				#between.ecdf <- ecdf(between.distances)
				#score <- 1-mean(between.ecdf(within.distances))
				score <- ecdf(within.distances)(quantile(between.distances,0.01))
				# Larger score -> less recombination
				
				#score <- mean(within.distances)/mean(between.distances)
				
				scores[score.counter] <- score
				score.counter <- score.counter + 1
			}
		}
	}
	
	score.counter <- score.counter - 1
	scores <- scores[seq(1,score.counter)]
	n.scores <- length(scores)
	hgt.score <- median(scores)
	# Over the whole simulation, after discarding the initial samples.
	hgt.score.mean <- mean(scores)
	
	save(hgt.score, hgt.score.mean, n.scores, file=paste(save.path, '/hgt', save.extension, '.RData', sep=''))
	
	return(scores)
}