rm(list=ls())

## Give fixed parameters that define the simulation setup
# this "if" is here because it allows hiding stuff in kate...
if (TRUE) {

	methods.to.run <- c('simulation', 'distances')
	#methods.to.run <- c('simulation', 'distances', 'locus.correlations', 'hgt.scores')
	#methods.to.run <- 'hgt.scores'
	simulation.name <- 'my_test_simulation'
}



## paths and files
if (TRUE) {

	root <- 'C:/Work/CCDD/codes/to_publish/cog_simulation'

}



############################################
## Code for actual operations starts here ##
############################################

#install.packages('e1071')

source(paste(root, '/cog_evolution_functions.R', sep=''))
source(paste(root,'/cog_summary_functions.R', sep=''))
source(paste(root,'/simulation_utility_functions.R', sep=''))

# Create a directory for simulation.name if it does not exist:
res.dir.name <- paste(root, '/Results/', simulation.name, sep='')
if (!file.exists(res.dir.name)) {
	dir.create(res.dir.name)
	dir.create(paste(res.dir.name, '/saved_genes', sep=''))
	dir.create(paste(res.dir.name, '/distances', sep=''))
	dir.create(paste(res.dir.name, '/locus_correlations', sep=''))
	dir.create(paste(res.dir.name, '/hgt_scores', sep=''))
}
	
if (!any(methods.to.run=='simulation')) {
	# If simulation has been run before, load used parameters.
	# Otherwise abort.
	if (file.exists(paste(res.dir.name, '/', simulation.name, '_pars.RData', sep=''))) {
		load(file=paste(res.dir.name, '/', simulation.name, '_pars.RData', sep='')) # simulation.name, n.generations, n.strains, n.core.genes.to.simulate, n.features.per.gene, gene.length, mutation.rate, recombination.rate, recombination.acceptance.par, genome.size, fitness.cost.per.excess.gene, deletion.rate, novel.gene.introduction.rate, hgt.rate, hgt.acceptance.par, save.interval
	} else {
		stop('Parameters not found')
	}
	
} else {
	## Set parameters
	n.generations <- 40000
	n.strains <- 2000
	first.generation.for.summaries <- 10000
	
	######################################################
	## PARAMETERS RELATED TO THE CORE GENOME SIMULATION ##
	######################################################
	
	# The number of core genes for which detailed evolution is simulated.
	# Set this to zero, if only gene presence/absence pattern is to be
	# simulated.
	n.core.genes.to.simulate <- 40
	
	# Number of abstract sequence features representing a gene sequence 
	# in the simulated core genes. (precomputed mapping from the low-dimensional
	# sequence representation to the Hamming distances only exists for 
	# n.features.per.gene=10 and gene.length=500)
	n.features.per.gene <- 10
	
	# Length of a single gene in base pairs
	gene.length <- 500
	
	# Mean number of strains that mutate per generation per gene
	mutation.rate <- 1.8
	
	# Mean number of homologous recombination event attempts per generation 
	# per gene
	recombination.rate <- 7
	
	# Parameter to determine whether a recombination is accepted.
	# Acceptance probability 10^(-Ax), x the Hamming distance between the
	# old and new alleles.
	recombination.acceptance.par <- 18
	
	
	#######################################################
	## PARAMETERS RELATED TO THE GENE CONTENT SIMULATION ##
	#######################################################
	
	# The number of genes a strain may have without fitness cost. Set this
	# to zero if only the detailed evolution of the core genes is to be 
	# simulated.
	genome.size <- 60
	
	# Fitness cost for every gene exceeding the genome size.
	fitness.cost.per.excess.gene <- 0.99
	
	# Mean number of deletion attempts per generation per core gene. (The 
	# total number of attempts is obtained by multiplying with the current 
	# core genome size.)
	deletion.rate <- 0.066
	
	# Mean number of introductions of novel genes to the population per 
	# generation per core gene (the total number of introductions 
	# is obtained by multiplying with the current core genome size)
	novel.gene.introduction.rate <- 0.18
	
	# Mean number of horizontal gene transfer attempts per gene per generation
	hgt.rate <- 7.4
	
	# Number of genes that, if deleted, lead to zero fitness.
	fixed.core.size <- 0
	
	# Parameter to determine whether a HGT is accepted. Acceptance 
	# probability 10^(-Ay), y the Jaccard distance between strains.
	hgt.acceptance.par <- 18/12
	
	# Parameters related to saving etc.
	save.interval <- 1000
	
	save(simulation.name, n.generations, n.strains, first.generation.for.summaries, n.core.genes.to.simulate, n.features.per.gene, gene.length, mutation.rate, recombination.rate, recombination.acceptance.par, genome.size, fitness.cost.per.excess.gene, deletion.rate, novel.gene.introduction.rate, hgt.rate, fixed.core.size, hgt.acceptance.par, save.interval, file=paste(res.dir.name, '/', simulation.name, '_pars.RData', sep=''))
}
	
save.extension <- '_test1'

print.output <- TRUE




####################
## Run simulation ##
####################	
if (any(methods.to.run=='simulation')) {
	save.path <- paste(root, '/Results/', simulation.name, '/saved_genes', sep='')
	population <- NULL
	start.generation <- 1
	
	# Check if previously run results exist for these parameter values
	all.res.files <- list.files(save.path)
	if (length(all.res.files)>0) {
		res.file.inds <- grep(pattern=paste(save.extension,'.RData', sep=''), x=all.res.files)
		if (length(res.file.inds)>0) {
			res.files <- all.res.files[res.file.inds]
			res.generations <- unlist(lapply(strsplit(res.files,'_'), function(x){as.integer(substr(x[1],start=4,stop=nchar(x[1])))}))
			max.existing.gen <- max(res.generations)
			max.gen.fname <- res.files[which.max(res.generations)]
			start.generation <- max.existing.gen + 1
			load(file=paste(save.path, '/', max.gen.fname, sep=''))
			# elapsed, population
		}
	}
	
	if (start.generation<n.generations) {
		run.evolution(simulation.name=simulation.name, variables.in.grid.names=variables.in.grid.names, variables.in.grid.values=variables.in.grid.values, n.generations=n.generations, n.strains=n.strains, n.core.genes.to.simulate=n.core.genes.to.simulate, n.features.per.gene=n.features.per.gene, gene.length=gene.length, mutation.rate=mutation.rate, recombination.rate=recombination.rate, recombination.acceptance.par=recombination.acceptance.par, genome.size=genome.size, fitness.cost.per.excess.gene=fitness.cost.per.excess.gene, deletion.rate=deletion.rate, novel.gene.introduction.rate=novel.gene.introduction.rate, hgt.rate=hgt.rate, fixed.core.size=fixed.core.size, hgt.acceptance.par=hgt.acceptance.par, save.interval=save.interval, save.path=save.path, save.extension=save.extension, print.output=print.output, population=population, start.generation=start.generation)
	}
}
	
	
#######################
## Compute distances ##
#######################	
if (any(methods.to.run=='distances')) {
	gene.path <- paste(root, '/Results/', simulation.name, '/saved_genes', sep='')
	save.path <- paste(root, '/Results/', simulation.name, '/distances', sep='')
	print('Computing distances')
	
	# Number of strains between which 
	# distances are computed
	n.strains.to.use <- pmin(n.strains,300)
	
	dist.now <- compute.distances(n.generations=n.generations, n.strains.to.use=n.strains.to.use, n.features.per.gene=n.features.per.gene, gene.length=gene.length, print.output=FALSE, save.interval=save.interval, gene.path=gene.path, save.path=save.path, save.extension=save.extension)	
}
	
	
##################################################
## Compute locus correlations ('linkage score') ##
##################################################
if (any(methods.to.run=='locus.correlations')) {
	gene.path <- paste(root, '/Results/', simulation.name, '/saved_genes', sep='')
	save.path <- paste(root, '/Results/', simulation.name, '/locus_correlations', sep='')
	
	n.strains.to.use <- pmin(n.strains,200)
	corr.trace <- compute.locus.correlations(n.generations=n.generations, n.strains.to.use=n.strains.to.use, n.features.per.gene=n.features.per.gene, gene.length=gene.length, print.output=print.output, save.interval=save.interval, gene.path=gene.path, save.path=save.path, save.extension=save.extension, n.core.genes.to.simulate=n.core.genes.to.simulate)
}


#################################################################
## Compute horizontal gene transfer scores ('clonality score') ##
#################################################################
if (any(methods.to.run=='hgt.scores')) {
	gene.path <- paste(root, '/Results/', simulation.name, '/saved_genes', sep='')
	save.path <- paste(root, '/Results/', simulation.name, '/hgt_scores', sep='')
	dist.path <- paste(root, '/Results/', simulation.name, '/distances', sep='')
	
	
	# The number of strains which are used for calculating the statistic
	# is the same as the number of strains between which the distances were
	# calculated. This is taken from the number of rows in the Jaccard
	# distance matrix.
	hgt.scores <- compute.medium.sized.cluster.mixedness(gene.path=gene.path, dist.path=dist.path, save.path=save.path, first.generation=first.generation.for.summaries, n.generations=n.generations, save.interval=save.interval, save.extension=save.extension, freq.lower=0.4, freq.upper=0.6)
	
}