rm(list=ls())
## UPDATE THIS PATH BASED ON WHERE YOU HAVE SAVED THE CODE
base.path <- 'C:/Work/rich_phenotype/reduced_rank_regression/gene_metabolome/'
setwd(base.path)


### load all codes
library(MASS)
library(fields)
source(paste(base.path, 'brrr/full_low_rank_brr.R', sep = ''))
source(paste(base.path, 'sparse_fa/sparse_fa.R', sep = ''))
source(paste(base.path, 'infinite_brr/infinite_brr.R', sep = ''))
source(paste(base.path, 'bayes_lm/bayes_lm.R', sep = ''))
source(paste(base.path, 'common/mvrnorm_own.R', sep = ''))
source(paste(base.path, 'common/PosDef.R', sep = ''))
source(paste(base.path, 'common/auxiliary_functions.R', sep = ''))
source(paste(base.path, 'common/preprocess_data.R', sep = ''))

if (file.exists('preproc_data.RData')) {
	load('preproc_data.RData')	
} else {
	load('YOUR_OWN_GENOTYPES.RData')
	# Must contain matrix "genotypes", which has N rows and S columns. The rows
	# are the individuals and the columns the SNPs. The coding of the SNPs
	# must be 0,1,2 according to the minor allele count.
	
	load('YOUR_OWN_PHENOTYPES.RData')
	# Must contain matrix "phenotypes", which has N rows and P columns. The rows
	# are the individuals and the columns the phenotypes. In the article these
	# were quantile normalized to have unit variance.
	data <- list()
	data$genotypes <- genotypes
	data$phenotypes <- phenotypes
	
	n.pheno <- ncol(phenotypes)
	n.snps.to.keep <- 200
	
	# preprocess data as in the paper
	data <- preprocess.data(raw.data=data, n.snps.to.keep=n.snps.to.keep, permutation = NULL)
	print('Data preprocessed')
	save(data, n.pheno, n.snps.to.keep, file='preproc_data.RData')	
}
n.patients <- nrow(data$genotypes)


# INFORMATIVE PRIOR for PTVE
# These values correspond to the prior expectation that the first component
# explains [0.3-0.999] of the total variation with probability 0.98.
a4.shape=4.1
a4.rate=0.31
a4.lower.bound=2.7

# Select parameters for the prior distribution of a3 such that the expected 
# total proportion of variation explained has the specified quantiles:
tpve.quantiles <- c(0.5, 0.99)
tpve.quantile.values <- c(0.000001, 0.001)


# Limits tell where a3 must lie for mean TPVE to lie in the specified interval.
limits <- compute.a3.interval(tpve=tpve.quantile.values, a4.shape=a4.shape, a4.rate=a4.rate, a4.lower.bound=a4.lower.bound, a3.lower.bound=2, local.shrinkage.nu=3, snp.total.variation=data$total.unpruned.snp.variation, phenotype.avg.variance=mean(apply(data$phenotypes,2,var)))
a3.quantiles <- 1-tpve.quantiles[c(2,1)]
ans <- determine.gamma.pars(quantiles=a3.quantiles, values=c(limits$a3.min, limits$a3.max))
a3.shape <- ans$shape
a3.rate <- ans$rate
a3.lower.bound <- 2
a3.init.value <- 3000
a4.init.value <- 4.5

brrr.rank <- 1 # This was used in the GWAS in the article.			
# Simulate from the prior distribution of TPVE
#tpve.prior.samples <- simulate.tpve.from.prior(a3.shape=a3.shape, a3.rate=a3.rate, a3.lower.bound=a3.lower.bound, a4.shape=a4.shape, a4.rate=a4.rate, a4.lower.bound=a4.lower.bound, n.simulations=100, brr.rank=brrr.rank, data)
tpve.prior.samples <- NA


# Informative initialization of the Bayesian reduced rank regression model.
# rare.maf.threshold is used for bookkeeping: the proportion of variance 
# explained by SNPs with MAF less than ...
init.brr.model <- initialize.informative.brr(data=data, a3.init.value=a3.init.value, a4.init.value=a4.init.value, local.shrinkage.nu=3, a3.shape=a3.shape, a3.rate=a3.rate, a3.lower.bound=a3.lower.bound, a4.shape=a4.shape, a4.rate=a4.rate, a4.lower.bound=a4.lower.bound, brr.factor.relevance.cutoff=0.01, alpha0=-1, alpha1=-5E-4, a.sigma=1, b.sigma=1, brr.rank=brrr.rank, input.clustering=NULL, initialize.Psi.Gamma.proposal=F, adaptation.interval=100, adaptation.intervals.total=10, rare.maf.threshold=0.01)


# Informative initialization of the FA-part (the noise model)
fa <- initialize.fa.from.prior(a1.shape=18, a1.rate=2, a1.lower.bound=2, a2.shape=18, a2.rate=2, a2.lower.bound=3, a.sigma=2.2, b.sigma=0.3, local.shrinkage.nu=3, factor.relevance.cutoff=0.001, alpha0=-1, alpha1=-0.005, rank=10, n.patients=nrow(data$genotypes), n.pheno=ncol(data$phenotypes))
# Update FA-model until convergence
Y <- data$phenotypes - data$genotypes %*% init.brr.model$context$Psi %*% init.brr.model$context$Gamma
init.fa.model <- sparse.fa.gibbs(n.iter=200, fa$context, fa$prior, Y)
init.brr.model$prior$variances <- init.fa.model$context$variances


# Initialize full Bayesian reduced rank regression model (the reduced rank 
# regression combined with the noise model) for MCMC
init.model <- list()
init.model$fa$context <- init.fa.model$context
init.model$fa$prior <- fa$prior
init.model$brr$context <- init.brr.model$context
init.model$brr$prior <- init.brr.model$prior
init.model$A <- NA
print('Model initialized')


# Run MCMC to learn the full BRRR model from data
tX <- proc.time()[3]
n.iterations <- 20000
thinning <- 50
# In the article, we used 5000 iterations for initial examination. Majority
# of genes converged to zero within these iterations. We used threshold 0.001
# for mean PTVE (estimated using the second half of the simulation) to select 
# genes which were analyzed using a longer MCMC (a couple of percent of all 
# genes). 20000 iterations were run for the selected genes which was sufficient 
# for approximate PTVE estimates. For more thorough mixing of the algorithm, 
# we recommend 40000-80000 iterations.

mcmc.output <- gibbs.full.low.rank.brr(model=init.model, data=data, n.iter=n.iterations, thin=thinning, fixed.brr.rank=brrr.rank, brr.vars.to.record=c('Gamma', 'Psi', 'a3a4', 'brr.rank', 'maf.group.tpve'), fix.gamma.iteration = 2000)

tY <- proc.time()[3]
print(paste('Gibbs run time: ', tY-tX, sep=''))

# remove burnin
mcmc.output <- remove.burnin(mcmc.output=mcmc.output, burnin=round(n.iterations/thinning/2))


# Trace of PTVE
ptve.samples <- unlist(mcmc.output$traces$tpve)

# Mean proportion of total variation explained by rare variants
ptve.rare <- mean(unlist(lapply(mcmc.output$traces$maf.group.tpve, function(x){x[['rare']]})))