# Identifying adaptive SNPs and Genetic Offset

rm(list=ls())
install.packages("gradientForest", repos="http://R-Forge.R-project.org")

library(adegenet)
library(gradientForest)
library(raster)
library(psych)
library(vegan)

### Collect WorldClim data ###

# need file with latitude and longitude co-ords for each lake/location in the dataset
lakes <- read.csv("./Lake_latitude_longitude.csv")
coords <- data.frame(x=lakes$Longitude,y=lakes$Latitude)

r <- getData("worldclim",var="bio",res=2.5)
# get climate data for our specific lakes
points <- SpatialPoints(coords, proj4string = r@crs)
cellvalues <- extract(r,points, cellnumbers= T)[,1]
values <- extract(r,points)
enviro <- cbind.data.frame( cell =cellvalues, coordinates(points),values)


### Adaptive SNPs ###

# Read in genetic data
gen <- read.PLINK("./Filtered.snps.raw",
                  map.file = "./Filtered.snps.map")

r2 <- getData('CMIP5', var='bio', res=2.5, rcp=45, model='AC', year=50)

pointsfut <- SpatialPoints(coords, proj4string = r2@crs)
cellvaluesfut <- extract(r2,pointsfut, cellnumbers= T)[,1]
valuesfut <- extract(r2,pointsfut)

envirofut <- cbind.data.frame( cell =cellvaluesfut, coordinates(pointsfut),valuesfut)

gen <- as.matrix(gen)

# Impute missing values

sum(is.na(gen))
gen.imp <- apply(gen, 2, function(x) replace(x, is.na(x), as.numeric(names(which.max(table(x))))))
sum(is.na(gen.imp))

# environmental and bathymetric predictors 

env <- read.csv("./Bathymetric_environment_data.csv")

env$Population<- as.factor(env$Population)
str(env)
env[is.na(env)] <- 0
# ensure that individuals match across env and genotype data
identical(rownames(gen.imp), env[,1]) 

# remove highly correlated variables

pred <- env[,c(6,7,9,12,13,16,24, 26:28)]
pairs.panels(pred, scale=T)

# Redundancy Analysis
charr.rda <- rda(gen.imp ~ ., data=pred, scale=T)
charr.rda
RsquareAdj(charr.rda)

signif.full <- anova.cca(charr.rda, parallel=getOption("mc.cores"))
signif.full
# Full model is significant

# can check the significance of each axis but this takes a while
signif.axis <- anova.cca(charr.rda, by="axis", parallel=8)
signif.axis
memory.limit(size=100000)

# Identify outliers on each axis
outliers <- function(x,z){
  lims <- mean(x) + c(-1, 1) * z * sd(x)     # find loadings +/-z sd from mean loading     
  x[x < lims[1] | x > lims[2]]               # locus names in these tails
}

cand1 <- outliers(load.rda[,1],3) 
cand2 <- outliers(load.rda[,2],3) 
cand3 <- outliers(load.rda[,3],3) 
cand4 <- outliers(load.rda[,4],3) 
cand5 <- outliers(load.rda[,5],3) 
cand6 <- outliers(load.rda[,6],3) 
cand7 <- outliers(load.rda[,7],3) 
cand8 <- outliers(load.rda[,8],3) 
cand9 <- outliers(load.rda[,9],3) 
cand10 <- outliers(load.rda[,10],3) 


ncand <- length(cand1) + length(cand2) + length(cand3) + length(cand4) + length(cand5) + length(cand6) + length(cand7) + length(cand8) + length(cand9) + length(cand10)
ncand 

cand1 <- cbind.data.frame(rep(1,times=length(cand1)), names(cand1), unname(cand1))
cand2 <- cbind.data.frame(rep(2,times=length(cand2)), names(cand2), unname(cand2))
cand3 <- cbind.data.frame(rep(3,times=length(cand3)), names(cand3), unname(cand3))
cand4 <- cbind.data.frame(rep(4,times=length(cand4)), names(cand4), unname(cand4))
cand5 <- cbind.data.frame(rep(5,times=length(cand5)), names(cand5), unname(cand5))
cand6 <- cbind.data.frame(rep(6,times=length(cand6)), names(cand6), unname(cand6))
cand7 <- cbind.data.frame(rep(7,times=length(cand7)), names(cand7), unname(cand7))
cand8 <- cbind.data.frame(rep(8,times=length(cand8)), names(cand8), unname(cand8))
cand9 <- cbind.data.frame(rep(9,times=length(cand9)), names(cand9), unname(cand9))
cand10 <- cbind.data.frame(rep(10,times=length(cand10)), names(cand10), unname(cand10))

colnames(cand1) <- colnames(cand2) <- colnames(cand3)  <- colnames(cand4) <- colnames(cand5) <- 
  colnames(cand6) <- colnames(cand7) <- colnames(cand8) <- colnames(cand9) <- colnames(cand10) <- 
  c("axis","snp","loading")



cand <- rbind(cand1, cand2, cand3, cand4, cand5, cand6, cand7, cand8, cand9, cand10)
cand$snp <- as.character(cand$snp)

foo <- matrix(nrow=(ncand), ncol=10)
colnames(foo) <- c("bio2", "bio3", "bio5", "bio8", "bio9", "bio12", "Alt", "mean_depth", "surf_area", "lit_zone")

for (i in 1:length(cand$snp)) {
  nam <- cand[i,2]
  snp.gen <- gen.imp[,nam]
  foo[i,] <- apply(pred,2,function(x) cor(x,snp.gen))
}

cand <- cbind.data.frame(cand,foo)  
head(cand)

length(cand$snp[duplicated(cand$snp)]) # check for duplicate SNPs
cand <- cand[!duplicated(cand$snp),] 

for (i in 1:length(cand$snp)) {
  bar <- cand[i,]
  cand[i,14] <- names(which.max(abs(bar[4:13]))) # gives the variable
  cand[i,15] <- max(abs(bar[4:13]))              # gives the correlation
}

colnames(cand)[14] <- "predictor"
colnames(cand)[15] <- "correlation"

table(cand$predictor) 



#### GradientForest model ####

# need file with latitude and longitude co-ords for each lake/location in the dataset
lakes <- read.csv("D:/Documents/PhD/Genomic_vulnerability/Scripts/Lake_latitude_longitude.csv")
coords <- data.frame(x=lakes$Longitude,y=lakes$Latitude)

# Get climate data (current)
r <- getData("worldclim",var="bio",res=2.5)
# get climate data for our specific lakes
points <- SpatialPoints(coords, proj4string = r@crs)
cellvalues <- extract(r,points, cellnumbers= T)[,1]
values <- extract(r,points)
enviro <- cbind.data.frame( cell =cellvalues, coordinates(points),values)

# Now we need to collect the data for the future environmental variables
r2 <- getData('CMIP5', var='bio', res=2.5, rcp=45, model='AC', year=50)

pointsfut <- SpatialPoints(coords, proj4string = r2@crs)
cellvaluesfut <- extract(r2,pointsfut, cellnumbers= T)[,1]
valuesfut <- extract(r2,pointsfut)

envirofut <- cbind.data.frame( cell =cellvaluesfut, coordinates(pointsfut),valuesfut)


# Load in data which has minor allele frequencies for loci to be used 

gfData <- read.csv("./Data_for_GradientForest.csv")


envGF <- gfData[,5:10] #climate data  

SNPsGF1 <- gfData[,grep("NC",colnames(gfData))] 
SNPsGF2 <- gfData[,grep("NW", colnames(gfData))]#SNP data

SNPsGF <- cbind(SNPsGF1, SNPsGF2)


maxLevel <- log2(0.368*nrow(envGF)/2)

gf <- gradientForest(cbind(envGF, SNPsGF), predictor.vars=colnames(envGF),
                     response.vars=colnames(SNPsGF), ntree=250, 
                     maxLevel=maxLevel, trace=T, corr.threshold=0.7)
plot(gf, plot.type="O")
plot(gf, plot.type="C")
gf$call


# plot output, see ?plot.gradient                                                                                                                                                                                                                                                                                                                                                                                                                          orest
type = "O"
raster::plot(gf, plot.type = "O")

# compare differences between current and future values (genomic vulnerability)
enviro <- read.csv("./Lake_environmental_data.csv")
envirofut <- read.csv("./Lake_environmental_data_future_RCP_45.csv")


proj <- predict(gf, enviro[,-c(1:5, 8, 10:11, 14:15, 17:23)]) 

colnames(envirofut) <- colnames(enviro)
projfut <- predict(gf, envirofut[,-c(1:5, 8, 10:11, 14:15, 17:23)]) 

genOffset <- sqrt((projfut[,1]-proj[,1])^2 + (projfut[,2]-proj[,2])^2
                  +(projfut[,3]-proj[,3])^2 + (projfut[,4]-proj[,4])^2
                  +(projfut[,5]-proj[,5])^2 + (projfut[,6]-proj[,6])^2)

genOffset
