## EpiRADseq data analysis
## whitefish project

# Functions ----

# Consistent plot aesthetics for PCA
theme.pca <- function() {
  theme_bw() +
    theme(panel.grid.minor = element_blank(),
          panel.grid.major = element_blank(),
          panel.background = element_rect(colour="black",fill="white", size=1),
          axis.text = element_text(size=16, color = "black"),
          axis.ticks = element_line(size = 0.5, colour = "black"),
          axis.ticks.length = unit(3, "mm"),
          axis.title.y = element_text(size = 30),
          axis.title.x = element_text(size = 30),
          axis.text.x = element_text(size=20),
          axis.text.y = element_text(size=20),
          legend.title = element_text(size = 20),
          legend.text = element_text(size = 20))
}

cols <- c("Allt_na_lairige"="#a4dede","Carron"="#008396","Eck"="gray60","Glashan"="#9b2915","Lomond"="gray60","Shira"="#287c71",
          "Sloy"="#6c966f","Tarsan"="#fe7c73")
# library ----
library(RUVSeq);library(EDASeq);library(edgeR);library(DESeq2);library(variancePartition);library(limma)

# set working direcotry ----
setwd("~/Desktop/ddRAD_epiRAD_powan/demultiplexed_reads/EpiRAD/Intermediate_files3/08.EpiRAD/datasets/")

#### Lomond system ----

# read data ----
DEdata1 <- read.delim('lom_filtered_counts.txt',header=TRUE, stringsAsFactors=TRUE)
DEdata2 <- read.delim('lom_data.txt',header=TRUE, stringsAsFactors=TRUE, row.names=1)

# read the table with the batch loci ID
library_batch_loci <- read.table("list_of_library_batch_loci_glmFit.txt", header = TRUE)
adapter_batch_loci <- read.table("list_of_adapter_batch_loci_glmFit.txt", header = TRUE)

DEdata1 <- DEdata1[!(DEdata1$Locus %in% library_batch_loci$Locus),]
DEdata1 <- DEdata1[!(DEdata1$Locus %in% adapter_batch_loci$Locus),]

write.table(DEdata1, "final_lomond_dataset_count_table.txt", sep = "\t", quote = FALSE, row.names = FALSE)

rownames(DEdata1) <- DEdata1$Locus
DEdata1 <- DEdata1[,-1]


# DESeq2 exploratory analysis ----
dds_lom <- DESeqDataSetFromMatrix(countData = DEdata1,colData = DEdata2,design = ~ Lake)
dds_lom <- estimateSizeFactors(dds_lom) # Estimate library size correction scaling factors
vst_lom <- vst(dds_lom) # compute vst transformation
rld_lom <- rlog(dds_lom) # compute vst transformation
write.table(assay(rld_lom), file="rlog_lom_reads.txt", sep="\t")
rld_lom <- read.table("../rda/rlog_lom_reads2.txt", header = TRUE)

data <- plotPCA(rld_lom, intgroup=c("Lake"), returnData=TRUE, ntop = 120897) # set ntop value to full number of loci in count table
ggplot(data, aes(PC1, PC2, colour = Lake)) + geom_point(size = 6, alpha = 0.85) + theme.pca() + scale_color_manual(values = cols) +
  geom_vline(xintercept = 0, linetype = 'dashed') + geom_hline(yintercept = 0, linetype = 'dashed') + labs(x="PC1 5%", y="PC2 4%")


# run model in edgeR ----
y <- DGEList(counts = DEdata1, group = DEdata2$Lake, samples = DEdata2)
y <- calcNormFactors(y)  # Calculate normalization factors to scale the raw library sizes
design <- model.matrix(~0+Lake, data = DEdata2)
y <-estimateDisp(y,design)

fit <- glmFit(y,design)

lom_car <- glmLRT(fit, contrast=c(0,-1,1,0,0))
summary(decideTests(lom_car))
topTags(lom_car, n=10, p.value = 0.05)
plotMD(lom_car)
abline(h=c(-1, 1), col="blue")
lom_car_tops <- topTags(lom_car,n=500, adjust.method = "BH",p.value = 0.05, sort.by = "logFC")
write.csv(lom_car_tops, "lom_car_top_methyl_loci.csv")

lom_sloy <- glmLRT(fit, contrast=c(0,0,1,0,-1))
summary(decideTests(lom_sloy))
lom_sloy_tops <- topTags(lom_sloy,n=500, p.value=0.05, sort.by = "logFC")
write.csv(lom_sloy_tops, "lom_sloy_top_methyl_loci.csv")
topTags(lom_sloy, n=10, p.value = 0.05)
plotMD(lom_sloy)
abline(h=c(-1, 1), col="blue")

lom_shi <- glmLRT(fit, contrast=c(0,0,1,-1,0))
summary(decideTests(lom_shi))
lom_shi_tops <- topTags(lom_shi,n=500, p.value = 0.05, sort.by = "logFC")
write.csv(lom_shi_tops, "lom_shi_top_methyl_loci.csv")
plotMD(lom_shi)
abline(h=c(-1, 1), col="blue")

lom_lai <- glmLRT(fit, contrast=c(-1,0,1,0,0))
summary(decideTests(lom_lai))
lom_lai_tops <- topTags(lom_lai,n=500, p.value=0.05, sort.by = "logFC")
write.csv(lom_lai_tops, "lom_lai_top_methyl_loci.csv")
topTags(lom_lai, n=10, p.value = 0.05)
plotMD(lom_lai)
abline(h=c(-1, 1), col="blue")


#### Eck system ----
# read data ----
DEdata1 <- read.delim('eck_filtered_counts.txt',header=TRUE, stringsAsFactors=TRUE)
DEdata2 <- read.delim('eck_data.txt',header=TRUE, stringsAsFactors=TRUE, row.names=1)

DEdata1 <- DEdata1[!(DEdata1$Locus %in% library_batch_loci$Locus),]
DEdata1 <- DEdata1[!(DEdata1$Locus %in% adapter_batch_loci$Locus),]

write.table(DEdata1, "final_eck_dataset_count_table.txt", sep = "\t", quote = FALSE, row.names = FALSE)

rownames(DEdata1) <- DEdata1$Locus
DEdata1 <- DEdata1[,-1]

y <- DGEList(counts = DEdata1, group = DEdata2$Lake, samples = DEdata2)

# DESeq2 exploratory analysis
dds_eck <- DESeqDataSetFromMatrix(countData = DEdata1, colData = DEdata2,design = ~ Lake)
dds_eck <- estimateSizeFactors(dds_eck) # Estimate library size correction scaling factors
vst_eck <- vst(dds_eck) # compute vst transformation
rlog_eck <- rlog(dds_eck) # compute log transformation
write.table(assay(rlog_eck), file="rlog_eck_reads.txt", sep="\t")
rlog_eck <- read.table("../rda/rlog_eck_reads.txt", header = TRUE)

data <- plotPCA(rlog_eck, intgroup=c("Lake"), ntop = 117395, returnData=TRUE) # set ntop value to full number of loci in count table
ggplot(data, aes(PC1, PC2, colour = Lake)) + geom_point(size = 6) + theme.pca() + scale_color_manual(values = cols) +
  geom_vline(xintercept = 0, linetype = 'dashed') + geom_hline(yintercept = 0, linetype = 'dashed') + labs(x="PC1 4%", y="PC2 4%")


# run model in edgeR ----
y <- calcNormFactors(y)  # Calculate normalization factors to scale the raw library sizes
design <- model.matrix(~0+Lake, data = DEdata2)
disp <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTagwiseDisp(disp, design)

fit <- glmFit(y,design)


eck_tar <- glmLRT(fit, contrast=c(1,0,-1))
summary(decideTests(eck_tar))
eck_tar_tops <- topTags(eck_tar, n=200, p.value = 0.05, sort.by = "PValue")
write.csv(eck_tar_tops, "eck_tar_top_methyl_loci.csv")

eck_gla <- glmLRT(fit, contrast=c(1,-1,0))
summary(decideTests(eck_gla))
eck_gla_tops <- topTags(eck_gla, n=1500, p.value = 0.05, sort.by = "logFC")
write.csv(eck_gla_tops, "eck_gla_top_methyl_loci.csv")
plotMD(eck_gla)
abline(h=c(-1, 1), col="blue")


# combined dataset ---- 

DEdata1 <- read.delim('combined_counts.txt',header=TRUE, stringsAsFactors=TRUE)
DEdata2 <- read.delim('combined_data.txt',header=TRUE, stringsAsFactors=TRUE, row.names=1)

library_batch_loci <- read.table("list_of_library_batch_loci_glmFit.txt", header = TRUE)
adapter_batch_loci <- read.table("list_of_adapter_batch_loci_glmFit.txt", header = TRUE)

write.table(DEdata1, "final_combined_dataset_count_table.txt", sep = "\t", quote = FALSE, row.names = FALSE)

rownames(DEdata1) <- DEdata1$Locus
DEdata1 <- DEdata1[,-1]


# DESeq2
dds_combined <- DESeqDataSetFromMatrix(countData = DEdata1,colData = DEdata2,design = ~ 1)
dds_combined <- estimateSizeFactors(dds) # Estimate library size correction scaling factors
vst_combined <- vst(dds_combined) # compute vst transformation
rld_combined <- rlog(dds_combined)
write.table(assay(rld_combined), file="rlog_reads_combined.txt", sep="\t", quote = FALSE)  # this is used for the epigenetic RDA 
rlog_reads <- data.frame(assay(rld_combined))