# Identify batch effect loci
library(edgeR);library(DESeq2);library(EDASeq);library(data.table);library(tidyverse)
# In this script we use only lakes with samples split across libraries to identify 
# loci that have a batch effect signal. Once identified using edgeR, these loci are
# removed from the dataset. 
setwd("~/Desktop/ddRAD_epiRAD_powan/demultiplexed_reads/EpiRAD/Intermediate_files3/08.EpiRAD/datasets/")


# Library batch -----
DEdata1 <- read.delim('batch_data_counts.txt',header=TRUE, stringsAsFactors=TRUE, row.names=1)
DEdata2 <- read.delim('batch_data.txt',header=TRUE, stringsAsFactors=TRUE, row.names=1)

set <- newSeqExpressionSet(as.matrix(DEdata1),phenoData = DEdata2)
plotRLE(set, outline=FALSE, ylim=c(-4, 4), col=DEdata2$Library)  # plots the relative log expression of read counts
plotPCA(set, col=DEdata2$Library,k=5,cex=1.2)
plotPCA(set, col=DEdata2$Lake,k=5,cex=1.2)


x <- as.factor(DEdata2$Library)
design <- model.matrix(~ x , data=DEdata2)
y <-DGEList(counts = DEdata1, group = x)
y <- calcNormFactors(y)
y <- estimateDisp(y, design)

fit <- glmFit(y,design)

lrt <- glmLRT(fit, coef=3)
top <- topTags(lrt, n=nrow(set))$table
top_batch_loci12 <- top %>% 
  rownames_to_column("Locus") %>% 
  filter(FDR < 0.05 | logFC >= 1 | logFC <= -1)

lrt <- glmLRT(fit, coef=2)
top <- topTags(lrt, n=nrow(set))$table
top_batch_loci13 <- top %>% 
  rownames_to_column("Locus") %>% 
  filter(FDR < 0.05 | logFC >= 1 | logFC <= -1)

lrt <- glmLRT(fit, contrast=c(0,-1,1))
top <- topTags(lrt, n=nrow(set))$table
top_batch_loci23 <- top %>% 
  rownames_to_column("Locus") %>% 
  filter(FDR < 0.05 | logFC >= 1 | logFC <= -1)

batch_loci <- data.frame(unique(c(top_batch_loci12$Locus,top_batch_loci13$Locus,top_batch_loci23$Locus)))
colnames(batch_loci) <- "Locus"
fwrite(batch_loci, "list_of_library_batch_loci_glmFit.txt", sep = "\t", row.names = FALSE)


# Adapter batch ----

DEdata1 <- read.delim('combined_counts.txt',header=TRUE, stringsAsFactors=TRUE)
DEdata2 <- read.delim('combined_data.txt',header=TRUE, stringsAsFactors=TRUE, row.names=1)

DEdata1 <- DEdata1[!(DEdata1$Locus %in% batch_loci$Locus),]
rownames(DEdata1) <- DEdata1$Locus
DEdata1 <- DEdata1[,-1]

set <- newSeqExpressionSet(as.matrix(DEdata1),phenoData = DEdata2)

x <- as.factor(DEdata2$factor)
design <- model.matrix(~ x , data=pData(set))
y <- DGEList(counts = DEdata1, group = DEdata2$Lake, samples = DEdata2)
y <- calcNormFactors(y)
y <- estimateDisp(y, design)

fit <- glmFit(y, design)

lrt <- glmLRT(fit, coef=2)
top <- topTags(lrt, n=nrow(set))$table
summary(decideTests(lrt))
top_adpater_batch <- top %>% 
  rownames_to_column("Locus") %>% 
  filter(FDR < 0.05)

factor <- rownames(set)[which(rownames(set)%in% rownames(top)[top$FDR < 0.1])]
combined_batch <- data.frame(factor)
colnames(combined_batch) <- "Locus"

fwrite(top_adpater_batch, "list_of_adapter_batch_loci_glmFit.txt", sep = "\t", row.names = FALSE)