# Identify batch effect loci library(edgeR);library(DESeq2);library(EDASeq);library(data.table);library(tidyverse) # In this script we use only lakes with samples split across libraries to identify # loci that have a batch effect signal. Once identified using edgeR, these loci are # removed from the dataset. setwd("~/Desktop/ddRAD_epiRAD_powan/demultiplexed_reads/EpiRAD/Intermediate_files3/08.EpiRAD/datasets/") # Library batch ----- DEdata1 <- read.delim('batch_data_counts.txt',header=TRUE, stringsAsFactors=TRUE, row.names=1) DEdata2 <- read.delim('batch_data.txt',header=TRUE, stringsAsFactors=TRUE, row.names=1) set <- newSeqExpressionSet(as.matrix(DEdata1),phenoData = DEdata2) plotRLE(set, outline=FALSE, ylim=c(-4, 4), col=DEdata2$Library) # plots the relative log expression of read counts plotPCA(set, col=DEdata2$Library,k=5,cex=1.2) plotPCA(set, col=DEdata2$Lake,k=5,cex=1.2) x <- as.factor(DEdata2$Library) design <- model.matrix(~ x , data=DEdata2) y <-DGEList(counts = DEdata1, group = x) y <- calcNormFactors(y) y <- estimateDisp(y, design) fit <- glmFit(y,design) lrt <- glmLRT(fit, coef=3) top <- topTags(lrt, n=nrow(set))$table top_batch_loci12 <- top %>% rownames_to_column("Locus") %>% filter(FDR < 0.05 | logFC >= 1 | logFC <= -1) lrt <- glmLRT(fit, coef=2) top <- topTags(lrt, n=nrow(set))$table top_batch_loci13 <- top %>% rownames_to_column("Locus") %>% filter(FDR < 0.05 | logFC >= 1 | logFC <= -1) lrt <- glmLRT(fit, contrast=c(0,-1,1)) top <- topTags(lrt, n=nrow(set))$table top_batch_loci23 <- top %>% rownames_to_column("Locus") %>% filter(FDR < 0.05 | logFC >= 1 | logFC <= -1) batch_loci <- data.frame(unique(c(top_batch_loci12$Locus,top_batch_loci13$Locus,top_batch_loci23$Locus))) colnames(batch_loci) <- "Locus" fwrite(batch_loci, "list_of_library_batch_loci_glmFit.txt", sep = "\t", row.names = FALSE) # Adapter batch ---- DEdata1 <- read.delim('combined_counts.txt',header=TRUE, stringsAsFactors=TRUE) DEdata2 <- read.delim('combined_data.txt',header=TRUE, stringsAsFactors=TRUE, row.names=1) DEdata1 <- DEdata1[!(DEdata1$Locus %in% batch_loci$Locus),] rownames(DEdata1) <- DEdata1$Locus DEdata1 <- DEdata1[,-1] set <- newSeqExpressionSet(as.matrix(DEdata1),phenoData = DEdata2) x <- as.factor(DEdata2$factor) design <- model.matrix(~ x , data=pData(set)) y <- DGEList(counts = DEdata1, group = DEdata2$Lake, samples = DEdata2) y <- calcNormFactors(y) y <- estimateDisp(y, design) fit <- glmFit(y, design) lrt <- glmLRT(fit, coef=2) top <- topTags(lrt, n=nrow(set))$table summary(decideTests(lrt)) top_adpater_batch <- top %>% rownames_to_column("Locus") %>% filter(FDR < 0.05) factor <- rownames(set)[which(rownames(set)%in% rownames(top)[top$FDR < 0.1])] combined_batch <- data.frame(factor) colnames(combined_batch) <- "Locus" fwrite(top_adpater_batch, "list_of_adapter_batch_loci_glmFit.txt", sep = "\t", row.names = FALSE)