## EpiRADseq data analysis ## whitefish project # Functions ---- # Consistent plot aesthetics for PCA theme.pca <- function() { theme_bw() + theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(), panel.background = element_rect(colour="black",fill="white", size=1), axis.text = element_text(size=16, color = "black"), axis.ticks = element_line(size = 0.5, colour = "black"), axis.ticks.length = unit(3, "mm"), axis.title.y = element_text(size = 30), axis.title.x = element_text(size = 30), axis.text.x = element_text(size=20), axis.text.y = element_text(size=20), legend.title = element_text(size = 20), legend.text = element_text(size = 20)) } cols <- c("Allt_na_lairige"="#a4dede","Carron"="#008396","Eck"="gray60","Glashan"="#9b2915","Lomond"="gray60","Shira"="#287c71", "Sloy"="#6c966f","Tarsan"="#fe7c73") # library ---- library(RUVSeq);library(EDASeq);library(edgeR);library(DESeq2);library(variancePartition);library(limma) # set working direcotry ---- setwd("~/Desktop/ddRAD_epiRAD_powan/demultiplexed_reads/EpiRAD/Intermediate_files3/08.EpiRAD/datasets/") #### Lomond system ---- # read data ---- DEdata1 <- read.delim('lom_filtered_counts.txt',header=TRUE, stringsAsFactors=TRUE) DEdata2 <- read.delim('lom_data.txt',header=TRUE, stringsAsFactors=TRUE, row.names=1) # read the table with the batch loci ID library_batch_loci <- read.table("list_of_library_batch_loci_glmFit.txt", header = TRUE) adapter_batch_loci <- read.table("list_of_adapter_batch_loci_glmFit.txt", header = TRUE) DEdata1 <- DEdata1[!(DEdata1$Locus %in% library_batch_loci$Locus),] DEdata1 <- DEdata1[!(DEdata1$Locus %in% adapter_batch_loci$Locus),] write.table(DEdata1, "final_lomond_dataset_count_table.txt", sep = "\t", quote = FALSE, row.names = FALSE) rownames(DEdata1) <- DEdata1$Locus DEdata1 <- DEdata1[,-1] # DESeq2 exploratory analysis ---- dds_lom <- DESeqDataSetFromMatrix(countData = DEdata1,colData = DEdata2,design = ~ Lake) dds_lom <- estimateSizeFactors(dds_lom) # Estimate library size correction scaling factors vst_lom <- vst(dds_lom) # compute vst transformation rld_lom <- rlog(dds_lom) # compute vst transformation write.table(assay(rld_lom), file="rlog_lom_reads.txt", sep="\t") rld_lom <- read.table("../rda/rlog_lom_reads2.txt", header = TRUE) data <- plotPCA(rld_lom, intgroup=c("Lake"), returnData=TRUE, ntop = 120897) # set ntop value to full number of loci in count table ggplot(data, aes(PC1, PC2, colour = Lake)) + geom_point(size = 6, alpha = 0.85) + theme.pca() + scale_color_manual(values = cols) + geom_vline(xintercept = 0, linetype = 'dashed') + geom_hline(yintercept = 0, linetype = 'dashed') + labs(x="PC1 5%", y="PC2 4%") # run model in edgeR ---- y <- DGEList(counts = DEdata1, group = DEdata2$Lake, samples = DEdata2) y <- calcNormFactors(y) # Calculate normalization factors to scale the raw library sizes design <- model.matrix(~0+Lake, data = DEdata2) y <-estimateDisp(y,design) fit <- glmFit(y,design) lom_car <- glmLRT(fit, contrast=c(0,-1,1,0,0)) summary(decideTests(lom_car)) topTags(lom_car, n=10, p.value = 0.05) plotMD(lom_car) abline(h=c(-1, 1), col="blue") lom_car_tops <- topTags(lom_car,n=500, adjust.method = "BH",p.value = 0.05, sort.by = "logFC") write.csv(lom_car_tops, "lom_car_top_methyl_loci.csv") lom_sloy <- glmLRT(fit, contrast=c(0,0,1,0,-1)) summary(decideTests(lom_sloy)) lom_sloy_tops <- topTags(lom_sloy,n=500, p.value=0.05, sort.by = "logFC") write.csv(lom_sloy_tops, "lom_sloy_top_methyl_loci.csv") topTags(lom_sloy, n=10, p.value = 0.05) plotMD(lom_sloy) abline(h=c(-1, 1), col="blue") lom_shi <- glmLRT(fit, contrast=c(0,0,1,-1,0)) summary(decideTests(lom_shi)) lom_shi_tops <- topTags(lom_shi,n=500, p.value = 0.05, sort.by = "logFC") write.csv(lom_shi_tops, "lom_shi_top_methyl_loci.csv") plotMD(lom_shi) abline(h=c(-1, 1), col="blue") lom_lai <- glmLRT(fit, contrast=c(-1,0,1,0,0)) summary(decideTests(lom_lai)) lom_lai_tops <- topTags(lom_lai,n=500, p.value=0.05, sort.by = "logFC") write.csv(lom_lai_tops, "lom_lai_top_methyl_loci.csv") topTags(lom_lai, n=10, p.value = 0.05) plotMD(lom_lai) abline(h=c(-1, 1), col="blue") #### Eck system ---- # read data ---- DEdata1 <- read.delim('eck_filtered_counts.txt',header=TRUE, stringsAsFactors=TRUE) DEdata2 <- read.delim('eck_data.txt',header=TRUE, stringsAsFactors=TRUE, row.names=1) DEdata1 <- DEdata1[!(DEdata1$Locus %in% library_batch_loci$Locus),] DEdata1 <- DEdata1[!(DEdata1$Locus %in% adapter_batch_loci$Locus),] write.table(DEdata1, "final_eck_dataset_count_table.txt", sep = "\t", quote = FALSE, row.names = FALSE) rownames(DEdata1) <- DEdata1$Locus DEdata1 <- DEdata1[,-1] y <- DGEList(counts = DEdata1, group = DEdata2$Lake, samples = DEdata2) # DESeq2 exploratory analysis dds_eck <- DESeqDataSetFromMatrix(countData = DEdata1, colData = DEdata2,design = ~ Lake) dds_eck <- estimateSizeFactors(dds_eck) # Estimate library size correction scaling factors vst_eck <- vst(dds_eck) # compute vst transformation rlog_eck <- rlog(dds_eck) # compute log transformation write.table(assay(rlog_eck), file="rlog_eck_reads.txt", sep="\t") rlog_eck <- read.table("../rda/rlog_eck_reads.txt", header = TRUE) data <- plotPCA(rlog_eck, intgroup=c("Lake"), ntop = 117395, returnData=TRUE) # set ntop value to full number of loci in count table ggplot(data, aes(PC1, PC2, colour = Lake)) + geom_point(size = 6) + theme.pca() + scale_color_manual(values = cols) + geom_vline(xintercept = 0, linetype = 'dashed') + geom_hline(yintercept = 0, linetype = 'dashed') + labs(x="PC1 4%", y="PC2 4%") # run model in edgeR ---- y <- calcNormFactors(y) # Calculate normalization factors to scale the raw library sizes design <- model.matrix(~0+Lake, data = DEdata2) disp <- estimateGLMCommonDisp(y,design) y <- estimateGLMTagwiseDisp(disp, design) fit <- glmFit(y,design) eck_tar <- glmLRT(fit, contrast=c(1,0,-1)) summary(decideTests(eck_tar)) eck_tar_tops <- topTags(eck_tar, n=200, p.value = 0.05, sort.by = "PValue") write.csv(eck_tar_tops, "eck_tar_top_methyl_loci.csv") eck_gla <- glmLRT(fit, contrast=c(1,-1,0)) summary(decideTests(eck_gla)) eck_gla_tops <- topTags(eck_gla, n=1500, p.value = 0.05, sort.by = "logFC") write.csv(eck_gla_tops, "eck_gla_top_methyl_loci.csv") plotMD(eck_gla) abline(h=c(-1, 1), col="blue") # combined dataset ---- DEdata1 <- read.delim('combined_counts.txt',header=TRUE, stringsAsFactors=TRUE) DEdata2 <- read.delim('combined_data.txt',header=TRUE, stringsAsFactors=TRUE, row.names=1) library_batch_loci <- read.table("list_of_library_batch_loci_glmFit.txt", header = TRUE) adapter_batch_loci <- read.table("list_of_adapter_batch_loci_glmFit.txt", header = TRUE) write.table(DEdata1, "final_combined_dataset_count_table.txt", sep = "\t", quote = FALSE, row.names = FALSE) rownames(DEdata1) <- DEdata1$Locus DEdata1 <- DEdata1[,-1] # DESeq2 dds_combined <- DESeqDataSetFromMatrix(countData = DEdata1,colData = DEdata2,design = ~ 1) dds_combined <- estimateSizeFactors(dds) # Estimate library size correction scaling factors vst_combined <- vst(dds_combined) # compute vst transformation rld_combined <- rlog(dds_combined) write.table(assay(rld_combined), file="rlog_reads_combined.txt", sep="\t", quote = FALSE) # this is used for the epigenetic RDA rlog_reads <- data.frame(assay(rld_combined))