### Creating PCA plots for Eck and Lomond systems separately and combined ### using SNPRelate and plotting with ggplot # Packages ---- library(SNPRelate);library(tidyverse) # Functions ---- # Consistent plot aesthetics for PCA theme.pca <- function() { theme_bw() + theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(), panel.background = element_rect(colour="black",fill="white", size=1), axis.text = element_text(size=16, color = "black"), axis.ticks = element_line(size = 0.5, colour = "black"), axis.ticks.length = unit(3, "mm"), axis.title.y = element_text(size = 30), axis.title.x = element_text(size = 30), axis.text.x = element_text(size=20), axis.text.y = element_text(size=20), legend.title = element_text(size = 20), legend.text = element_text(size = 20)) } # Lomond ---- # Set working directory setwd("~/Desktop/ddRAD_epiRAD_powan/demultiplexed_reads/EpiRAD/Intermediate_files3/06.Population_genetics/lomond/PCA/") # Import vcf file and popdata file ---- vcf.fn <- "lom.recode.vcf" snpgdsVCF2GDS(vcf.fn, "test.gds", method="biallelic.only") snpgdsSummary("test.gds") genofile <- snpgdsOpen("test.gds") pop_data <- read.table("popmap_ddRAD_epiRAD_lom_final.txt",header=FALSE) # Start analysis ---- set.seed(1000) # for reproducibility # pca with all SNPs pca <- snpgdsPCA(genofile, num.thread=2, autosome.only = FALSE) # # variance proportion (%) pc.percent <- pca$varprop*100 pc.percent <- head(round(pc.percent, 2)) # Manipulate results ---- tab <- data.frame(sample.id = pca$sample.id, EV1 = pca$eigenvect[,1], # the first eigenvector EV2 = pca$eigenvect[,2], # the second eigenvector EV3 = pca$eigenvect[,3], # the third eigenvector EV4 = pca$eigenvect[,4], EV5 = pca$eigenvect[,5], stringsAsFactors = FALSE) head(tab) # add population data tab[,7] <- pop_data$V2 colnames(tab)[7] <- "Lake" # recode lake names lakes_renamed <- recode(tab$Lake, lai = "Allt na lairige", shi = "Shira", car = "Carron", sloy = "Sloy", lom = "Lomond") tab[,7] <- lakes_renamed write.csv(tab, "gen_pca_lom.csv", row.names = FALSE) tab$Lake <- factor(tab$Lake, levels = c("Allt na lairige", "Shira", "Carron", "Sloy", "Lomond")) # reorder the levels of the lakes # Plot results ---- cols <- c("Allt na lairige" = "#a4dede", "Shira" = "#287c71", "Carron" = "#008396", "Sloy" = "#6c966f", "Lomond" = "gray60") # define colours for each Lake # PC1 and PC2 pdf(file = "~/Dropbox/Marco_Crotti/Evolutionary genomics of whitefish/Translocation project/Genomic analyses/figures/lom_pc12.pdf", width = 11.69, height = 8.27) ggplot(tab, aes(x = EV1, y = EV2, colour = Lake)) + geom_point(size=6, alpha=0.85) + scale_color_manual(values = cols) + theme.pca() + labs(x=paste("EV1",round(pc.percent, 2)[1],"%"), y = paste("EV2", round(pc.percent, 2)[2], "%")) + geom_vline(xintercept = 0, linetype = 'dashed') + geom_hline(yintercept = 0, linetype = 'dashed') dev.off() # PC1 and PC3 pdf(file = "~/Dropbox/Marco_Crotti/Evolutionary genomics of whitefish/Translocation project/Genomic analyses/figures/lom_pc13.pdf", width = 11.69, height = 8.27) ggplot(tab, aes(x = EV1, y = EV3,col = Lake)) + geom_point(size=6, alpha=0.85) + scale_color_manual(values = cols) + theme.pca() + labs(x=paste("EV1",round(pc.percent, 2)[1],"%"), y=paste("EV3", round(pc.percent, 2)[3], "%")) + geom_vline(xintercept = 0, linetype = 'dashed') + geom_hline(yintercept = 0, linetype = 'dashed') dev.off() #Eck ---- setwd("~/Desktop/ddRAD_epiRAD_powan/demultiplexed_reads/EpiRAD/Intermediate_files3/06.Population_genetics/eck/PCA/") # Import vcf vcf.fn <- "eck.recode.vcf" snpgdsVCF2GDS(vcf.fn, "test.gds", method="biallelic.only") snpgdsSummary("test.gds") genofile <- snpgdsOpen("test.gds", allow.duplicate = TRUE) pop_data <- read.table("popmap_ddRAD_epiRAD_eck_final.txt",header=FALSE) # Start analysis set.seed(1000) # for reproducibility # pca with all SNPs pca <- snpgdsPCA(genofile, num.thread=2, autosome.only = FALSE) # # variance proportion (%) pc.percent <- pca$varprop*100 pc.percent <- head(round(pc.percent, 2)) # Manipulate results tab <- data.frame(sample.id = pca$sample.id, EV1 = pca$eigenvect[,1], # the first eigenvector EV2 = pca$eigenvect[,2], # the second eigenvector EV3 = pca$eigenvect[,3], # the third eigenvector EV4 = pca$eigenvect[,4], EV5 = pca$eigenvect[,5], stringsAsFactors = FALSE) head(tab) # add population data tab[,7] <- pop_data$V2 colnames(tab)[7] <- "Lake" # recode lake names lakes_renamed <- recode(tab$Lake, gla = "Glashan", tar = "Tarsan", eck = "Eck") tab[,7] <- lakes_renamed write.csv(tab, "gen_pca_eck.csv", row.names = FALSE) tab$Lake <- factor(tab$Lake, levels = c("Glashan", "Tarsan", "Eck")) # reorder the levels of the lakes # Plot results cols <- c("Glashan" = "#9b2915", "Tarsan"="#fe7c73", "Eck"="gray60") # define colours for each Lake # PC1 and PC2 pdf(file = "~/Dropbox/Marco_Crotti/Evolutionary genomics of whitefish/Translocation project/Genomic analyses/figures/eck_pc12.pdf", width = 11.69, height = 8.27) ggplot(tab, aes(x = EV1, y = EV2, colour = Lake)) + geom_point(size=6, alpha=0.85) + scale_color_manual(values = cols) + theme.pca() + labs(x=paste("EV1",round(pc.percent, 2)[1],"%"), y=paste("EV2", round(pc.percent, 2)[2], "%")) + geom_vline(xintercept = 0, linetype = 'dashed') + geom_hline(yintercept = 0, linetype = 'dashed') dev.off() # PC1 and PC3 pdf(file = "~/Dropbox/Marco_Crotti/Evolutionary genomics of whitefish/Translocation project/Genomic analyses/figures/eck_pc13.pdf", width = 11.69, height = 8.27) ggplot(tab, aes(x = EV1, y = EV3,col = Lake)) + geom_point(size=6, alpha=0.85) + scale_color_manual(values = cols) + theme.pca() + labs(x=paste("EV1",round(pc.percent, 2)[1],"%"), y=paste("EV3", round(pc.percent, 2)[3], "%")) + geom_vline(xintercept = 0, linetype = 'dashed') + geom_hline(yintercept = 0, linetype = 'dashed') dev.off() #### combined ---- setwd("~/Desktop/ddRAD_epiRAD_powan/demultiplexed_reads/EpiRAD/Intermediate_files3/06.Population_genetics/combined/PCA/") # 1 first we do Lomond # Import vcf file and popdata file vcf.fn <- "combined.recode.vcf" snpgdsVCF2GDS(vcf.fn, "test.gds", method="biallelic.only") snpgdsSummary("test.gds") genofile <- snpgdsOpen("test.gds") pop_data <- read.table("popmap_ddRAD_epiRAD_combined_final.txt",header=FALSE) # Start analysis set.seed(1000) # for reproducibility # pca with all SNPs pca <- snpgdsPCA(genofile, num.thread=2, autosome.only = FALSE) # # variance proportion (%) pc.percent <- pca$varprop*100 pc.percent <- head(round(pc.percent, 2)) # Manipulate results tab <- data.frame(sample.id = pca$sample.id, EV1 = pca$eigenvect[,1], # the first eigenvector EV2 = pca$eigenvect[,2], # the second eigenvector EV3 = pca$eigenvect[,3], # the third eigenvector EV4 = pca$eigenvect[,4], EV5 = pca$eigenvect[,5], stringsAsFactors = FALSE) head(tab) # add population data tab[,7] <- pop_data$V2 colnames(tab)[7] <- "Lake" # recode lake names lakes_renamed <- recode(tab$Lake, gla = "Glashan", tar = "Tarsan", eck = "Eck", lai = "Allt na lairige", shi = "Shira", car = "Carron", sloy = "Sloy", lom = "Lomond") tab[,7] <- lakes_renamed write.csv(tab, "gen_pca_combined.csv", row.names = FALSE) # Plot results cols <- c("Allt na lairige"="#a4dede","Carron"="#008396","Eck"="gray60","Glashan"="#9b2915","Lomond"="gray60","Shira"="#287c71", "Sloy"="#6c966f","Tarsan"="#fe7c73") tab$Lake <- factor(tab$Lake,levels = c("Glashan", "Tarsan","Eck","Allt na lairige", "Carron","Shira","Sloy","Lomond")) tab <- tab %>% mutate(System = case_when( grepl("Lomond", Lake) ~ "Lomond", grepl("Allt na lairige", Lake) ~ "Lomond", grepl("Shira", Lake) ~ "Lomond", grepl("Carron", Lake) ~ "Lomond", grepl("Sloy", Lake) ~ "Lomond", grepl("Eck", Lake) ~ "Eck", grepl("Glashan", Lake) ~ "Eck", grepl("Tarsan", Lake) ~ "Eck" )) # PC1 and PC2 pdf(file = "~/Dropbox/Marco_Crotti/Evolutionary genomics of whitefish/Translocation project/Genomic analyses/figures/combined_12.pdf", width = 11.69, height = 8.27) ggplot(tab, aes(x = EV1, y = EV2, colour = Lake, shape = System)) + geom_point(size=6, alpha=0.75) + scale_color_manual(values = cols) + theme.pca() + labs(x=paste("EV1",round(pc.percent, 2)[1],"%"), y=paste("EV2", round(pc.percent, 2)[2], "%")) + geom_vline(xintercept = 0, linetype = 'dashed') + geom_hline(yintercept = 0, linetype = 'dashed') dev.off() # PC1 and PC3 pdf(file = "~/Dropbox/Marco_Crotti/Evolutionary genomics of whitefish/Translocation project/Genomic analyses/figures/combined_13.pdf", width = 11.69, height = 8.27) ggplot(tab, aes(x = EV1, y = EV3,col = Lake, shape = System)) + geom_point(size=6, alpha=0.85) + scale_color_manual(values = cols) + theme.pca() + labs(x=paste("EV1",round(pc.percent, 2)[1],"%"), y=paste("EV3", round(pc.percent, 2)[3], "%")) + geom_vline(xintercept = 0, linetype = 'dashed') + geom_hline(yintercept = 0, linetype = 'dashed') dev.off()