# candidate gene analysis
# R script
# by J.

####~load libraries~~~~~~~~~~####
library(ggplot2)
library(ggrepel)
library(amap)
library(reshape2)
library(rstudioapi)
library(svglite)

####~housekeeping~~~~~~~~~~~~####
rm(list=ls()) #clear the environment
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #set wd

###~~define output~~~~~~~~~~~####
output = "../11_summary_figures"
dir.create(output)
setwd(output)
samples=c("Vivi live","Vivi 30-45 days","Vivi 60+ days","Ovi 30-45 days","Ovi 60+ days")
sample_order=c("WV.live","WV.m1","WV.m2","EO.m1","EO.m2")

###~~specify data~~~~~~~~~~~~####
em_file="../04_edger/cpm.csv" #expression matrix with CPM
em_log2_file="../04_edger/log2cpm.csv" #expression matrix with log2(CPM)
ss_file="../02_reference_data/sample_sheet.csv" #sample sheet
candidate_file="../02_reference_data/Recknagel_et_al_2021_candidates.csv" #candidate genes
live_v_2m="../06_dge_analysis/02_WV_m2vlive/sig.csv"
EO_v_WV="../06_dge_analysis/05_EOvWV_m2/sig.csv"
EO_v_CVII="../02_reference_data/Recknagel_et_al_2021_DEGs.csv"

###~~logfile~~~~~~~~~~~~~~~~~####
log_file=file(paste("10_variance_partition_",Sys.Date(),".log",sep=""))
sink(log_file,append=TRUE,type="output",split=TRUE)

####~load data~~~~~~~~~~~~~~~####
em=read.csv(em_file,row.names=1) #loads expression matrix
em_scaled=data.frame(t(scale(data.frame(t(em)))))
em_scaled=na.omit(em_scaled)
em_log2=read.csv(em_log2_file,row.names = 1)
ss=read.csv(ss_file,row.names=1) #loads sample sheet
candidates=read.csv(candidate_file,header=FALSE)
master_m2vl=read.csv(live_v_2m)
master_EOvWV=read.csv(EO_v_WV)
reference_EOvCVII=read.csv(EO_v_CVII)

####~parse data~~~~~~~~~~~~~~####

###~~em~~~~~~~~~~~~~~~~~~~~~~####
em = em[,row.names(ss)] #keep only cell culture samples
em_scaled=data.frame(t(scale(data.frame(t(em)))))
em_scaled=na.omit(em_scaled) #remove genes with no counts for cell culture samples

###~~EOvCVII~~~~~~~~~~~~~~~~~####
EOvCVII_sig = subset(reference_EOvCVII, combined_siginficant_padj.0.05 == "YES")
EOvCVII_genes = EOvCVII_sig$gene_symbol

###~~candidates~~~~~~~~~~~~~~####
names(candidates) = candidates[2,]
candidates = candidates[-c(1,2),]
all_candidate_genes = candidates[,1]
candidate_genes = all_candidate_genes[all_candidate_genes %in% row.names(em)]

####~themes~~~~~~~~~~~~~~~~~~####
theme_j = theme(
  plot.title = element_blank(),
  axis.text.x=element_text(size=10),
  axis.text.y=element_text(size=10),
  axis.title.x=element_text(size=18),
  axis.title.y=element_text(size=18),
  panel.background = element_rect(fill = "white", colour = "lightgrey"),
  panel.grid.major = element_line(linewidth = 0.5, linetype = "solid", colour = "lightgrey"),
  panel.grid.minor = element_line(linewidth =  0.25, linetype = "solid", colour = "lightgrey"),
  legend.key = element_blank(),
  legend.title = element_blank(), 
  legend.text = element_text(size = 18), 
  legend.key.size = unit(1, "cm"))

###~~palettes~~~~~~~~~~~~~~~~####
palette = c("red","#ee82ee","#8b008b","#40e0d0","#008080")

####~EO-WV vs EO-CVII~~~~~~~~####
EOvWV = master_EOvWV$SYMBOL
shared = EOvCVII_genes[EOvCVII_genes %in% EOvWV]

####~candidate genes~~~~~~~~~####
#make a table for candidate genes
gene_data=data.frame(t(em_log2[candidate_genes,]))
gene_data$sample_group=ss$Group
gene_data.m=melt(gene_data,id.vars = "sample_group",levels=c())
gene_data.m$sample_group=factor(gene_data$sample_group, 
                                levels=c("WV.live","WV.m1","WV.m2","EO.m1","EO.m2")) #reorder

faceted_boxplot_can=ggplot(gene_data.m,aes(y=value,fill=sample_group))+ 
  geom_boxplot(outlier.size=0,show.legend=TRUE)+
  facet_wrap(~variable,ncol=5)+
  theme_j+
  scale_fill_manual("Sample Group", 
                    values = palette, 
                    labels = samples,
#                    breaks = c("EO.m1","EO.m2","WV.live","WV.m1","WV.m2"),
                    )+
  labs(title = "Candidate genes from Recknagel et al. 2021", y = "Log2(CPM)")+
  theme(axis.text.x=element_blank(),axis.ticks.x=element_blank())+
  theme(legend.position = "bottom")+
  guides(fill = guide_legend(nrow = 2))
faceted_boxplot_can
ggsave("candidate_genes_expressed.svg",plot=faceted_boxplot_can)

####~2m vs live~~~~~~~~~~~~~~####
makeMyBoxplots = function(candidate_genes, 
                          ss = ss, 
                          em = em_scaled, 
                          palette = palette_cb1, 
                          labels = samples,
                          savename) {
  #~make gene table~~~~~~~~~####
  gene_data=data.frame(t(em[candidate_genes,]))
  gene_data$sample_group=ss$Group
  gene_data.m=melt(gene_data,id.vars = "sample_group")
  gene_data.m$sample_group=factor(gene_data$sample_group,levels=sample_order) #reorder
  
  #~make boxplot~~~~~~~~~~~~####
  boxplot=ggplot(gene_data.m,aes(x=variable,y=value,fill=sample_group))+
    geom_boxplot(outlier.size=0,show.legend=TRUE)+
    theme_j+
    xlab(element_blank())+
    ylab("log2(CPM)")+
    scale_fill_manual(values=palette, labels=labels)+
    theme(legend.position = "right")+
    theme(axis.text.x=element_text(angle=65,
                                   hjust=1,
                                   size=18))
  
  #~make faceted boxplot~~~~####
  faceted_boxplot=ggplot(gene_data.m,aes(y=value,fill=sample_group))+ 
    geom_boxplot(outlier.size=0,show.legend=TRUE)+
    theme_j+
    theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), strip.text = element_text(size = 14))+
    xlab(element_blank())+
    ylab("log2(CPM)")+
    scale_fill_manual(values=palette, labels=labels)+
    theme(legend.position = "bottom")+
    facet_wrap(~variable,ncol=5)
  
  #~save out~~~~~~~~~~~~~~~~####
  ggsave(paste(savename,".svg",sep = ""), boxplot, height = 5, width = 8)
  ggsave(paste(savename,"_faceted.svg",sep = ""), faceted_boxplot, height = 7, width = 10)
}

####~2m vs live~~~~~~~~~~~~~~####
sortByPValue = function(df, x = "PValue") {
  order_of_x=order(df[,x],decreasing=FALSE)
  df[order_of_x,]
}

###~~remake em~~~~~~~~~~~~~~~####
ss.WV = subset(ss, Lineage == "WV")
ss.m2vl = subset(ss.WV, Group != "WV.m1")
em.m2vl = em_log2[,row.names(ss.m2vl)]

###~~get top 10 up~~~~~~~~~~~####
m2vl_up = subset(master_m2vl, logFC > 0)
m2vl_up = sortByPValue(df = m2vl_up)
m2vl_up_genes = m2vl_up[1:20,]$SYMBOL

###~~get top 10 down~~~~~~~~~####
m2vl_down = subset(master_m2vl, logFC < 0)
m2vl_down = sortByPValue(df = m2vl_down)
m2vl_down_genes = m2vl_down[1:20,]$SYMBOL

###~~make some boxplots~~~~~~####
makeMyBoxplots(candidate_genes = m2vl_up_genes[1:10], 
               em = em.m2vl, 
               ss = ss.m2vl,
               labels = c("Live","Cultured (60+ days)"),
               palette = palette[c(1,3)],
               savename = "m2vl_5_up")

makeMyBoxplots(candidate_genes = m2vl_down_genes[1:10], 
               em = em.m2vl, 
               ss = ss.m2vl,
               labels = c("Live","Cultured (60+ days)"),
               palette = palette[c(1,3)],
               savename = "m2vl_5_down")

####~EO vs WV~~~~~~~~~~~~~~~~####
###~~remake em~~~~~~~~~~~~~~~####
ss.m2 = subset(ss, DiC > 59)
ss.m2 = subset(ss.m2, Group != "WV.live")
em.m2 = em_log2[,row.names(ss.m2)]

###~~get top 10 up~~~~~~~~~~~####
EOvWV_up = subset(master_EOvWV, logFC > 0)
EOvWV_up = sortByPValue(df = EOvWV_up)
EOvWV_up_genes = EOvWV_up[1:20,]$SYMBOL

###~~get top 10 down~~~~~~~~~####
EOvWV_down = subset(master_EOvWV, logFC < 0)
EOvWV_down = sortByPValue(df = EOvWV_down)
EOvWV_down_genes = EOvWV_down[1:20,]$SYMBOL

###~~make some boxplots~~~~~~####
makeMyBoxplots(candidate_genes = EOvWV_up_genes[1:10], 
               em = em.m2,
               ss = ss.m2,
               labels = samples[c(3,5)],
               palette = palette[c(3,5)],
               savename = "EOvWV_5_up")

makeMyBoxplots(candidate_genes = EOvWV_down_genes[1:10], 
               em = em.m2, 
               ss = ss.m2,
               labels = samples[c(3,5)],
               palette = palette[c(3,5)],
               savename = "EOvWV_5_down")

####~citations~~~~~~~~~~~~~~~####
dir.create("citations")
sink(file = "citations/R.bib")
toBibtex(citation())
sink()

sink(file = "citations/AnnotationDbi.bib")
toBibtex(citation("AnnotationDbi"))
sink()

sink(file = "citations/topGO.bib")
toBibtex(citation("topGO"))
sink()

toBibtex(citation("ggplot2"))

####~fin~~~~~~~~~~~~~~~~~~~~~##EOvWV_down####~fin~~~~~~~~~~~~~~~~~~~~~####
closeAllConnections()