## Add annotations from eggnog mapper to Cellenics DGE results
## by J.

####Housekeeping~~~~~~~~~~~~~####
rm(list=ls()) #clear the environment
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #set wd to Scripts folder

###~Specify data~~~~~~~~~~~~~####
dgeDir = "../gene_expression/by_type/"
eggnogFile = "../eggnog.tsv"
gene2protFile = "../gene2prot.txt"
proteinFile = "../protein.faa"

####Parse data~~~~~~~~~~~~~~~####
###~Gene to protein data~~~~~####
gene2prot = read.delim(gene2protFile, header = FALSE)
names(gene2prot) = c("SYMBOL","REFSEQ")
prot2gene = gene2prot[,c("REFSEQ","SYMBOL")]

###~Protein data~~~~~~~~~~~~~####
all_prot = phylotools::read.fasta(file = proteinFile)
all_prot$seq.name = gsub(" .*","",all_prot$seq.name) #remove everything but prot ID
all_prot$length = stringr::str_count(all_prot$seq.text) #make new column with str length
prot_lengths = all_prot[,c(1,3)] #make new df with lengths and IDs only
prot_lengths = merge(prot_lengths, prot2gene, by=1, all.x=TRUE)
prot_lengths = prot_lengths[order(prot_lengths$SYMBOL, -abs(prot_lengths$length)),] #order df by SYMBOL and prot lengths
prot_longbois = prot_lengths[ !duplicated(prot_lengths$SYMBOL),] #get new df with only longest prots

###~Eggnog data~~~~~~~~~~~~~~####
nog = read.csv(eggnogFile, sep = "\t", header = FALSE)
colnames(nog) = nog[5,] #set column names
colnames(nog)[1] = "query" #remove hashtag from 1st column name
nog = nog[-c(1:5),] #remove misc header rows

##~~add symbol to eggnogg~~~~####
nog = merge(nog, prot2gene, by=1, all.x=TRUE)

##~~get nogs for longbois~~~~####
nog = subset(nog, query %in% prot_longbois$seq.name)
row.names(nog) = nog$SYMBOL #sets gene symbols as row names

###~DGE tables~~~~~~~~~~~~~~~####
##~~read DGE tables~~~~~~~~~~####
dgeFiles = list.files(dgeDir, full.names = TRUE)
dgeTables = lapply(dgeFiles, read.csv)

##~~Filter by logFC~~~~~~~~~~####
filter4DEGs = function(geneTable) {
  ## Filter by logFC
  geneTable = subset(geneTable, logFC > 1 | logFC < -1)
  ## return the gene table
  return(geneTable)
}

dgeTables = lapply(dgeTables, filter4DEGs)

##~~Add eggnog annotation~~~~####
addEggnogAnnotation = function(df, x = nog) {
  ncbi = row.names(x) #get ncbi gene names from nog df
  eggnog = x$Preferred_name #get eggnog gene names from nog df
  gene2gene = data.frame(ncbi,eggnog) #make a data frame with both sets of gene names
  df = merge(df,gene2gene,by.x = "gene_names", by.y = 1) #merge the new names into the old df
  return(df)
}

dgeTablesRenamed = lapply(dgeTables, addEggnogAnnotation)

##~~Tidy up gene tables~~~~~~####
tidyGeneTables = function(geneTable) {
  ## Rename rows
  row.names(geneTable) = geneTable$gene_names
  ## Retain only useful columns 
  geneTable = geneTable[,c(6,3,2)]
  return(geneTable)
}

dgeTablesTidied = lapply(dgeTablesRenamed, tidyGeneTables)

####Save out new tables~~~~~~####
cluster_order = c("endothelial","epithelial","erythroid","phagocytotic","smc") #this is the order of the cluster files as read by R
cluster_index = 0
for (table in dgeTablesTidied) {
  cluster_index = cluster_index+1
  write.csv(table, file = paste(dgeDir,"eggnog_",cluster_order[cluster_index],".csv",sep = ""))
}
