# Alternative splicing analysis script 
# by ~J.
# for analysis of AS in pregnant Z. vivipara oviduct (WV)

####~load libraries~~~~~~~~~~####
library(tximport)
library(readr)
library(GenomicFeatures)
library(AnnotationDbi)
library(stageR)
library(DRIMSeq)
library(DEXSeq)
library(IsoformSwitchAnalyzeR)

####~housekeeping~~~~~~~~~~~~####
rm(list=ls()) #clear the environment
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) #set wd to Scripts folder

###~~output directory~~~~~~~~####
output = "../10_dtu_post_v_pre" #specify where the output should go, relative to where the script is
dir.create(output) #create directory for output
setwd(output) #set the new output directory as the working directory

###~~specify data~~~~~~~~~~~~####
refdata = "../02_reference_data/" #specify where the reference data is kept
saldata = "../03_salmon/" #specify where the salmon quant files are
test_condition = "post rep" #specify which samples to be compared to all others in sample sheet provided
exclude_condition = "pregnant"

###~~logfile~~~~~~~~~~~~~~~~~####
#log_file=file(paste("07_dtu_analysis_",Sys.Date(),".log",sep=""))
#sink(log_file, append = TRUE, type = "message")
#sink(log_file, append = TRUE, type = "output")

####~load data~~~~~~~~~~~~~~~####

###~~sample sheet~~~~~~~~~~~~####
ss = read.csv(paste(refdata,"sample_sheet.csv",sep = ""), row.names = 3) #read in ss from file
ss = subset(ss, Condition != exclude_condition) #remove samples not wanted in this analysis
ss$sample_id = row.names(ss) #sample_id columnn for DRIMSeq to read
ss_test = subset(ss, Condition == test_condition) #samples in the test condition
ss_con = subset(ss, Condition != test_condition) #control samples
n.small = if (length(row.names(ss_test)) > length(row.names(ss_con))) {length(row.names(ss_con))} else {length(row.names(ss_test))} #returns smallest sample group

##~~~remake ss~~~~~~~~~~~~~~~####
ss_test$sample_group = 2
ss_con$sample_group = 1
ss = rbind(ss_test,ss_con) #remake sample sheet with sample_group column for DRIMSeq
ss$sample_group = as.factor(ss$sample_group) #turn test status column into a factor for DRIMSeq
n = length(row.names(ss)) #returns total number of samples across all groups

###~~quant files~~~~~~~~~~~~~####
salmonQuantFiles = file.path("../03_salmon",paste(ss$Batch,ss$Barcode,sep = "_"),"quant.sf") #makes a list of filepaths to the quant data
names(salmonQuantFiles) = row.names(ss) #associate filepaths with sampleIDs from ss 
txi = tximport(salmonQuantFiles, type = "salmon", txOut = TRUE, countsFromAbundance = "scaledTPM") #import salmon quant files for DTU
cts = txi$counts
cts = cts[rowSums(cts) > 0,] #get rid of rows with 0 counts

###~~annotation info~~~~~~~~~####
txdb = makeTxDbFromGFF(paste(refdata,"annotation.gff",sep = ""))
txdf = AnnotationDbi::select(txdb, keys(txdb, "GENEID"), "TXNAME", "GENEID")
tab = table(txdf$GENEID)
txdf$ntx = tab[match(txdf$GENEID, names(tab))]
txdf = txdf[match(rownames(cts),txdf$TXNAME),] #retain only transcripts actually present in the data, and order the txdf by the counts df

###~~combine everything~~~~~~####
counts = data.frame(gene_id=txdf$GENEID,
                    feature_id=txdf$TXNAME,
                    cts)
write.csv(counts, "salmon_gene_transcript_counts.csv", row.names = FALSE) #write out salmon counts

####~DRIMSeq filtering~~~~~~~####
d = dmDSdata(counts=counts,samples=ss) 
d = dmFilter(d,
             min_samps_feature_expr=n.small, min_feature_expr=10, #filter transcripts with at least 10 counts in at least n.small samples
             min_samps_gene_expr=n, min_gene_expr=10) #filter genes with at least 10 counts in every sample

####~per-gene DTU testing~~~~####
sample.data = DRIMSeq::samples(d)
count.data = round(as.matrix(counts(d)[,-c(1:2)]))
dxd = DEXSeqDataSet(countData = count.data,
                    sampleData = sample.data,
                    design = ~sample + exon + sample_group:exon, #note that "exon" here refers to transcripts
                    featureID = counts(d)$feature_id,
                    groupID = counts(d)$gene_id)
system.time({
  dxd = estimateSizeFactors(dxd)
  dxd = estimateDispersions(dxd, quiet = TRUE)
  dxd = testForDEU(dxd, reducedModel = ~sample + exon)
})

###~~extract results table~~~####
dxr = DEXSeqResults(dxd, independentFiltering = FALSE) #filtering already done in DRIMSeq
qval = perGeneQValue(dxr)
dxr.g = data.frame(gene=names(qval),qval)
columns = c("featureID", "groupID", "pvalue")
dxr.df = as.data.frame((dxr[,columns]))
write.csv(dxr.df, file="DEXSeq_DTU.csv",row.names = FALSE)

####~stageR from DEXSeq~~~~~~####
strp = function(x) substr(x,1,15)

###~~screening df~~~~~~~~~~~~####
#detect genes with evidence of DTU
pScreen = qval
names(pScreen) = strp(names(pScreen))

###~~confirmation df~~~~~~~~~####
#detect transcripts for those genes that are DU'd
#construct 1 column matrix of the transcript p values
pConfirmation = matrix(dxr$pvalue, ncol=1)
dimnames(pConfirmation) = list(strp(dxr$featureID),"transcript")

###~~make tx2gene~~~~~~~~~~~~####
tx2gene = dxr.df[,c("featureID","groupID")]
for (i in 1:2) tx2gene[,i] = strp(tx2gene[,i])

###~~~DTU analysis~~~~~~~~~~~####
stageRObj = stageRTx(pScreen = pScreen,
                     pConfirmation = pConfirmation,
                     pScreenAdjusted = TRUE,
                     tx2gene = tx2gene)
stageRObj = stageWiseAdjustment(stageRObj, method = "dtu", alpha=0.05)
suppressWarnings({
  dex.padj = getAdjustedPValues(stageRObj, order=FALSE, onlySignificantGenes = TRUE)
})
write.csv(dex.padj, "DTU_transcript_level.csv", row.names = FALSE)

###~~~gene list~~~~~~~~~~~~~~####
genes = unique(dex.padj$geneID)
write(genes, "DTU_gene_list.txt")
transcripts = dex.padj$txID
write(transcripts, "DTU_transcript_list.txt")

####~data visualisation~~~~~~####

###~~make switchAnalyzeRlist~####

##~~~design~~~~~~~~~~~~~~~~~~####
myDesign = data.frame(
  sampleID = row.names(ss),
  condition = ss$Condition
)

###~~quant data~~~~~~~~~~~~~~####
mySwitchList = importRdata(
  isoformCountMatrix = txi$counts,
  isoformRepExpression = txi$abundance,
  designMatrix = myDesign,
  isoformExonAnnoation = paste(refdata,"annotation.gtf",sep=""),
  isoformNtFasta = paste(refdata,"transcript.fasta",sep=""),
  showProgress = FALSE
)

###~~q values from DEXSeq~~~~####
iso_qvals = dex.padj[,c("txID","transcript")] #make a df with just isoform (transcript) q values
names(iso_qvals) = c("isoform_id","isoform_switch_q_value") #edit columns to match the switchYlist obj
mySwitchList$isoformFeatures$isoform_switch_q_value[match(iso_qvals$isoform_id, mySwitchList$isoformFeatures$isoform_id)] = iso_qvals$isoform_switch_q_value #add q values from DEXSeq into switchYlist obj

####~extract sequences~~~~~~~####
mySwitchList = extractSequence(
  mySwitchList,
  pathToOutput = output,
  writeToFile = TRUE,
  removeLongAAseq = TRUE,
  alsoSplitFastaFile = TRUE
) #these have to be sent to external tools:
    # CPC2 (http://cpc2.gao-lab.org/): paste nt fasta - download tab-delim result file
    # PFAM (https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan): default params/aa fasta - copy/paste result email and save as a .txt
    # SignalP (https://services.healthtech.dtu.dk/services/SignalP-5.0/): aa fasta, param "Short output(no figures)" under "Output format" - download "Prediction summary"
    # IUPred2A (https://iupred2a.elte.hu/): upload aa fasta, save resulting text file
    # analyzeDeepTMHMM (https://biolib.com/DTU/DeepTMHMM): download "gff3 format" after running
    # analyzeDeepLoc2 (https://services.healthtech.dtu.dk/service.php?DeepLoc-2.0): choose "Short output (no figures)" and choose "Download prediction results: CSV Summary"
dir.create("x_tools")
# all result files should then be added to the x_tools folder with the appropriate extension and
# the name of the tool used, i.e. x_tools/CPC2.txt, x_tools/analyzeDeepTMHMM.gff3, etc.

####~save switchAnalyzeRlist~####
save(mySwitchList, file = "mySwitchList.Rdata")

####~fin~~~~~~~~~~~~~~~~~~~~~####
closeAllConnections()