### written by Kevin Schneider ###
### last modified: 08.10.2019 ###
### Kevin.Schneider@glasgow.ac.uk ###
### KevinSchneider@gmx.at ###

### scripts (UNIX, R, and Python) used in Schneider et al. 2019 BMC Genomics ###

### --> TransDecoder script <-- ###
### get the longest ORFs in coding sequences of the transcriptomes ###

#!/bin/bash

for file in *.fsa_nt; do
	~/apps/TransDecoder/TransDecoder.LongOrfs -t $file -S
done

### --> OrthoFinder script <-- ###

OrthoFinder-2.2.3/orthofinder -f seqfiles -t 16

### --> collapse fastas to one single line each <-- ###

#!/bin/bash

for file in *.fasta; do
	awk '/^>/ {printf("\n%s\n",$0);next;} { printf("%s",$0);}  END {printf("\n");}' < $file > collapsed_$file
done

### --> change encoding from DOS to UNIX in one of the transcriptome files <-- ###

#!/bin/bash

tr -d '\015' < collapsed_O_tshawytscha.fasta > collapsed_O_tshawytscha_edited.fasta

### --> combining orthologues with custom Python script (combine_orthologues_20180723.py) <-- ###
### invoke with python3 combine_orthologues_20180723.py ###

##### 23.07.2018 #####
##### by Kevin Schneider #####
##### written for Python 3 #####
"""
--> script to combine genes from different transcriptome files into orthologue files <--
loop through csv files and fetch one salmon accession number in turn
only take orthologues with a higher  or equal number (but not larger than four) 
of orthologous genes in the focal species compared to the salmon genome (to avoid inclusion of collapsed genes)
then create a file named after the orthologues in the salmon genome
take the gene name of the other species belonging to the same orthologue
parse through this species's transcriptome for the above gene name
take the matched sequence and append it to the orthologue file created above
start from the beginning
"""

from glob import glob

def header_filter(line):
     return not line.startswith("Orthogroup")
     
def extract_ref_list(ref_match):
     return ref_match.split("\t")[2]

def calc_field_length(field):
     sep_field = field.split(", ")
     return len(sep_field)
     
def extract_ortho_list(ortho_match):
     return ortho_match.split("\t")[1]
     
def check_anomality(ref_length, ortho_length):
     if (ref_length >= ortho_length) and (ref_length <= 4):
          return True
     else:
          return False

def extract_spec_name(curr_file):
     spec_name = curr_file.split("_transdecoder")[0]
     return "collapsed_" + spec_name + ".fasta"

def assemble_outfile(ref_field, ortho_field, passed, spec_file_name):
     file_name = ref_field.replace(", ", "_")
     full_file_name = "outfiles/" + file_name + ".fasta"
     full_file_name = full_file_name.replace("\n", "")
     if passed:
          # parse through transcriptome to extract header and sequence
          with open(spec_file_name, "r+") as infile:
               for line in infile:
                    if line.startswith(">"):
                         line = line.replace("(", "_")
                         line = line.replace(")", "_")
                         line = line.replace(">", "")
                         for ortho_subfield in ortho_field.split(", "):
                              ortho_subfield = ortho_subfield.split("__g.")[0]
                              start_string = ortho_subfield.split("__")[0] + "__"
                              ortho_subfield = ortho_subfield.replace(start_string, "")
                              if ortho_subfield in line:
                                   seq = next(infile)
                                   if len(seq) >= 200:
                                        # append the above header and sequence to the opened orthologue file
                                        with open(full_file_name, "a+") as outfile:
                                             species = spec_file_name.replace("collapsed_", "").replace(".fasta", "")
                                             new_header = ">" + ortho_subfield + "|" + species + "\n"
                                             outfile.write(new_header)
                                             outfile.write(seq)
          return True

ortho_files = "*S_salar_genomic.csv"  ### change file name as required
for file in glob(ortho_files):
     f = open(file)
     ortho_lines = list(filter(header_filter, f))
     ref_list = list(map(extract_ref_list, ortho_lines))
     ref_length = list(map(calc_field_length, ref_list))
     ortho_list = list(map(extract_ortho_list, ortho_lines))
     ortho_length = list(map(calc_field_length, ortho_list))
     i = 0
     check_list = []
     for ref_element in ref_length:
          check_element = check_anomality(ref_element, ortho_length[i])
          check_list.append(check_element)
          i = i + 1
     spec_file_name = extract_spec_name(file)
     j = 0
     for ref_field in ref_list:
          output_True_list = assemble_outfile(ref_field, ortho_list[j], check_list[j], spec_file_name)
          j = j + 1

### --> END: combining orthologues with custom Python script (combine_orthologues_20180723.py) <-- ###

### --> match orthologues with custom Python script (match_orthologues_20180720.py) <-- ###
### invoke with python3 match_orthologues_20180720.py ###

##### 20.07.2018 #####
##### by Kevin Schneider #####
##### written for Python 3 #####
"""
--> script to match & combine single-name orthologues which were mapped to the same gene sets in the reference genome
"""

from glob import glob
from os import listdir
from os.path import isfile
from re import sub
          
def find_ortho_sets(targ_dir):
     file_list = listdir(targ_dir)
     ortho_sets = []
     for file in file_list:
          if len(file.split("_")) > 2:
               ortho_sets.append(file)
     return ortho_sets
     
def matching_orthologues(file, orthos2match):
     ortho_list = []
     for ortho in orthos2match:
          if file in ortho:
               delstring0 = file.replace(".fasta", "")
               delstring1 = delstring0 + "_"
               ortho = ortho.replace(delstring1, "")
               delstring2 = "_" + delstring0
               ortho = ortho.replace(delstring2, "")
               ortho = ortho.replace(".fasta", "")
               ortho = ortho.split("_")
               ortho = [a + "_" + b for a,b in zip(ortho[::2], ortho[1::2])]
               ortho_list.append(ortho)
     flatten = lambda l: [item for sublist in l for item in sublist]
     ortho_list = flatten(ortho_list)
     return ortho_list

def combine_files(file, matched_file):
     with open(file, "a+") as f1:
          with open(matched_file, "r+") as f2:
               content2 = f2.read()
               f1.write(content2)
               return True

targ_dir = "outfiles/"
orthos2match = find_ortho_sets(targ_dir)
ortho_files = "outfiles/*.fasta"  ### change file name as required
for file in glob(ortho_files):
     if isfile(file):
          orthos2append = matching_orthologues(file, orthos2match)
          if len(orthos2append) > 0:
               for matched_file in orthos2append:
                    matched_file_name = matched_file + ".fasta"
                    if matched_file_name in glob(ortho_files):
                         combine_files(file, matched_file_name)

### --> filtering with custom Python script (filter_fastas_20180810.py) <-- ###
### invoke with python3 filter_fastas_20180810.py and the below options ###
### check on xxx for updates ###

##### 10.08.2018 #####
##### by Kevin Schneider #####
##### KevinSchneider@gmx.at #####
##### written for Python 3 #####
"""
--> script to filter fasta files <--
depending on the chosen flags/parameters, it:
     1) removes record duplicates
     2) filters by the number of sequence records
     3) filters by the number of taxa
     4) saves the original sequence headers to a .txt file
     5) simplifies the sequence headers
"""

from glob import glob
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid
from os import path
import argparse
import datetime
from json import dumps

def get_unique(records):
     """ check if sequence is present more than once in the list of sequences """
     """ instead of comparing the sequence records explicitly, checksums are applied """
     checksums = set()
     for record in records:
          checksum = seguid(record.seq)
          if checksum in checksums:
               continue
          checksums.add(checksum)
          yield record
          
def check_rec_number(records, rec_l_limit, rec_u_limit):
     """ filter by the number of records in the file """
     rec_num = len(records)
     if rec_num > rec_l_limit and rec_num < rec_u_limit:
          return True

def check_taxon_number(records, taxa_limit):
     """ filter by the number of distinct taxa in the file """
     IDs = set()
     taxa_num = 0
     for record in records:
          if not record.id in IDs:
               IDs.add(record.id)
               taxa_num += 1
     if taxa_num > taxa_limit:
          return True

def save_seq_headers(file, records):
     """ save sequence headers to a .txt file """
     for record in records:
          base = path.splitext(file)[0]
          new_file_name = base + ".headers.txt"
          with open(new_file_name, "w") as f:
               f.write(record.id)

def simplify_headers(records, separator, ele_num):
     """ cut down headers depending on chosen number of elements between separators to be retained """
     for record in records:
          record.id = record.id.split(separator)[ele_num]
          record.id = separator.join(record.id)
     return records
     
def save_args(args):
     """ save all command line arguments to a .txt file including date and time """
     date_time = datetime.datetime.now()
     date_time = date_time.strftime("%Y-%m-%d_%H-%M")
     arg_file_name = "filter_fastas_args_" + date_time + ".txt"
     with open(arg_file_name, "w") as af:
          af.write(dumps(vars(args)))

""" handling and parsing the input parameters """     
parser = argparse.ArgumentParser()
parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)')
parser.add_argument('-d', action='store_true', help='remove duplicates')
parser.add_argument('-rl', nargs=1, type=int, help='lower limit for number of records')
parser.add_argument('-ru', nargs=1, type=int, help='upper limit for number of records')
parser.add_argument('-t', nargs=1, type=int, help='lower limit for number of taxa')
parser.add_argument('-s', action='store_true', help='saves the original sequence headers to a .txt file')
parser.add_argument('-si', action='store_true', help='simplify sequence header')
parser.add_argument('-sep', nargs=1, type=str, help='separator (in between quotes) for delimiting sequence headers')
parser.add_argument('-e', nargs=1, type=int, help='beginning from the start, number of elements between separators to keep')
args = parser.parse_args()
files = args.i
remove_dup = args.d
rec_l_limit = args.rl[0]
rec_u_limit = args.ru[0]
taxa_limit = args.t[0]
save_headers = args.s
simplify = args.si
separator = args.sep[0]
ele_num = args.e[0]

""" save command line arguments to a .txt file """
save_args(args)

""" loop through all the files with the specified file name (pattern) and open them in turn """  
for file in glob(files):
     with open(file, "rU") as f:
          """ create a list of sequence records in the file """
          records = list(SeqIO.parse(f, "fasta"))
          if remove_dup:
               new_records = get_unique(records)
          if not check_rec_number(records, rec_l_limit, rec_u_limit):
               continue
          if not check_taxon_number(records, taxa_limit):
               continue
          if save_headers:
               save_seq_headers(file, records)
          if simplify:
               new_records = simplify_headers(records, separator, ele_num)
          base = path.splitext(file)[0]
          ext = path.splitext(file)[1]
          new_file = base + ".filtered" + ext
          SeqIO.write(new_records, new_file, "fasta")
          
### --> END: filtering with custom script (filter_fastas_20180810.py) <-- ###     

### parameters used in above script for molevol dataset ###

{"i": "*.orfpy.fasta", "d": true, "rl": [7], "ru": [80], "t": [7], "s": true, "si": true, "sep": ["|"], "e": [2]}

### --> orfpy script <-- ###
### choose longest ORF again in the case of multiple ORFs per orthogroup ###

#!/bin/bash

for file in *.fasta; do
	base=$(basename $file .fasta)
	python get_orfs_or_cdss.py -i $file -f fasta -t CDS --on $base.orfpy.fasta --op $base.orfpyprot.fasta --ob $base.bed --og $base.gff3 --min_len 200 -m one
done

### --> prank script <-- ###
### align based on codon model ###

#!/bin/bash

for file in *.fasta; do
	prank -d=$file -o=prank_aligned_$file -codon -F
done

### --> example PBS script for parallelisation of prank script <-- ###

#PBS -N prank_alignment_1_subset_remaining3
#PBS -S /bin/bash
#PBS -m abe
#PBS -M Kevin.Schneider@glasgow.ac.uk
#PBS -l nodes=1:ppn=16,walltime=10000:00:00,cput=160000:00:00

source ~/.bash_profile
cd prank/orthogroups
./prank_orthogroups_commands4server_subset_remaining16_1.sh &
./prank_orthogroups_commands4server_subset_remaining16_2.sh &
./prank_orthogroups_commands4server_subset_remaining16_3.sh &
./prank_orthogroups_commands4server_subset_remaining16_4.sh &
./prank_orthogroups_commands4server_subset_remaining16_5.sh &
./prank_orthogroups_commands4server_subset_remaining16_6.sh &
./prank_orthogroups_commands4server_subset_remaining16_7.sh &
./prank_orthogroups_commands4server_subset_remaining16_8.sh &
./prank_orthogroups_commands4server_subset_remaining16_9.sh &
./prank_orthogroups_commands4server_subset_remaining16_10.sh &
./prank_orthogroups_commands4server_subset_remaining16_11.sh &
./prank_orthogroups_commands4server_subset_remaining16_12.sh &
./prank_orthogroups_commands4server_subset_remaining16_13.sh &
./prank_orthogroups_commands4server_subset_remaining16_14.sh &
./prank_orthogroups_commands4server_subset_remaining16_15.sh &
./prank_orthogroups_commands4server_subset_remaining16_16.sh &
wait
echo "Finished!"

### alignment trimming by nucleotide and taxon occupancy using TrimAl v1.2 software (installed and run on server) ###
# 1st time: -resoverlap 0.5 and -seqoverlap 50, without -block option, used (but still many too diverged sequences in the alignments)
# 2nd time (re-run): as below (file names start with trimmed in 1st run and trimmed2 in 2nd run)

#!/bin/bash

cd codonfiles
for file in *.best.fas; do
	trimal -in $file -out ../trimmed_codonfiles/trimmed2_$file -fasta -resoverlap 1.0 -seqoverlap 0.38 -noallgaps
done

### change sequence headers with custom Python script and transform to PHYLIP file (for FastME) ###

##### 25.10.2018 #####
##### by Kevin Schneider #####
##### KevinSchneider@gmx.at #####
##### written for Python 3 #####
"""
--> script to simplify fasta file headers and convert fasta to phylip files <--
"""

from glob import glob
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid
from os import path
import argparse

def simplify_ortho_headers(records):
     ortho_dict = {}
     """ extract last part of header and add the number of the current orthologue in the orthogroup """
     for record in records:
          old_record_id = record.id.split('|')[-1]
          new_record_id = old_record_id[0:8]
          if not new_record_id in ortho_dict.keys():
               ortho_dict[new_record_id] = 1
          else:
               ortho_dict[new_record_id] += 1
          record.id = new_record_id + str(ortho_dict[new_record_id])
     return records

parser = argparse.ArgumentParser()
parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)')
args = parser.parse_args()
files = args.i

for file in glob(files):
     with open(file, 'rU') as f:
          """ create a list of sequence records in the file """
          records = list(SeqIO.parse(f, 'fasta'))
          new_records = simplify_ortho_headers(records)
          base = path.splitext(file)[0]
          new_fasta = base + '.simplified.fasta'
          phylip = base + '.simplified.phylip'
          SeqIO.write(new_records, new_fasta, 'fasta')
          SeqIO.write(new_records, phylip, 'phylip')

### BioNJ phylogenetic tree construction using FastME ###

#!/bin/bash

cd orthogroup_intermediate_files/orthogroup_intermediate_files2
for file in trimmed2*.simplified.phylip; do
	base=$(basename $file .simplified.phylip)
	fastme -i $file -o ../fastme_trees/$base.nwk --method=BioNJ --dna=F84 --gamma=1.0 --nni --spr -T 16
done

### removing empty files (due to too large divergence and uncertain phylogenetic relationships) ###

#!/bin/bash

find . -name "*.codon.fasta.best.nwk" -size 0k | xargs rm

### labelling branches in the phylogenetic trees with custom R scripts ### 

##### 29.10.2018 #####
##### by Kevin Schneider #####
##### KevinSchneider@gmx.at #####
##### written for R version 3.3.1 #####

require(ape)

setwd("/Volumes/LaCie/orthogroup_tree_files/")
path <- "/Volumes/LaCie/orthogroup_tree_files/"

pre_taxa <- c("C_clupea", "C_lavare", "Sa_alpin", "Sa_fonti")

treefiles <- Sys.glob(file.path(paste(path, "trimmed2_OG*.codon.fasta.best.nwk", sep = "")))

descendants <- function (tree, descendnodes, descendtips = NULL) {
  tipnum <- tree$Nnode + 1
  descendnodes_new <- tree$edge[tree$edge[, 1] %in% descendnodes, 2]
  descendtips <- append(descendtips, descendnodes[descendnodes <= tipnum])
  descendtips <- append(descendtips, descendnodes_new[descendnodes_new <= tipnum])
  descendnodes_new <- descendnodes_new[descendnodes_new > tipnum]
  if (length(descendnodes_new) == 0) {
    return(descendtips) 
  } 
  do.call("descendants", list(tree, descendnodes_new, root, mid, descendtips)) 
}

for (i in 1:length(treefiles)) {
  tree <- read.tree(treefiles[i])
  if ("E_lucius1" %in% tree$tip.label) {
    tree <- root(tree, outgroup = "E_lucius1", resolve.root = TRUE)
    taxa <- c()
    for (j in 1:length(pre_taxa)) {
      taxa <- append(taxa, grep(pre_taxa[j], tree$tip.label, value = TRUE))
    }
    if (length(taxa) >= 1) {
      tree <- makeNodeLabel(tree, method = "number", prefix = "")
      for (k in 1:tree$Nnode) {
        desc <- descendants(tree, tree$node.label[k])
        if (all(desc %in% taxa)) {
          tree$node.label[k] <- "{Foreground}"
        }
      }
      for (l in 1:length(tree$tip.label)) {
        if (tree$tip.label[l] %in% taxa) {
          tree$tip.label[l] <- paste(tree$tip.label[l], "{Foreground}", sep = "")
        }
      }
      write.tree(tree, file = paste(path, "labelled_", basename(treefiles[i]), sep = ""))
    }
  }
}

### shorten headers of codon fasta files ###

##### 02.11.2018 #####
##### by Kevin Schneider #####
##### KevinSchneider@gmx.at #####
##### written for Python 3 #####
"""
--> script to shorten fasta file headers <--
"""

from glob import glob
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid
from os import path
import argparse

def shorten_ortho_headers(records):
     """ shorten header to PHYLIP-style sequence names """
     for record in records:
          record.description = record.description.split(' ')[0]
     return records

parser = argparse.ArgumentParser()
parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)')
args = parser.parse_args()
files = args.i

for file in glob(files):
     with open(file, 'rU') as f:
          """ create a list of sequence records in the file """
          records = list(SeqIO.parse(f, 'fasta'))
          new_records = shorten_ortho_headers(records)
          base = path.splitext(file)[0]
          new_fasta = base + '.shortened.header.fasta'
          SeqIO.write(new_records, new_fasta, 'fasta')

### combining alignments and tree files ###

#!/bin/bash

for alfile in trimmed2_OG*.shortened.header.fasta; do
	base=$(basename $alfile .codon.fasta.best.simplified.shortened.header.fasta)
	fullname=$(echo "labelled_"${base})
	basepath=$(echo "../orthogroup_tree_files/"${fullname})
	treefile=$(grep ${basepath} <(find ../orthogroup_tree_files -name "labelled_trimmed2_OG*"))
	if [ -f "$treefile" ]; then
		cat $alfile > ../orthogroup_HYPHY_input/$base.hyphy
		echo "" >> ../orthogroup_HYPHY_input/$base.hyphy
		cat $treefile >> ../orthogroup_HYPHY_input/$base.hyphy
	fi
done

### batch file for RELAX analyses ###

fileToExe = "/usr/local/lib/hyphy/TemplateBatchFiles/SelectionAnalyses/RELAX.bf";
inputRedirect = {};
inputRedirect["01"]="Universal";
inputRedirect["02"]="/Volumes/LaCie/orthogroup_HYPHY_input/$file";
inputRedirect["03"]="y";
inputRedirect["04"]="Foreground";
inputRedirect["05"]="All";
ExecuteAFile(fileToExe, inputRedirect);

### bash script for running RELAX via HYPHY ###

#!/bin/bash

for file in trimmed2_OG*.hyphy.bf; do
	sudo mv $file ../orthogroup_RELAX_output/
	HYPHYMP ../orthogroup_RELAX_output/$file
	base=$(basename $file .bf)
	sudo mv $base.RELAX.json ../orthogroup_RELAX_output/
done

### create input tree file for relative rate analyses ###

#!/bin/bash

for file in trimmed2_OG*.codon.fasta.best.nwk; do
	base=$(basename $file .codon.fasta.best.nwk)
	base=${base:9}
	printf $base"\t" >> pre_all_OG_trees.nwk
	cat $file >> pre_all_OG_trees.nwk
done

sed '/^$/d' pre_all_OG_trees.nwk > all_OG_trees.nwk

### relative rate analyses not meaningful for orthogroups (since specific genes are also compared to other genes in the data) ###

setwd("/Users/kevinschneider/Documents/relrates")

source("helperFuncs_mod.R")
source("projection_coevo_new.R")

#this will take some time
mamTrees=readTrees("all_OG_trees.nwk")
#cutoff = exp(-7) based on the peak of the mean/variance plot


#this is the basic method
mamRER=getAllResiduals(mamTrees,useSpecies=mamTrees$masterTree$tip.label, transform = "none",weighted = F, cutoff=0.001)
#For some datasets scaling improves results
mamRERs=scale(mamRER)


#this method peforms better on benchmarks
mamRERlogW=getAllResiduals(mamTrees,useSpecies=mamTrees$masterTree$tip.label, transform = "log",weighted = T, cutoff=0.001)

#again can be scaled, almost always improves results in our exprience
mamRERlogWs=scale(mamRERlogW)

#read the binary tree
marineb=read.tree("MasterTree_bin_CorSalv_OG.nwk")
plot(marineb)
#controlb <- read.tree("control_bin.nwk")

#convert it to a paths vector
  phenvMarine=allPathMasterRelative(marineb, mamTrees$masterTree, mamTrees$ap)
  #control_phenvMarine <- allPathMasterRelative(controlb, mamTrees$masterTree, mamTrees$ap)
#this isn't binary anymore due to addition along branches
table(phenvMarine)  
#we can fix that 
phenvMarine[phenvMarine>1]=1
#control_phenvMarine[control_phenvMarine > 1] <- 1


corMarine=getAllCor(mamRER, phenvMarine)
hist(corMarine$P)
#control_corMarine <- getAllCor(mamRER, control_phenvMarine)

corMarineLogW=getAllCor(mamRERlogW, phenvMarine)
hist(corMarineLogW$P)
#control_corMarineLogW <- getAllCor(mamRERlogW, control_phenvMarine)

corMarineLogWs=getAllCor(mamRERlogWs, phenvMarine)
hist(corMarineLogWs$P)
#control_corMarineLogWs <- getAllCor(mamRERlogWs, control_phenvMarine)

corMarineLogWs_cleaned <- corMarineLogWs[!is.na(corMarineLogWs[, 3]), ] # comment out later (Kevin)


# additional manipulation of pipeline output (Kevin)
# coreset (10 species)
#salm_fore_pos_sel_Bonf <- readLines("Bonf_sigfiles_gene_M1_BS2_salm_fore_core.txt")
#CorSalv_pos_sel_Bonf <- readLines("Bonf_sigfiles_gene_M1_BS2_CorSalv_core.txt")
# extended transcriptome set (20 species across teleosts)
#salm_fore_pos_sel_ext <- readLines("sigfiles_gene_BS1_BS2_salm_fore_ext.txt")
#CorSalv_pos_sel_ext <- readLines("sigfiles_gene_BS1_BS2_CorSalv_ext.txt")

corMarineLogWs_cleaned <- corMarineLogWs[!is.na(corMarineLogWs[, 3]), ]
control_corMarineLogWs_cleaned <- control_corMarineLogWs[!is.na(control_corMarineLogWs[, 3]), ]
sig_genes <- corMarineLogWs_cleaned[corMarineLogWs_cleaned[, 3] < 0.05, ]
accelerated_sig <- sig_genes[sig_genes[, 1] > 0, ]
decelerated_sig <- sig_genes[sig_genes[, 1] < 0, ]

sig_genes_Bonf <- corMarineLogWs_cleaned[corMarineLogWs_cleaned[, 3] < (0.05 / nrow(corMarineLogWs_cleaned)), ]
accelerated_sig_Bonf <- sig_genes_Bonf[sig_genes_Bonf[, 1] > 0, ]
decelerated_sig_Bonf <- sig_genes_Bonf[sig_genes_Bonf[, 1] < 0, ]

p_FDR <- p.adjust(corMarineLogWs_cleaned[, 3], method = "fdr", n = length(corMarineLogWs_cleaned[, 3]))
sig_genes_FDR <- corMarineLogWs_cleaned[p_FDR < 0.05, ]
accelerated_sig_FDR <- sig_genes_FDR[sig_genes_FDR[, 1] > 0, ]
decelerated_sig_FDR <- sig_genes_FDR[sig_genes_FDR[, 1] < 0, ]


# number of significantly accelerated genes (p < 0.05) without Bonferroni correction
nrow(accelerated_sig)

# number of significantly accelerated genes (p < (0.05/1,446)) after Bonferroni correction
nrow(accelerated_sig_Bonf)

# number of significanlty accelerated genes (FDR-q < 0.05) after FDR correction
nrow(accelerated_sig_FDR)

# number of significantly decelerated genes (p < 0.05) without Bonferroni correction
nrow(decelerated_sig)

# number of significantly decelerated genes (p < (0.05/1,446)) after Bonferroni correction
nrow(decelerated_sig_Bonf)

# number of significantly decelerated genes (FDR-q < 0.05) after FDR correction
nrow(decelerated_sig_FDR)

# changing gene name prefixes
write(accelerated_sig, "accelerated_genes_OG_20181105.txt", sep = "\n")
write(decelerated_sig, "decelerated_genes_OG_20181105.txt", sep = "\n")

### transform multi-line fasta files to single-line sequence alignment files (for Kr/Kc analyses [Hanada]) ### 

##### 07.11.2018 #####
##### by Kevin Schneider #####
##### KevinSchneider@gmx.at #####
##### written for Python 3 #####
"""
--> script to convert mutli-line fasta files to one-line sequence alignment files <--
"""

from glob import glob
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid
from os import path
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)')
args = parser.parse_args()
files = args.i

for file in glob(files):
     with open(file, 'rU') as f:
          """ create a list of sequence records in the file """
          records = list(SeqIO.parse(f, 'fasta'))
          base = path.splitext(file)[0]
          new_fasta = 'krkc_input_' + base + '.oneline.seq'
          with open(new_fasta, 'w') as outfile:
               for record in records:
                    outfile.write(record.id + '\n' + str(record.seq) + '\n')

### script for running Kr/Kc (Hanada) analyses ###

#!/bin/bash

for file in krkc_input_*.oneline.aln; do
	perl KaKs.pl $file > $file.kaks
	perl KrKc.pl $file > $file.classA.krkc
	perl KrKc_classB.pl $file > $file.classB.krkc
done

# --> too many OTUs for Hanada's implementation of Kr/Kc and Ka/Ks analyses <-- #

### batch file for aBSREL analyses ###

fileToExe = "/usr/local/lib/hyphy/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf";
inputRedirect = {};
inputRedirect["01"]="Universal";
inputRedirect["02"]="/Volumes/LaCie/orthogroup_aBSREL_input/£";
inputRedirect["03"]="y";
inputRedirect["04"]="Foreground";
ExecuteAFile(fileToExe, inputRedirect);

### bash script for running aBSREL via HYPHY ###

#!/bin/bash

for file in trimmed2_OG*.hyphy.aBSREL.bf; do
	sudo mv $file ../orthogroup_aBSREL_output/
	HYPHYMP ../orthogroup_aBSREL_output/$file
	base=$(basename $file .bf)
	sudo mv $base.json ../orthogroup_aBSREL_output/
done

### bash script for extracting relevant information from RELAX output

#!/bin/bash

for file in trimmed2*.hyphy.RELAX.json; do
	pval=$(grep "p-value" $file | sed -e "s/\:\(.*\)\,/\1/" | cut -c13-)
	param=$(grep "relaxation or intensification parameter" $file | sed -e "s/\:\(.*\)/\1/" | cut -c45-)
	echo "START" >> RELAX_results.txt
	echo $file >> RELAX_results.txt
	echo $pval >> RELAX_results.txt
	echo $param >> RELAX_results.txt
	echo "END" >> RELAX_results.txt
done

###

### used for getting reference sets for enrichment calculations of OGs of outlier genes

grep -vf file1 file2 > file3

###

###

### R script for analysing RELAX results

### Kevin Schneider; last modified: 07.10.2019 ###
### KevinSchneider@gmx.at ###

library(ggplot2)

setwd("/Users/kevinschneider/Documents/transcriptomics/HYPHY_analyses/")

gotable <- read.csv("blast2go_export_2704orthogroups_salmonids.txt", header = TRUE, sep = "\t")


relax <- readLines("RELAX_results.txt")
relax <- relax[-(seq(from = 1, to = length(relax), by = 5))]
relax <- relax[-(seq(from = 4, to = length(relax), by = 4))]
relaxframe <- data.frame("ID" = as.character(relax[seq(from = 1, to = length(relax), by = 3)]), 
                         "pval" = as.numeric(relax[seq(from = 2, to = length(relax), by = 3)]),
                         "qval" = NA,
                         "selparam" = as.numeric(relax[seq(from = 3, to = length(relax), by = 3)]),
                         "sig" = NA)

relaxframe$qval <- p.adjust(relaxframe$pval, method = "fdr", n = nrow(relaxframe))
Bonfcutoff <- (0.05 / nrow(relaxframe))

relaxed_nocorr <- 0
relaxed_FDR <- 0
relaxed_Bonf <- 0
intensified_nocorr <- 0
intensified_FDR <- 0
intensified_Bonf <- 0
for (i in 1:nrow(relaxframe)) {
  if ((relaxframe$pval[i] < 0.05) && (relaxframe$selparam[i] < 1)) {
    relaxed_nocorr <- relaxed_nocorr + 1
    relaxframe$sig[i] <- "relaxed_nocorr"
  } 
  if ((relaxframe$qval[i] < 0.10) && (relaxframe$selparam[i] < 1)) {
    relaxed_FDR <- relaxed_FDR + 1
    relaxframe$sig[i] <- "relaxed_FDR"
  } 
  if ((relaxframe$pval[i] < Bonfcutoff) && (relaxframe$selparam[i] < 1)) {
    relaxed_Bonf <- relaxed_Bonf + 1
    relaxframe$sig[i] <- "relaxed_Bonf"
  } 
  if ((relaxframe$pval[i] < 0.05) && (relaxframe$selparam[i] > 1)) {
    intensified_nocorr <- intensified_nocorr + 1
    relaxframe$sig[i] <- "intensified_nocorr"
  } 
  if ((relaxframe$qval[i] < 0.10) && (relaxframe$selparam[i] > 1)) {
    intensified_FDR <- intensified_FDR + 1
    relaxframe$sig[i] <- "intensified_FDR"
  } 
  if ((relaxframe$pval[i] < Bonfcutoff) && (relaxframe$selparam[i] > 1)) {
    intensified_Bonf <- intensified_Bonf + 1
    relaxframe$sig[i] <- "intensified_Bonf"
  }
}

freq_table_nocorr <- matrix(c(relaxed_nocorr, intensified_nocorr, 
                       (nrow(relaxframe) - relaxed_nocorr),
                       (nrow(relaxframe) - intensified_nocorr)),
                     nrow = 2,
                     byrow = TRUE,
                     dimnames = list("category" = c("outliers", "non-outliers") ,
                                     "selection" = c("relaxed", "intensified")))
freq_table_FDR <- matrix(c(relaxed_FDR, intensified_FDR, 
                           (nrow(relaxframe) - relaxed_FDR),
                           (nrow(relaxframe) - intensified_FDR)),
                         nrow = 2,
                         byrow = TRUE,
                         dimnames = list("category" = c("outliers", "non-outliers") ,
                                         "selection" = c("relaxed", "intensified")))

freq_table_Bonf <- matrix(c(relaxed_Bonf, intensified_Bonf, 
                           (nrow(relaxframe) - relaxed_Bonf),
                           (nrow(relaxframe) - intensified_Bonf)),
                         nrow = 2,
                         byrow = TRUE,
                         dimnames = list("category" = c("outliers", "non-outliers") ,
                                         "selection" = c("relaxed", "intensified")))

below1 <- relaxframe$selparam[which(relaxframe$selparam < 1)]
above1 <- relaxframe$selparam[which(relaxframe$selparam > 1)]

freq_table_below_or_above_1 <- matrix(c(length(below1), length(above1), 
                                        1347.5, 1347.5),
                                        nrow = 2,
                                        byrow = TRUE,
                                        dimnames = list("category" = c("outliers", "non-outliers") ,
                                                        "selection" = c("relaxed", "intensified")))


fisher.test(freq_table_nocorr)
fisher.test(freq_table_FDR)
fisher.test(freq_table_Bonf)
fisher.test(freq_table_below_or_above_1)

relaxed_nocorr
relaxed_FDR
relaxed_Bonf
intensified_nocorr
intensified_FDR
intensified_Bonf

write.csv(relaxframe, "RELAX_R_dataframe.csv")

median(relaxframe$selparam)
mean(relaxframe$selparam)

freq_table_selparam <- matrix(c((length(relaxframe$selparam[relaxframe$selparam != 1]) / 2), 
                                (length(relaxframe$selparam[relaxframe$selparam != 1]) / 2), 
                                length(relaxframe$selparam[relaxframe$selparam < 1]),
                                length(relaxframe$selparam[relaxframe$selparam > 1])),
                                nrow = 2,
                                byrow = TRUE,
                                dimnames = list("category" = c("expected", "observed") ,
                                                "selection" = c("relaxed", "intensified")))

fisher.test(freq_table_selparam)

# extract orthogroup ID from json file ID

relaxframe$ID <- substr(relaxframe$ID, 10, 18)
gotable$Sequence.Name <- unlist(lapply(gotable$Sequence.Name, function(x) strsplit(as.character(x), "|", fixed = TRUE))
                                )[-(seq(from = 1, to = (nrow(gotable) * 2), by = 2))]
names(gotable)[1] <- "ID"

# combine the two dataframes
combframe <- merge(relaxframe, gotable, by = "ID")

GO_vec <- c()
GO_code_vec <- c()
for (j in 1:nrow(combframe)) {
  if (!is.na(combframe$Annotation.GO.Term[j])) {
    GO_vec <- append(GO_vec, unlist(strsplit(as.character(combframe$Annotation.GO.Term[j]), split = ";", fixed = TRUE)))
    GO_code_vec <- append(GO_code_vec, unlist(strsplit(as.character(combframe$Annotation.GO.ID[j]), split = ";", fixed = TRUE)))
  }
}

selparam_list <- list()
GO_frame <- data.frame("GO_category" = GO_vec, "GO_code" = GO_code_vec, "pval_median" = NA, "selparam_median" = NA,
                       "GO_frequency" = NA)
for (k in 1:length(GO_vec)) {
  pval_median_vec <- c()
  selparam_median_vec <- c()
  print(k)
  for (l in 1:nrow(combframe)) {
    invec <- FALSE
    invec <- grepl(GO_vec[k], combframe$Annotation.GO.Term[l], fixed = TRUE)
    if (invec) {
      pval_median_vec <- append(pval_median_vec, combframe$pval[l])
      selparam_median_vec <- append(selparam_median_vec, combframe$selparam[l])
    }
  }
  if (!is.null(pval_median_vec)) {
    pval_median <- median(pval_median_vec)
    GO_frame$pval_median[k] <- pval_median
  } else {
    GO_frame$pval_median[k] <- NA
  }
  if (!is.null(selparam_median_vec)) {
    selparam_median <- median(selparam_median_vec)
    GO_frame$selparam_median[k] <- selparam_median
    GO_frame$GO_frequency[k] <- length(selparam_median_vec)
  } else {
    GO_frame$selparam_median[k] <- NA
    GO_frame$GO_frequency[k] <- NA
  }
  selparam_list[[GO_vec[k]]] <- selparam_median_vec
}

GO_frame <- GO_frame[!duplicated(GO_frame), ]

write.table(GO_frame, file = "RELAX_and_GO_analysis_out_table.csv", quote = FALSE, sep = ";",
            row.names = FALSE)

save(selparam_list, file = "RELAX_and_GO_analysis_session.RData")

violin_list <- selparam_list[c(686, 22, 11, 34, 32, 15, 20, 14, 1, 180, 2, 1327, 723, 35, 645,
                               51, 84, 248, 250, 324)]
violin_list <- selparam_list[c(686, 22, 11, 34, 32, 15, 20)]
violin_list <- selparam_list[c(14, 1, 180, 2, 1327, 723, 35)]
violin_list <- selparam_list[c(645, 51, 84, 248, 250, 324)]

k_vec <- c()
GO_term_vec <- c()
for (m in 1:length(violin_list)) {
  for (n in 1:length(violin_list[[m]])) {
    k_vec <- append(k_vec, violin_list[[m]][n])
    GO_term_vec <- append(GO_term_vec, names(violin_list)[m])
  }
}
violin_df <- data.frame("k" = k_vec, "GO" = factor(GO_term_vec))

vplot <- ggplot(violin_df, aes(GO, k, fill = GO))
vplot + geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) + geom_jitter(height = 0, width = 0.1) + guides(fill=FALSE) + geom_hline(yintercept=1, linetype = "dashed", 
                                                                                               colour = "black", size=1) + coord_cartesian(ylim = c(0, 3)) +
  theme_grey(base_size = 18) + theme(axis.text.x = element_text(angle = 20, hjust = 1))
vplot + geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) + geom_jitter(height = 0, width = 0.1) + guides(fill=FALSE) + geom_hline(yintercept=1, linetype = "dashed", 
                                                                                                                                  colour = "black", size=1) + coord_cartesian(ylim = c(0, 3)) +
  theme_grey(base_size = 20)
vplot + geom_boxplot() + geom_jitter(height = 0, width = 0.1) + guides(fill=FALSE) + geom_hline(yintercept=1, linetype = "dashed", 
                                                                                               colour = "black", size=1) + coord_cartesian(ylim = c(0, 3))

wc.frame <- data.frame("GO_ID" = names(selparam_list), "psig" = NA, "qsig" = NA, "pval" = NA, "qval" = NA, "Vstat" = NA)
for (o in 1:length(selparam_list)) {
  pval <- NA
  qval <- NA
  psig <- NA
  qsig <- NA
  Vstat <- NA
  wc.test <- wilcox.test(selparam_list[[o]], mu = 1, alternative = "two.sided", na.action = na.omit)
  pval <- wc.test$p.value
  qval <- p.adjust(pval, method = "fdr", n = length(selparam_list))
  Vstat <- wc.test$statistic
  if (!is.na(pval)) {
    if (pval < 0.05) {
      psig <- TRUE
    } else {
      psig <- FALSE
    }
  }
  if (!is.na(qval)) {
    if (qval < 0.10) {
      qsig <- TRUE
    } else {
      qsig <- FALSE
    }
  }
  wc.frame$psig[o] <- psig
  wc.frame$qsig[o] <- qsig
  wc.frame$pval[o] <- pval
  wc.frame$qval[o] <- qval
  wc.frame$Vstat[o] <- Vstat
}

violin_list <- selparam_list[which(wc.frame$psig)]

k_vec <- c()
GO_term_vec <- c()
for (m in 1:length(violin_list)) {
  for (n in 1:length(violin_list[[m]])) {
    k_vec <- append(k_vec, violin_list[[m]][n])
    GO_term_vec <- append(GO_term_vec, names(violin_list)[m])
  }
}
violin_df <- data.frame("k" = k_vec, "GO" = factor(GO_term_vec))

vplot <- ggplot(violin_df, aes(GO, k, fill = GO))
vplot + geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) + geom_jitter(height = 0, width = 0.1) + guides(fill=FALSE) + geom_hline(yintercept=1, linetype = "dashed", 
                                                                                                                                  colour = "black", size=1) + coord_cartesian(ylim = c(0, 3)) +
  theme_grey(base_size = 18) + theme(axis.text.x = element_text(angle = 20, hjust = 1)) +
  scale_x_discrete(name = "GO term")
vplot + geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) + geom_jitter(height = 0, width = 0.1) + guides(fill=FALSE) + geom_hline(yintercept=1, linetype = "dashed", 
                                                                                                                                  colour = "black", size=1) + coord_cartesian(ylim = c(0, 3)) +
  theme_grey(base_size = 20)
vplot + geom_boxplot() + geom_jitter(height = 0, width = 0.1) + guides(fill=FALSE) + geom_hline(yintercept=1, linetype = "dashed", 
                                                                                                colour = "black", size=1) + coord_cartesian(ylim = c(0, 3))


### testing the difference of the selection parameter k distribution from the null expectation of 1

wilcox.test(relaxframe$selparam, mu = 1, alternative = "two.sided", na.action = na.omit)

median(relaxframe$selparam)
mean(relaxframe$selparam)

##### GO term enrichment analysis
##### using combframe, GO_frame, and outlier IDs (RELAX & aBSREL, incl. reference sets)

GO_frame <- cbind(GO_frame, rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), 
                  rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)))
names(GO_frame)[6] <- "GO_freq_ref_RELAX_rel"
names(GO_frame)[7] <- "GO_freq_RELAX_rel"
names(GO_frame)[8] <- "GO_freq_ref_RELAX_int"
names(GO_frame)[9] <- "GO_freq_RELAX_int"
names(GO_frame)[10] <- "GO_freq_ref_aBSREL"
names(GO_frame)[11] <- "GO_freq_aBSREL"

### reference FDR relaxed outliers (RELAX)
RELAX_ref_rel_IDs <- readLines("refset_relaxed_FDR_seq_IDs_RELAX_results.txt")
RELAX_ref_rel_IDs <- unlist(lapply(RELAX_ref_rel_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE))
    )[-(seq(from = 1, to = (length(RELAX_ref_rel_IDs) * 2), by = 2))]
RELAX_ref_rel <- combframe[which(combframe$ID %in% RELAX_ref_rel_IDs), ]

for (o in 1:nrow(GO_frame)) {
  for (p in 1:nrow(RELAX_ref_rel)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o], RELAX_ref_rel$Annotation.GO.ID[p], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_ref_RELAX_rel[o] <- GO_frame$GO_freq_ref_RELAX_rel[o] + 1
    }
  }
}

### FDR relaxed outliers (RELAX)
RELAX_rel_IDs <- readLines("relaxed_FDR_seq_IDs_RELAX_results.txt")
RELAX_rel_IDs <- unlist(lapply(RELAX_rel_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE))
    )[-(seq(from = 1, to = (length(RELAX_rel_IDs) * 2), by = 2))]
RELAX_rel <- combframe[which(combframe$ID %in% RELAX_rel_IDs), ]

for (o2 in 1:nrow(GO_frame)) {
  for (p2 in 1:nrow(RELAX_rel)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o2], RELAX_rel$Annotation.GO.ID[p2], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_RELAX_rel[o2] <- GO_frame$GO_freq_RELAX_rel[o2] + 1
    }
  }
}

### reference FDR relaxed outliers (RELAX)
RELAX_ref_int_IDs <- readLines("refset_intensified_FDR_seq_IDs_RELAX_results.txt")
RELAX_ref_int_IDs <- unlist(lapply(RELAX_ref_int_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE))
)[-(seq(from = 1, to = (length(RELAX_ref_int_IDs) * 2), by = 2))]
RELAX_ref_int <- combframe[which(combframe$ID %in% RELAX_ref_int_IDs), ]

for (o in 1:nrow(GO_frame)) {
  for (p in 1:nrow(RELAX_ref_int)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o], RELAX_ref_int$Annotation.GO.ID[p], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_ref_RELAX_int[o] <- GO_frame$GO_freq_ref_RELAX_int[o] + 1
    }
  }
}

### FDR relaxed outliers (RELAX)
RELAX_int_IDs <- readLines("intensified_FDR_seq_IDs_RELAX_results.txt")
RELAX_int_IDs <- unlist(lapply(RELAX_int_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE))
)[-(seq(from = 1, to = (length(RELAX_int_IDs) * 2), by = 2))]
RELAX_int <- combframe[which(combframe$ID %in% RELAX_int_IDs), ]

for (o2 in 1:nrow(GO_frame)) {
  for (p2 in 1:nrow(RELAX_int)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o2], RELAX_int$Annotation.GO.ID[p2], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_RELAX_int[o2] <- GO_frame$GO_freq_RELAX_int[o2] + 1
    }
  }
}

### reference FDR relaxed outliers (RELAX)
aBSREL_ref_IDs <- readLines("refset_FDR_seq_IDs_aBSREL_results1.txt")
aBSREL_ref_IDs <- unlist(lapply(aBSREL_ref_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE))
)[-(seq(from = 1, to = (length(aBSREL_ref_IDs) * 2), by = 2))]
aBSREL_ref <- combframe[which(combframe$ID %in% aBSREL_ref_IDs), ]

for (o in 1:nrow(GO_frame)) {
  for (p in 1:nrow(aBSREL_ref)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o], aBSREL_ref$Annotation.GO.ID[p], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_ref_aBSREL[o] <- GO_frame$GO_freq_ref_aBSREL[o] + 1
    }
  }
}

### FDR relaxed outliers (RELAX)
aBSREL_IDs <- readLines("FDR_seq_IDs_aBSREL_results1.txt")
aBSREL_IDs <- unlist(lapply(aBSREL_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE))
)[-(seq(from = 1, to = (length(aBSREL_IDs) * 2), by = 2))]
aBSREL <- combframe[which(combframe$ID %in% aBSREL_IDs), ]

for (o2 in 1:nrow(GO_frame)) {
  for (p2 in 1:nrow(aBSREL)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o2], aBSREL$Annotation.GO.ID[p2], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_aBSREL[o2] <- GO_frame$GO_freq_aBSREL[o2] + 1
    }
  }
}

GO_frame <- cbind(GO_frame, rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), 
                  rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)),
                  rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)))
names(GO_frame)[12] <- "GO_freq_both_fore_ref_RELAX_rel"
names(GO_frame)[13] <- "GO_freq_both_fore_RELAX_rel"
names(GO_frame)[14] <- "GO_freq_both_fore_ref_RELAX_int"
names(GO_frame)[15] <- "GO_freq_both_fore_RELAX_int"
names(GO_frame)[16] <- "GO_freq_both_fore_ref_aBSREL"
names(GO_frame)[17] <- "GO_freq_both_fore_aBSREL"
names(GO_frame)[18] <- "GO_freq_both_fore_ref_relaxed_diversifying"
names(GO_frame)[19] <- "GO_freq_both_fore_relaxed_diversifying"
names(GO_frame)[20] <- "GO_freq_both_fore_ref_intensified_diversifying"
names(GO_frame)[21] <- "GO_freq_both_fore_intensified_diversifying"

### reference FDR relaxed outliers (RELAX) with both Coregonus & Salvelinus in alignments
RELAX_both_fore_ref_rel_IDs <- readLines("both_fore_refset_relaxed_FDR_seq_IDs_RELAX_results.txt")
RELAX_both_fore_ref_rel_IDs <- unlist(lapply(RELAX_both_fore_ref_rel_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE))
)[-(seq(from = 1, to = (length(RELAX_both_fore_ref_rel_IDs) * 2), by = 2))]
RELAX_both_fore_ref_rel <- combframe[which(combframe$ID %in% RELAX_both_fore_ref_rel_IDs), ]

for (o in 1:nrow(GO_frame)) {
  for (p in 1:nrow(RELAX_both_fore_ref_rel)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o], RELAX_both_fore_ref_rel$Annotation.GO.ID[p], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_both_fore_ref_RELAX_rel[o] <- GO_frame$GO_freq_both_fore_ref_RELAX_rel[o] + 1
    }
  }
}

### FDR relaxed outliers (RELAX) with both Coregonus & Salvelinus in alignments
RELAX_both_fore_rel_IDs <- readLines("both_fore_relaxed_FDR_seq_IDs_RELAX_results.txt")
RELAX_both_fore_rel_IDs <- unlist(lapply(RELAX_both_fore_rel_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE))
)[-(seq(from = 1, to = (length(RELAX_both_fore_rel_IDs) * 2), by = 2))]
RELAX_both_fore_rel <- combframe[which(combframe$ID %in% RELAX_both_fore_rel_IDs), ]

for (o2 in 1:nrow(GO_frame)) {
  for (p2 in 1:nrow(RELAX_both_fore_rel)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o2], RELAX_both_fore_rel$Annotation.GO.ID[p2], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_both_fore_RELAX_rel[o2] <- GO_frame$GO_freq_both_fore_RELAX_rel[o2] + 1
    }
  }
}

### reference FDR relaxed outliers (RELAX) with both Coregonus & Salvelinus in alignments
RELAX_both_fore_ref_int_IDs <- readLines("both_fore_refset_intensified_FDR_seq_IDs_RELAX_results.txt")
RELAX_both_fore_ref_int_IDs <- unlist(lapply(RELAX_both_fore_ref_int_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE))
)[-(seq(from = 1, to = (length(RELAX_both_fore_ref_int_IDs) * 2), by = 2))]
RELAX_both_fore_ref_int <- combframe[which(combframe$ID %in% RELAX_both_fore_ref_int_IDs), ]

for (o in 1:nrow(GO_frame)) {
  for (p in 1:nrow(RELAX_both_fore_ref_int)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o], RELAX_both_fore_ref_int$Annotation.GO.ID[p], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_both_fore_ref_RELAX_int[o] <- GO_frame$GO_freq_both_fore_ref_RELAX_int[o] + 1
    }
  }
}

### FDR relaxed outliers (RELAX) with both Coregonus & Salvelinus in alignments
RELAX_both_fore_int_IDs <- readLines("both_fore_intensified_FDR_seq_IDs_RELAX_results.txt")
RELAX_both_fore_int_IDs <- unlist(lapply(RELAX_both_fore_int_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE))
)[-(seq(from = 1, to = (length(RELAX_both_fore_int_IDs) * 2), by = 2))]
RELAX_both_fore_int <- combframe[which(combframe$ID %in% RELAX_both_fore_int_IDs), ]

for (o2 in 1:nrow(GO_frame)) {
  for (p2 in 1:nrow(RELAX_both_fore_int)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o2], RELAX_both_fore_int$Annotation.GO.ID[p2], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_both_fore_RELAX_int[o2] <- GO_frame$GO_freq_both_fore_RELAX_int[o2] + 1
    }
  }
}

### reference FDR relaxed outliers (RELAX) with both Coregonus & Salvelinus in alignments
aBSREL_both_fore_ref_IDs <- readLines("both_fore_refset_FDR_seq_IDs_aBSREL_results1.txt")
aBSREL_both_fore_ref_IDs <- unlist(lapply(aBSREL_both_fore_ref_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE))
)[-(seq(from = 1, to = (length(aBSREL_both_fore_ref_IDs) * 2), by = 2))]
aBSREL_both_fore_ref <- combframe[which(combframe$ID %in% aBSREL_both_fore_ref_IDs), ]

for (o in 1:nrow(GO_frame)) {
  for (p in 1:nrow(aBSREL_both_fore_ref)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o], aBSREL_both_fore_ref$Annotation.GO.ID[p], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_both_fore_ref_aBSREL[o] <- GO_frame$GO_freq_both_fore_ref_aBSREL[o] + 1
    }
  }
}

### FDR relaxed outliers (RELAX) with both Coregonus & Salvelinus in alignments
aBSREL_both_fore_IDs <- readLines("both_fore_FDR_seq_IDs_aBSREL_results1.txt")
aBSREL_both_fore_IDs <- unlist(lapply(aBSREL_both_fore_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE))
)[-(seq(from = 1, to = (length(aBSREL_both_fore_IDs) * 2), by = 2))]
aBSREL_both_fore <- combframe[which(combframe$ID %in% aBSREL_both_fore_IDs), ]

for (o2 in 1:nrow(GO_frame)) {
  for (p2 in 1:nrow(aBSREL_both_fore)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o2], aBSREL_both_fore$Annotation.GO.ID[p2], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_both_fore_aBSREL[o2] <- GO_frame$GO_freq_both_fore_aBSREL[o2] + 1
    }
  }
}

### reference FDR relaxed  & diversifying outliers (RELAX & aBSREL) with both Coregonus & Salvelinus in alignments
RELAX_rel_aBSREL_both_fore_ref_IDs <- readLines("refset_both_fore_relaxed_diversifying_FDR_seqIDs.txt")
RELAX_rel_aBSREL_both_fore_ref_IDs <- unlist(lapply(RELAX_rel_aBSREL_both_fore_ref_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)))[-(seq(from = 1, to = (length(RELAX_rel_aBSREL_both_fore_ref_IDs) * 2), by = 2))]
RELAX_rel_aBSREL_both_fore_ref <- combframe[which(combframe$ID %in% RELAX_rel_aBSREL_both_fore_ref_IDs), ]

for (o in 1:nrow(GO_frame)) {
  for (p in 1:nrow(RELAX_rel_aBSREL_both_fore_ref)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o], RELAX_rel_aBSREL_both_fore_ref$Annotation.GO.ID[p], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_both_fore_ref_relaxed_diversifying[o] <- GO_frame$GO_freq_both_fore_ref_relaxed_diversifying[o] + 1
    }
  }
}

### FDR relaxed  & diversifying outliers (RELAX & aBSREL) with both Coregonus & Salvelinus in alignments
RELAX_rel_aBSREL_both_fore_IDs <- readLines("both_fg_taxa_relaxed_FDR_to_FDR_aBSREL1.txt")
RELAX_rel_aBSREL_both_fore_IDs <- unlist(lapply(RELAX_rel_aBSREL_both_fore_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)))[-(seq(from = 1, to = (length(RELAX_rel_aBSREL_both_fore_IDs) * 2), by = 2))]
RELAX_rel_aBSREL_both_fore <- combframe[which(combframe$ID %in% RELAX_rel_aBSREL_both_fore_IDs), ]

for (o2 in 1:nrow(GO_frame)) {
  for (p2 in 1:nrow(RELAX_rel_aBSREL_both_fore)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o2], RELAX_rel_aBSREL_both_fore$Annotation.GO.ID[p2], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_both_fore_relaxed_diversifying[o2] <- GO_frame$GO_freq_both_fore_relaxed_diversifying[o2] + 1
    }
  }
}

### reference FDR intensified  & diversifying outliers (RELAX & aBSREL) with both Coregonus & Salvelinus in alignments
RELAX_int_aBSREL_both_fore_ref_IDs <- readLines("refset_both_fore_intensified_diversifying_FDR_seqIDs.txt")
RELAX_int_aBSREL_both_fore_ref_IDs <- unlist(lapply(RELAX_int_aBSREL_both_fore_ref_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)))[-(seq(from = 1, to = (length(RELAX_int_aBSREL_both_fore_ref_IDs) * 2), by = 2))]
RELAX_int_aBSREL_both_fore_ref <- combframe[which(combframe$ID %in% RELAX_int_aBSREL_both_fore_ref_IDs), ]

for (o in 1:nrow(GO_frame)) {
  for (p in 1:nrow(RELAX_int_aBSREL_both_fore_ref)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o], RELAX_int_aBSREL_both_fore_ref$Annotation.GO.ID[p], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_both_fore_ref_intensified_diversifying[o] <- GO_frame$GO_freq_both_fore_ref_intensified_diversifying[o] + 1
    }
  }
}

### FDR intensified  & diversifying outliers (RELAX & aBSREL) with both Coregonus & Salvelinus in alignments
RELAX_int_aBSREL_both_fore_IDs <- readLines("both_fg_taxa_intensified_FDR_to_FDR_aBSREL1.txt")
RELAX_int_aBSREL_both_fore_IDs <- unlist(lapply(RELAX_int_aBSREL_both_fore_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)))[-(seq(from = 1, to = (length(RELAX_int_aBSREL_both_fore_IDs) * 2), by = 2))]
RELAX_int_aBSREL_both_fore <- combframe[which(combframe$ID %in% RELAX_int_aBSREL_both_fore_IDs), ]

for (o2 in 1:nrow(GO_frame)) {
  for (p2 in 1:nrow(RELAX_int_aBSREL_both_fore)) {
    greptrue <- FALSE
    greptrue <- grepl(GO_frame$GO_code[o2], RELAX_int_aBSREL_both_fore$Annotation.GO.ID[p2], fixed = TRUE)
    if (greptrue) {
      GO_frame$GO_freq_both_fore_intensified_diversifying[o2] <- GO_frame$GO_freq_both_fore_intensified_diversifying[o2] + 1
    }
  }
}

GO_frame <- cbind(GO_frame, rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), 
                  rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)),
                  rep(0, nrow(GO_frame)),  rep(0, nrow(GO_frame)),  rep(0, nrow(GO_frame)),  rep(0, nrow(GO_frame)),
                  rep(0, nrow(GO_frame)),  rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)),  rep(0, nrow(GO_frame)),
                  rep(0, nrow(GO_frame)),  rep(0, nrow(GO_frame)))
names(GO_frame)[22] <- "GO_freq_not_ref_RELAX_rel"
names(GO_frame)[23] <- "GO_freq_not_RELAX_rel"
names(GO_frame)[24] <- "GO_freq_not_ref_RELAX_int"
names(GO_frame)[25] <- "GO_freq_not_RELAX_int"
names(GO_frame)[26] <- "GO_freq_not_ref_aBSREL"
names(GO_frame)[27] <- "GO_freq_not_aBSREL"
names(GO_frame)[28] <- "GO_freq_not_both_fore_ref_RELAX_rel"
names(GO_frame)[29] <- "GO_freq_not_both_fore_RELAX_rel"
names(GO_frame)[30] <- "GO_freq_not_both_fore_ref_RELAX_int"
names(GO_frame)[31] <- "GO_freq_not_both_fore_RELAX_int"
names(GO_frame)[32] <- "GO_freq_not_both_fore_ref_aBSREL"
names(GO_frame)[33] <- "GO_freq_not_both_fore_aBSREL"
names(GO_frame)[34] <- "GO_freq_not_both_fore_ref_relaxed_diversifying"
names(GO_frame)[35] <- "GO_freq_not_both_fore_relaxed_diversifying"
names(GO_frame)[36] <- "GO_freq_not_both_fore_ref_intensified_diversifying"
names(GO_frame)[37] <- "GO_freq_not_both_fore_intensified_diversifying"

for (q in 1:nrow(GO_frame)) {
  GO_frame$GO_freq_not_ref_RELAX_rel[q] <- length(RELAX_ref_rel_IDs) - GO_frame$GO_freq_ref_RELAX_rel[q]
  GO_frame$GO_freq_not_RELAX_rel[q] <- length(RELAX_rel_IDs) - GO_frame$GO_freq_RELAX_rel[q]
  GO_frame$GO_freq_not_ref_RELAX_int[q] <- length(RELAX_ref_int_IDs) - GO_frame$GO_freq_ref_RELAX_int[q]
  GO_frame$GO_freq_not_RELAX_int[q] <- length(RELAX_int_IDs) - GO_frame$GO_freq_RELAX_int[q]
  GO_frame$GO_freq_not_ref_aBSREL[q] <- length(aBSREL_ref_IDs) - GO_frame$GO_freq_ref_aBSREL[q]
  GO_frame$GO_freq_not_aBSREL[q] <- length(aBSREL_IDs) - GO_frame$GO_freq_aBSREL[q]
  GO_frame$GO_freq_not_both_fore_ref_RELAX_rel[q] <- length(RELAX_both_fore_ref_rel_IDs) - GO_frame$GO_freq_both_fore_ref_RELAX_rel[q]
  GO_frame$GO_freq_not_both_fore_RELAX_rel[q] <- length(RELAX_both_fore_rel_IDs) - GO_frame$GO_freq_both_fore_RELAX_rel[q]
  GO_frame$GO_freq_not_both_fore_ref_RELAX_int[q] <- length(RELAX_both_fore_ref_int_IDs) - GO_frame$GO_freq_both_fore_ref_RELAX_int[q]
  GO_frame$GO_freq_not_both_fore_RELAX_int[q] <- length(RELAX_both_fore_int_IDs) - GO_frame$GO_freq_both_fore_RELAX_int[q]
  GO_frame$GO_freq_not_both_fore_ref_aBSREL[q] <- length(aBSREL_both_fore_ref_IDs) - GO_frame$GO_freq_both_fore_ref_aBSREL[q]
  GO_frame$GO_freq_not_both_fore_aBSREL[q] <- length(aBSREL_both_fore_IDs) - GO_frame$GO_freq_both_fore_aBSREL[q]
  GO_frame$GO_freq_not_both_fore_ref_relaxed_diversifying[q] <- length(RELAX_rel_aBSREL_both_fore_ref_IDs) - GO_frame$GO_freq_both_fore_ref_relaxed_diversifying[q]
  GO_frame$GO_freq_not_both_fore_relaxed_diversifying[q] <- length(RELAX_rel_aBSREL_both_fore_IDs) - GO_frame$GO_freq_both_fore_relaxed_diversifying[q]
  GO_frame$GO_freq_not_both_fore_ref_intensified_diversifying[q] <- length(RELAX_int_aBSREL_both_fore_ref_IDs) - GO_frame$GO_freq_both_fore_ref_intensified_diversifying[q]
  GO_frame$GO_freq_not_both_fore_intensified_diversifying[q] <- length(RELAX_int_aBSREL_both_fore_IDs) - GO_frame$GO_freq_both_fore_intensified_diversifying[q]
}

Ntotals <- c(length(which(as.logical(c((GO_frame$GO_freq_ref_RELAX_rel == 0) + (GO_frame$GO_freq_RELAX_rel == 0))))),
             length(which(as.logical(c((GO_frame$GO_freq_ref_RELAX_int == 0) + (GO_frame$GO_freq_RELAX_int == 0))))),
             length(which(as.logical(c((GO_frame$GO_freq_ref_aBSREL == 0) + (GO_frame$GO_freq_aBSREL == 0))))),
             length(which(as.logical(c((GO_frame$GO_freq_both_fore_ref_RELAX_rel == 0) + (GO_frame$GO_freq_both_fore_RELAX_rel == 0))))),
             length(which(as.logical(c((GO_frame$GO_freq_both_fore_ref_RELAX_int == 0) + (GO_frame$GO_freq_both_fore_RELAX_int == 0))))),
             length(which(as.logical(c((GO_frame$GO_freq_both_fore_ref_aBSREL == 0) + (GO_frame$GO_freq_both_fore_aBSREL == 0))))),
             length(which(as.logical(c((GO_frame$GO_freq_both_fore_ref_relaxed_diversifying == 0) + (GO_frame$GO_freq_both_fore_relaxed_diversifying == 0))))),
             length(which(as.logical(c((GO_frame$GO_freq_both_fore_ref_intensified_diversifying == 0) + (GO_frame$GO_freq_both_fore_intensified_diversifying == 0))))))

GO_frame <- cbind(GO_frame, rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)),
                  rep(0, nrow(GO_frame)),  rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)),  rep(0, nrow(GO_frame)))
names(GO_frame)[38] <- "RELAX_rel_qval"
names(GO_frame)[39] <- "RELAX_int_qval"
names(GO_frame)[40] <- "aBSREL_qval"
names(GO_frame)[41] <- "both_fore_RELAX_rel_qval"
names(GO_frame)[42] <- "both_fore_RELAX_int_qval"
names(GO_frame)[43] <- "both_fore_aBSREL_qval"
names(GO_frame)[44] <- "both_fore_relaxed_diversifying_qval"
names(GO_frame)[45] <- "both_fore_intensified_diversifying_qval"

GO_frame <- cbind(GO_frame, rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)),
                  rep(0, nrow(GO_frame)),  rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)),  rep(0, nrow(GO_frame)))
names(GO_frame)[46] <- "RELAX_rel_pval"
names(GO_frame)[47] <- "RELAX_int_pval"
names(GO_frame)[48] <- "aBSREL_pval"
names(GO_frame)[49] <- "both_fore_RELAX_rel_pval"
names(GO_frame)[50] <- "both_fore_RELAX_int_pval"
names(GO_frame)[51] <- "both_fore_aBSREL_pval"
names(GO_frame)[52] <- "both_fore_relaxed_diversifying_pval"
names(GO_frame)[53] <- "both_fore_intensified_diversifying_pval"

# for (r in 1:nrow(GO_frame)) {
#   t <- 30
#   for (s in seq(6, 16, 2)) {
#     freq_table <- matrix(c(GO_frame[r, s + 1], GO_frame[r, s], GO_frame[r, s + 13], GO_frame[r, s + 12]),
#                          nrow = 2,
#                          dimnames = list("category" = c("focal_GO", "other_GOs") ,
#                                          "GO term" = c("outliers", "reference")))
#     temp_FET <- fisher.test(freq_table)
#     temp_FET_FDR_qval <- p.adjust(temp_FET$p.value, method = "fdr", n = Ntotals[t - 29])
#     GO_frame[r, t] <- temp_FET_FDR_qval
#     GO_frame[r, t + 6] <- temp_FET$p.value
#     t <- t + 1
#   }
# }
# 
# write.csv(GO_frame[which(GO_frame$RELAX_rel_pval < 0.05), ], "twosided_RELAX_rel_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$RELAX_int_pval < 0.05), ], "twosided_RELAX_int_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$aBSREL_pval < 0.05), ], "twosided_aBSREL_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$both_fore_RELAX_rel_pval < 0.05), ], "twosided_both_fore_RELAX_rel_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$both_fore_RELAX_int_pval < 0.05), ], "twosided_both_fore_RELAX_int_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$both_fore_aBSREL_pval < 0.05), ], "twosided_both_fore_aBSREL_pval_R_dataframe.csv")

for (r in 1:nrow(GO_frame)) {
  t <- 38
  for (s in seq(6, 20, 2)) {
    freq_table <- matrix(c(GO_frame[r, s + 1], GO_frame[r, s], GO_frame[r, s + 17], GO_frame[r, s + 16]),
                         nrow = 2,
                         dimnames = list("category" = c("focal_GO", "other_GOs") ,
                                         "GO term" = c("outliers", "reference")))
    if (all(freq_table >= 0)) {
      temp_FET <- fisher.test(freq_table, alternative = "greater")
      temp_FET_FDR_qval <- p.adjust(temp_FET$p.value, method = "fdr", n = Ntotals[t - 37])
      GO_frame[r, t] <- temp_FET_FDR_qval
      GO_frame[r, t + 8] <- temp_FET$p.value
    }
    t <- t + 1
  }
}

# write.csv(GO_frame[which(GO_frame$RELAX_rel_pval < 0.05), ], "enriched_RELAX_rel_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$RELAX_int_pval < 0.05), ], "enriched_RELAX_int_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$aBSREL_pval < 0.05), ], "enriched_aBSREL_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$both_fore_RELAX_rel_pval < 0.05), ], "enriched_both_fore_RELAX_rel_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$both_fore_RELAX_int_pval < 0.05), ], "enriched_both_fore_RELAX_int_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$both_fore_aBSREL_pval < 0.05), ], "enriched_both_fore_aBSREL_pval_R_dataframe.csv")
write.table(GO_frame[which(GO_frame$both_fore_relaxed_diversifying_pval < 0.05), ], "enriched_both_fore_relaxed_diversifying_pval_R_dataframe.tsv", quote = FALSE, row.names = FALSE, sep = "\t")
write.table(GO_frame[which(GO_frame$both_fore_intensified_diversifying_pval < 0.05), ], "enriched_both_fore_intensified_diversifying_pval_R_dataframe.tsv", quote = FALSE, row.names = FALSE, sep = "\t")
write.table(GO_frame[which(GO_frame$GO_freq_both_fore_relaxed_diversifying > 0), ], "all_FET_both_fore_relaxed_diversifying_pval_R_dataframe.tsv", quote = FALSE, row.names = FALSE, sep = "\t")
write.table(GO_frame[which(GO_frame$GO_freq_both_fore_intensified_diversifying > 0), ], "all_FET_both_fore_intensified_diversifying_pval_R_dataframe.tsv", quote = FALSE, row.names = FALSE, sep = "\t")

for (r in 1:nrow(GO_frame)) {
  t <- 38
  for (s in seq(6, 20, 2)) {
    freq_table <- matrix(c(GO_frame[r, s + 1], GO_frame[r, s], GO_frame[r, s + 17], GO_frame[r, s + 16]),
                         nrow = 2,
                         dimnames = list("category" = c("focal_GO", "other_GOs") ,
                                         "GO term" = c("outliers", "reference")))
    if (all(freq_table >= 0)) {
      temp_FET <- fisher.test(freq_table, alternative = "less")
      temp_FET_FDR_qval <- p.adjust(temp_FET$p.value, method = "fdr", n = Ntotals[t - 37])
      GO_frame[r, t] <- temp_FET_FDR_qval
      GO_frame[r, t + 8] <- temp_FET$p.value
    }
    t <- t + 1
  }
}

# write.csv(GO_frame[which(GO_frame$RELAX_rel_pval < 0.05), ], "underrepresented_RELAX_rel_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$RELAX_int_pval < 0.05), ], "underrepresented_RELAX_int_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$aBSREL_pval < 0.05), ], "underrepresented_aBSREL_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$both_fore_RELAX_rel_pval < 0.05), ], "underrepresented_both_fore_RELAX_rel_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$both_fore_RELAX_int_pval < 0.05), ], "underrepresented_both_fore_RELAX_int_pval_R_dataframe.csv")
# write.csv(GO_frame[which(GO_frame$both_fore_aBSREL_pval < 0.05), ], "underrepresented_both_fore_aBSREL_pval_R_dataframe.csv")
write.table(GO_frame[which(GO_frame$both_fore_relaxed_diversifying_pval < 0.05), ], "underrepresented_both_fore_relaxed_diversifying_pval_R_dataframe.tsv", quote = FALSE, row.names = FALSE, sep = "\t")
write.table(GO_frame[which(GO_frame$both_fore_intensified_diversifying_pval < 0.05), ], "underrepresented_both_fore_intensified_diversfiying_pval_R_dataframe.tsv", quote = FALSE, row.names = FALSE, sep = "\t")

# outputting GO term information as .csv files
both_fore_relaxed_diversifying_frame <- data.frame("OG" = RELAX_rel_aBSREL_both_fore$ID, "Description" = RELAX_rel_aBSREL_both_fore$Sequence.Description,
                                                   "GO_IDs" = RELAX_rel_aBSREL_both_fore$Annotation.GO.ID, "GO_terms" = RELAX_rel_aBSREL_both_fore$Annotation.GO.Term)
both_fore_intensified_diversifying_frame <- data.frame("OG" = RELAX_int_aBSREL_both_fore$ID, "Description" = RELAX_int_aBSREL_both_fore$Sequence.Description,
                                                   "GO_IDs" = RELAX_int_aBSREL_both_fore$Annotation.GO.ID, "GO_terms" = RELAX_int_aBSREL_both_fore$Annotation.GO.Term)

write.table(both_fore_relaxed_diversifying_frame, "both_fore_relaxed_diversifying_FDR_GO_info.tsv", quote = FALSE, row.names = FALSE, sep = "\t")
write.table(both_fore_intensified_diversifying_frame, "both_fore_intensified_diversifying_FDR_GO_info.tsv", quote = FALSE, row.names = FALSE, sep = "\t")

# flagging orthogroups that include both Coregonus & Salvelinus
both_fore_OG <- as.character(readLines("both_fore_OG_IDs.txt"))
both_fore <- combframe$ID %in% both_fore_OG
combframe <- data.frame(combframe, "both_Cor_Salv" = both_fore)

# outputting outlier information (incl. GO terms) as tab-delimited .txt files
rel_or_int <- rep(NA, nrow(combframe))
rel_or_int[combframe$selparam < 1] <- "relaxed"
rel_or_int[combframe$selparam > 1] <- "intensified"
RELAX_Bonf <- which(combframe$sig %in% c("relaxed_Bonf", "intensified_Bonf"))
RELAX_Bonf_frame <- data.frame("Orthogroup_ID" = combframe$ID[RELAX_Bonf], "both_Cor_Salv" = combframe$both_Cor_Salv[RELAX_Bonf], 
                               "Description" = combframe$Sequence.Description[RELAX_Bonf], "GO_terms" = combframe$Annotation.GO.Term[RELAX_Bonf],
                               "p-value" = combframe$pval[RELAX_Bonf], "q-value" = combframe$qval[RELAX_Bonf], "k-value" = combframe$selparam[RELAX_Bonf],
                               "rel_or_int" = rel_or_int[RELAX_Bonf])
write.table(RELAX_Bonf_frame, "additional_file_1_new.txt", quote = FALSE, row.names = FALSE, sep = "\t")

RELAX_FDR <- which(combframe$sig %in% c("relaxed_FDR", "intensified_FDR"))
RELAX_FDR_frame <- data.frame("Orthogroup_ID" = combframe$ID[RELAX_FDR], "both_Cor_Salv" = combframe$both_Cor_Salv[RELAX_FDR], 
                               "Description" = combframe$Sequence.Description[RELAX_FDR], "GO_terms" = combframe$Annotation.GO.Term[RELAX_FDR],
                               "p-value" = combframe$pval[RELAX_FDR], "q-value" = combframe$qval[RELAX_FDR], "k-value" = combframe$selparam[RELAX_FDR],
                               "rel_or_int" = rel_or_int[RELAX_FDR])
write.table(RELAX_FDR_frame, "additional_file_2_new.txt", quote = FALSE, row.names = FALSE, sep = "\t")

RELAX_all_frame <- data.frame("Orthogroup_ID" = combframe$ID, "both_Cor_Salv" = combframe$both_Cor_Salv, 
                              "Description" = combframe$Sequence.Description, "GO_terms" = combframe$Annotation.GO.Term,
                              "p-value" = combframe$pval, "q-value" = combframe$qval, "k-value" = combframe$selparam,
                              "rel_or_int" = rel_or_int, "significance" = combframe$sig)
write.table(RELAX_all_frame, "additional_file_3_new.txt", quote = FALSE, row.names = FALSE, sep = "\t")

aBSREL_OG_IDs <- as.character(readLines("FDR_OG_IDs_aBSREL_results1.txt"))
aBSREL_FDR <- which(combframe$ID %in% aBSREL_OG_IDs)
aBSREL_FDR_frame <- data.frame("Orthogroup_ID" = combframe$ID, "both_Cor_Salv" = combframe$both_Cor_Salv, 
                             "Description" = combframe$Sequence.Description, "GO_terms" = combframe$Annotation.GO.Term)
write.table(aBSREL_FDR_frame, "additional_file_4_new.txt", quote = FALSE, row.names = FALSE, sep = "\t")

### converting HYPHY sequence/tree files (step 1)

##### 04.12.2018 #####
##### by Kevin Schneider #####
##### KevinSchneider@gmx.at #####
##### written for Python 3 #####
"""
--> script to convert mutli-line fasta files to one-line fasta files <--
"""

from glob import glob
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid
from os import path
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)')
args = parser.parse_args()
files = args.i

for file in glob(files):
     with open(file, 'rU') as f:
          """ create a list of sequence records in the file """
          records = list(SeqIO.parse(f, 'fasta'))
          base = path.splitext(file)[0]
          new_fasta = 'GO_input_' + base + '.oneline.fasta'
          with open(new_fasta, 'w') as outfile:
               for record in records:
                    outfile.write('>' + record.id + '\n' + str(record.seq) + '\n')

### converting HYPHY sequence/tree files (step 2)

##### 04.12.2018 #####
##### by Kevin Schneider #####
##### KevinSchneider@gmx.at #####
##### written for Python 3 #####
"""
--> script to change sequence headers <--
"""

from glob import glob
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid
from os import path
import argparse
import re

def change_headers(records, file):
     ortho_dict = {}
     """ extract last part of header and add the number of the current orthologue in the orthogroup """
     for record in records:
          OG_str = str(file)
          OG_info = re.search('trimmed2_(.+?).oneline', OG_str)
          record.id = record.id + "|" + OG_info.group(1)
     return records

parser = argparse.ArgumentParser()
parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)')
args = parser.parse_args()
files = args.i

for file in glob(files):
     with open(file, 'rU') as f:
          """ create a list of sequence records in the file """
          records = list(SeqIO.parse(f, 'fasta'))
          new_records = change_headers(records, file)
          base = path.splitext(file)[0]
          new_fasta = base + '.incl_tree.fasta'
          with open(new_fasta, 'w') as outfile:
               for record in new_records:
                    outfile.write('>' + record.id + '\n' + str(record.seq) + '\n')
		
### converting HYPHY sequence/tree files (step 3)

### extracting sequences from fasta files derived from hyphy files (see above) for GO analysis in Blast2GO
### unnecessary information (from trees) and gaps are removed

#!/bin/bash

for file in GO_input_trimmed2_*.oneline.incl_tree.fasta; do
	if grep -q ">S_salar" $file; then
		grep -A1 ">S_salar" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta
	elif grep -q ">S_trutta" $file; then
		grep -A1 ">S_trutta" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta
	elif grep -q ">Sa_alpin" $file; then
		grep -A1 ">Sa_alpin" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta
	elif grep -q ">Sa_fonti" $file; then
		grep -A1 ">Sa_fonti" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta
	elif grep -q ">O_mykiss" $file; then
		grep -A1 ">O_mykiss" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta
	elif grep -q ">O_tshawy" $file; then
		grep -A1 ">O_tshawy" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta
	elif grep -q ">C_clupea" $file; then
		grep -A1 ">C_clupea" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta
	elif grep -q ">C_lavare" $file; then
		grep -A1 ">C_lavare" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta
	else
		echo "No suitable taxa in $file!"
	fi
done

### GO analysis in Blast2GO

# --> blastn (-task megablast) using QBlast (NCBI Blast Server)
# ----> against non-redundant protein sequences (nr)
# ----> taxonomy filter: bony fishes (taxa: 7898, Actinopterygii)
# ----> blast expectation value (e-value) threshold: 1.0E-3
# ----> number of retained blast hits: 1
# ----> blast descriptor annotator option selected
# ----> word size: 28
# ----> low complexity filter selected
# ----> HSP length cutoff: 33
# ----> HSP-hit coverage: 0
# ----> filter by description: no filter
# --> 

### in the meantime: extracting aBSREL results (Holm-Bonferroni corrected p-values) from aBSREL .json output files

#!/bin/bash

for file in trimmed2_OG*.hyphy.aBSREL.json; do
	pval=$(grep "\"Corrected P-value\":" $file | sed -e "s/\:\(.*\)\,/\1/" | cut -c26-)
	echo "START" >> aBSREL_results.txt
	echo $file >> aBSREL_results.txt
	echo $pval >> aBSREL_results.txt
	echo "END" >> aBSREL_results.txt
done

### extracting aBSREL results (uncorrected p-values) from aBSREL .json output files

#!/bin/bash

for file in trimmed2_OG*.hyphy.aBSREL.json; do
	pval=$(grep "\"Uncorrected P-value\":" $file | sed -e "s/\:\(.*\)\,/\1/" | cut -c28-)
	echo "START" >> aBSREL_results2.txt
	echo $file >> aBSREL_results2.txt
	echo $pval >> aBSREL_results2.txt
	echo "END" >> aBSREL_results2.txt
done

###

### calculation of number of outliers per orthogroup and number of orthogroups with at least one outlier in aBSREL analyses

### Kevin Schneider; last modified: 29.01.2019 ###
### KevinSchneider@gmx.at ###

library(stringr)

setwd("/Users/kevinschneider/Documents/transcriptomics/HYPHY_analyses/")
absrel <- readLines("aBSREL_results.txt")
absrel <- absrel[-(seq(from = 1, to = length(absrel), by = 4))]
absrel <- absrel[-(seq(from = 3, to = length(absrel), by = 3))]

absrel_list <- list()
Bonfcutoff <- (0.05 / (length(absrel) / 2))
nocorr_count <- 0
FDR_count <- 0
Bonf_count <- 0
for (i in 1:(length(absrel)/2)) {
  ID <- absrel[(i * 2) - 1]
  pvals <- absrel[i * 2]
  filtered_pvals1 <- gsub("\"", "", pvals)
  filtered_pvals2 <- gsub(" ", "", filtered_pvals1)
  split_pvals <- as.numeric(unlist(str_split(filtered_pvals2, ":")))
  split_pvals <- split_pvals[!is.na(split_pvals)]
  nocorrs <- c()
  FDRs <- c()
  Bonfs <- c()
  nocorr_ID <- NA
  FDR_ID <- NA
  Bonf_ID <- NA
  for (j in 1:length(split_pvals)) {
    if (is.numeric(split_pvals[j])) {
      if (split_pvals[j] < 0.05) {
        nocorr <- TRUE
        nocorr_ID <- substr(ID, 10, 18)
      } else {
        nocorr <- FALSE
      }
      nocorrs <- append(nocorrs, nocorr)
      if (p.adjust(split_pvals[j], method = "fdr", n = (length(absrel) / 2)) < 0.10) {
        FDR <- TRUE
        FDR_ID <- substr(ID, 10, 18)
      } else {
        FDR <- FALSE
      }
      FDRs <- append(FDRs, FDR)
      if (split_pvals[j] < Bonfcutoff) {
        Bonf <- TRUE
        Bonf_ID <- substr(ID, 10, 18)
      } else {
        Bonf <- FALSE
      }
      Bonfs <- append(Bonfs, Bonf)
    }
  }
  nocorr_snums <- length(nocorrs[which(nocorrs)])
  FDR_snums <- length(FDRs[which(FDRs)])
  Bonf_snums <- length(Bonfs[which(Bonfs)])
  if (nocorr_snums > 0) {
    nocorr_count <- nocorr_count + 1
  }
  if (FDR_snums > 0) {
    FDR_count <- FDR_count + 1
  }
  if (Bonf_snums > 0) {
    Bonf_count <- Bonf_count + 1
  }
  absrel_list[[ID]] <- list("pvals" = split_pvals, "nocorr_sig" = nocorrs, "nocorr_sig_num" = nocorr_snums, 
                            "FDR_sig" = FDRs, "FDR_sig_num" = FDR_snums, "Bonf_sig" = Bonfs, 
                            "Bonf_sig_num" = Bonf_snums, "nocorr_ID" = nocorr_ID, "FDR_ID" = FDR_ID, 
                            "Bonf_ID" = Bonf_ID)
}

nocorr_IDs <- unique(unlist(lapply(absrel_list, "[[", "nocorr_ID")))
FDR_IDs <- unique(unlist(lapply(absrel_list, "[[", "FDR_ID")))
Bonf_IDs <- unique(unlist(lapply(absrel_list, "[[", "Bonf_ID")))
nocorr_IDs <- nocorr_IDs[-1]
FDR_IDs <- FDR_IDs[-1]
Bonf_IDs <- Bonf_IDs[-1]

nocorr_vec <- c()
FDR_vec <- c()
Bonf_vec <- c()
for (k in 1:length(absrel_list)) {
  if (!is.na(absrel_list[[k]]$nocorr_ID)) {
    pval_str <- paste(absrel_list[[k]]$pvals, collapse = "|")
    nocorr_vec <- append(nocorr_vec, paste(c(absrel_list[[k]]$nocorr_ID, pval_str), collapse = ";"))
  }
  if (!is.na(absrel_list[[k]]$FDR_ID)) {
    pval_str <- paste(absrel_list[[k]]$pvals, collapse = "|")
    FDR_vec <- append(FDR_vec, paste(c(absrel_list[[k]]$FDR_ID, pval_str), collapse = ";"))
  }
  if (!is.na(absrel_list[[k]]$Bonf_ID)) {
    pval_str <- paste(absrel_list[[k]]$pvals, collapse = "|")
    Bonf_vec <- append(Bonf_vec, paste(c(absrel_list[[k]]$Bonf_ID, pval_str), collapse = ";"))
  }
}

writeLines(nocorr_vec, "nocorr_IDs_aBSREL_results1_w_pvals.txt")
writeLines(FDR_vec, "FDR_IDs_aBSREL_results1_w_pvals.txt")
writeLines(Bonf_vec, "Bonf_IDs_aBSREL_results1_w_pvals.txt")

writeLines(nocorr_IDs, "nocorr_IDs_aBSREL_results1.txt")
writeLines(FDR_IDs, "FDR_IDs_aBSREL_results1.txt")
writeLines(Bonf_IDs, "Bonf_IDs_aBSREL_results1.txt")

absrel <- readLines("aBSREL_results2.txt")
absrel <- absrel[-(seq(from = 1, to = length(absrel), by = 4))]
absrel <- absrel[-(seq(from = 3, to = length(absrel), by = 3))]

absrel_list <- list()
Bonfcutoff <- (0.05 / (length(absrel) / 2))
nocorr_count2 <- 0
FDR_count2 <- 0
Bonf_count2 <- 0
for (i in 1:(length(absrel)/2)) {
  ID <- absrel[(i * 2) - 1]
  pvals <- absrel[i * 2]
  filtered_pvals1 <- gsub("\"", "", pvals)
  split_pvals <- as.numeric(unlist(str_split(filtered_pvals1, " "))) 
  split_pvals <- split_pvals[!is.na(split_pvals)]
  nocorrs <- c()
  FDRs <- c()
  Bonfs <- c()
  nocorr_ID <- NA
  FDR_ID <- NA
  Bonf_ID <- NA
  for (j in 1:length(split_pvals)) {
    if (is.numeric(split_pvals[j])) {
      if (split_pvals[j] < 0.05) {
        nocorr <- TRUE
        nocorr_ID <- substr(ID, 10, 18)
      } else {
        nocorr <- FALSE
      }
      nocorrs <- append(nocorrs, nocorr)
      if (p.adjust(split_pvals[j], method = "fdr", n = (length(absrel) / 2)) < 0.10) {
        FDR <- TRUE
        FDR_ID <- substr(ID, 10, 18)
      } else {
        FDR <- FALSE
      }
      FDRs <- append(FDRs, FDR)
      if (split_pvals[j] < Bonfcutoff) {
        Bonf <- TRUE
        Bonf_ID <- substr(ID, 10, 18)
      } else {
        Bonf <- FALSE
      }
      Bonfs <- append(Bonfs, Bonf)
    }
  }
  nocorr_snums <- length(nocorrs[which(nocorrs)])
  FDR_snums <- length(FDRs[which(FDRs)])
  Bonf_snums <- length(Bonfs[which(Bonfs)])
  if (nocorr_snums > 0) {
    nocorr_count2 <- nocorr_count2 + 1
  }
  if (FDR_snums > 0) {
    FDR_count2 <- FDR_count2 + 1
  }
  if (Bonf_snums > 0) {
    Bonf_count2 <- Bonf_count2 + 1
  }
  absrel_list[[ID]] <- list("pvals" = split_pvals, "nocorr_sig" = nocorrs, "nocorr_sig_num" = nocorr_snums, 
                            "FDR_sig" = FDRs, "FDR_sig_num" = FDR_snums, "Bonf_sig" = Bonfs, 
                            "Bonf_sig_num" = Bonf_snums, "nocorr_ID" = nocorr_ID, "FDR_ID" = FDR_ID, 
                            "Bonf_ID" = Bonf_ID)
}

nocorr_IDs <- unique(unlist(lapply(absrel_list, "[[", "nocorr_ID")))
FDR_IDs <- unique(unlist(lapply(absrel_list, "[[", "FDR_ID")))
Bonf_IDs <- unique(unlist(lapply(absrel_list, "[[", "Bonf_ID")))
writeLines(nocorr_IDs, "nocorr_IDs_aBSREL_results2.txt")
writeLines(FDR_IDs, "FDR_IDs_aBSREL_results2.txt")
writeLines(Bonf_IDs, "Bonf_IDs_aBSREL_results2.txt")

nocorr_count
FDR_count
Bonf_count
nocorr_count2
FDR_count2
Bonf_count2

### continued: GO analysis in Blast2GO

### reblasting of orthogroups with no blast result
### no taxonomy filter applied this time

# --> blastn (-task megablast) using QBlast (NCBI Blast Server)
# ----> against non-redundant protein sequences (nr)
# ----> taxonomy filter: no filter
# ----> blast expectation value (e-value) threshold: 1.0E-3
# ----> number of retained blast hits: 1
# ----> blast descriptor annotator option selected
# ----> word size: 28
# ----> low complexity filter selected
# ----> HSP length cutoff: 33
# ----> HSP-hit coverage: 0
# ----> filter by description: no filter
# --> 

### next step: GO mapping with Goa version 2018.12 database

# ----> for creating annotations later

### additionally: InterProScan applied due to many unmapped sequences

# ----> using EMBL-EBI InterPro
# ----> default settings applied

### annotation using default parameters

# annotation cut-off: 55
# GO weight: 5
# no taxonomy filter
# e-value-hit-filter: 1.0E-6
# HSP-hit coverage cut-off: 0
# hit filter: 500

### additional annotation using merging of InterProScan GOs with existing annotation

### annotation augmentation by ANNEX

### annotation of remaining unannotated sequences using blast descriptions

# ----> minimum similarity of 85
# ----> validation of annotations


### GO-Slim runs

# ----> using Obo file from GO website
# ----> generic GO slim

### extracting sequence IDs for Blast2GO GO term enrichment analysis from orthogroup outlier text files

#!/bin/bash

while read line; do
  grep $line $2 | cut -c 2- >> $3
done <$1

### extracting sequence IDs from Blast2GO input fasta file for Blast2GO GO term enrichment analysis

#!/bin/bash

grep ">" $1 | cut -c 2- >> $2

### testing the difference of the selection parameter k distribution from the null expectation of 1

wilcox.test(relaxframe$selparam, mu = 1, alternative = "two.sided", na.action = na.omit)

median(relaxframe$selparam)
mean(relaxframe$selparam)


###

### Venn diagrams

library(VennDiagram)

draw.triple.venn(22, 96, 12, 5, 3, 0, 0, fill = c("blue", "red", "green"),
                 cex = 1.5)
draw.triple.venn(138, 111, 105, 10, 16, 0, 0, fill = c("blue", "red", "green"),
                 cex = 1.5)
draw.triple.venn(272, 580, 235, 58, 87, 0, 0, fill = c("blue", "red", "green"),
                 cex = 1.5)

###

#!/bin/bash

# calculating average gene counts from maximum gene counts per orthogroup
max_array=()
for file in trimmed2_*.RELAX.json; do
	maximum=0
	a=0
	b=0
	c=0
	d=0
	e=0
	f=0
	g=0
	h=0
	i=0
	j=0
	while read line; do
		if `grep -q 'original name":"C_clupea' <<< "$line"`; then
			a=$((a + 1))
		fi
		if `grep -q 'original name":"C_lavare' <<< "$line"`; then
			b=$((b + 1))
		fi
		if `grep -q 'original name":"E_lucius' <<< "$line"`; then
			c=$((c + 1))
		fi
		if `grep -q 'original name":"O_mykiss' <<< "$line"`; then
			d=$((d + 1))
		fi
		if `grep -q 'original name":"O_tshawy' <<< "$line"`; then
			e=$((e + 1))
		fi
		if `grep -q 'original name":"Sa_alpin' <<< "$line"`; then
			f=$((f + 1))
		fi
		if `grep -q 'original name":"Sa_fonti' <<< "$line"`; then
			g=$((g + 1))
		fi
		if `grep -q 'original name":"S_salar' <<< "$line"`; then
			h=$((h + 1))
		fi
		if `grep -q 'original name":"S_trutta' <<< "$line"`; then
			i=$((i + 1))
		fi
		if `grep -q 'original name":"T_thymal' <<< "$line"`; then
			j=$((j + 1))
		fi
	done <$file
	echo ">C_clupea" >> $file.genecounts.txt
	echo $a >> $file.genecounts.txt
	echo ">C_lavare" >> $file.genecounts.txt
	echo $b >> $file.genecounts.txt
	echo ">E_lucius" >> $file.genecounts.txt
	echo $c >> $file.genecounts.txt
	echo ">O_mykiss" >> $file.genecounts.txt
	echo $d >> $file.genecounts.txt
	echo ">O_tshawy" >> $file.genecounts.txt
	echo $e >> $file.genecounts.txt
	echo ">Sa_alpin" >> $file.genecounts.txt
	echo $f >> $file.genecounts.txt
	echo ">Sa_fonti" >> $file.genecounts.txt
	echo $g >> $file.genecounts.txt
	echo ">S_salar" >> $file.genecounts.txt
	echo $h >> $file.genecounts.txt
	echo ">S_trutta" >> $file.genecounts.txt
	echo $i >> $file.genecounts.txt
	echo ">T_thymal" >> $file.genecounts.txt
	echo $j >> $file.genecounts.txt
	count_array=($a $b $c $d $e $f $g $h $i $j)
	maximum=$(python calculate_max.py "${count_array[@]}")
	max_array+=($maximum)
done

sum=0
total=0
for i in "${max_array[@]}"; do
    sum=`expr $sum + $i`
    total=`expr $total + 1`
done
mean=$(bc <<< "scale=5; $sum / $total")
echo $mean > mean_genecount.txt

###

# python script used in the above BASH script

##### written for Python 3 #####

"""
--> script to calculate maximum value from BASH array input <--
"""

import sys

maximum = sys.argv[1:]
maximum = list(map(int, maximum))
maximum.sort()
print(maximum[len(maximum) - 1])

###

### removing matching lines to exclude IDs from test sets for GO term enrichment tests from reference sets

#!/bin/bash

# $1 = input file 1; $2 = input file 2

while IFS= read -r line; do
	sed -i .bak "/$line/d" $2
done < "$1"


### more GO term enrichment tests in Blast2GO (Fisher's Exact Tests)

### this time reference sets do not include IDs from test sets (see script above)

# Bonf, FDR, and nocorr sets for RELAX, aBSREL (Bonf-Holm), and aBSREL (nocorr)

# with FDR (q-value) threshold of 0.1 (two-tailed)

###

### obtaining difference between sequence files

#!/bin/bash

grep -v -F -x -f OG_list2.txt OG_list.txt


### calculation of total sequence length and average sequence length per orthogroup

#!/bin/bash

total=0
for file in GO_input_trimmed2_OG*.oneline.incl_tree.fasta; do
	curr=$(sed '2q;d' $file | wc -c)
	total=`expr $total + $curr - 1`
done
echo $total
count=$(ls -l GO_input_trimmed2_OG*.oneline.incl_tree.fasta | wc -l)
avg=`expr $total / $count`
echo $avg

###

### counting alignment lengths of OG alignments

#!/bin/bash

for file in GO_input_trimmed2_OG*.oneline.incl_tree.fasta; do
	head -2 $file | tail -1 | wc -c | awk '{$1=$1};1' >> alignment_counts.txt
done

###

##### 04.12.2018 #####
##### by Kevin Schneider #####
##### KevinSchneider@gmx.at #####
##### written for Python 3 #####
##### last modified: 06.03.2018 #####
"""
--> script to convert mutli-line fasta files to one-line fasta files <--
"""

from glob import glob
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid
from os import path
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)')
args = parser.parse_args()
files = args.i

for file in glob(files):
     with open(file, 'rU') as f:
          """ create a list of sequence records in the file """
          records = list(SeqIO.parse(f, 'fasta'))
          base = path.splitext(file)[0]
          new_fasta = base + '.oneline.fasta'
          with open(new_fasta, 'w') as outfile:
               for record in records:
                    outfile.write('>' + record.id + '\n' + str(record.seq) + '\n')

###

### obtaining transcript counts of raw transcriptomes

#!/bin/bash

for file in *.oneline.fasta; do
	grep ">" $file | wc -l | awk '{$1=$1};1' > $file.transcript_count.txt
	grep -v ">" $file | awk '{ print length }' | awk '{$1=$1};1' > $file.transcript_lengths.txt
done

###

### script to calculate basic raw transcript metrics based on transcript lengths

t_vec <- Sys.glob(file.path("*.oneline.fasta.transcript_lengths.txt"))

for (i in t_vec) {
  
  print("##############################")
  print(i)
  print("##############################")
  
  lengths <- readLines(i)
  lengths <- as.numeric(lengths)
  
  print("min length")
  print(min(lengths))
  print("max length")
  print(max(lengths))
  print("mean length")
  print(mean(lengths))
  print("median length")
  print(median(lengths))
  print("sum of transcript lengths (in Mbp)")
  print(sum(lengths)/1000000)
  print("transcript count")
  print(length(lengths))
  
  print("##############################")
}

###


setwd("/Users/kevinschneider/Documents/transcriptomics/HYPHY_analyses/")

symbols <- read.table("webgestalt_gsea_input_genesymbols.rnk", header = FALSE, sep = "\t")
colnames(symbols) <- c("genesymbols", "k")

symbols <- symbols[!grepl("!", symbols$genesymbols), ]

symbols$k[symbols$k == 1] <- 0
symbols$k[symbols$k < 1] <- symbols$k[symbols$k < 1] - 1
symbols$k[symbols$k > 1] <- symbols$k[symbols$k > 1] / 50

write.table(symbols, file = "transformed_webgestalt_gsea_input_genesymbols_human.rnk", quote = FALSE, sep = "\t", row.names = FALSE)

symbols$genesymbols <- tolower(symbols$genesymbols)

write.table(symbols, file = "transformed_webgestalt_gsea_input_genesymbols_zebrafish.rnk", quote = FALSE, sep = "\t", row.names = FALSE)

### next for zebrafish

symbols <- read.table("webgestalt_gsea_input_genesymbols_zebrafish.rnk", header = FALSE, sep = "\t")
colnames(symbols) <- c("genesymbols", "k")

symbols <- symbols[!grepl("!", symbols$genesymbols), ]

symbols$k[symbols$k == 1] <- 0
symbols$k[symbols$k < 1] <- symbols$k[symbols$k < 1] - 1
symbols$k[symbols$k > 1] <- symbols$k[symbols$k > 1] / 50

symbols$genesymbols <- tolower(symbols$genesymbols)

write.table(symbols, file = "transformed_webgestalt_gsea_input_genesymbols_new_zebrafish.rnk", quote = FALSE, sep = "\t", row.names = FALSE)

###

### converting human gene symbols to symbols of zebrafish homologues (for GSEA in WegGestalt)

setwd("/Users/kevinschneider/Documents/transcriptomics/HYPHY_analyses/")

human <- read.csv("gsea_input_gene_symbols.csv", header = TRUE)
homologue_df <- read.delim("homology.txt", header = TRUE, sep = "\t", fill = TRUE)
human_homologue_df <- homologue_df[which(homologue_df$Common.Organism.Name == "human"), ]
zf_homologue_df <- homologue_df[which(homologue_df$Common.Organism.Name == "zebrafish"), ]
human <- human[, -6]
human$genesymbol_human <- trimws(as.character(human$genesymbol_human))
human <- data.frame(human, "genesymbol_zebrafish" = human$genesymbol_human, stringsAsFactors = FALSE)

for (i in 1:nrow(human)) {
  pos <- grep(human$genesymbol_human[i], human_homologue_df$Symbol)
  if (length(pos)) {
    homologue_id <- human_homologue_df$HomoloGene.ID[pos]
    zf_gene_pos <- grep(homologue_id, zf_homologue_df$HomoloGene.ID)
    zf_gene <- as.character(zf_homologue_df$Symbol[zf_gene_pos][1])
    if (grepl("|", zf_gene)) {
      zf_gene <- unlist(strsplit(zf_gene, split = "[|]"))[1]
    }
    human$genesymbol_zebrafish[i] <- zf_gene
  }
  if (is.na(human$genesymbol_zebrafish[i])) {
    human$genesymbol_zebrafish[i] <- tolower(human$genesymbol_human[i])
  }
}

zf_gsea <- data.frame("symbols" = human$genesymbol_zebrafish, "kval" = human$kval)
zf_gsea$kval <- as.numeric(zf_gsea$kval)
zf_gsea <- zf_gsea[-which(is.na(zf_gsea$kval)), ]
zf_gsea$kval[zf_gsea$kval == 1] <- 0
zf_gsea$kval[zf_gsea$kval < 1] <- zf_gsea$kval[zf_gsea$kval < 1] - 1
zf_gsea$kval[zf_gsea$kval > 1] <- zf_gsea$kval[zf_gsea$kval > 1] / 50

write.table(human, "human_zebrafish_homology.tsv", sep = "\t", row.names = FALSE, quote = FALSE)
write.table(zf_gsea, "human_zebrafish_homology.rnk", sep = "\t", row.names = FALSE, quote = FALSE)

###

### extract entries in "zebrafish_symbols_kvalues.csv" based on outlier OGs

#!/bin/bash

while IFS= read -r line; do
	grep $line $2 >> $3
done < $1

###

### 


setwd("/Users/kevinschneider/Documents/transcriptomics/HYPHY_analyses/")
# A treemap R script produced by the REVIGO server at http://revigo.irb.hr/
# If you found REVIGO useful in your work, please cite the following reference:
# Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology
# terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800

# author: Anton Kratz <anton.kratz@gmail.com>, RIKEN Omics Science Center, Functional Genomics Technology Team, Japan
# created: Fri, Nov 02, 2012  7:25:52 PM
# last change: Fri, Nov 09, 2012  3:20:01 PM

# -----------------------------------------------------------------------------
# If you don't have the treemap package installed, uncomment the following line:
# install.packages( "treemap" );
library(treemap) 								# treemap package by Martijn Tennekes

# Set the working directory if necessary
# setwd("C:/Users/username/workingdir");

# --------------------------------------------------------------------------
# Here is your data from REVIGO. Scroll down for plot configuration options.

revigo.names <- c("term_ID","description","freqInDbPercent","value","uniqueness","dispensability","representative");
revigo.data <- rbind(c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "a"),
c("GO:0006470","protein dephosphorylation",1.272,0.9975,0.678,0.000,"protein dephosphorylation"),
c("GO:0070932","histone H3 deacetylation",0.035,1.0000,0.725,0.256,"protein dephosphorylation"),
c("GO:0006508","proteolysis",6.441,0.9814,0.707,0.317,"protein dephosphorylation"),
c("GO:0006468","protein phosphorylation",7.825,0.9864,0.662,0.519,"protein dephosphorylation"),
c("GO:0006886","intracellular protein transport",2.141,0.9978,0.687,0.000,"intracellular protein transport"),
c("GO:0016192","vesicle-mediated transport",3.868,0.9992,0.680,0.443,"intracellular protein transport"),
c("GO:0006812","cation transport",4.146,0.9995,0.680,0.488,"intracellular protein transport"),
c("GO:0006836","neurotransmitter transport",0.733,1.0000,0.682,0.359,"intracellular protein transport"),
c("GO:0008217","regulation of blood pressure",0.290,1.0000,0.829,0.000,"regulation of blood pressure"),
c("GO:0019885","antigen processing and presentation of endogenous peptide antigen via MHC class I",0.047,1.0000,0.852,0.000,"antigen processing and presentation of endogenous peptide antigen via MHC class I"),
c("GO:0019722","calcium-mediated signaling",0.095,1.0000,0.791,0.068,"calcium-mediated signaling"),
c("GO:0035556","intracellular signal transduction",6.701,0.9975,0.786,0.241,"calcium-mediated signaling"),
c("GO:0009069","serine family amino acid metabolic process",0.118,0.9992,0.699,0.092,"serine family amino acid metabolism"),
c("GO:0006570","tyrosine metabolic process",0.030,0.9998,0.705,0.670,"serine family amino acid metabolism"),
c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "z"));

stuff <- data.frame(revigo.data);
names(stuff) <- revigo.names;

stuff$value <- as.numeric( as.character(stuff$value) );
stuff$freqInDbPercent <- as.numeric( as.character(stuff$freqInDbPercent) );
stuff$uniqueness <- as.numeric( as.character(stuff$uniqueness) );
stuff$dispensability <- as.numeric( as.character(stuff$dispensability) );

# equal size since p-value difference negligible
stuff$value <- 1
stuff$value[1] <- 0
stuff$value[length(stuff$value)] <- 0

# by default, outputs to a PDF file
png(file="revigo_treemap_relaxed_diversifying_bioproc_recoloured.png", width=900, height=500)
#pdf( file="revigo_treemap_relaxed_diversifying_bioproc.pdf", width=16, height=9 ) # width and height are in inches

# check the tmPlot command documentation for all possible parameters - there are a lot more
treemap(
	stuff,
	index = c("representative","description"),
	vSize = "value",
	type = "categorical",
	vColor = "representative",
	title = "GO Terms - Relaxed & Diversifying Selection",
	inflate.labels = FALSE,      # set this to TRUE for space-filling group labels - good for posters
	lowerbound.cex.labels = 0,   # try to draw as many labels as possible (still, some small squares may not get a label)
	bg.labels = "#CCCCCCAA",     # define background color of group labels
												       # "#CCCCCC00" is fully transparent, "#CCCCCCAA" is semi-transparent grey, NA is opaque
	position.legend = "none",
	fontsize.title = 18,
	fontsize.label = 18,
	align.labels = list(c("left", "top"), c("center", "bottom")),
	drop.unused.levels = FALSE
)

dev.off()

###

###


setwd("/Users/kevinschneider/Documents/transcriptomics/HYPHY_analyses/")
# A treemap R script produced by the REVIGO server at http://revigo.irb.hr/
# If you found REVIGO useful in your work, please cite the following reference:
# Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology
# terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800

# author: Anton Kratz <anton.kratz@gmail.com>, RIKEN Omics Science Center, Functional Genomics Technology Team, Japan
# created: Fri, Nov 02, 2012  7:25:52 PM
# last change: Fri, Nov 09, 2012  3:20:01 PM

# -----------------------------------------------------------------------------
# If you don't have the treemap package installed, uncomment the following line:
# install.packages( "treemap" );
library(treemap) 								# treemap package by Martijn Tennekes

# Set the working directory if necessary
# setwd("C:/Users/username/workingdir");

# --------------------------------------------------------------------------
# Here is your data from REVIGO. Scroll down for plot configuration options.

revigo.names <- c("term_ID","description","freqInDbPercent","value","uniqueness","dispensability","representative");
revigo.data <- rbind(c("GO:0005975","carbohydrate metabolic process",1.905,0.9945,0.610,0.000,"carbohydrate metabolism"),
c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "d"),
c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "e"),
c("GO:0007269","neurotransmitter secretion",0.432,1.0000,0.651,0.100,"neurotransmitter secretion"),
c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "o"),
c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "p"),
c("GO:0009069","serine family amino acid metabolic process",0.118,0.9985,0.385,0.086,"serine family amino acid metabolism"),
c("GO:0006486","protein glycosylation",1.183,0.9985,0.346,0.299,"serine family amino acid metabolism"),
c("GO:0006508","proteolysis",6.441,0.9975,0.510,0.314,"serine family amino acid metabolism"),
c("GO:0045892","negative regulation of transcription, DNA-templated",1.343,0.9999,0.473,0.358,"serine family amino acid metabolism"),
c("GO:0006468","protein phosphorylation",7.825,0.9984,0.468,0.451,"serine family amino acid metabolism"),
c("GO:0006633","fatty acid biosynthetic process",0.325,0.9998,0.337,0.574,"serine family amino acid metabolism"),
c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "z"));

stuff <- data.frame(revigo.data);
names(stuff) <- revigo.names;

stuff$value <- as.numeric( as.character(stuff$value) );
stuff$freqInDbPercent <- as.numeric( as.character(stuff$freqInDbPercent) );
stuff$uniqueness <- as.numeric( as.character(stuff$uniqueness) );
stuff$dispensability <- as.numeric( as.character(stuff$dispensability) );

# equal size since p-value difference negligible
stuff$value <- 1
stuff$value[2] <- 0.00001
stuff$value[3] <- 0.00001
stuff$value[5] <- 0.00001
stuff$value[6] <- 0.00001
stuff$value[length(stuff$value)] <- 0.00001

# by default, outputs to a PDF file
png(file="revigo_mod_treemap_intensified_diversifying_bioproc_recoloured.png", width=400, height=700)
#pdf( file="revigo_treemap_intensified_diversifying_bioproc.pdf", width=16, height=9 ) # width and height are in inches

# check the tmPlot command documentation for all possible parameters - there are a lot more
tmPlot(
	stuff,
	index = c("representative","description"),
	vSize = "value",
	type = "categorical",
	vColor = "representative",
	title = "GO Terms - Intensified & Diversifying Selection",
	inflate.labels = FALSE,      # set this to TRUE for space-filling group labels - good for posters
	lowerbound.cex.labels = 0,   # try to draw as many labels as possible (still, some small squares may not get a label)
	bg.labels = "#CCCCCCAA",     # define background color of group labels
												       # "#CCCCCC00" is fully transparent, "#CCCCCCAA" is semi-transparent grey, NA is opaque
	position.legend = "none",
	fontsize.title = 18,
	fontsize.label = 18,
	align.labels = list(c("left", "top"), c("center", "bottom")),
)

dev.off()

###

###


# A plotting R script produced by the REVIGO server at http://revigo.irb.hr/
# If you found REVIGO useful in your work, please cite the following reference:
# Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology
# terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800


# --------------------------------------------------------------------------
# If you don't have the ggplot2 package installed, uncomment the following line:
# install.packages( "ggplot2" );
library( ggplot2 );
# --------------------------------------------------------------------------
# If you don't have the scales package installed, uncomment the following line:
# install.packages( "scales" );
library( scales );


# --------------------------------------------------------------------------
# Here is your data from REVIGO. Scroll down for plot configuration options.

revigo.names <- c("term_ID","description","frequency_%","plot_X","plot_Y","GO_frequency","log10_p_value","uniqueness","dispensability");
revigo.data <- rbind(c("GO:0007340","acrosome reaction", 0.101, 5.178,-0.261, 2,-2.5187,0.605,0.000),
c("GO:0006629","lipid metabolic process", 3.502,-5.067,-1.917, 2,-2.0576,0.335,0.035),
c("GO:0006355","regulation of transcription, DNA-templated",10.303,-3.787, 4.497, 8,-1.5057,0.440,0.164),
c("GO:0009395","phospholipid catabolic process", 0.089,-3.915,-4.114, 2,-1.5666,0.380,0.322),
c("GO:0009069","serine family amino acid metabolic process", 0.118,-5.034, 0.526, 3,-1.7080,0.245,0.331),
c("GO:0006570","tyrosine metabolic process", 0.030,-4.116, 0.870, 2,-1.4064,0.258,0.670));

one.data <- data.frame(revigo.data);
names(one.data) <- revigo.names;
one.data <- one.data [(one.data$plot_X != "null" & one.data$plot_Y != "null"), ];
one.data$plot_X <- as.numeric( as.character(one.data$plot_X) );
one.data$plot_Y <- as.numeric( as.character(one.data$plot_Y) );
one.data$GO_frequency <- as.numeric( as.character(one.data$GO_frequency) );
one.data$log10_p_value <- as.numeric( as.character(one.data$log10_p_value) );
one.data$frequency <- as.numeric( as.character(one.data$frequency) );
one.data$uniqueness <- as.numeric( as.character(one.data$uniqueness) );
one.data$dispensability <- as.numeric( as.character(one.data$dispensability) );
#head(one.data);


# --------------------------------------------------------------------------
# Names of the axes, sizes of the numbers and letters, names of the columns,
# etc. can be changed below

p1 <- ggplot( data = one.data );
p1 <- p1 + geom_point( aes( plot_X, plot_Y, colour = log10_p_value, size = GO_frequency), alpha = I(0.6) ) + scale_size_area();
p1 <- p1 + scale_colour_gradientn( colours = c("blue", "green", "yellow", "red"), limits = c( min(one.data$log10_p_value), 0) );
p1 <- p1 + geom_point( aes(plot_X, plot_Y, size = GO_frequency), shape = 21, fill = "transparent", colour = I (alpha ("black", 0.6) )) + scale_size_area();
p1 <- p1 + scale_size( range=c(as.integer(min(one.data$GO_frequency)) * 5, as.integer(max(one.data$GO_frequency) * 5)), breaks = c(2, 3, 8)) + theme_bw(); # + scale_fill_gradientn(colours = heat_hcl(7), limits = c(-300, 0) );
#ex <- one.data [ one.data$dispensability < 0.15, ]; 
#p1 <- p1 + geom_text( data = ex, aes(plot_X, plot_Y, label = description), colour = I(alpha("black", 0.85)), size = 3 );
p1 <- p1 + labs (y = "semantic space y", x = "semantic space x");
p1 <- p1 + theme(legend.key = element_blank(), plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
                 axis.title = element_text(size = 13, face = "bold")) ;
one.x_range = max(one.data$plot_X) - min(one.data$plot_X);
one.y_range = max(one.data$plot_Y) - min(one.data$plot_Y);
p1 <- p1 + xlim(min(one.data$plot_X)-one.x_range/10,max(one.data$plot_X)+one.x_range/10);
p1 <- p1 + ylim(min(one.data$plot_Y)-one.y_range/10,max(one.data$plot_Y)+one.y_range/10);
p1 <- p1 + ggtitle("Relaxed Selection (Enrichment p-value < 0.05)");


# --------------------------------------------------------------------------
# Output the plot to screen

p1;

# Uncomment the line below to also save the plot to a file.
# The file type depends on the extension (default=pdf).

ggsave("revigo_both_fore_semantic_enriched_relaxed_bioproc_sim0_9.png", device = "png", width = 9, height = 7)

###

###


# A plotting R script produced by the REVIGO server at http://revigo.irb.hr/
# If you found REVIGO useful in your work, please cite the following reference:
# Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology
# terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800


# --------------------------------------------------------------------------
# If you don't have the ggplot2 package installed, uncomment the following line:
# install.packages( "ggplot2" );
library( ggplot2 );
# --------------------------------------------------------------------------
# If you don't have the scales package installed, uncomment the following line:
# install.packages( "scales" );
library( scales );


# --------------------------------------------------------------------------
# Here is your data from REVIGO. Scroll down for plot configuration options.

revigo.names <- c("term_ID","description","frequency_%","plot_X","plot_Y","GO_frequency","log10_p_value","uniqueness","dispensability");
revigo.data <- rbind(c("GO:0002138","retinoic acid biosynthetic process", 0.047, 1.702,-7.269, 1,-1.4540,0.596,0.000),
c("GO:0042273","ribosomal large subunit biogenesis", 0.278, 5.582,-0.993, 1,-1.4540,0.722,0.000),
c("GO:0030325","adrenal gland development", 0.071,-2.558, 5.046, 1,-1.4540,0.730,0.024),
c("GO:0051014","actin filament severing", 0.035, 2.649, 4.117, 1,-1.4540,0.691,0.078),
c("GO:0097190","apoptotic signaling pathway", 0.574,-6.360,-3.464, 1,-1.4540,0.570,0.095),
c("GO:1901222","regulation of NIK/NF-kappaB signaling", 0.047,-6.136,-0.225, 1,-1.4540,0.628,0.182),
c("GO:0034720","histone H3-K4 demethylation", 0.024, 4.160,-4.282, 1,-1.4540,0.602,0.209),
c("GO:0006355","regulation of transcription, DNA-templated",10.303,-1.797,-6.211, 8,-2.2859,0.586,0.269),
c("GO:0007165","signal transduction",22.310,-4.846,-3.634, 5,-1.6421,0.590,0.330));

one.data <- data.frame(revigo.data);
names(one.data) <- revigo.names;
one.data <- one.data [(one.data$plot_X != "null" & one.data$plot_Y != "null"), ];
one.data$plot_X <- as.numeric( as.character(one.data$plot_X) );
one.data$plot_Y <- as.numeric( as.character(one.data$plot_Y) );
one.data$GO_frequency <- as.numeric( as.character(one.data$GO_frequency) );
one.data$log10_p_value <- as.numeric( as.character(one.data$log10_p_value) );
one.data$frequency <- as.numeric( as.character(one.data$frequency) );
one.data$uniqueness <- as.numeric( as.character(one.data$uniqueness) );
one.data$dispensability <- as.numeric( as.character(one.data$dispensability) );
#head(one.data);


# --------------------------------------------------------------------------
# Names of the axes, sizes of the numbers and letters, names of the columns,
# etc. can be changed below

p1 <- ggplot( data = one.data );
p1 <- p1 + geom_point( aes( plot_X, plot_Y, colour = log10_p_value, size = GO_frequency), alpha = I(0.6) ) + scale_size_area();
p1 <- p1 + scale_colour_gradientn( colours = c("blue", "green", "yellow", "red"), limits = c( min(one.data$log10_p_value), 0) );
p1 <- p1 + geom_point( aes(plot_X, plot_Y, size = GO_frequency), shape = 21, fill = "transparent", colour = I (alpha ("black", 0.6) )) + scale_size_area();
p1 <- p1 + scale_size( range=c(as.integer(min(one.data$GO_frequency)) * 5, as.integer(max(one.data$GO_frequency) * 5)), breaks = c(1, 5, 8)) + theme_bw(); # + scale_fill_gradientn(colours = heat_hcl(7), limits = c(-300, 0) );
#ex <- one.data [ one.data$dispensability < 0.15, ]; 
#p1 <- p1 + geom_text( data = ex, aes(plot_X, plot_Y, label = description), colour = I(alpha("black", 0.85)), size = 3 );
p1 <- p1 + labs (y = "semantic space y", x = "semantic space x");
p1 <- p1 + theme(legend.key = element_blank(), plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
                 axis.title = element_text(size = 13, face = "bold")) ;
one.x_range = max(one.data$plot_X) - min(one.data$plot_X);
one.y_range = max(one.data$plot_Y) - min(one.data$plot_Y);
p1 <- p1 + xlim(min(one.data$plot_X)-one.x_range/10,max(one.data$plot_X)+one.x_range/10);
p1 <- p1 + ylim(min(one.data$plot_Y)-one.y_range/10,max(one.data$plot_Y)+one.y_range/10);
p1 <- p1 + ggtitle("Intensified Selection (Enrichment p-value < 0.05)");


# --------------------------------------------------------------------------
# Output the plot to screen

p1;

# Uncomment the line below to also save the plot to a file.
# The file type depends on the extension (default=pdf).

ggsave("revigo_both_fore_semantic_intensified_relaxed_bioproc_sim0_9.png", device = "png", width = 9, height = 7)

###

###


# A plotting R script produced by the REVIGO server at http://revigo.irb.hr/
# If you found REVIGO useful in your work, please cite the following reference:
# Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology
# terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800


# --------------------------------------------------------------------------
# If you don't have the ggplot2 package installed, uncomment the following line:
# install.packages( "ggplot2" );
library( ggplot2 );
# --------------------------------------------------------------------------
# If you don't have the scales package installed, uncomment the following line:
# install.packages( "scales" );
library( scales );


# --------------------------------------------------------------------------
# Here is your data from REVIGO. Scroll down for plot configuration options.

revigo.names <- c("term_ID","description","frequency_%","plot_X","plot_Y","GO_frequency","log10_p_value","uniqueness","dispensability");
revigo.data <- rbind(c("GO:0006090","pyruvate metabolic process", 0.260, 6.292,-3.272, 2,-2.7579,0.619,0.000),
c("GO:0019885","antigen processing and presentation of endogenous peptide antigen via MHC class I", 0.047,-5.378,-1.697, 1,-1.3767,0.877,0.000),
c("GO:0071918","urea transmembrane transport", 0.059,-3.965, 4.281, 1,-1.3767,0.854,0.027),
c("GO:0007179","transforming growth factor beta receptor signaling pathway", 0.237,-2.254,-6.465, 1,-1.3767,0.797,0.030),
c("GO:0000103","sulfate assimilation", 0.018,-5.827, 1.274, 1,-1.3767,0.911,0.070),
c("GO:0070932","histone H3 deacetylation", 0.035, 0.654, 5.374, 1,-1.3767,0.763,0.074),
c("GO:0008380","RNA splicing", 1.136, 4.489, 4.012, 1,-1.3767,0.743,0.115),
c("GO:0019722","calcium-mediated signaling", 0.095,-0.762,-7.475, 1,-1.3767,0.820,0.179),
c("GO:0006508","proteolysis", 6.441,-0.531, 2.055, 6,-1.6621,0.772,0.211),
c("GO:0080111","DNA demethylation", 0.024, 4.672, 0.679, 1,-1.3767,0.680,0.234),
c("GO:0035556","intracellular signal transduction", 6.701,-3.065,-5.690, 3,-1.3697,0.792,0.266),
c("GO:0005977","glycogen metabolic process", 0.142, 4.194,-2.539, 1,-1.3767,0.678,0.267),
c("GO:0016042","lipid catabolic process", 0.668, 5.777,-1.651, 2,-2.2928,0.576,0.305),
c("GO:0055085","transmembrane transport", 6.956,-3.398, 4.995, 6,-1.9046,0.890,0.311),
c("GO:0070682","proteasome regulatory particle assembly", 0.012, 1.160, 6.986, 1,-1.3767,0.868,0.329),
c("GO:0030111","regulation of Wnt signaling pathway", 0.668,-1.948,-5.306, 1,-1.3767,0.718,0.425),
c("GO:0006211","5-methylcytosine catabolic process", 0.015, 6.741, 3.221, 1,-1.3767,0.723,0.427),
c("GO:0006956","complement activation", 0.349,-3.369,-2.348, 1,-1.3767,0.691,0.475),
c("GO:0006561","proline biosynthetic process", 0.035, 6.530,-0.295, 1,-1.3767,0.623,0.515),
c("GO:0006108","malate metabolic process", 0.053, 6.811,-2.827, 1,-1.3767,0.623,0.531),
c("GO:0046486","glycerolipid metabolic process", 0.864, 5.020,-3.657, 2,-1.3826,0.604,0.663),
c("GO:0009258","10-formyltetrahydrofolate catabolic process", 0.012, 6.646,-0.818, 1,-1.3767,0.578,0.734),
c("GO:0006644","phospholipid metabolic process", 1.142, 4.965,-3.249, 1,-1.3767,0.598,0.775));

one.data <- data.frame(revigo.data);
names(one.data) <- revigo.names;
one.data <- one.data [(one.data$plot_X != "null" & one.data$plot_Y != "null"), ];
one.data$plot_X <- as.numeric( as.character(one.data$plot_X) );
one.data$plot_Y <- as.numeric( as.character(one.data$plot_Y) );
one.data$GO_frequency <- as.numeric( as.character(one.data$GO_frequency) );
one.data$log10_p_value <- as.numeric( as.character(one.data$log10_p_value) );
one.data$frequency <- as.numeric( as.character(one.data$frequency) );
one.data$uniqueness <- as.numeric( as.character(one.data$uniqueness) );
one.data$dispensability <- as.numeric( as.character(one.data$dispensability) );
#head(one.data);


# --------------------------------------------------------------------------
# Names of the axes, sizes of the numbers and letters, names of the columns,
# etc. can be changed below

p1 <- ggplot( data = one.data );
p1 <- p1 + geom_point( aes( plot_X, plot_Y, colour = log10_p_value, size = GO_frequency), alpha = I(0.6) ) + scale_size_area();
p1 <- p1 + scale_colour_gradientn( colours = c("blue", "green", "yellow", "red"), limits = c( min(one.data$log10_p_value), 0) );
p1 <- p1 + geom_point( aes(plot_X, plot_Y, size = GO_frequency), shape = 21, fill = "transparent", colour = I (alpha ("black", 0.6) )) + scale_size_area();
p1 <- p1 + scale_size( range=c(as.integer(min(one.data$GO_frequency)) * 5, as.integer(max(one.data$GO_frequency) * 5)), breaks = c(1, 2, 3, 6)) + theme_bw(); # + scale_fill_gradientn(colours = heat_hcl(7), limits = c(-300, 0) );
#ex <- one.data [ one.data$dispensability < 0.15, ]; 
#p1 <- p1 + geom_text( data = ex, aes(plot_X, plot_Y, label = description), colour = I(alpha("black", 0.85)), size = 3 );
p1 <- p1 + labs (y = "semantic space y", x = "semantic space x");
p1 <- p1 + theme(legend.key = element_blank(), plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
                 axis.title = element_text(size = 13, face = "bold")) ;
one.x_range = max(one.data$plot_X) - min(one.data$plot_X);
one.y_range = max(one.data$plot_Y) - min(one.data$plot_Y);
p1 <- p1 + xlim(min(one.data$plot_X)-one.x_range/10,max(one.data$plot_X)+one.x_range/10);
p1 <- p1 + ylim(min(one.data$plot_Y)-one.y_range/10,max(one.data$plot_Y)+one.y_range/10);
p1 <- p1 + ggtitle("Diversifying Selection (Enrichment p-value < 0.05)");


# --------------------------------------------------------------------------
# Output the plot to screen

p1;

# Uncomment the line below to also save the plot to a file.
# The file type depends on the extension (default=pdf).

ggsave("revigo_both_fore_semantic_enriched_diversifying_bioproc_sim0_9.png", device = "png", width = 9, height = 7)

###

### obtaining hypergeometric expectations of overlap between relaxed & diversifying as well as intensified & diversifying outliers
### testing frequency differences using Fisher's Exact Tests

reldiv <- (112 * 71) / 2190
reldiv_rest <- 112 - reldiv
fisher.test(matrix(c(9, 103, reldiv, reldiv_rest), 2, 2), alternative = "greater")

reldiv <- (65 * 71) / 2190
reldiv_rest <- 65 - reldiv
fisher.test(matrix(c(12, 53, reldiv, reldiv_rest), 2, 2), alternative = "greater")

fisher.test(matrix(c(12, 53, 9, 103), 2, 2), alternative = "greater")

###

###

#!/bin/bash

# renaming fasta files
for i in *.oneline.fasta; do
  mv -- "$i" "${i/%.oneline.fasta/.aln.fasta}"
done

for i in *.aln.fasta; do
  mv -- "$i" "${i:18}"
done

###

###