### written by Kevin Schneider ### ### last modified: 08.10.2019 ### ### Kevin.Schneider@glasgow.ac.uk ### ### KevinSchneider@gmx.at ### ### scripts (UNIX, R, and Python) used in Schneider et al. 2019 BMC Genomics ### ### --> TransDecoder script <-- ### ### get the longest ORFs in coding sequences of the transcriptomes ### #!/bin/bash for file in *.fsa_nt; do ~/apps/TransDecoder/TransDecoder.LongOrfs -t $file -S done ### --> OrthoFinder script <-- ### OrthoFinder-2.2.3/orthofinder -f seqfiles -t 16 ### --> collapse fastas to one single line each <-- ### #!/bin/bash for file in *.fasta; do awk '/^>/ {printf("\n%s\n",$0);next;} { printf("%s",$0);} END {printf("\n");}' < $file > collapsed_$file done ### --> change encoding from DOS to UNIX in one of the transcriptome files <-- ### #!/bin/bash tr -d '\015' < collapsed_O_tshawytscha.fasta > collapsed_O_tshawytscha_edited.fasta ### --> combining orthologues with custom Python script (combine_orthologues_20180723.py) <-- ### ### invoke with python3 combine_orthologues_20180723.py ### ##### 23.07.2018 ##### ##### by Kevin Schneider ##### ##### written for Python 3 ##### """ --> script to combine genes from different transcriptome files into orthologue files <-- loop through csv files and fetch one salmon accession number in turn only take orthologues with a higher or equal number (but not larger than four) of orthologous genes in the focal species compared to the salmon genome (to avoid inclusion of collapsed genes) then create a file named after the orthologues in the salmon genome take the gene name of the other species belonging to the same orthologue parse through this species's transcriptome for the above gene name take the matched sequence and append it to the orthologue file created above start from the beginning """ from glob import glob def header_filter(line): return not line.startswith("Orthogroup") def extract_ref_list(ref_match): return ref_match.split("\t")[2] def calc_field_length(field): sep_field = field.split(", ") return len(sep_field) def extract_ortho_list(ortho_match): return ortho_match.split("\t")[1] def check_anomality(ref_length, ortho_length): if (ref_length >= ortho_length) and (ref_length <= 4): return True else: return False def extract_spec_name(curr_file): spec_name = curr_file.split("_transdecoder")[0] return "collapsed_" + spec_name + ".fasta" def assemble_outfile(ref_field, ortho_field, passed, spec_file_name): file_name = ref_field.replace(", ", "_") full_file_name = "outfiles/" + file_name + ".fasta" full_file_name = full_file_name.replace("\n", "") if passed: # parse through transcriptome to extract header and sequence with open(spec_file_name, "r+") as infile: for line in infile: if line.startswith(">"): line = line.replace("(", "_") line = line.replace(")", "_") line = line.replace(">", "") for ortho_subfield in ortho_field.split(", "): ortho_subfield = ortho_subfield.split("__g.")[0] start_string = ortho_subfield.split("__")[0] + "__" ortho_subfield = ortho_subfield.replace(start_string, "") if ortho_subfield in line: seq = next(infile) if len(seq) >= 200: # append the above header and sequence to the opened orthologue file with open(full_file_name, "a+") as outfile: species = spec_file_name.replace("collapsed_", "").replace(".fasta", "") new_header = ">" + ortho_subfield + "|" + species + "\n" outfile.write(new_header) outfile.write(seq) return True ortho_files = "*S_salar_genomic.csv" ### change file name as required for file in glob(ortho_files): f = open(file) ortho_lines = list(filter(header_filter, f)) ref_list = list(map(extract_ref_list, ortho_lines)) ref_length = list(map(calc_field_length, ref_list)) ortho_list = list(map(extract_ortho_list, ortho_lines)) ortho_length = list(map(calc_field_length, ortho_list)) i = 0 check_list = [] for ref_element in ref_length: check_element = check_anomality(ref_element, ortho_length[i]) check_list.append(check_element) i = i + 1 spec_file_name = extract_spec_name(file) j = 0 for ref_field in ref_list: output_True_list = assemble_outfile(ref_field, ortho_list[j], check_list[j], spec_file_name) j = j + 1 ### --> END: combining orthologues with custom Python script (combine_orthologues_20180723.py) <-- ### ### --> match orthologues with custom Python script (match_orthologues_20180720.py) <-- ### ### invoke with python3 match_orthologues_20180720.py ### ##### 20.07.2018 ##### ##### by Kevin Schneider ##### ##### written for Python 3 ##### """ --> script to match & combine single-name orthologues which were mapped to the same gene sets in the reference genome """ from glob import glob from os import listdir from os.path import isfile from re import sub def find_ortho_sets(targ_dir): file_list = listdir(targ_dir) ortho_sets = [] for file in file_list: if len(file.split("_")) > 2: ortho_sets.append(file) return ortho_sets def matching_orthologues(file, orthos2match): ortho_list = [] for ortho in orthos2match: if file in ortho: delstring0 = file.replace(".fasta", "") delstring1 = delstring0 + "_" ortho = ortho.replace(delstring1, "") delstring2 = "_" + delstring0 ortho = ortho.replace(delstring2, "") ortho = ortho.replace(".fasta", "") ortho = ortho.split("_") ortho = [a + "_" + b for a,b in zip(ortho[::2], ortho[1::2])] ortho_list.append(ortho) flatten = lambda l: [item for sublist in l for item in sublist] ortho_list = flatten(ortho_list) return ortho_list def combine_files(file, matched_file): with open(file, "a+") as f1: with open(matched_file, "r+") as f2: content2 = f2.read() f1.write(content2) return True targ_dir = "outfiles/" orthos2match = find_ortho_sets(targ_dir) ortho_files = "outfiles/*.fasta" ### change file name as required for file in glob(ortho_files): if isfile(file): orthos2append = matching_orthologues(file, orthos2match) if len(orthos2append) > 0: for matched_file in orthos2append: matched_file_name = matched_file + ".fasta" if matched_file_name in glob(ortho_files): combine_files(file, matched_file_name) ### --> filtering with custom Python script (filter_fastas_20180810.py) <-- ### ### invoke with python3 filter_fastas_20180810.py and the below options ### ### check on xxx for updates ### ##### 10.08.2018 ##### ##### by Kevin Schneider ##### ##### KevinSchneider@gmx.at ##### ##### written for Python 3 ##### """ --> script to filter fasta files <-- depending on the chosen flags/parameters, it: 1) removes record duplicates 2) filters by the number of sequence records 3) filters by the number of taxa 4) saves the original sequence headers to a .txt file 5) simplifies the sequence headers """ from glob import glob from Bio import SeqIO from Bio.SeqUtils.CheckSum import seguid from os import path import argparse import datetime from json import dumps def get_unique(records): """ check if sequence is present more than once in the list of sequences """ """ instead of comparing the sequence records explicitly, checksums are applied """ checksums = set() for record in records: checksum = seguid(record.seq) if checksum in checksums: continue checksums.add(checksum) yield record def check_rec_number(records, rec_l_limit, rec_u_limit): """ filter by the number of records in the file """ rec_num = len(records) if rec_num > rec_l_limit and rec_num < rec_u_limit: return True def check_taxon_number(records, taxa_limit): """ filter by the number of distinct taxa in the file """ IDs = set() taxa_num = 0 for record in records: if not record.id in IDs: IDs.add(record.id) taxa_num += 1 if taxa_num > taxa_limit: return True def save_seq_headers(file, records): """ save sequence headers to a .txt file """ for record in records: base = path.splitext(file)[0] new_file_name = base + ".headers.txt" with open(new_file_name, "w") as f: f.write(record.id) def simplify_headers(records, separator, ele_num): """ cut down headers depending on chosen number of elements between separators to be retained """ for record in records: record.id = record.id.split(separator)[ele_num] record.id = separator.join(record.id) return records def save_args(args): """ save all command line arguments to a .txt file including date and time """ date_time = datetime.datetime.now() date_time = date_time.strftime("%Y-%m-%d_%H-%M") arg_file_name = "filter_fastas_args_" + date_time + ".txt" with open(arg_file_name, "w") as af: af.write(dumps(vars(args))) """ handling and parsing the input parameters """ parser = argparse.ArgumentParser() parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)') parser.add_argument('-d', action='store_true', help='remove duplicates') parser.add_argument('-rl', nargs=1, type=int, help='lower limit for number of records') parser.add_argument('-ru', nargs=1, type=int, help='upper limit for number of records') parser.add_argument('-t', nargs=1, type=int, help='lower limit for number of taxa') parser.add_argument('-s', action='store_true', help='saves the original sequence headers to a .txt file') parser.add_argument('-si', action='store_true', help='simplify sequence header') parser.add_argument('-sep', nargs=1, type=str, help='separator (in between quotes) for delimiting sequence headers') parser.add_argument('-e', nargs=1, type=int, help='beginning from the start, number of elements between separators to keep') args = parser.parse_args() files = args.i remove_dup = args.d rec_l_limit = args.rl[0] rec_u_limit = args.ru[0] taxa_limit = args.t[0] save_headers = args.s simplify = args.si separator = args.sep[0] ele_num = args.e[0] """ save command line arguments to a .txt file """ save_args(args) """ loop through all the files with the specified file name (pattern) and open them in turn """ for file in glob(files): with open(file, "rU") as f: """ create a list of sequence records in the file """ records = list(SeqIO.parse(f, "fasta")) if remove_dup: new_records = get_unique(records) if not check_rec_number(records, rec_l_limit, rec_u_limit): continue if not check_taxon_number(records, taxa_limit): continue if save_headers: save_seq_headers(file, records) if simplify: new_records = simplify_headers(records, separator, ele_num) base = path.splitext(file)[0] ext = path.splitext(file)[1] new_file = base + ".filtered" + ext SeqIO.write(new_records, new_file, "fasta") ### --> END: filtering with custom script (filter_fastas_20180810.py) <-- ### ### parameters used in above script for molevol dataset ### {"i": "*.orfpy.fasta", "d": true, "rl": [7], "ru": [80], "t": [7], "s": true, "si": true, "sep": ["|"], "e": [2]} ### --> orfpy script <-- ### ### choose longest ORF again in the case of multiple ORFs per orthogroup ### #!/bin/bash for file in *.fasta; do base=$(basename $file .fasta) python get_orfs_or_cdss.py -i $file -f fasta -t CDS --on $base.orfpy.fasta --op $base.orfpyprot.fasta --ob $base.bed --og $base.gff3 --min_len 200 -m one done ### --> prank script <-- ### ### align based on codon model ### #!/bin/bash for file in *.fasta; do prank -d=$file -o=prank_aligned_$file -codon -F done ### --> example PBS script for parallelisation of prank script <-- ### #PBS -N prank_alignment_1_subset_remaining3 #PBS -S /bin/bash #PBS -m abe #PBS -M Kevin.Schneider@glasgow.ac.uk #PBS -l nodes=1:ppn=16,walltime=10000:00:00,cput=160000:00:00 source ~/.bash_profile cd prank/orthogroups ./prank_orthogroups_commands4server_subset_remaining16_1.sh & ./prank_orthogroups_commands4server_subset_remaining16_2.sh & ./prank_orthogroups_commands4server_subset_remaining16_3.sh & ./prank_orthogroups_commands4server_subset_remaining16_4.sh & ./prank_orthogroups_commands4server_subset_remaining16_5.sh & ./prank_orthogroups_commands4server_subset_remaining16_6.sh & ./prank_orthogroups_commands4server_subset_remaining16_7.sh & ./prank_orthogroups_commands4server_subset_remaining16_8.sh & ./prank_orthogroups_commands4server_subset_remaining16_9.sh & ./prank_orthogroups_commands4server_subset_remaining16_10.sh & ./prank_orthogroups_commands4server_subset_remaining16_11.sh & ./prank_orthogroups_commands4server_subset_remaining16_12.sh & ./prank_orthogroups_commands4server_subset_remaining16_13.sh & ./prank_orthogroups_commands4server_subset_remaining16_14.sh & ./prank_orthogroups_commands4server_subset_remaining16_15.sh & ./prank_orthogroups_commands4server_subset_remaining16_16.sh & wait echo "Finished!" ### alignment trimming by nucleotide and taxon occupancy using TrimAl v1.2 software (installed and run on server) ### # 1st time: -resoverlap 0.5 and -seqoverlap 50, without -block option, used (but still many too diverged sequences in the alignments) # 2nd time (re-run): as below (file names start with trimmed in 1st run and trimmed2 in 2nd run) #!/bin/bash cd codonfiles for file in *.best.fas; do trimal -in $file -out ../trimmed_codonfiles/trimmed2_$file -fasta -resoverlap 1.0 -seqoverlap 0.38 -noallgaps done ### change sequence headers with custom Python script and transform to PHYLIP file (for FastME) ### ##### 25.10.2018 ##### ##### by Kevin Schneider ##### ##### KevinSchneider@gmx.at ##### ##### written for Python 3 ##### """ --> script to simplify fasta file headers and convert fasta to phylip files <-- """ from glob import glob from Bio import SeqIO from Bio.SeqUtils.CheckSum import seguid from os import path import argparse def simplify_ortho_headers(records): ortho_dict = {} """ extract last part of header and add the number of the current orthologue in the orthogroup """ for record in records: old_record_id = record.id.split('|')[-1] new_record_id = old_record_id[0:8] if not new_record_id in ortho_dict.keys(): ortho_dict[new_record_id] = 1 else: ortho_dict[new_record_id] += 1 record.id = new_record_id + str(ortho_dict[new_record_id]) return records parser = argparse.ArgumentParser() parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)') args = parser.parse_args() files = args.i for file in glob(files): with open(file, 'rU') as f: """ create a list of sequence records in the file """ records = list(SeqIO.parse(f, 'fasta')) new_records = simplify_ortho_headers(records) base = path.splitext(file)[0] new_fasta = base + '.simplified.fasta' phylip = base + '.simplified.phylip' SeqIO.write(new_records, new_fasta, 'fasta') SeqIO.write(new_records, phylip, 'phylip') ### BioNJ phylogenetic tree construction using FastME ### #!/bin/bash cd orthogroup_intermediate_files/orthogroup_intermediate_files2 for file in trimmed2*.simplified.phylip; do base=$(basename $file .simplified.phylip) fastme -i $file -o ../fastme_trees/$base.nwk --method=BioNJ --dna=F84 --gamma=1.0 --nni --spr -T 16 done ### removing empty files (due to too large divergence and uncertain phylogenetic relationships) ### #!/bin/bash find . -name "*.codon.fasta.best.nwk" -size 0k | xargs rm ### labelling branches in the phylogenetic trees with custom R scripts ### ##### 29.10.2018 ##### ##### by Kevin Schneider ##### ##### KevinSchneider@gmx.at ##### ##### written for R version 3.3.1 ##### require(ape) setwd("/Volumes/LaCie/orthogroup_tree_files/") path <- "/Volumes/LaCie/orthogroup_tree_files/" pre_taxa <- c("C_clupea", "C_lavare", "Sa_alpin", "Sa_fonti") treefiles <- Sys.glob(file.path(paste(path, "trimmed2_OG*.codon.fasta.best.nwk", sep = ""))) descendants <- function (tree, descendnodes, descendtips = NULL) { tipnum <- tree$Nnode + 1 descendnodes_new <- tree$edge[tree$edge[, 1] %in% descendnodes, 2] descendtips <- append(descendtips, descendnodes[descendnodes <= tipnum]) descendtips <- append(descendtips, descendnodes_new[descendnodes_new <= tipnum]) descendnodes_new <- descendnodes_new[descendnodes_new > tipnum] if (length(descendnodes_new) == 0) { return(descendtips) } do.call("descendants", list(tree, descendnodes_new, root, mid, descendtips)) } for (i in 1:length(treefiles)) { tree <- read.tree(treefiles[i]) if ("E_lucius1" %in% tree$tip.label) { tree <- root(tree, outgroup = "E_lucius1", resolve.root = TRUE) taxa <- c() for (j in 1:length(pre_taxa)) { taxa <- append(taxa, grep(pre_taxa[j], tree$tip.label, value = TRUE)) } if (length(taxa) >= 1) { tree <- makeNodeLabel(tree, method = "number", prefix = "") for (k in 1:tree$Nnode) { desc <- descendants(tree, tree$node.label[k]) if (all(desc %in% taxa)) { tree$node.label[k] <- "{Foreground}" } } for (l in 1:length(tree$tip.label)) { if (tree$tip.label[l] %in% taxa) { tree$tip.label[l] <- paste(tree$tip.label[l], "{Foreground}", sep = "") } } write.tree(tree, file = paste(path, "labelled_", basename(treefiles[i]), sep = "")) } } } ### shorten headers of codon fasta files ### ##### 02.11.2018 ##### ##### by Kevin Schneider ##### ##### KevinSchneider@gmx.at ##### ##### written for Python 3 ##### """ --> script to shorten fasta file headers <-- """ from glob import glob from Bio import SeqIO from Bio.SeqUtils.CheckSum import seguid from os import path import argparse def shorten_ortho_headers(records): """ shorten header to PHYLIP-style sequence names """ for record in records: record.description = record.description.split(' ')[0] return records parser = argparse.ArgumentParser() parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)') args = parser.parse_args() files = args.i for file in glob(files): with open(file, 'rU') as f: """ create a list of sequence records in the file """ records = list(SeqIO.parse(f, 'fasta')) new_records = shorten_ortho_headers(records) base = path.splitext(file)[0] new_fasta = base + '.shortened.header.fasta' SeqIO.write(new_records, new_fasta, 'fasta') ### combining alignments and tree files ### #!/bin/bash for alfile in trimmed2_OG*.shortened.header.fasta; do base=$(basename $alfile .codon.fasta.best.simplified.shortened.header.fasta) fullname=$(echo "labelled_"${base}) basepath=$(echo "../orthogroup_tree_files/"${fullname}) treefile=$(grep ${basepath} <(find ../orthogroup_tree_files -name "labelled_trimmed2_OG*")) if [ -f "$treefile" ]; then cat $alfile > ../orthogroup_HYPHY_input/$base.hyphy echo "" >> ../orthogroup_HYPHY_input/$base.hyphy cat $treefile >> ../orthogroup_HYPHY_input/$base.hyphy fi done ### batch file for RELAX analyses ### fileToExe = "/usr/local/lib/hyphy/TemplateBatchFiles/SelectionAnalyses/RELAX.bf"; inputRedirect = {}; inputRedirect["01"]="Universal"; inputRedirect["02"]="/Volumes/LaCie/orthogroup_HYPHY_input/$file"; inputRedirect["03"]="y"; inputRedirect["04"]="Foreground"; inputRedirect["05"]="All"; ExecuteAFile(fileToExe, inputRedirect); ### bash script for running RELAX via HYPHY ### #!/bin/bash for file in trimmed2_OG*.hyphy.bf; do sudo mv $file ../orthogroup_RELAX_output/ HYPHYMP ../orthogroup_RELAX_output/$file base=$(basename $file .bf) sudo mv $base.RELAX.json ../orthogroup_RELAX_output/ done ### create input tree file for relative rate analyses ### #!/bin/bash for file in trimmed2_OG*.codon.fasta.best.nwk; do base=$(basename $file .codon.fasta.best.nwk) base=${base:9} printf $base"\t" >> pre_all_OG_trees.nwk cat $file >> pre_all_OG_trees.nwk done sed '/^$/d' pre_all_OG_trees.nwk > all_OG_trees.nwk ### relative rate analyses not meaningful for orthogroups (since specific genes are also compared to other genes in the data) ### setwd("/Users/kevinschneider/Documents/relrates") source("helperFuncs_mod.R") source("projection_coevo_new.R") #this will take some time mamTrees=readTrees("all_OG_trees.nwk") #cutoff = exp(-7) based on the peak of the mean/variance plot #this is the basic method mamRER=getAllResiduals(mamTrees,useSpecies=mamTrees$masterTree$tip.label, transform = "none",weighted = F, cutoff=0.001) #For some datasets scaling improves results mamRERs=scale(mamRER) #this method peforms better on benchmarks mamRERlogW=getAllResiduals(mamTrees,useSpecies=mamTrees$masterTree$tip.label, transform = "log",weighted = T, cutoff=0.001) #again can be scaled, almost always improves results in our exprience mamRERlogWs=scale(mamRERlogW) #read the binary tree marineb=read.tree("MasterTree_bin_CorSalv_OG.nwk") plot(marineb) #controlb <- read.tree("control_bin.nwk") #convert it to a paths vector phenvMarine=allPathMasterRelative(marineb, mamTrees$masterTree, mamTrees$ap) #control_phenvMarine <- allPathMasterRelative(controlb, mamTrees$masterTree, mamTrees$ap) #this isn't binary anymore due to addition along branches table(phenvMarine) #we can fix that phenvMarine[phenvMarine>1]=1 #control_phenvMarine[control_phenvMarine > 1] <- 1 corMarine=getAllCor(mamRER, phenvMarine) hist(corMarine$P) #control_corMarine <- getAllCor(mamRER, control_phenvMarine) corMarineLogW=getAllCor(mamRERlogW, phenvMarine) hist(corMarineLogW$P) #control_corMarineLogW <- getAllCor(mamRERlogW, control_phenvMarine) corMarineLogWs=getAllCor(mamRERlogWs, phenvMarine) hist(corMarineLogWs$P) #control_corMarineLogWs <- getAllCor(mamRERlogWs, control_phenvMarine) corMarineLogWs_cleaned <- corMarineLogWs[!is.na(corMarineLogWs[, 3]), ] # comment out later (Kevin) # additional manipulation of pipeline output (Kevin) # coreset (10 species) #salm_fore_pos_sel_Bonf <- readLines("Bonf_sigfiles_gene_M1_BS2_salm_fore_core.txt") #CorSalv_pos_sel_Bonf <- readLines("Bonf_sigfiles_gene_M1_BS2_CorSalv_core.txt") # extended transcriptome set (20 species across teleosts) #salm_fore_pos_sel_ext <- readLines("sigfiles_gene_BS1_BS2_salm_fore_ext.txt") #CorSalv_pos_sel_ext <- readLines("sigfiles_gene_BS1_BS2_CorSalv_ext.txt") corMarineLogWs_cleaned <- corMarineLogWs[!is.na(corMarineLogWs[, 3]), ] control_corMarineLogWs_cleaned <- control_corMarineLogWs[!is.na(control_corMarineLogWs[, 3]), ] sig_genes <- corMarineLogWs_cleaned[corMarineLogWs_cleaned[, 3] < 0.05, ] accelerated_sig <- sig_genes[sig_genes[, 1] > 0, ] decelerated_sig <- sig_genes[sig_genes[, 1] < 0, ] sig_genes_Bonf <- corMarineLogWs_cleaned[corMarineLogWs_cleaned[, 3] < (0.05 / nrow(corMarineLogWs_cleaned)), ] accelerated_sig_Bonf <- sig_genes_Bonf[sig_genes_Bonf[, 1] > 0, ] decelerated_sig_Bonf <- sig_genes_Bonf[sig_genes_Bonf[, 1] < 0, ] p_FDR <- p.adjust(corMarineLogWs_cleaned[, 3], method = "fdr", n = length(corMarineLogWs_cleaned[, 3])) sig_genes_FDR <- corMarineLogWs_cleaned[p_FDR < 0.05, ] accelerated_sig_FDR <- sig_genes_FDR[sig_genes_FDR[, 1] > 0, ] decelerated_sig_FDR <- sig_genes_FDR[sig_genes_FDR[, 1] < 0, ] # number of significantly accelerated genes (p < 0.05) without Bonferroni correction nrow(accelerated_sig) # number of significantly accelerated genes (p < (0.05/1,446)) after Bonferroni correction nrow(accelerated_sig_Bonf) # number of significanlty accelerated genes (FDR-q < 0.05) after FDR correction nrow(accelerated_sig_FDR) # number of significantly decelerated genes (p < 0.05) without Bonferroni correction nrow(decelerated_sig) # number of significantly decelerated genes (p < (0.05/1,446)) after Bonferroni correction nrow(decelerated_sig_Bonf) # number of significantly decelerated genes (FDR-q < 0.05) after FDR correction nrow(decelerated_sig_FDR) # changing gene name prefixes write(accelerated_sig, "accelerated_genes_OG_20181105.txt", sep = "\n") write(decelerated_sig, "decelerated_genes_OG_20181105.txt", sep = "\n") ### transform multi-line fasta files to single-line sequence alignment files (for Kr/Kc analyses [Hanada]) ### ##### 07.11.2018 ##### ##### by Kevin Schneider ##### ##### KevinSchneider@gmx.at ##### ##### written for Python 3 ##### """ --> script to convert mutli-line fasta files to one-line sequence alignment files <-- """ from glob import glob from Bio import SeqIO from Bio.SeqUtils.CheckSum import seguid from os import path import argparse parser = argparse.ArgumentParser() parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)') args = parser.parse_args() files = args.i for file in glob(files): with open(file, 'rU') as f: """ create a list of sequence records in the file """ records = list(SeqIO.parse(f, 'fasta')) base = path.splitext(file)[0] new_fasta = 'krkc_input_' + base + '.oneline.seq' with open(new_fasta, 'w') as outfile: for record in records: outfile.write(record.id + '\n' + str(record.seq) + '\n') ### script for running Kr/Kc (Hanada) analyses ### #!/bin/bash for file in krkc_input_*.oneline.aln; do perl KaKs.pl $file > $file.kaks perl KrKc.pl $file > $file.classA.krkc perl KrKc_classB.pl $file > $file.classB.krkc done # --> too many OTUs for Hanada's implementation of Kr/Kc and Ka/Ks analyses <-- # ### batch file for aBSREL analyses ### fileToExe = "/usr/local/lib/hyphy/TemplateBatchFiles/SelectionAnalyses/aBSREL.bf"; inputRedirect = {}; inputRedirect["01"]="Universal"; inputRedirect["02"]="/Volumes/LaCie/orthogroup_aBSREL_input/£"; inputRedirect["03"]="y"; inputRedirect["04"]="Foreground"; ExecuteAFile(fileToExe, inputRedirect); ### bash script for running aBSREL via HYPHY ### #!/bin/bash for file in trimmed2_OG*.hyphy.aBSREL.bf; do sudo mv $file ../orthogroup_aBSREL_output/ HYPHYMP ../orthogroup_aBSREL_output/$file base=$(basename $file .bf) sudo mv $base.json ../orthogroup_aBSREL_output/ done ### bash script for extracting relevant information from RELAX output #!/bin/bash for file in trimmed2*.hyphy.RELAX.json; do pval=$(grep "p-value" $file | sed -e "s/\:\(.*\)\,/\1/" | cut -c13-) param=$(grep "relaxation or intensification parameter" $file | sed -e "s/\:\(.*\)/\1/" | cut -c45-) echo "START" >> RELAX_results.txt echo $file >> RELAX_results.txt echo $pval >> RELAX_results.txt echo $param >> RELAX_results.txt echo "END" >> RELAX_results.txt done ### ### used for getting reference sets for enrichment calculations of OGs of outlier genes grep -vf file1 file2 > file3 ### ### ### R script for analysing RELAX results ### Kevin Schneider; last modified: 07.10.2019 ### ### KevinSchneider@gmx.at ### library(ggplot2) setwd("/Users/kevinschneider/Documents/transcriptomics/HYPHY_analyses/") gotable <- read.csv("blast2go_export_2704orthogroups_salmonids.txt", header = TRUE, sep = "\t") relax <- readLines("RELAX_results.txt") relax <- relax[-(seq(from = 1, to = length(relax), by = 5))] relax <- relax[-(seq(from = 4, to = length(relax), by = 4))] relaxframe <- data.frame("ID" = as.character(relax[seq(from = 1, to = length(relax), by = 3)]), "pval" = as.numeric(relax[seq(from = 2, to = length(relax), by = 3)]), "qval" = NA, "selparam" = as.numeric(relax[seq(from = 3, to = length(relax), by = 3)]), "sig" = NA) relaxframe$qval <- p.adjust(relaxframe$pval, method = "fdr", n = nrow(relaxframe)) Bonfcutoff <- (0.05 / nrow(relaxframe)) relaxed_nocorr <- 0 relaxed_FDR <- 0 relaxed_Bonf <- 0 intensified_nocorr <- 0 intensified_FDR <- 0 intensified_Bonf <- 0 for (i in 1:nrow(relaxframe)) { if ((relaxframe$pval[i] < 0.05) && (relaxframe$selparam[i] < 1)) { relaxed_nocorr <- relaxed_nocorr + 1 relaxframe$sig[i] <- "relaxed_nocorr" } if ((relaxframe$qval[i] < 0.10) && (relaxframe$selparam[i] < 1)) { relaxed_FDR <- relaxed_FDR + 1 relaxframe$sig[i] <- "relaxed_FDR" } if ((relaxframe$pval[i] < Bonfcutoff) && (relaxframe$selparam[i] < 1)) { relaxed_Bonf <- relaxed_Bonf + 1 relaxframe$sig[i] <- "relaxed_Bonf" } if ((relaxframe$pval[i] < 0.05) && (relaxframe$selparam[i] > 1)) { intensified_nocorr <- intensified_nocorr + 1 relaxframe$sig[i] <- "intensified_nocorr" } if ((relaxframe$qval[i] < 0.10) && (relaxframe$selparam[i] > 1)) { intensified_FDR <- intensified_FDR + 1 relaxframe$sig[i] <- "intensified_FDR" } if ((relaxframe$pval[i] < Bonfcutoff) && (relaxframe$selparam[i] > 1)) { intensified_Bonf <- intensified_Bonf + 1 relaxframe$sig[i] <- "intensified_Bonf" } } freq_table_nocorr <- matrix(c(relaxed_nocorr, intensified_nocorr, (nrow(relaxframe) - relaxed_nocorr), (nrow(relaxframe) - intensified_nocorr)), nrow = 2, byrow = TRUE, dimnames = list("category" = c("outliers", "non-outliers") , "selection" = c("relaxed", "intensified"))) freq_table_FDR <- matrix(c(relaxed_FDR, intensified_FDR, (nrow(relaxframe) - relaxed_FDR), (nrow(relaxframe) - intensified_FDR)), nrow = 2, byrow = TRUE, dimnames = list("category" = c("outliers", "non-outliers") , "selection" = c("relaxed", "intensified"))) freq_table_Bonf <- matrix(c(relaxed_Bonf, intensified_Bonf, (nrow(relaxframe) - relaxed_Bonf), (nrow(relaxframe) - intensified_Bonf)), nrow = 2, byrow = TRUE, dimnames = list("category" = c("outliers", "non-outliers") , "selection" = c("relaxed", "intensified"))) below1 <- relaxframe$selparam[which(relaxframe$selparam < 1)] above1 <- relaxframe$selparam[which(relaxframe$selparam > 1)] freq_table_below_or_above_1 <- matrix(c(length(below1), length(above1), 1347.5, 1347.5), nrow = 2, byrow = TRUE, dimnames = list("category" = c("outliers", "non-outliers") , "selection" = c("relaxed", "intensified"))) fisher.test(freq_table_nocorr) fisher.test(freq_table_FDR) fisher.test(freq_table_Bonf) fisher.test(freq_table_below_or_above_1) relaxed_nocorr relaxed_FDR relaxed_Bonf intensified_nocorr intensified_FDR intensified_Bonf write.csv(relaxframe, "RELAX_R_dataframe.csv") median(relaxframe$selparam) mean(relaxframe$selparam) freq_table_selparam <- matrix(c((length(relaxframe$selparam[relaxframe$selparam != 1]) / 2), (length(relaxframe$selparam[relaxframe$selparam != 1]) / 2), length(relaxframe$selparam[relaxframe$selparam < 1]), length(relaxframe$selparam[relaxframe$selparam > 1])), nrow = 2, byrow = TRUE, dimnames = list("category" = c("expected", "observed") , "selection" = c("relaxed", "intensified"))) fisher.test(freq_table_selparam) # extract orthogroup ID from json file ID relaxframe$ID <- substr(relaxframe$ID, 10, 18) gotable$Sequence.Name <- unlist(lapply(gotable$Sequence.Name, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (nrow(gotable) * 2), by = 2))] names(gotable)[1] <- "ID" # combine the two dataframes combframe <- merge(relaxframe, gotable, by = "ID") GO_vec <- c() GO_code_vec <- c() for (j in 1:nrow(combframe)) { if (!is.na(combframe$Annotation.GO.Term[j])) { GO_vec <- append(GO_vec, unlist(strsplit(as.character(combframe$Annotation.GO.Term[j]), split = ";", fixed = TRUE))) GO_code_vec <- append(GO_code_vec, unlist(strsplit(as.character(combframe$Annotation.GO.ID[j]), split = ";", fixed = TRUE))) } } selparam_list <- list() GO_frame <- data.frame("GO_category" = GO_vec, "GO_code" = GO_code_vec, "pval_median" = NA, "selparam_median" = NA, "GO_frequency" = NA) for (k in 1:length(GO_vec)) { pval_median_vec <- c() selparam_median_vec <- c() print(k) for (l in 1:nrow(combframe)) { invec <- FALSE invec <- grepl(GO_vec[k], combframe$Annotation.GO.Term[l], fixed = TRUE) if (invec) { pval_median_vec <- append(pval_median_vec, combframe$pval[l]) selparam_median_vec <- append(selparam_median_vec, combframe$selparam[l]) } } if (!is.null(pval_median_vec)) { pval_median <- median(pval_median_vec) GO_frame$pval_median[k] <- pval_median } else { GO_frame$pval_median[k] <- NA } if (!is.null(selparam_median_vec)) { selparam_median <- median(selparam_median_vec) GO_frame$selparam_median[k] <- selparam_median GO_frame$GO_frequency[k] <- length(selparam_median_vec) } else { GO_frame$selparam_median[k] <- NA GO_frame$GO_frequency[k] <- NA } selparam_list[[GO_vec[k]]] <- selparam_median_vec } GO_frame <- GO_frame[!duplicated(GO_frame), ] write.table(GO_frame, file = "RELAX_and_GO_analysis_out_table.csv", quote = FALSE, sep = ";", row.names = FALSE) save(selparam_list, file = "RELAX_and_GO_analysis_session.RData") violin_list <- selparam_list[c(686, 22, 11, 34, 32, 15, 20, 14, 1, 180, 2, 1327, 723, 35, 645, 51, 84, 248, 250, 324)] violin_list <- selparam_list[c(686, 22, 11, 34, 32, 15, 20)] violin_list <- selparam_list[c(14, 1, 180, 2, 1327, 723, 35)] violin_list <- selparam_list[c(645, 51, 84, 248, 250, 324)] k_vec <- c() GO_term_vec <- c() for (m in 1:length(violin_list)) { for (n in 1:length(violin_list[[m]])) { k_vec <- append(k_vec, violin_list[[m]][n]) GO_term_vec <- append(GO_term_vec, names(violin_list)[m]) } } violin_df <- data.frame("k" = k_vec, "GO" = factor(GO_term_vec)) vplot <- ggplot(violin_df, aes(GO, k, fill = GO)) vplot + geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) + geom_jitter(height = 0, width = 0.1) + guides(fill=FALSE) + geom_hline(yintercept=1, linetype = "dashed", colour = "black", size=1) + coord_cartesian(ylim = c(0, 3)) + theme_grey(base_size = 18) + theme(axis.text.x = element_text(angle = 20, hjust = 1)) vplot + geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) + geom_jitter(height = 0, width = 0.1) + guides(fill=FALSE) + geom_hline(yintercept=1, linetype = "dashed", colour = "black", size=1) + coord_cartesian(ylim = c(0, 3)) + theme_grey(base_size = 20) vplot + geom_boxplot() + geom_jitter(height = 0, width = 0.1) + guides(fill=FALSE) + geom_hline(yintercept=1, linetype = "dashed", colour = "black", size=1) + coord_cartesian(ylim = c(0, 3)) wc.frame <- data.frame("GO_ID" = names(selparam_list), "psig" = NA, "qsig" = NA, "pval" = NA, "qval" = NA, "Vstat" = NA) for (o in 1:length(selparam_list)) { pval <- NA qval <- NA psig <- NA qsig <- NA Vstat <- NA wc.test <- wilcox.test(selparam_list[[o]], mu = 1, alternative = "two.sided", na.action = na.omit) pval <- wc.test$p.value qval <- p.adjust(pval, method = "fdr", n = length(selparam_list)) Vstat <- wc.test$statistic if (!is.na(pval)) { if (pval < 0.05) { psig <- TRUE } else { psig <- FALSE } } if (!is.na(qval)) { if (qval < 0.10) { qsig <- TRUE } else { qsig <- FALSE } } wc.frame$psig[o] <- psig wc.frame$qsig[o] <- qsig wc.frame$pval[o] <- pval wc.frame$qval[o] <- qval wc.frame$Vstat[o] <- Vstat } violin_list <- selparam_list[which(wc.frame$psig)] k_vec <- c() GO_term_vec <- c() for (m in 1:length(violin_list)) { for (n in 1:length(violin_list[[m]])) { k_vec <- append(k_vec, violin_list[[m]][n]) GO_term_vec <- append(GO_term_vec, names(violin_list)[m]) } } violin_df <- data.frame("k" = k_vec, "GO" = factor(GO_term_vec)) vplot <- ggplot(violin_df, aes(GO, k, fill = GO)) vplot + geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) + geom_jitter(height = 0, width = 0.1) + guides(fill=FALSE) + geom_hline(yintercept=1, linetype = "dashed", colour = "black", size=1) + coord_cartesian(ylim = c(0, 3)) + theme_grey(base_size = 18) + theme(axis.text.x = element_text(angle = 20, hjust = 1)) + scale_x_discrete(name = "GO term") vplot + geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) + geom_jitter(height = 0, width = 0.1) + guides(fill=FALSE) + geom_hline(yintercept=1, linetype = "dashed", colour = "black", size=1) + coord_cartesian(ylim = c(0, 3)) + theme_grey(base_size = 20) vplot + geom_boxplot() + geom_jitter(height = 0, width = 0.1) + guides(fill=FALSE) + geom_hline(yintercept=1, linetype = "dashed", colour = "black", size=1) + coord_cartesian(ylim = c(0, 3)) ### testing the difference of the selection parameter k distribution from the null expectation of 1 wilcox.test(relaxframe$selparam, mu = 1, alternative = "two.sided", na.action = na.omit) median(relaxframe$selparam) mean(relaxframe$selparam) ##### GO term enrichment analysis ##### using combframe, GO_frame, and outlier IDs (RELAX & aBSREL, incl. reference sets) GO_frame <- cbind(GO_frame, rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame))) names(GO_frame)[6] <- "GO_freq_ref_RELAX_rel" names(GO_frame)[7] <- "GO_freq_RELAX_rel" names(GO_frame)[8] <- "GO_freq_ref_RELAX_int" names(GO_frame)[9] <- "GO_freq_RELAX_int" names(GO_frame)[10] <- "GO_freq_ref_aBSREL" names(GO_frame)[11] <- "GO_freq_aBSREL" ### reference FDR relaxed outliers (RELAX) RELAX_ref_rel_IDs <- readLines("refset_relaxed_FDR_seq_IDs_RELAX_results.txt") RELAX_ref_rel_IDs <- unlist(lapply(RELAX_ref_rel_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (length(RELAX_ref_rel_IDs) * 2), by = 2))] RELAX_ref_rel <- combframe[which(combframe$ID %in% RELAX_ref_rel_IDs), ] for (o in 1:nrow(GO_frame)) { for (p in 1:nrow(RELAX_ref_rel)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o], RELAX_ref_rel$Annotation.GO.ID[p], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_ref_RELAX_rel[o] <- GO_frame$GO_freq_ref_RELAX_rel[o] + 1 } } } ### FDR relaxed outliers (RELAX) RELAX_rel_IDs <- readLines("relaxed_FDR_seq_IDs_RELAX_results.txt") RELAX_rel_IDs <- unlist(lapply(RELAX_rel_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (length(RELAX_rel_IDs) * 2), by = 2))] RELAX_rel <- combframe[which(combframe$ID %in% RELAX_rel_IDs), ] for (o2 in 1:nrow(GO_frame)) { for (p2 in 1:nrow(RELAX_rel)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o2], RELAX_rel$Annotation.GO.ID[p2], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_RELAX_rel[o2] <- GO_frame$GO_freq_RELAX_rel[o2] + 1 } } } ### reference FDR relaxed outliers (RELAX) RELAX_ref_int_IDs <- readLines("refset_intensified_FDR_seq_IDs_RELAX_results.txt") RELAX_ref_int_IDs <- unlist(lapply(RELAX_ref_int_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (length(RELAX_ref_int_IDs) * 2), by = 2))] RELAX_ref_int <- combframe[which(combframe$ID %in% RELAX_ref_int_IDs), ] for (o in 1:nrow(GO_frame)) { for (p in 1:nrow(RELAX_ref_int)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o], RELAX_ref_int$Annotation.GO.ID[p], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_ref_RELAX_int[o] <- GO_frame$GO_freq_ref_RELAX_int[o] + 1 } } } ### FDR relaxed outliers (RELAX) RELAX_int_IDs <- readLines("intensified_FDR_seq_IDs_RELAX_results.txt") RELAX_int_IDs <- unlist(lapply(RELAX_int_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (length(RELAX_int_IDs) * 2), by = 2))] RELAX_int <- combframe[which(combframe$ID %in% RELAX_int_IDs), ] for (o2 in 1:nrow(GO_frame)) { for (p2 in 1:nrow(RELAX_int)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o2], RELAX_int$Annotation.GO.ID[p2], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_RELAX_int[o2] <- GO_frame$GO_freq_RELAX_int[o2] + 1 } } } ### reference FDR relaxed outliers (RELAX) aBSREL_ref_IDs <- readLines("refset_FDR_seq_IDs_aBSREL_results1.txt") aBSREL_ref_IDs <- unlist(lapply(aBSREL_ref_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (length(aBSREL_ref_IDs) * 2), by = 2))] aBSREL_ref <- combframe[which(combframe$ID %in% aBSREL_ref_IDs), ] for (o in 1:nrow(GO_frame)) { for (p in 1:nrow(aBSREL_ref)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o], aBSREL_ref$Annotation.GO.ID[p], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_ref_aBSREL[o] <- GO_frame$GO_freq_ref_aBSREL[o] + 1 } } } ### FDR relaxed outliers (RELAX) aBSREL_IDs <- readLines("FDR_seq_IDs_aBSREL_results1.txt") aBSREL_IDs <- unlist(lapply(aBSREL_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (length(aBSREL_IDs) * 2), by = 2))] aBSREL <- combframe[which(combframe$ID %in% aBSREL_IDs), ] for (o2 in 1:nrow(GO_frame)) { for (p2 in 1:nrow(aBSREL)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o2], aBSREL$Annotation.GO.ID[p2], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_aBSREL[o2] <- GO_frame$GO_freq_aBSREL[o2] + 1 } } } GO_frame <- cbind(GO_frame, rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame))) names(GO_frame)[12] <- "GO_freq_both_fore_ref_RELAX_rel" names(GO_frame)[13] <- "GO_freq_both_fore_RELAX_rel" names(GO_frame)[14] <- "GO_freq_both_fore_ref_RELAX_int" names(GO_frame)[15] <- "GO_freq_both_fore_RELAX_int" names(GO_frame)[16] <- "GO_freq_both_fore_ref_aBSREL" names(GO_frame)[17] <- "GO_freq_both_fore_aBSREL" names(GO_frame)[18] <- "GO_freq_both_fore_ref_relaxed_diversifying" names(GO_frame)[19] <- "GO_freq_both_fore_relaxed_diversifying" names(GO_frame)[20] <- "GO_freq_both_fore_ref_intensified_diversifying" names(GO_frame)[21] <- "GO_freq_both_fore_intensified_diversifying" ### reference FDR relaxed outliers (RELAX) with both Coregonus & Salvelinus in alignments RELAX_both_fore_ref_rel_IDs <- readLines("both_fore_refset_relaxed_FDR_seq_IDs_RELAX_results.txt") RELAX_both_fore_ref_rel_IDs <- unlist(lapply(RELAX_both_fore_ref_rel_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (length(RELAX_both_fore_ref_rel_IDs) * 2), by = 2))] RELAX_both_fore_ref_rel <- combframe[which(combframe$ID %in% RELAX_both_fore_ref_rel_IDs), ] for (o in 1:nrow(GO_frame)) { for (p in 1:nrow(RELAX_both_fore_ref_rel)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o], RELAX_both_fore_ref_rel$Annotation.GO.ID[p], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_both_fore_ref_RELAX_rel[o] <- GO_frame$GO_freq_both_fore_ref_RELAX_rel[o] + 1 } } } ### FDR relaxed outliers (RELAX) with both Coregonus & Salvelinus in alignments RELAX_both_fore_rel_IDs <- readLines("both_fore_relaxed_FDR_seq_IDs_RELAX_results.txt") RELAX_both_fore_rel_IDs <- unlist(lapply(RELAX_both_fore_rel_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (length(RELAX_both_fore_rel_IDs) * 2), by = 2))] RELAX_both_fore_rel <- combframe[which(combframe$ID %in% RELAX_both_fore_rel_IDs), ] for (o2 in 1:nrow(GO_frame)) { for (p2 in 1:nrow(RELAX_both_fore_rel)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o2], RELAX_both_fore_rel$Annotation.GO.ID[p2], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_both_fore_RELAX_rel[o2] <- GO_frame$GO_freq_both_fore_RELAX_rel[o2] + 1 } } } ### reference FDR relaxed outliers (RELAX) with both Coregonus & Salvelinus in alignments RELAX_both_fore_ref_int_IDs <- readLines("both_fore_refset_intensified_FDR_seq_IDs_RELAX_results.txt") RELAX_both_fore_ref_int_IDs <- unlist(lapply(RELAX_both_fore_ref_int_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (length(RELAX_both_fore_ref_int_IDs) * 2), by = 2))] RELAX_both_fore_ref_int <- combframe[which(combframe$ID %in% RELAX_both_fore_ref_int_IDs), ] for (o in 1:nrow(GO_frame)) { for (p in 1:nrow(RELAX_both_fore_ref_int)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o], RELAX_both_fore_ref_int$Annotation.GO.ID[p], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_both_fore_ref_RELAX_int[o] <- GO_frame$GO_freq_both_fore_ref_RELAX_int[o] + 1 } } } ### FDR relaxed outliers (RELAX) with both Coregonus & Salvelinus in alignments RELAX_both_fore_int_IDs <- readLines("both_fore_intensified_FDR_seq_IDs_RELAX_results.txt") RELAX_both_fore_int_IDs <- unlist(lapply(RELAX_both_fore_int_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (length(RELAX_both_fore_int_IDs) * 2), by = 2))] RELAX_both_fore_int <- combframe[which(combframe$ID %in% RELAX_both_fore_int_IDs), ] for (o2 in 1:nrow(GO_frame)) { for (p2 in 1:nrow(RELAX_both_fore_int)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o2], RELAX_both_fore_int$Annotation.GO.ID[p2], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_both_fore_RELAX_int[o2] <- GO_frame$GO_freq_both_fore_RELAX_int[o2] + 1 } } } ### reference FDR relaxed outliers (RELAX) with both Coregonus & Salvelinus in alignments aBSREL_both_fore_ref_IDs <- readLines("both_fore_refset_FDR_seq_IDs_aBSREL_results1.txt") aBSREL_both_fore_ref_IDs <- unlist(lapply(aBSREL_both_fore_ref_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (length(aBSREL_both_fore_ref_IDs) * 2), by = 2))] aBSREL_both_fore_ref <- combframe[which(combframe$ID %in% aBSREL_both_fore_ref_IDs), ] for (o in 1:nrow(GO_frame)) { for (p in 1:nrow(aBSREL_both_fore_ref)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o], aBSREL_both_fore_ref$Annotation.GO.ID[p], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_both_fore_ref_aBSREL[o] <- GO_frame$GO_freq_both_fore_ref_aBSREL[o] + 1 } } } ### FDR relaxed outliers (RELAX) with both Coregonus & Salvelinus in alignments aBSREL_both_fore_IDs <- readLines("both_fore_FDR_seq_IDs_aBSREL_results1.txt") aBSREL_both_fore_IDs <- unlist(lapply(aBSREL_both_fore_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)) )[-(seq(from = 1, to = (length(aBSREL_both_fore_IDs) * 2), by = 2))] aBSREL_both_fore <- combframe[which(combframe$ID %in% aBSREL_both_fore_IDs), ] for (o2 in 1:nrow(GO_frame)) { for (p2 in 1:nrow(aBSREL_both_fore)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o2], aBSREL_both_fore$Annotation.GO.ID[p2], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_both_fore_aBSREL[o2] <- GO_frame$GO_freq_both_fore_aBSREL[o2] + 1 } } } ### reference FDR relaxed & diversifying outliers (RELAX & aBSREL) with both Coregonus & Salvelinus in alignments RELAX_rel_aBSREL_both_fore_ref_IDs <- readLines("refset_both_fore_relaxed_diversifying_FDR_seqIDs.txt") RELAX_rel_aBSREL_both_fore_ref_IDs <- unlist(lapply(RELAX_rel_aBSREL_both_fore_ref_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)))[-(seq(from = 1, to = (length(RELAX_rel_aBSREL_both_fore_ref_IDs) * 2), by = 2))] RELAX_rel_aBSREL_both_fore_ref <- combframe[which(combframe$ID %in% RELAX_rel_aBSREL_both_fore_ref_IDs), ] for (o in 1:nrow(GO_frame)) { for (p in 1:nrow(RELAX_rel_aBSREL_both_fore_ref)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o], RELAX_rel_aBSREL_both_fore_ref$Annotation.GO.ID[p], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_both_fore_ref_relaxed_diversifying[o] <- GO_frame$GO_freq_both_fore_ref_relaxed_diversifying[o] + 1 } } } ### FDR relaxed & diversifying outliers (RELAX & aBSREL) with both Coregonus & Salvelinus in alignments RELAX_rel_aBSREL_both_fore_IDs <- readLines("both_fg_taxa_relaxed_FDR_to_FDR_aBSREL1.txt") RELAX_rel_aBSREL_both_fore_IDs <- unlist(lapply(RELAX_rel_aBSREL_both_fore_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)))[-(seq(from = 1, to = (length(RELAX_rel_aBSREL_both_fore_IDs) * 2), by = 2))] RELAX_rel_aBSREL_both_fore <- combframe[which(combframe$ID %in% RELAX_rel_aBSREL_both_fore_IDs), ] for (o2 in 1:nrow(GO_frame)) { for (p2 in 1:nrow(RELAX_rel_aBSREL_both_fore)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o2], RELAX_rel_aBSREL_both_fore$Annotation.GO.ID[p2], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_both_fore_relaxed_diversifying[o2] <- GO_frame$GO_freq_both_fore_relaxed_diversifying[o2] + 1 } } } ### reference FDR intensified & diversifying outliers (RELAX & aBSREL) with both Coregonus & Salvelinus in alignments RELAX_int_aBSREL_both_fore_ref_IDs <- readLines("refset_both_fore_intensified_diversifying_FDR_seqIDs.txt") RELAX_int_aBSREL_both_fore_ref_IDs <- unlist(lapply(RELAX_int_aBSREL_both_fore_ref_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)))[-(seq(from = 1, to = (length(RELAX_int_aBSREL_both_fore_ref_IDs) * 2), by = 2))] RELAX_int_aBSREL_both_fore_ref <- combframe[which(combframe$ID %in% RELAX_int_aBSREL_both_fore_ref_IDs), ] for (o in 1:nrow(GO_frame)) { for (p in 1:nrow(RELAX_int_aBSREL_both_fore_ref)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o], RELAX_int_aBSREL_both_fore_ref$Annotation.GO.ID[p], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_both_fore_ref_intensified_diversifying[o] <- GO_frame$GO_freq_both_fore_ref_intensified_diversifying[o] + 1 } } } ### FDR intensified & diversifying outliers (RELAX & aBSREL) with both Coregonus & Salvelinus in alignments RELAX_int_aBSREL_both_fore_IDs <- readLines("both_fg_taxa_intensified_FDR_to_FDR_aBSREL1.txt") RELAX_int_aBSREL_both_fore_IDs <- unlist(lapply(RELAX_int_aBSREL_both_fore_IDs, function(x) strsplit(as.character(x), "|", fixed = TRUE)))[-(seq(from = 1, to = (length(RELAX_int_aBSREL_both_fore_IDs) * 2), by = 2))] RELAX_int_aBSREL_both_fore <- combframe[which(combframe$ID %in% RELAX_int_aBSREL_both_fore_IDs), ] for (o2 in 1:nrow(GO_frame)) { for (p2 in 1:nrow(RELAX_int_aBSREL_both_fore)) { greptrue <- FALSE greptrue <- grepl(GO_frame$GO_code[o2], RELAX_int_aBSREL_both_fore$Annotation.GO.ID[p2], fixed = TRUE) if (greptrue) { GO_frame$GO_freq_both_fore_intensified_diversifying[o2] <- GO_frame$GO_freq_both_fore_intensified_diversifying[o2] + 1 } } } GO_frame <- cbind(GO_frame, rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame))) names(GO_frame)[22] <- "GO_freq_not_ref_RELAX_rel" names(GO_frame)[23] <- "GO_freq_not_RELAX_rel" names(GO_frame)[24] <- "GO_freq_not_ref_RELAX_int" names(GO_frame)[25] <- "GO_freq_not_RELAX_int" names(GO_frame)[26] <- "GO_freq_not_ref_aBSREL" names(GO_frame)[27] <- "GO_freq_not_aBSREL" names(GO_frame)[28] <- "GO_freq_not_both_fore_ref_RELAX_rel" names(GO_frame)[29] <- "GO_freq_not_both_fore_RELAX_rel" names(GO_frame)[30] <- "GO_freq_not_both_fore_ref_RELAX_int" names(GO_frame)[31] <- "GO_freq_not_both_fore_RELAX_int" names(GO_frame)[32] <- "GO_freq_not_both_fore_ref_aBSREL" names(GO_frame)[33] <- "GO_freq_not_both_fore_aBSREL" names(GO_frame)[34] <- "GO_freq_not_both_fore_ref_relaxed_diversifying" names(GO_frame)[35] <- "GO_freq_not_both_fore_relaxed_diversifying" names(GO_frame)[36] <- "GO_freq_not_both_fore_ref_intensified_diversifying" names(GO_frame)[37] <- "GO_freq_not_both_fore_intensified_diversifying" for (q in 1:nrow(GO_frame)) { GO_frame$GO_freq_not_ref_RELAX_rel[q] <- length(RELAX_ref_rel_IDs) - GO_frame$GO_freq_ref_RELAX_rel[q] GO_frame$GO_freq_not_RELAX_rel[q] <- length(RELAX_rel_IDs) - GO_frame$GO_freq_RELAX_rel[q] GO_frame$GO_freq_not_ref_RELAX_int[q] <- length(RELAX_ref_int_IDs) - GO_frame$GO_freq_ref_RELAX_int[q] GO_frame$GO_freq_not_RELAX_int[q] <- length(RELAX_int_IDs) - GO_frame$GO_freq_RELAX_int[q] GO_frame$GO_freq_not_ref_aBSREL[q] <- length(aBSREL_ref_IDs) - GO_frame$GO_freq_ref_aBSREL[q] GO_frame$GO_freq_not_aBSREL[q] <- length(aBSREL_IDs) - GO_frame$GO_freq_aBSREL[q] GO_frame$GO_freq_not_both_fore_ref_RELAX_rel[q] <- length(RELAX_both_fore_ref_rel_IDs) - GO_frame$GO_freq_both_fore_ref_RELAX_rel[q] GO_frame$GO_freq_not_both_fore_RELAX_rel[q] <- length(RELAX_both_fore_rel_IDs) - GO_frame$GO_freq_both_fore_RELAX_rel[q] GO_frame$GO_freq_not_both_fore_ref_RELAX_int[q] <- length(RELAX_both_fore_ref_int_IDs) - GO_frame$GO_freq_both_fore_ref_RELAX_int[q] GO_frame$GO_freq_not_both_fore_RELAX_int[q] <- length(RELAX_both_fore_int_IDs) - GO_frame$GO_freq_both_fore_RELAX_int[q] GO_frame$GO_freq_not_both_fore_ref_aBSREL[q] <- length(aBSREL_both_fore_ref_IDs) - GO_frame$GO_freq_both_fore_ref_aBSREL[q] GO_frame$GO_freq_not_both_fore_aBSREL[q] <- length(aBSREL_both_fore_IDs) - GO_frame$GO_freq_both_fore_aBSREL[q] GO_frame$GO_freq_not_both_fore_ref_relaxed_diversifying[q] <- length(RELAX_rel_aBSREL_both_fore_ref_IDs) - GO_frame$GO_freq_both_fore_ref_relaxed_diversifying[q] GO_frame$GO_freq_not_both_fore_relaxed_diversifying[q] <- length(RELAX_rel_aBSREL_both_fore_IDs) - GO_frame$GO_freq_both_fore_relaxed_diversifying[q] GO_frame$GO_freq_not_both_fore_ref_intensified_diversifying[q] <- length(RELAX_int_aBSREL_both_fore_ref_IDs) - GO_frame$GO_freq_both_fore_ref_intensified_diversifying[q] GO_frame$GO_freq_not_both_fore_intensified_diversifying[q] <- length(RELAX_int_aBSREL_both_fore_IDs) - GO_frame$GO_freq_both_fore_intensified_diversifying[q] } Ntotals <- c(length(which(as.logical(c((GO_frame$GO_freq_ref_RELAX_rel == 0) + (GO_frame$GO_freq_RELAX_rel == 0))))), length(which(as.logical(c((GO_frame$GO_freq_ref_RELAX_int == 0) + (GO_frame$GO_freq_RELAX_int == 0))))), length(which(as.logical(c((GO_frame$GO_freq_ref_aBSREL == 0) + (GO_frame$GO_freq_aBSREL == 0))))), length(which(as.logical(c((GO_frame$GO_freq_both_fore_ref_RELAX_rel == 0) + (GO_frame$GO_freq_both_fore_RELAX_rel == 0))))), length(which(as.logical(c((GO_frame$GO_freq_both_fore_ref_RELAX_int == 0) + (GO_frame$GO_freq_both_fore_RELAX_int == 0))))), length(which(as.logical(c((GO_frame$GO_freq_both_fore_ref_aBSREL == 0) + (GO_frame$GO_freq_both_fore_aBSREL == 0))))), length(which(as.logical(c((GO_frame$GO_freq_both_fore_ref_relaxed_diversifying == 0) + (GO_frame$GO_freq_both_fore_relaxed_diversifying == 0))))), length(which(as.logical(c((GO_frame$GO_freq_both_fore_ref_intensified_diversifying == 0) + (GO_frame$GO_freq_both_fore_intensified_diversifying == 0)))))) GO_frame <- cbind(GO_frame, rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame))) names(GO_frame)[38] <- "RELAX_rel_qval" names(GO_frame)[39] <- "RELAX_int_qval" names(GO_frame)[40] <- "aBSREL_qval" names(GO_frame)[41] <- "both_fore_RELAX_rel_qval" names(GO_frame)[42] <- "both_fore_RELAX_int_qval" names(GO_frame)[43] <- "both_fore_aBSREL_qval" names(GO_frame)[44] <- "both_fore_relaxed_diversifying_qval" names(GO_frame)[45] <- "both_fore_intensified_diversifying_qval" GO_frame <- cbind(GO_frame, rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame)), rep(0, nrow(GO_frame))) names(GO_frame)[46] <- "RELAX_rel_pval" names(GO_frame)[47] <- "RELAX_int_pval" names(GO_frame)[48] <- "aBSREL_pval" names(GO_frame)[49] <- "both_fore_RELAX_rel_pval" names(GO_frame)[50] <- "both_fore_RELAX_int_pval" names(GO_frame)[51] <- "both_fore_aBSREL_pval" names(GO_frame)[52] <- "both_fore_relaxed_diversifying_pval" names(GO_frame)[53] <- "both_fore_intensified_diversifying_pval" # for (r in 1:nrow(GO_frame)) { # t <- 30 # for (s in seq(6, 16, 2)) { # freq_table <- matrix(c(GO_frame[r, s + 1], GO_frame[r, s], GO_frame[r, s + 13], GO_frame[r, s + 12]), # nrow = 2, # dimnames = list("category" = c("focal_GO", "other_GOs") , # "GO term" = c("outliers", "reference"))) # temp_FET <- fisher.test(freq_table) # temp_FET_FDR_qval <- p.adjust(temp_FET$p.value, method = "fdr", n = Ntotals[t - 29]) # GO_frame[r, t] <- temp_FET_FDR_qval # GO_frame[r, t + 6] <- temp_FET$p.value # t <- t + 1 # } # } # # write.csv(GO_frame[which(GO_frame$RELAX_rel_pval < 0.05), ], "twosided_RELAX_rel_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$RELAX_int_pval < 0.05), ], "twosided_RELAX_int_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$aBSREL_pval < 0.05), ], "twosided_aBSREL_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$both_fore_RELAX_rel_pval < 0.05), ], "twosided_both_fore_RELAX_rel_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$both_fore_RELAX_int_pval < 0.05), ], "twosided_both_fore_RELAX_int_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$both_fore_aBSREL_pval < 0.05), ], "twosided_both_fore_aBSREL_pval_R_dataframe.csv") for (r in 1:nrow(GO_frame)) { t <- 38 for (s in seq(6, 20, 2)) { freq_table <- matrix(c(GO_frame[r, s + 1], GO_frame[r, s], GO_frame[r, s + 17], GO_frame[r, s + 16]), nrow = 2, dimnames = list("category" = c("focal_GO", "other_GOs") , "GO term" = c("outliers", "reference"))) if (all(freq_table >= 0)) { temp_FET <- fisher.test(freq_table, alternative = "greater") temp_FET_FDR_qval <- p.adjust(temp_FET$p.value, method = "fdr", n = Ntotals[t - 37]) GO_frame[r, t] <- temp_FET_FDR_qval GO_frame[r, t + 8] <- temp_FET$p.value } t <- t + 1 } } # write.csv(GO_frame[which(GO_frame$RELAX_rel_pval < 0.05), ], "enriched_RELAX_rel_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$RELAX_int_pval < 0.05), ], "enriched_RELAX_int_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$aBSREL_pval < 0.05), ], "enriched_aBSREL_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$both_fore_RELAX_rel_pval < 0.05), ], "enriched_both_fore_RELAX_rel_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$both_fore_RELAX_int_pval < 0.05), ], "enriched_both_fore_RELAX_int_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$both_fore_aBSREL_pval < 0.05), ], "enriched_both_fore_aBSREL_pval_R_dataframe.csv") write.table(GO_frame[which(GO_frame$both_fore_relaxed_diversifying_pval < 0.05), ], "enriched_both_fore_relaxed_diversifying_pval_R_dataframe.tsv", quote = FALSE, row.names = FALSE, sep = "\t") write.table(GO_frame[which(GO_frame$both_fore_intensified_diversifying_pval < 0.05), ], "enriched_both_fore_intensified_diversifying_pval_R_dataframe.tsv", quote = FALSE, row.names = FALSE, sep = "\t") write.table(GO_frame[which(GO_frame$GO_freq_both_fore_relaxed_diversifying > 0), ], "all_FET_both_fore_relaxed_diversifying_pval_R_dataframe.tsv", quote = FALSE, row.names = FALSE, sep = "\t") write.table(GO_frame[which(GO_frame$GO_freq_both_fore_intensified_diversifying > 0), ], "all_FET_both_fore_intensified_diversifying_pval_R_dataframe.tsv", quote = FALSE, row.names = FALSE, sep = "\t") for (r in 1:nrow(GO_frame)) { t <- 38 for (s in seq(6, 20, 2)) { freq_table <- matrix(c(GO_frame[r, s + 1], GO_frame[r, s], GO_frame[r, s + 17], GO_frame[r, s + 16]), nrow = 2, dimnames = list("category" = c("focal_GO", "other_GOs") , "GO term" = c("outliers", "reference"))) if (all(freq_table >= 0)) { temp_FET <- fisher.test(freq_table, alternative = "less") temp_FET_FDR_qval <- p.adjust(temp_FET$p.value, method = "fdr", n = Ntotals[t - 37]) GO_frame[r, t] <- temp_FET_FDR_qval GO_frame[r, t + 8] <- temp_FET$p.value } t <- t + 1 } } # write.csv(GO_frame[which(GO_frame$RELAX_rel_pval < 0.05), ], "underrepresented_RELAX_rel_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$RELAX_int_pval < 0.05), ], "underrepresented_RELAX_int_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$aBSREL_pval < 0.05), ], "underrepresented_aBSREL_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$both_fore_RELAX_rel_pval < 0.05), ], "underrepresented_both_fore_RELAX_rel_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$both_fore_RELAX_int_pval < 0.05), ], "underrepresented_both_fore_RELAX_int_pval_R_dataframe.csv") # write.csv(GO_frame[which(GO_frame$both_fore_aBSREL_pval < 0.05), ], "underrepresented_both_fore_aBSREL_pval_R_dataframe.csv") write.table(GO_frame[which(GO_frame$both_fore_relaxed_diversifying_pval < 0.05), ], "underrepresented_both_fore_relaxed_diversifying_pval_R_dataframe.tsv", quote = FALSE, row.names = FALSE, sep = "\t") write.table(GO_frame[which(GO_frame$both_fore_intensified_diversifying_pval < 0.05), ], "underrepresented_both_fore_intensified_diversfiying_pval_R_dataframe.tsv", quote = FALSE, row.names = FALSE, sep = "\t") # outputting GO term information as .csv files both_fore_relaxed_diversifying_frame <- data.frame("OG" = RELAX_rel_aBSREL_both_fore$ID, "Description" = RELAX_rel_aBSREL_both_fore$Sequence.Description, "GO_IDs" = RELAX_rel_aBSREL_both_fore$Annotation.GO.ID, "GO_terms" = RELAX_rel_aBSREL_both_fore$Annotation.GO.Term) both_fore_intensified_diversifying_frame <- data.frame("OG" = RELAX_int_aBSREL_both_fore$ID, "Description" = RELAX_int_aBSREL_both_fore$Sequence.Description, "GO_IDs" = RELAX_int_aBSREL_both_fore$Annotation.GO.ID, "GO_terms" = RELAX_int_aBSREL_both_fore$Annotation.GO.Term) write.table(both_fore_relaxed_diversifying_frame, "both_fore_relaxed_diversifying_FDR_GO_info.tsv", quote = FALSE, row.names = FALSE, sep = "\t") write.table(both_fore_intensified_diversifying_frame, "both_fore_intensified_diversifying_FDR_GO_info.tsv", quote = FALSE, row.names = FALSE, sep = "\t") # flagging orthogroups that include both Coregonus & Salvelinus both_fore_OG <- as.character(readLines("both_fore_OG_IDs.txt")) both_fore <- combframe$ID %in% both_fore_OG combframe <- data.frame(combframe, "both_Cor_Salv" = both_fore) # outputting outlier information (incl. GO terms) as tab-delimited .txt files rel_or_int <- rep(NA, nrow(combframe)) rel_or_int[combframe$selparam < 1] <- "relaxed" rel_or_int[combframe$selparam > 1] <- "intensified" RELAX_Bonf <- which(combframe$sig %in% c("relaxed_Bonf", "intensified_Bonf")) RELAX_Bonf_frame <- data.frame("Orthogroup_ID" = combframe$ID[RELAX_Bonf], "both_Cor_Salv" = combframe$both_Cor_Salv[RELAX_Bonf], "Description" = combframe$Sequence.Description[RELAX_Bonf], "GO_terms" = combframe$Annotation.GO.Term[RELAX_Bonf], "p-value" = combframe$pval[RELAX_Bonf], "q-value" = combframe$qval[RELAX_Bonf], "k-value" = combframe$selparam[RELAX_Bonf], "rel_or_int" = rel_or_int[RELAX_Bonf]) write.table(RELAX_Bonf_frame, "additional_file_1_new.txt", quote = FALSE, row.names = FALSE, sep = "\t") RELAX_FDR <- which(combframe$sig %in% c("relaxed_FDR", "intensified_FDR")) RELAX_FDR_frame <- data.frame("Orthogroup_ID" = combframe$ID[RELAX_FDR], "both_Cor_Salv" = combframe$both_Cor_Salv[RELAX_FDR], "Description" = combframe$Sequence.Description[RELAX_FDR], "GO_terms" = combframe$Annotation.GO.Term[RELAX_FDR], "p-value" = combframe$pval[RELAX_FDR], "q-value" = combframe$qval[RELAX_FDR], "k-value" = combframe$selparam[RELAX_FDR], "rel_or_int" = rel_or_int[RELAX_FDR]) write.table(RELAX_FDR_frame, "additional_file_2_new.txt", quote = FALSE, row.names = FALSE, sep = "\t") RELAX_all_frame <- data.frame("Orthogroup_ID" = combframe$ID, "both_Cor_Salv" = combframe$both_Cor_Salv, "Description" = combframe$Sequence.Description, "GO_terms" = combframe$Annotation.GO.Term, "p-value" = combframe$pval, "q-value" = combframe$qval, "k-value" = combframe$selparam, "rel_or_int" = rel_or_int, "significance" = combframe$sig) write.table(RELAX_all_frame, "additional_file_3_new.txt", quote = FALSE, row.names = FALSE, sep = "\t") aBSREL_OG_IDs <- as.character(readLines("FDR_OG_IDs_aBSREL_results1.txt")) aBSREL_FDR <- which(combframe$ID %in% aBSREL_OG_IDs) aBSREL_FDR_frame <- data.frame("Orthogroup_ID" = combframe$ID, "both_Cor_Salv" = combframe$both_Cor_Salv, "Description" = combframe$Sequence.Description, "GO_terms" = combframe$Annotation.GO.Term) write.table(aBSREL_FDR_frame, "additional_file_4_new.txt", quote = FALSE, row.names = FALSE, sep = "\t") ### converting HYPHY sequence/tree files (step 1) ##### 04.12.2018 ##### ##### by Kevin Schneider ##### ##### KevinSchneider@gmx.at ##### ##### written for Python 3 ##### """ --> script to convert mutli-line fasta files to one-line fasta files <-- """ from glob import glob from Bio import SeqIO from Bio.SeqUtils.CheckSum import seguid from os import path import argparse parser = argparse.ArgumentParser() parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)') args = parser.parse_args() files = args.i for file in glob(files): with open(file, 'rU') as f: """ create a list of sequence records in the file """ records = list(SeqIO.parse(f, 'fasta')) base = path.splitext(file)[0] new_fasta = 'GO_input_' + base + '.oneline.fasta' with open(new_fasta, 'w') as outfile: for record in records: outfile.write('>' + record.id + '\n' + str(record.seq) + '\n') ### converting HYPHY sequence/tree files (step 2) ##### 04.12.2018 ##### ##### by Kevin Schneider ##### ##### KevinSchneider@gmx.at ##### ##### written for Python 3 ##### """ --> script to change sequence headers <-- """ from glob import glob from Bio import SeqIO from Bio.SeqUtils.CheckSum import seguid from os import path import argparse import re def change_headers(records, file): ortho_dict = {} """ extract last part of header and add the number of the current orthologue in the orthogroup """ for record in records: OG_str = str(file) OG_info = re.search('trimmed2_(.+?).oneline', OG_str) record.id = record.id + "|" + OG_info.group(1) return records parser = argparse.ArgumentParser() parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)') args = parser.parse_args() files = args.i for file in glob(files): with open(file, 'rU') as f: """ create a list of sequence records in the file """ records = list(SeqIO.parse(f, 'fasta')) new_records = change_headers(records, file) base = path.splitext(file)[0] new_fasta = base + '.incl_tree.fasta' with open(new_fasta, 'w') as outfile: for record in new_records: outfile.write('>' + record.id + '\n' + str(record.seq) + '\n') ### converting HYPHY sequence/tree files (step 3) ### extracting sequences from fasta files derived from hyphy files (see above) for GO analysis in Blast2GO ### unnecessary information (from trees) and gaps are removed #!/bin/bash for file in GO_input_trimmed2_*.oneline.incl_tree.fasta; do if grep -q ">S_salar" $file; then grep -A1 ">S_salar" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta elif grep -q ">S_trutta" $file; then grep -A1 ">S_trutta" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta elif grep -q ">Sa_alpin" $file; then grep -A1 ">Sa_alpin" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta elif grep -q ">Sa_fonti" $file; then grep -A1 ">Sa_fonti" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta elif grep -q ">O_mykiss" $file; then grep -A1 ">O_mykiss" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta elif grep -q ">O_tshawy" $file; then grep -A1 ">O_tshawy" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta elif grep -q ">C_clupea" $file; then grep -A1 ">C_clupea" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta elif grep -q ">C_lavare" $file; then grep -A1 ">C_lavare" $file | head -n2 | cut -d "(" -f1 | sed 's/-//g' | sed '/^$/d' >> allseqs4GO.fasta else echo "No suitable taxa in $file!" fi done ### GO analysis in Blast2GO # --> blastn (-task megablast) using QBlast (NCBI Blast Server) # ----> against non-redundant protein sequences (nr) # ----> taxonomy filter: bony fishes (taxa: 7898, Actinopterygii) # ----> blast expectation value (e-value) threshold: 1.0E-3 # ----> number of retained blast hits: 1 # ----> blast descriptor annotator option selected # ----> word size: 28 # ----> low complexity filter selected # ----> HSP length cutoff: 33 # ----> HSP-hit coverage: 0 # ----> filter by description: no filter # --> ### in the meantime: extracting aBSREL results (Holm-Bonferroni corrected p-values) from aBSREL .json output files #!/bin/bash for file in trimmed2_OG*.hyphy.aBSREL.json; do pval=$(grep "\"Corrected P-value\":" $file | sed -e "s/\:\(.*\)\,/\1/" | cut -c26-) echo "START" >> aBSREL_results.txt echo $file >> aBSREL_results.txt echo $pval >> aBSREL_results.txt echo "END" >> aBSREL_results.txt done ### extracting aBSREL results (uncorrected p-values) from aBSREL .json output files #!/bin/bash for file in trimmed2_OG*.hyphy.aBSREL.json; do pval=$(grep "\"Uncorrected P-value\":" $file | sed -e "s/\:\(.*\)\,/\1/" | cut -c28-) echo "START" >> aBSREL_results2.txt echo $file >> aBSREL_results2.txt echo $pval >> aBSREL_results2.txt echo "END" >> aBSREL_results2.txt done ### ### calculation of number of outliers per orthogroup and number of orthogroups with at least one outlier in aBSREL analyses ### Kevin Schneider; last modified: 29.01.2019 ### ### KevinSchneider@gmx.at ### library(stringr) setwd("/Users/kevinschneider/Documents/transcriptomics/HYPHY_analyses/") absrel <- readLines("aBSREL_results.txt") absrel <- absrel[-(seq(from = 1, to = length(absrel), by = 4))] absrel <- absrel[-(seq(from = 3, to = length(absrel), by = 3))] absrel_list <- list() Bonfcutoff <- (0.05 / (length(absrel) / 2)) nocorr_count <- 0 FDR_count <- 0 Bonf_count <- 0 for (i in 1:(length(absrel)/2)) { ID <- absrel[(i * 2) - 1] pvals <- absrel[i * 2] filtered_pvals1 <- gsub("\"", "", pvals) filtered_pvals2 <- gsub(" ", "", filtered_pvals1) split_pvals <- as.numeric(unlist(str_split(filtered_pvals2, ":"))) split_pvals <- split_pvals[!is.na(split_pvals)] nocorrs <- c() FDRs <- c() Bonfs <- c() nocorr_ID <- NA FDR_ID <- NA Bonf_ID <- NA for (j in 1:length(split_pvals)) { if (is.numeric(split_pvals[j])) { if (split_pvals[j] < 0.05) { nocorr <- TRUE nocorr_ID <- substr(ID, 10, 18) } else { nocorr <- FALSE } nocorrs <- append(nocorrs, nocorr) if (p.adjust(split_pvals[j], method = "fdr", n = (length(absrel) / 2)) < 0.10) { FDR <- TRUE FDR_ID <- substr(ID, 10, 18) } else { FDR <- FALSE } FDRs <- append(FDRs, FDR) if (split_pvals[j] < Bonfcutoff) { Bonf <- TRUE Bonf_ID <- substr(ID, 10, 18) } else { Bonf <- FALSE } Bonfs <- append(Bonfs, Bonf) } } nocorr_snums <- length(nocorrs[which(nocorrs)]) FDR_snums <- length(FDRs[which(FDRs)]) Bonf_snums <- length(Bonfs[which(Bonfs)]) if (nocorr_snums > 0) { nocorr_count <- nocorr_count + 1 } if (FDR_snums > 0) { FDR_count <- FDR_count + 1 } if (Bonf_snums > 0) { Bonf_count <- Bonf_count + 1 } absrel_list[[ID]] <- list("pvals" = split_pvals, "nocorr_sig" = nocorrs, "nocorr_sig_num" = nocorr_snums, "FDR_sig" = FDRs, "FDR_sig_num" = FDR_snums, "Bonf_sig" = Bonfs, "Bonf_sig_num" = Bonf_snums, "nocorr_ID" = nocorr_ID, "FDR_ID" = FDR_ID, "Bonf_ID" = Bonf_ID) } nocorr_IDs <- unique(unlist(lapply(absrel_list, "[[", "nocorr_ID"))) FDR_IDs <- unique(unlist(lapply(absrel_list, "[[", "FDR_ID"))) Bonf_IDs <- unique(unlist(lapply(absrel_list, "[[", "Bonf_ID"))) nocorr_IDs <- nocorr_IDs[-1] FDR_IDs <- FDR_IDs[-1] Bonf_IDs <- Bonf_IDs[-1] nocorr_vec <- c() FDR_vec <- c() Bonf_vec <- c() for (k in 1:length(absrel_list)) { if (!is.na(absrel_list[[k]]$nocorr_ID)) { pval_str <- paste(absrel_list[[k]]$pvals, collapse = "|") nocorr_vec <- append(nocorr_vec, paste(c(absrel_list[[k]]$nocorr_ID, pval_str), collapse = ";")) } if (!is.na(absrel_list[[k]]$FDR_ID)) { pval_str <- paste(absrel_list[[k]]$pvals, collapse = "|") FDR_vec <- append(FDR_vec, paste(c(absrel_list[[k]]$FDR_ID, pval_str), collapse = ";")) } if (!is.na(absrel_list[[k]]$Bonf_ID)) { pval_str <- paste(absrel_list[[k]]$pvals, collapse = "|") Bonf_vec <- append(Bonf_vec, paste(c(absrel_list[[k]]$Bonf_ID, pval_str), collapse = ";")) } } writeLines(nocorr_vec, "nocorr_IDs_aBSREL_results1_w_pvals.txt") writeLines(FDR_vec, "FDR_IDs_aBSREL_results1_w_pvals.txt") writeLines(Bonf_vec, "Bonf_IDs_aBSREL_results1_w_pvals.txt") writeLines(nocorr_IDs, "nocorr_IDs_aBSREL_results1.txt") writeLines(FDR_IDs, "FDR_IDs_aBSREL_results1.txt") writeLines(Bonf_IDs, "Bonf_IDs_aBSREL_results1.txt") absrel <- readLines("aBSREL_results2.txt") absrel <- absrel[-(seq(from = 1, to = length(absrel), by = 4))] absrel <- absrel[-(seq(from = 3, to = length(absrel), by = 3))] absrel_list <- list() Bonfcutoff <- (0.05 / (length(absrel) / 2)) nocorr_count2 <- 0 FDR_count2 <- 0 Bonf_count2 <- 0 for (i in 1:(length(absrel)/2)) { ID <- absrel[(i * 2) - 1] pvals <- absrel[i * 2] filtered_pvals1 <- gsub("\"", "", pvals) split_pvals <- as.numeric(unlist(str_split(filtered_pvals1, " "))) split_pvals <- split_pvals[!is.na(split_pvals)] nocorrs <- c() FDRs <- c() Bonfs <- c() nocorr_ID <- NA FDR_ID <- NA Bonf_ID <- NA for (j in 1:length(split_pvals)) { if (is.numeric(split_pvals[j])) { if (split_pvals[j] < 0.05) { nocorr <- TRUE nocorr_ID <- substr(ID, 10, 18) } else { nocorr <- FALSE } nocorrs <- append(nocorrs, nocorr) if (p.adjust(split_pvals[j], method = "fdr", n = (length(absrel) / 2)) < 0.10) { FDR <- TRUE FDR_ID <- substr(ID, 10, 18) } else { FDR <- FALSE } FDRs <- append(FDRs, FDR) if (split_pvals[j] < Bonfcutoff) { Bonf <- TRUE Bonf_ID <- substr(ID, 10, 18) } else { Bonf <- FALSE } Bonfs <- append(Bonfs, Bonf) } } nocorr_snums <- length(nocorrs[which(nocorrs)]) FDR_snums <- length(FDRs[which(FDRs)]) Bonf_snums <- length(Bonfs[which(Bonfs)]) if (nocorr_snums > 0) { nocorr_count2 <- nocorr_count2 + 1 } if (FDR_snums > 0) { FDR_count2 <- FDR_count2 + 1 } if (Bonf_snums > 0) { Bonf_count2 <- Bonf_count2 + 1 } absrel_list[[ID]] <- list("pvals" = split_pvals, "nocorr_sig" = nocorrs, "nocorr_sig_num" = nocorr_snums, "FDR_sig" = FDRs, "FDR_sig_num" = FDR_snums, "Bonf_sig" = Bonfs, "Bonf_sig_num" = Bonf_snums, "nocorr_ID" = nocorr_ID, "FDR_ID" = FDR_ID, "Bonf_ID" = Bonf_ID) } nocorr_IDs <- unique(unlist(lapply(absrel_list, "[[", "nocorr_ID"))) FDR_IDs <- unique(unlist(lapply(absrel_list, "[[", "FDR_ID"))) Bonf_IDs <- unique(unlist(lapply(absrel_list, "[[", "Bonf_ID"))) writeLines(nocorr_IDs, "nocorr_IDs_aBSREL_results2.txt") writeLines(FDR_IDs, "FDR_IDs_aBSREL_results2.txt") writeLines(Bonf_IDs, "Bonf_IDs_aBSREL_results2.txt") nocorr_count FDR_count Bonf_count nocorr_count2 FDR_count2 Bonf_count2 ### continued: GO analysis in Blast2GO ### reblasting of orthogroups with no blast result ### no taxonomy filter applied this time # --> blastn (-task megablast) using QBlast (NCBI Blast Server) # ----> against non-redundant protein sequences (nr) # ----> taxonomy filter: no filter # ----> blast expectation value (e-value) threshold: 1.0E-3 # ----> number of retained blast hits: 1 # ----> blast descriptor annotator option selected # ----> word size: 28 # ----> low complexity filter selected # ----> HSP length cutoff: 33 # ----> HSP-hit coverage: 0 # ----> filter by description: no filter # --> ### next step: GO mapping with Goa version 2018.12 database # ----> for creating annotations later ### additionally: InterProScan applied due to many unmapped sequences # ----> using EMBL-EBI InterPro # ----> default settings applied ### annotation using default parameters # annotation cut-off: 55 # GO weight: 5 # no taxonomy filter # e-value-hit-filter: 1.0E-6 # HSP-hit coverage cut-off: 0 # hit filter: 500 ### additional annotation using merging of InterProScan GOs with existing annotation ### annotation augmentation by ANNEX ### annotation of remaining unannotated sequences using blast descriptions # ----> minimum similarity of 85 # ----> validation of annotations ### GO-Slim runs # ----> using Obo file from GO website # ----> generic GO slim ### extracting sequence IDs for Blast2GO GO term enrichment analysis from orthogroup outlier text files #!/bin/bash while read line; do grep $line $2 | cut -c 2- >> $3 done <$1 ### extracting sequence IDs from Blast2GO input fasta file for Blast2GO GO term enrichment analysis #!/bin/bash grep ">" $1 | cut -c 2- >> $2 ### testing the difference of the selection parameter k distribution from the null expectation of 1 wilcox.test(relaxframe$selparam, mu = 1, alternative = "two.sided", na.action = na.omit) median(relaxframe$selparam) mean(relaxframe$selparam) ### ### Venn diagrams library(VennDiagram) draw.triple.venn(22, 96, 12, 5, 3, 0, 0, fill = c("blue", "red", "green"), cex = 1.5) draw.triple.venn(138, 111, 105, 10, 16, 0, 0, fill = c("blue", "red", "green"), cex = 1.5) draw.triple.venn(272, 580, 235, 58, 87, 0, 0, fill = c("blue", "red", "green"), cex = 1.5) ### #!/bin/bash # calculating average gene counts from maximum gene counts per orthogroup max_array=() for file in trimmed2_*.RELAX.json; do maximum=0 a=0 b=0 c=0 d=0 e=0 f=0 g=0 h=0 i=0 j=0 while read line; do if `grep -q 'original name":"C_clupea' <<< "$line"`; then a=$((a + 1)) fi if `grep -q 'original name":"C_lavare' <<< "$line"`; then b=$((b + 1)) fi if `grep -q 'original name":"E_lucius' <<< "$line"`; then c=$((c + 1)) fi if `grep -q 'original name":"O_mykiss' <<< "$line"`; then d=$((d + 1)) fi if `grep -q 'original name":"O_tshawy' <<< "$line"`; then e=$((e + 1)) fi if `grep -q 'original name":"Sa_alpin' <<< "$line"`; then f=$((f + 1)) fi if `grep -q 'original name":"Sa_fonti' <<< "$line"`; then g=$((g + 1)) fi if `grep -q 'original name":"S_salar' <<< "$line"`; then h=$((h + 1)) fi if `grep -q 'original name":"S_trutta' <<< "$line"`; then i=$((i + 1)) fi if `grep -q 'original name":"T_thymal' <<< "$line"`; then j=$((j + 1)) fi done <$file echo ">C_clupea" >> $file.genecounts.txt echo $a >> $file.genecounts.txt echo ">C_lavare" >> $file.genecounts.txt echo $b >> $file.genecounts.txt echo ">E_lucius" >> $file.genecounts.txt echo $c >> $file.genecounts.txt echo ">O_mykiss" >> $file.genecounts.txt echo $d >> $file.genecounts.txt echo ">O_tshawy" >> $file.genecounts.txt echo $e >> $file.genecounts.txt echo ">Sa_alpin" >> $file.genecounts.txt echo $f >> $file.genecounts.txt echo ">Sa_fonti" >> $file.genecounts.txt echo $g >> $file.genecounts.txt echo ">S_salar" >> $file.genecounts.txt echo $h >> $file.genecounts.txt echo ">S_trutta" >> $file.genecounts.txt echo $i >> $file.genecounts.txt echo ">T_thymal" >> $file.genecounts.txt echo $j >> $file.genecounts.txt count_array=($a $b $c $d $e $f $g $h $i $j) maximum=$(python calculate_max.py "${count_array[@]}") max_array+=($maximum) done sum=0 total=0 for i in "${max_array[@]}"; do sum=`expr $sum + $i` total=`expr $total + 1` done mean=$(bc <<< "scale=5; $sum / $total") echo $mean > mean_genecount.txt ### # python script used in the above BASH script ##### written for Python 3 ##### """ --> script to calculate maximum value from BASH array input <-- """ import sys maximum = sys.argv[1:] maximum = list(map(int, maximum)) maximum.sort() print(maximum[len(maximum) - 1]) ### ### removing matching lines to exclude IDs from test sets for GO term enrichment tests from reference sets #!/bin/bash # $1 = input file 1; $2 = input file 2 while IFS= read -r line; do sed -i .bak "/$line/d" $2 done < "$1" ### more GO term enrichment tests in Blast2GO (Fisher's Exact Tests) ### this time reference sets do not include IDs from test sets (see script above) # Bonf, FDR, and nocorr sets for RELAX, aBSREL (Bonf-Holm), and aBSREL (nocorr) # with FDR (q-value) threshold of 0.1 (two-tailed) ### ### obtaining difference between sequence files #!/bin/bash grep -v -F -x -f OG_list2.txt OG_list.txt ### calculation of total sequence length and average sequence length per orthogroup #!/bin/bash total=0 for file in GO_input_trimmed2_OG*.oneline.incl_tree.fasta; do curr=$(sed '2q;d' $file | wc -c) total=`expr $total + $curr - 1` done echo $total count=$(ls -l GO_input_trimmed2_OG*.oneline.incl_tree.fasta | wc -l) avg=`expr $total / $count` echo $avg ### ### counting alignment lengths of OG alignments #!/bin/bash for file in GO_input_trimmed2_OG*.oneline.incl_tree.fasta; do head -2 $file | tail -1 | wc -c | awk '{$1=$1};1' >> alignment_counts.txt done ### ##### 04.12.2018 ##### ##### by Kevin Schneider ##### ##### KevinSchneider@gmx.at ##### ##### written for Python 3 ##### ##### last modified: 06.03.2018 ##### """ --> script to convert mutli-line fasta files to one-line fasta files <-- """ from glob import glob from Bio import SeqIO from Bio.SeqUtils.CheckSum import seguid from os import path import argparse parser = argparse.ArgumentParser() parser.add_argument('-i', required=True, type=str, help='input files in fasta format (in quotes if wildcard [*] included)') args = parser.parse_args() files = args.i for file in glob(files): with open(file, 'rU') as f: """ create a list of sequence records in the file """ records = list(SeqIO.parse(f, 'fasta')) base = path.splitext(file)[0] new_fasta = base + '.oneline.fasta' with open(new_fasta, 'w') as outfile: for record in records: outfile.write('>' + record.id + '\n' + str(record.seq) + '\n') ### ### obtaining transcript counts of raw transcriptomes #!/bin/bash for file in *.oneline.fasta; do grep ">" $file | wc -l | awk '{$1=$1};1' > $file.transcript_count.txt grep -v ">" $file | awk '{ print length }' | awk '{$1=$1};1' > $file.transcript_lengths.txt done ### ### script to calculate basic raw transcript metrics based on transcript lengths t_vec <- Sys.glob(file.path("*.oneline.fasta.transcript_lengths.txt")) for (i in t_vec) { print("##############################") print(i) print("##############################") lengths <- readLines(i) lengths <- as.numeric(lengths) print("min length") print(min(lengths)) print("max length") print(max(lengths)) print("mean length") print(mean(lengths)) print("median length") print(median(lengths)) print("sum of transcript lengths (in Mbp)") print(sum(lengths)/1000000) print("transcript count") print(length(lengths)) print("##############################") } ### setwd("/Users/kevinschneider/Documents/transcriptomics/HYPHY_analyses/") symbols <- read.table("webgestalt_gsea_input_genesymbols.rnk", header = FALSE, sep = "\t") colnames(symbols) <- c("genesymbols", "k") symbols <- symbols[!grepl("!", symbols$genesymbols), ] symbols$k[symbols$k == 1] <- 0 symbols$k[symbols$k < 1] <- symbols$k[symbols$k < 1] - 1 symbols$k[symbols$k > 1] <- symbols$k[symbols$k > 1] / 50 write.table(symbols, file = "transformed_webgestalt_gsea_input_genesymbols_human.rnk", quote = FALSE, sep = "\t", row.names = FALSE) symbols$genesymbols <- tolower(symbols$genesymbols) write.table(symbols, file = "transformed_webgestalt_gsea_input_genesymbols_zebrafish.rnk", quote = FALSE, sep = "\t", row.names = FALSE) ### next for zebrafish symbols <- read.table("webgestalt_gsea_input_genesymbols_zebrafish.rnk", header = FALSE, sep = "\t") colnames(symbols) <- c("genesymbols", "k") symbols <- symbols[!grepl("!", symbols$genesymbols), ] symbols$k[symbols$k == 1] <- 0 symbols$k[symbols$k < 1] <- symbols$k[symbols$k < 1] - 1 symbols$k[symbols$k > 1] <- symbols$k[symbols$k > 1] / 50 symbols$genesymbols <- tolower(symbols$genesymbols) write.table(symbols, file = "transformed_webgestalt_gsea_input_genesymbols_new_zebrafish.rnk", quote = FALSE, sep = "\t", row.names = FALSE) ### ### converting human gene symbols to symbols of zebrafish homologues (for GSEA in WegGestalt) setwd("/Users/kevinschneider/Documents/transcriptomics/HYPHY_analyses/") human <- read.csv("gsea_input_gene_symbols.csv", header = TRUE) homologue_df <- read.delim("homology.txt", header = TRUE, sep = "\t", fill = TRUE) human_homologue_df <- homologue_df[which(homologue_df$Common.Organism.Name == "human"), ] zf_homologue_df <- homologue_df[which(homologue_df$Common.Organism.Name == "zebrafish"), ] human <- human[, -6] human$genesymbol_human <- trimws(as.character(human$genesymbol_human)) human <- data.frame(human, "genesymbol_zebrafish" = human$genesymbol_human, stringsAsFactors = FALSE) for (i in 1:nrow(human)) { pos <- grep(human$genesymbol_human[i], human_homologue_df$Symbol) if (length(pos)) { homologue_id <- human_homologue_df$HomoloGene.ID[pos] zf_gene_pos <- grep(homologue_id, zf_homologue_df$HomoloGene.ID) zf_gene <- as.character(zf_homologue_df$Symbol[zf_gene_pos][1]) if (grepl("|", zf_gene)) { zf_gene <- unlist(strsplit(zf_gene, split = "[|]"))[1] } human$genesymbol_zebrafish[i] <- zf_gene } if (is.na(human$genesymbol_zebrafish[i])) { human$genesymbol_zebrafish[i] <- tolower(human$genesymbol_human[i]) } } zf_gsea <- data.frame("symbols" = human$genesymbol_zebrafish, "kval" = human$kval) zf_gsea$kval <- as.numeric(zf_gsea$kval) zf_gsea <- zf_gsea[-which(is.na(zf_gsea$kval)), ] zf_gsea$kval[zf_gsea$kval == 1] <- 0 zf_gsea$kval[zf_gsea$kval < 1] <- zf_gsea$kval[zf_gsea$kval < 1] - 1 zf_gsea$kval[zf_gsea$kval > 1] <- zf_gsea$kval[zf_gsea$kval > 1] / 50 write.table(human, "human_zebrafish_homology.tsv", sep = "\t", row.names = FALSE, quote = FALSE) write.table(zf_gsea, "human_zebrafish_homology.rnk", sep = "\t", row.names = FALSE, quote = FALSE) ### ### extract entries in "zebrafish_symbols_kvalues.csv" based on outlier OGs #!/bin/bash while IFS= read -r line; do grep $line $2 >> $3 done < $1 ### ### setwd("/Users/kevinschneider/Documents/transcriptomics/HYPHY_analyses/") # A treemap R script produced by the REVIGO server at http://revigo.irb.hr/ # If you found REVIGO useful in your work, please cite the following reference: # Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology # terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800 # author: Anton Kratz , RIKEN Omics Science Center, Functional Genomics Technology Team, Japan # created: Fri, Nov 02, 2012 7:25:52 PM # last change: Fri, Nov 09, 2012 3:20:01 PM # ----------------------------------------------------------------------------- # If you don't have the treemap package installed, uncomment the following line: # install.packages( "treemap" ); library(treemap) # treemap package by Martijn Tennekes # Set the working directory if necessary # setwd("C:/Users/username/workingdir"); # -------------------------------------------------------------------------- # Here is your data from REVIGO. Scroll down for plot configuration options. revigo.names <- c("term_ID","description","freqInDbPercent","value","uniqueness","dispensability","representative"); revigo.data <- rbind(c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "a"), c("GO:0006470","protein dephosphorylation",1.272,0.9975,0.678,0.000,"protein dephosphorylation"), c("GO:0070932","histone H3 deacetylation",0.035,1.0000,0.725,0.256,"protein dephosphorylation"), c("GO:0006508","proteolysis",6.441,0.9814,0.707,0.317,"protein dephosphorylation"), c("GO:0006468","protein phosphorylation",7.825,0.9864,0.662,0.519,"protein dephosphorylation"), c("GO:0006886","intracellular protein transport",2.141,0.9978,0.687,0.000,"intracellular protein transport"), c("GO:0016192","vesicle-mediated transport",3.868,0.9992,0.680,0.443,"intracellular protein transport"), c("GO:0006812","cation transport",4.146,0.9995,0.680,0.488,"intracellular protein transport"), c("GO:0006836","neurotransmitter transport",0.733,1.0000,0.682,0.359,"intracellular protein transport"), c("GO:0008217","regulation of blood pressure",0.290,1.0000,0.829,0.000,"regulation of blood pressure"), c("GO:0019885","antigen processing and presentation of endogenous peptide antigen via MHC class I",0.047,1.0000,0.852,0.000,"antigen processing and presentation of endogenous peptide antigen via MHC class I"), c("GO:0019722","calcium-mediated signaling",0.095,1.0000,0.791,0.068,"calcium-mediated signaling"), c("GO:0035556","intracellular signal transduction",6.701,0.9975,0.786,0.241,"calcium-mediated signaling"), c("GO:0009069","serine family amino acid metabolic process",0.118,0.9992,0.699,0.092,"serine family amino acid metabolism"), c("GO:0006570","tyrosine metabolic process",0.030,0.9998,0.705,0.670,"serine family amino acid metabolism"), c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "z")); stuff <- data.frame(revigo.data); names(stuff) <- revigo.names; stuff$value <- as.numeric( as.character(stuff$value) ); stuff$freqInDbPercent <- as.numeric( as.character(stuff$freqInDbPercent) ); stuff$uniqueness <- as.numeric( as.character(stuff$uniqueness) ); stuff$dispensability <- as.numeric( as.character(stuff$dispensability) ); # equal size since p-value difference negligible stuff$value <- 1 stuff$value[1] <- 0 stuff$value[length(stuff$value)] <- 0 # by default, outputs to a PDF file png(file="revigo_treemap_relaxed_diversifying_bioproc_recoloured.png", width=900, height=500) #pdf( file="revigo_treemap_relaxed_diversifying_bioproc.pdf", width=16, height=9 ) # width and height are in inches # check the tmPlot command documentation for all possible parameters - there are a lot more treemap( stuff, index = c("representative","description"), vSize = "value", type = "categorical", vColor = "representative", title = "GO Terms - Relaxed & Diversifying Selection", inflate.labels = FALSE, # set this to TRUE for space-filling group labels - good for posters lowerbound.cex.labels = 0, # try to draw as many labels as possible (still, some small squares may not get a label) bg.labels = "#CCCCCCAA", # define background color of group labels # "#CCCCCC00" is fully transparent, "#CCCCCCAA" is semi-transparent grey, NA is opaque position.legend = "none", fontsize.title = 18, fontsize.label = 18, align.labels = list(c("left", "top"), c("center", "bottom")), drop.unused.levels = FALSE ) dev.off() ### ### setwd("/Users/kevinschneider/Documents/transcriptomics/HYPHY_analyses/") # A treemap R script produced by the REVIGO server at http://revigo.irb.hr/ # If you found REVIGO useful in your work, please cite the following reference: # Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology # terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800 # author: Anton Kratz , RIKEN Omics Science Center, Functional Genomics Technology Team, Japan # created: Fri, Nov 02, 2012 7:25:52 PM # last change: Fri, Nov 09, 2012 3:20:01 PM # ----------------------------------------------------------------------------- # If you don't have the treemap package installed, uncomment the following line: # install.packages( "treemap" ); library(treemap) # treemap package by Martijn Tennekes # Set the working directory if necessary # setwd("C:/Users/username/workingdir"); # -------------------------------------------------------------------------- # Here is your data from REVIGO. Scroll down for plot configuration options. revigo.names <- c("term_ID","description","freqInDbPercent","value","uniqueness","dispensability","representative"); revigo.data <- rbind(c("GO:0005975","carbohydrate metabolic process",1.905,0.9945,0.610,0.000,"carbohydrate metabolism"), c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "d"), c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "e"), c("GO:0007269","neurotransmitter secretion",0.432,1.0000,0.651,0.100,"neurotransmitter secretion"), c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "o"), c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "p"), c("GO:0009069","serine family amino acid metabolic process",0.118,0.9985,0.385,0.086,"serine family amino acid metabolism"), c("GO:0006486","protein glycosylation",1.183,0.9985,0.346,0.299,"serine family amino acid metabolism"), c("GO:0006508","proteolysis",6.441,0.9975,0.510,0.314,"serine family amino acid metabolism"), c("GO:0045892","negative regulation of transcription, DNA-templated",1.343,0.9999,0.473,0.358,"serine family amino acid metabolism"), c("GO:0006468","protein phosphorylation",7.825,0.9984,0.468,0.451,"serine family amino acid metabolism"), c("GO:0006633","fatty acid biosynthetic process",0.325,0.9998,0.337,0.574,"serine family amino acid metabolism"), c("foo", "bar", 0.0, 0.0, 0.0, 0.0, "z")); stuff <- data.frame(revigo.data); names(stuff) <- revigo.names; stuff$value <- as.numeric( as.character(stuff$value) ); stuff$freqInDbPercent <- as.numeric( as.character(stuff$freqInDbPercent) ); stuff$uniqueness <- as.numeric( as.character(stuff$uniqueness) ); stuff$dispensability <- as.numeric( as.character(stuff$dispensability) ); # equal size since p-value difference negligible stuff$value <- 1 stuff$value[2] <- 0.00001 stuff$value[3] <- 0.00001 stuff$value[5] <- 0.00001 stuff$value[6] <- 0.00001 stuff$value[length(stuff$value)] <- 0.00001 # by default, outputs to a PDF file png(file="revigo_mod_treemap_intensified_diversifying_bioproc_recoloured.png", width=400, height=700) #pdf( file="revigo_treemap_intensified_diversifying_bioproc.pdf", width=16, height=9 ) # width and height are in inches # check the tmPlot command documentation for all possible parameters - there are a lot more tmPlot( stuff, index = c("representative","description"), vSize = "value", type = "categorical", vColor = "representative", title = "GO Terms - Intensified & Diversifying Selection", inflate.labels = FALSE, # set this to TRUE for space-filling group labels - good for posters lowerbound.cex.labels = 0, # try to draw as many labels as possible (still, some small squares may not get a label) bg.labels = "#CCCCCCAA", # define background color of group labels # "#CCCCCC00" is fully transparent, "#CCCCCCAA" is semi-transparent grey, NA is opaque position.legend = "none", fontsize.title = 18, fontsize.label = 18, align.labels = list(c("left", "top"), c("center", "bottom")), ) dev.off() ### ### # A plotting R script produced by the REVIGO server at http://revigo.irb.hr/ # If you found REVIGO useful in your work, please cite the following reference: # Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology # terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800 # -------------------------------------------------------------------------- # If you don't have the ggplot2 package installed, uncomment the following line: # install.packages( "ggplot2" ); library( ggplot2 ); # -------------------------------------------------------------------------- # If you don't have the scales package installed, uncomment the following line: # install.packages( "scales" ); library( scales ); # -------------------------------------------------------------------------- # Here is your data from REVIGO. Scroll down for plot configuration options. revigo.names <- c("term_ID","description","frequency_%","plot_X","plot_Y","GO_frequency","log10_p_value","uniqueness","dispensability"); revigo.data <- rbind(c("GO:0007340","acrosome reaction", 0.101, 5.178,-0.261, 2,-2.5187,0.605,0.000), c("GO:0006629","lipid metabolic process", 3.502,-5.067,-1.917, 2,-2.0576,0.335,0.035), c("GO:0006355","regulation of transcription, DNA-templated",10.303,-3.787, 4.497, 8,-1.5057,0.440,0.164), c("GO:0009395","phospholipid catabolic process", 0.089,-3.915,-4.114, 2,-1.5666,0.380,0.322), c("GO:0009069","serine family amino acid metabolic process", 0.118,-5.034, 0.526, 3,-1.7080,0.245,0.331), c("GO:0006570","tyrosine metabolic process", 0.030,-4.116, 0.870, 2,-1.4064,0.258,0.670)); one.data <- data.frame(revigo.data); names(one.data) <- revigo.names; one.data <- one.data [(one.data$plot_X != "null" & one.data$plot_Y != "null"), ]; one.data$plot_X <- as.numeric( as.character(one.data$plot_X) ); one.data$plot_Y <- as.numeric( as.character(one.data$plot_Y) ); one.data$GO_frequency <- as.numeric( as.character(one.data$GO_frequency) ); one.data$log10_p_value <- as.numeric( as.character(one.data$log10_p_value) ); one.data$frequency <- as.numeric( as.character(one.data$frequency) ); one.data$uniqueness <- as.numeric( as.character(one.data$uniqueness) ); one.data$dispensability <- as.numeric( as.character(one.data$dispensability) ); #head(one.data); # -------------------------------------------------------------------------- # Names of the axes, sizes of the numbers and letters, names of the columns, # etc. can be changed below p1 <- ggplot( data = one.data ); p1 <- p1 + geom_point( aes( plot_X, plot_Y, colour = log10_p_value, size = GO_frequency), alpha = I(0.6) ) + scale_size_area(); p1 <- p1 + scale_colour_gradientn( colours = c("blue", "green", "yellow", "red"), limits = c( min(one.data$log10_p_value), 0) ); p1 <- p1 + geom_point( aes(plot_X, plot_Y, size = GO_frequency), shape = 21, fill = "transparent", colour = I (alpha ("black", 0.6) )) + scale_size_area(); p1 <- p1 + scale_size( range=c(as.integer(min(one.data$GO_frequency)) * 5, as.integer(max(one.data$GO_frequency) * 5)), breaks = c(2, 3, 8)) + theme_bw(); # + scale_fill_gradientn(colours = heat_hcl(7), limits = c(-300, 0) ); #ex <- one.data [ one.data$dispensability < 0.15, ]; #p1 <- p1 + geom_text( data = ex, aes(plot_X, plot_Y, label = description), colour = I(alpha("black", 0.85)), size = 3 ); p1 <- p1 + labs (y = "semantic space y", x = "semantic space x"); p1 <- p1 + theme(legend.key = element_blank(), plot.title = element_text(size = 14, face = "bold", hjust = 0.5), axis.title = element_text(size = 13, face = "bold")) ; one.x_range = max(one.data$plot_X) - min(one.data$plot_X); one.y_range = max(one.data$plot_Y) - min(one.data$plot_Y); p1 <- p1 + xlim(min(one.data$plot_X)-one.x_range/10,max(one.data$plot_X)+one.x_range/10); p1 <- p1 + ylim(min(one.data$plot_Y)-one.y_range/10,max(one.data$plot_Y)+one.y_range/10); p1 <- p1 + ggtitle("Relaxed Selection (Enrichment p-value < 0.05)"); # -------------------------------------------------------------------------- # Output the plot to screen p1; # Uncomment the line below to also save the plot to a file. # The file type depends on the extension (default=pdf). ggsave("revigo_both_fore_semantic_enriched_relaxed_bioproc_sim0_9.png", device = "png", width = 9, height = 7) ### ### # A plotting R script produced by the REVIGO server at http://revigo.irb.hr/ # If you found REVIGO useful in your work, please cite the following reference: # Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology # terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800 # -------------------------------------------------------------------------- # If you don't have the ggplot2 package installed, uncomment the following line: # install.packages( "ggplot2" ); library( ggplot2 ); # -------------------------------------------------------------------------- # If you don't have the scales package installed, uncomment the following line: # install.packages( "scales" ); library( scales ); # -------------------------------------------------------------------------- # Here is your data from REVIGO. Scroll down for plot configuration options. revigo.names <- c("term_ID","description","frequency_%","plot_X","plot_Y","GO_frequency","log10_p_value","uniqueness","dispensability"); revigo.data <- rbind(c("GO:0002138","retinoic acid biosynthetic process", 0.047, 1.702,-7.269, 1,-1.4540,0.596,0.000), c("GO:0042273","ribosomal large subunit biogenesis", 0.278, 5.582,-0.993, 1,-1.4540,0.722,0.000), c("GO:0030325","adrenal gland development", 0.071,-2.558, 5.046, 1,-1.4540,0.730,0.024), c("GO:0051014","actin filament severing", 0.035, 2.649, 4.117, 1,-1.4540,0.691,0.078), c("GO:0097190","apoptotic signaling pathway", 0.574,-6.360,-3.464, 1,-1.4540,0.570,0.095), c("GO:1901222","regulation of NIK/NF-kappaB signaling", 0.047,-6.136,-0.225, 1,-1.4540,0.628,0.182), c("GO:0034720","histone H3-K4 demethylation", 0.024, 4.160,-4.282, 1,-1.4540,0.602,0.209), c("GO:0006355","regulation of transcription, DNA-templated",10.303,-1.797,-6.211, 8,-2.2859,0.586,0.269), c("GO:0007165","signal transduction",22.310,-4.846,-3.634, 5,-1.6421,0.590,0.330)); one.data <- data.frame(revigo.data); names(one.data) <- revigo.names; one.data <- one.data [(one.data$plot_X != "null" & one.data$plot_Y != "null"), ]; one.data$plot_X <- as.numeric( as.character(one.data$plot_X) ); one.data$plot_Y <- as.numeric( as.character(one.data$plot_Y) ); one.data$GO_frequency <- as.numeric( as.character(one.data$GO_frequency) ); one.data$log10_p_value <- as.numeric( as.character(one.data$log10_p_value) ); one.data$frequency <- as.numeric( as.character(one.data$frequency) ); one.data$uniqueness <- as.numeric( as.character(one.data$uniqueness) ); one.data$dispensability <- as.numeric( as.character(one.data$dispensability) ); #head(one.data); # -------------------------------------------------------------------------- # Names of the axes, sizes of the numbers and letters, names of the columns, # etc. can be changed below p1 <- ggplot( data = one.data ); p1 <- p1 + geom_point( aes( plot_X, plot_Y, colour = log10_p_value, size = GO_frequency), alpha = I(0.6) ) + scale_size_area(); p1 <- p1 + scale_colour_gradientn( colours = c("blue", "green", "yellow", "red"), limits = c( min(one.data$log10_p_value), 0) ); p1 <- p1 + geom_point( aes(plot_X, plot_Y, size = GO_frequency), shape = 21, fill = "transparent", colour = I (alpha ("black", 0.6) )) + scale_size_area(); p1 <- p1 + scale_size( range=c(as.integer(min(one.data$GO_frequency)) * 5, as.integer(max(one.data$GO_frequency) * 5)), breaks = c(1, 5, 8)) + theme_bw(); # + scale_fill_gradientn(colours = heat_hcl(7), limits = c(-300, 0) ); #ex <- one.data [ one.data$dispensability < 0.15, ]; #p1 <- p1 + geom_text( data = ex, aes(plot_X, plot_Y, label = description), colour = I(alpha("black", 0.85)), size = 3 ); p1 <- p1 + labs (y = "semantic space y", x = "semantic space x"); p1 <- p1 + theme(legend.key = element_blank(), plot.title = element_text(size = 14, face = "bold", hjust = 0.5), axis.title = element_text(size = 13, face = "bold")) ; one.x_range = max(one.data$plot_X) - min(one.data$plot_X); one.y_range = max(one.data$plot_Y) - min(one.data$plot_Y); p1 <- p1 + xlim(min(one.data$plot_X)-one.x_range/10,max(one.data$plot_X)+one.x_range/10); p1 <- p1 + ylim(min(one.data$plot_Y)-one.y_range/10,max(one.data$plot_Y)+one.y_range/10); p1 <- p1 + ggtitle("Intensified Selection (Enrichment p-value < 0.05)"); # -------------------------------------------------------------------------- # Output the plot to screen p1; # Uncomment the line below to also save the plot to a file. # The file type depends on the extension (default=pdf). ggsave("revigo_both_fore_semantic_intensified_relaxed_bioproc_sim0_9.png", device = "png", width = 9, height = 7) ### ### # A plotting R script produced by the REVIGO server at http://revigo.irb.hr/ # If you found REVIGO useful in your work, please cite the following reference: # Supek F et al. "REVIGO summarizes and visualizes long lists of Gene Ontology # terms" PLoS ONE 2011. doi:10.1371/journal.pone.0021800 # -------------------------------------------------------------------------- # If you don't have the ggplot2 package installed, uncomment the following line: # install.packages( "ggplot2" ); library( ggplot2 ); # -------------------------------------------------------------------------- # If you don't have the scales package installed, uncomment the following line: # install.packages( "scales" ); library( scales ); # -------------------------------------------------------------------------- # Here is your data from REVIGO. Scroll down for plot configuration options. revigo.names <- c("term_ID","description","frequency_%","plot_X","plot_Y","GO_frequency","log10_p_value","uniqueness","dispensability"); revigo.data <- rbind(c("GO:0006090","pyruvate metabolic process", 0.260, 6.292,-3.272, 2,-2.7579,0.619,0.000), c("GO:0019885","antigen processing and presentation of endogenous peptide antigen via MHC class I", 0.047,-5.378,-1.697, 1,-1.3767,0.877,0.000), c("GO:0071918","urea transmembrane transport", 0.059,-3.965, 4.281, 1,-1.3767,0.854,0.027), c("GO:0007179","transforming growth factor beta receptor signaling pathway", 0.237,-2.254,-6.465, 1,-1.3767,0.797,0.030), c("GO:0000103","sulfate assimilation", 0.018,-5.827, 1.274, 1,-1.3767,0.911,0.070), c("GO:0070932","histone H3 deacetylation", 0.035, 0.654, 5.374, 1,-1.3767,0.763,0.074), c("GO:0008380","RNA splicing", 1.136, 4.489, 4.012, 1,-1.3767,0.743,0.115), c("GO:0019722","calcium-mediated signaling", 0.095,-0.762,-7.475, 1,-1.3767,0.820,0.179), c("GO:0006508","proteolysis", 6.441,-0.531, 2.055, 6,-1.6621,0.772,0.211), c("GO:0080111","DNA demethylation", 0.024, 4.672, 0.679, 1,-1.3767,0.680,0.234), c("GO:0035556","intracellular signal transduction", 6.701,-3.065,-5.690, 3,-1.3697,0.792,0.266), c("GO:0005977","glycogen metabolic process", 0.142, 4.194,-2.539, 1,-1.3767,0.678,0.267), c("GO:0016042","lipid catabolic process", 0.668, 5.777,-1.651, 2,-2.2928,0.576,0.305), c("GO:0055085","transmembrane transport", 6.956,-3.398, 4.995, 6,-1.9046,0.890,0.311), c("GO:0070682","proteasome regulatory particle assembly", 0.012, 1.160, 6.986, 1,-1.3767,0.868,0.329), c("GO:0030111","regulation of Wnt signaling pathway", 0.668,-1.948,-5.306, 1,-1.3767,0.718,0.425), c("GO:0006211","5-methylcytosine catabolic process", 0.015, 6.741, 3.221, 1,-1.3767,0.723,0.427), c("GO:0006956","complement activation", 0.349,-3.369,-2.348, 1,-1.3767,0.691,0.475), c("GO:0006561","proline biosynthetic process", 0.035, 6.530,-0.295, 1,-1.3767,0.623,0.515), c("GO:0006108","malate metabolic process", 0.053, 6.811,-2.827, 1,-1.3767,0.623,0.531), c("GO:0046486","glycerolipid metabolic process", 0.864, 5.020,-3.657, 2,-1.3826,0.604,0.663), c("GO:0009258","10-formyltetrahydrofolate catabolic process", 0.012, 6.646,-0.818, 1,-1.3767,0.578,0.734), c("GO:0006644","phospholipid metabolic process", 1.142, 4.965,-3.249, 1,-1.3767,0.598,0.775)); one.data <- data.frame(revigo.data); names(one.data) <- revigo.names; one.data <- one.data [(one.data$plot_X != "null" & one.data$plot_Y != "null"), ]; one.data$plot_X <- as.numeric( as.character(one.data$plot_X) ); one.data$plot_Y <- as.numeric( as.character(one.data$plot_Y) ); one.data$GO_frequency <- as.numeric( as.character(one.data$GO_frequency) ); one.data$log10_p_value <- as.numeric( as.character(one.data$log10_p_value) ); one.data$frequency <- as.numeric( as.character(one.data$frequency) ); one.data$uniqueness <- as.numeric( as.character(one.data$uniqueness) ); one.data$dispensability <- as.numeric( as.character(one.data$dispensability) ); #head(one.data); # -------------------------------------------------------------------------- # Names of the axes, sizes of the numbers and letters, names of the columns, # etc. can be changed below p1 <- ggplot( data = one.data ); p1 <- p1 + geom_point( aes( plot_X, plot_Y, colour = log10_p_value, size = GO_frequency), alpha = I(0.6) ) + scale_size_area(); p1 <- p1 + scale_colour_gradientn( colours = c("blue", "green", "yellow", "red"), limits = c( min(one.data$log10_p_value), 0) ); p1 <- p1 + geom_point( aes(plot_X, plot_Y, size = GO_frequency), shape = 21, fill = "transparent", colour = I (alpha ("black", 0.6) )) + scale_size_area(); p1 <- p1 + scale_size( range=c(as.integer(min(one.data$GO_frequency)) * 5, as.integer(max(one.data$GO_frequency) * 5)), breaks = c(1, 2, 3, 6)) + theme_bw(); # + scale_fill_gradientn(colours = heat_hcl(7), limits = c(-300, 0) ); #ex <- one.data [ one.data$dispensability < 0.15, ]; #p1 <- p1 + geom_text( data = ex, aes(plot_X, plot_Y, label = description), colour = I(alpha("black", 0.85)), size = 3 ); p1 <- p1 + labs (y = "semantic space y", x = "semantic space x"); p1 <- p1 + theme(legend.key = element_blank(), plot.title = element_text(size = 14, face = "bold", hjust = 0.5), axis.title = element_text(size = 13, face = "bold")) ; one.x_range = max(one.data$plot_X) - min(one.data$plot_X); one.y_range = max(one.data$plot_Y) - min(one.data$plot_Y); p1 <- p1 + xlim(min(one.data$plot_X)-one.x_range/10,max(one.data$plot_X)+one.x_range/10); p1 <- p1 + ylim(min(one.data$plot_Y)-one.y_range/10,max(one.data$plot_Y)+one.y_range/10); p1 <- p1 + ggtitle("Diversifying Selection (Enrichment p-value < 0.05)"); # -------------------------------------------------------------------------- # Output the plot to screen p1; # Uncomment the line below to also save the plot to a file. # The file type depends on the extension (default=pdf). ggsave("revigo_both_fore_semantic_enriched_diversifying_bioproc_sim0_9.png", device = "png", width = 9, height = 7) ### ### obtaining hypergeometric expectations of overlap between relaxed & diversifying as well as intensified & diversifying outliers ### testing frequency differences using Fisher's Exact Tests reldiv <- (112 * 71) / 2190 reldiv_rest <- 112 - reldiv fisher.test(matrix(c(9, 103, reldiv, reldiv_rest), 2, 2), alternative = "greater") reldiv <- (65 * 71) / 2190 reldiv_rest <- 65 - reldiv fisher.test(matrix(c(12, 53, reldiv, reldiv_rest), 2, 2), alternative = "greater") fisher.test(matrix(c(12, 53, 9, 103), 2, 2), alternative = "greater") ### ### #!/bin/bash # renaming fasta files for i in *.oneline.fasta; do mv -- "$i" "${i/%.oneline.fasta/.aln.fasta}" done for i in *.aln.fasta; do mv -- "$i" "${i:18}" done ### ###