Galaxy |

Changeset 81:b6f9a640e098 (2021-02-19)

Previous changeset 80:a4617f1d1d89 (2021-02-19) Next changeset 82:a103134ee6e0 (2021-02-25)

Commit message:
Uploaded

added:
.gitattributes
.gitignore
LICENSE
README.md
aa_histogram.r
baseline/Baseline_Functions.r
baseline/Baseline_Main.r
baseline/FiveS_Mutability.RData
baseline/FiveS_Substitution.RData
baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa
baseline/IMGTVHreferencedataset20161215.fa
baseline/IMGTVHreferencedataset20161215.fasta
baseline/baseline_url.txt
baseline/comparePDFs.r
baseline/filter.r
baseline/script_imgt.py
baseline/script_xlsx.py
baseline/wrapper.sh
change_o/change_o_url.txt
change_o/define_clones.r
change_o/define_clones.sh
change_o/makedb.sh
change_o/select_first_in_clone.r
check_unique_id.r
datatypes_conf.xml
gene_identification.py
imgt_loader.r
merge.r
merge_and_filter.r
mutation_column_checker.py
naive_output.r
new_imgt.r
pattern_plots.r
plot_pdf.r
sequence_overview.r
shm_clonality.htm
shm_csr.htm
shm_csr.py
shm_csr.r
shm_csr.xml
shm_downloads.htm
shm_first.htm
shm_frequency.htm
shm_overview.htm
shm_selection.htm
shm_transition.htm
style.tar.gz
subclass_definition.db.nhr
subclass_definition.db.nin
subclass_definition.db.nsq
summary_to_fasta.py
wrapper.sh

removed:
shm_csr/.gitattributes
shm_csr/.gitignore
shm_csr/LICENSE
shm_csr/README.md
shm_csr/aa_histogram.r
shm_csr/baseline/Baseline_Functions.r
shm_csr/baseline/Baseline_Main.r
shm_csr/baseline/FiveS_Mutability.RData
shm_csr/baseline/FiveS_Substitution.RData
shm_csr/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa
shm_csr/baseline/IMGTVHreferencedataset20161215.fa
shm_csr/baseline/IMGTVHreferencedataset20161215.fasta
shm_csr/baseline/baseline_url.txt
shm_csr/baseline/comparePDFs.r
shm_csr/baseline/filter.r
shm_csr/baseline/script_imgt.py
shm_csr/baseline/script_xlsx.py
shm_csr/baseline/wrapper.sh
shm_csr/change_o/change_o_url.txt
shm_csr/change_o/define_clones.r
shm_csr/change_o/define_clones.sh
shm_csr/change_o/makedb.sh
shm_csr/change_o/select_first_in_clone.r
shm_csr/check_unique_id.r
shm_csr/datatypes_conf.xml
shm_csr/gene_identification.py
shm_csr/imgt_loader.r
shm_csr/merge.r
shm_csr/merge_and_filter.r
shm_csr/mutation_column_checker.py
shm_csr/naive_output.r
shm_csr/new_imgt.r
shm_csr/pattern_plots.r
shm_csr/plot_pdf.r
shm_csr/sequence_overview.r
shm_csr/shm_clonality.htm
shm_csr/shm_csr.htm
shm_csr/shm_csr.py
shm_csr/shm_csr.r
shm_csr/shm_csr.xml
shm_csr/shm_downloads.htm
shm_csr/shm_first.htm
shm_csr/shm_frequency.htm
shm_csr/shm_overview.htm
shm_csr/shm_selection.htm
shm_csr/shm_transition.htm
shm_csr/style.tar.gz
shm_csr/subclass_definition.db.nhr
shm_csr/subclass_definition.db.nin
shm_csr/subclass_definition.db.nsq
shm_csr/summary_to_fasta.py
shm_csr/wrapper.sh

diff -r a4617f1d1d89 -r b6f9a640e098 .gitattributes
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/.gitattributes Fri Feb 19 15:10:54 2021 +0000

@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto

diff -r a4617f1d1d89 -r b6f9a640e098 .gitignore
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/.gitignore Fri Feb 19 15:10:54 2021 +0000

@@ -0,0 +1,4 @@
+
+shm_csr\.tar\.gz
+
+\.vscode/settings\.json

diff -r a4617f1d1d89 -r b6f9a640e098 LICENSE
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/LICENSE Fri Feb 19 15:10:54 2021 +0000

@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 david
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file

diff -r a4617f1d1d89 -r b6f9a640e098 README.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,13 @@
+# SHM CSR
+
+Somatic hypermutation and class switch recombination pipeline.
+The docker version can be found [here](https://github.com/ErasmusMC-Bioinformatics/ARGalaxy-docker).
+
+# Dependencies
+--------------------
+[Python 2.7](https://www.python.org/)
+[Change-O](https://changeo.readthedocs.io/en/version-0.4.4/)
+[Baseline](http://selection.med.yale.edu/baseline/)
+[R data.table](https://cran.r-project.org/web/packages/data.table/data.table.pdf)
+[R ggplot2](https://cran.r-project.org/web/packages/ggplot2/ggplot2.pdf)
+[R reshape2](https://cran.r-project.org/web/packages/reshape/reshape.pdf)

diff -r a4617f1d1d89 -r b6f9a640e098 aa_histogram.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/aa_histogram.r Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,69 @@
+library(ggplot2)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+mutations.by.id.file = args[1]
+absent.aa.by.id.file = args[2]
+genes = strsplit(args[3], ",")[[1]]
+genes = c(genes, "")
+outdir = args[4]
+
+
+print("---------------- read input ----------------")
+
+mutations.by.id = read.table(mutations.by.id.file, sep="\t", fill=T, header=T, quote="")
+absent.aa.by.id = read.table(absent.aa.by.id.file, sep="\t", fill=T, header=T, quote="")
+
+for(gene in genes){
+ graph.title = paste(gene, "AA mutation frequency")
+ if(gene == ""){
+ mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),]
+ absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),]
+
+ graph.title = "AA mutation frequency all"
+ } else {
+ mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),]
+ absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),]
+ }
+ print(paste("nrow", gene, nrow(absent.aa.by.id.gene)))
+ if(nrow(mutations.by.id.gene) == 0){
+ next
+ }
+
+ mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)])
+ aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)])
+
+ dat_freq = mutations.at.position / aa.at.position
+ dat_freq[is.na(dat_freq)] = 0
+ dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq)
+
+
+ print("---------------- plot ----------------")
+
+ m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=13, colour="black"))
+ m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i)
+ m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1")
+ m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1")
+ m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2")
+ m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2")
+ m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3")
+ m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(graph.title)
+ m = m + theme(panel.background = element_rect(fill = "white", colour="black"), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
+ #m = m + scale_colour_manual(values=c("black"))
+
+ print("---------------- write/print ----------------")
+
+
+ dat.sums = data.frame(index=1:length(mutations.at.position), mutations.at.position=mutations.at.position, aa.at.position=aa.at.position)
+
+ write.table(dat.sums, paste(outdir, "/aa_histogram_sum_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+ write.table(mutations.by.id.gene, paste(outdir, "/aa_histogram_count_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+ write.table(absent.aa.by.id.gene, paste(outdir, "/aa_histogram_absent_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+ write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
+
+ png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720)
+ print(m)
+ dev.off()
+
+ ggsave(paste(outdir, "/aa_histogram_", gene, ".pdf", sep=""), m, width=14, height=7)
+}

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/Baseline_Functions.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/Baseline_Functions.r Fri Feb 19 15:10:54 2021 +0000

[

b'@@ -0,0 +1,2287 @@\n+#########################################################################################\r\n+# License Agreement\r\n+# \r\n+# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE \r\n+# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER \r\n+# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE \r\n+# OR COPYRIGHT LAW IS PROHIBITED.\r\n+# \r\n+# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE \r\n+# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED \r\n+# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN \r\n+# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\r\n+#\r\n+# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences\r\n+# Coded by: Mohamed Uduman & Gur Yaari\r\n+# Copyright 2012 Kleinstein Lab\r\n+# Version: 1.3 (01/23/2014)\r\n+#########################################################################################\r\n+\r\n+# Global variables \r\n+ \r\n+ FILTER_BY_MUTATIONS = 1000\r\n+\r\n+ # Nucleotides\r\n+ NUCLEOTIDES = c("A","C","G","T")\r\n+ \r\n+ # Amino Acids\r\n+ AMINO_ACIDS <- c("F", "F", "L", "L", "S", "S", "S", "S", "Y", "Y", "*", "*", "C", "C", "*", "W", "L", "L", "L", "L", "P", "P", "P", "P", "H", "H", "Q", "Q", "R", "R", "R", "R", "I", "I", "I", "M", "T", "T", "T", "T", "N", "N", "K", "K", "S", "S", "R", "R", "V", "V", "V", "V", "A", "A", "A", "A", "D", "D", "E", "E", "G", "G", "G", "G")\r\n+ names(AMINO_ACIDS) <- c("TTT", "TTC", "TTA", "TTG", "TCT", "TCC", "TCA", "TCG", "TAT", "TAC", "TAA", "TAG", "TGT", "TGC", "TGA", "TGG", "CTT", "CTC", "CTA", "CTG", "CCT", "CCC", "CCA", "CCG", "CAT", "CAC", "CAA", "CAG", "CGT", "CGC", "CGA", "CGG", "ATT", "ATC", "ATA", "ATG", "ACT", "ACC", "ACA", "ACG", "AAT", "AAC", "AAA", "AAG", "AGT", "AGC", "AGA", "AGG", "GTT", "GTC", "GTA", "GTG", "GCT", "GCC", "GCA", "GCG", "GAT", "GAC", "GAA", "GAG", "GGT", "GGC", "GGA", "GGG")\r\n+ names(AMINO_ACIDS) <- names(AMINO_ACIDS)\r\n+\r\n+ #Amino Acid Traits\r\n+ #"*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V" "W" "Y"\r\n+ #B = "Hydrophobic/Burried" N = "Intermediate/Neutral" S="Hydrophilic/Surface") \r\n+ TRAITS_AMINO_ACIDS_CHOTHIA98 <- c("*","N","B","S","S","B","N","N","B","S","B","B","S","N","S","S","N","N","B","B","N")\r\n+ names(TRAITS_AMINO_ACIDS_CHOTHIA98) <- sort(unique(AMINO_ACIDS))\r\n+ TRAITS_AMINO_ACIDS <- array(NA,21)\r\n+ \r\n+ # Codon Table\r\n+ CODON_TABLE <- as.data.frame(matrix(NA,ncol=64,nrow=12))\r\n+\r\n+ # Substitution Model: Smith DS et al. 1996\r\n+ substitution_Literature_Mouse <- matrix(c(0, 0.156222928, 0.601501588, 0.242275484, 0.172506739, 0, 0.241239892, 0.586253369, 0.54636291, 0.255795364, 0, 0.197841727, 0.290240811, 0.467680608, 0.24207858, 0),nrow=4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n+ substitution_Flu_Human <- matrix(c(0,0.2795596,0.5026927,0.2177477,0.1693210,0,0.3264723,0.5042067,0.4983549,0.3328321,0,0.1688130,0.2021079,0.4696077,0.3282844,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n+ substitution_Flu25_Human <- matrix(c(0,0.2580641,0.5163685,0.2255674,0.1541125,0,0.3210224,0.5248651,0.5239281,0.3101292,0,0.1659427,0.1997207,0.4579444,0.3423350,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n+ load("FiveS_Substitution.RData")\r\n+\r\n+ # Mutability Models: Shapiro GS et al. 2002\r\n+ triMutability_Literature_Human <- matrix(c(0.24, 1.2, 0.96, 0.43, 2.14, 2, 1.11, 1.9, 0.85, 1.83, 2.36, 1.31, 0.82, 0.52, 0.89, 1.33, 1.4, 0.82, 1.83, 0.73, 1.83, 1.62, 1.53, 0.57, 0.92, 0.42, 0.42, 1.47, 3.44, 2.58, 1.18, 0.47, 0.39, 1.12, 1.8, 0.68, 0.47, 2.19, 2.35, 2.19, 1.05, 1.84, 1.26, 0.28, 0.98, 2.37, 0.66, 1.58, 0.67, 0.92, 1.76, 0.83, 0.97, 0.56, 0.75, 0.62, 2.26, 0.62, 0.74, 1.11, 1.16, 0.61, 0.88, 0.67, 0.37, 0.07, 1.08, 0.46, 0.31, 0.94, 0.62, 0.57, 0.29, NA, 1.44, 0.46, 0.69, 0.57, 0.24, 0.37, 1.1, 0.99, 1.39, 0.6, 2.26, 1.24, 1.36, 0.52, 0.33, 0.26, 1.25, 0.37, 0.58, 1.03, 1.2, '..b'U = lapply(1:length(facLevels), function(x){\r\n+ computeMutabilities(facLevels[x])\r\n+ })\r\n+ facIndex = match(facGL,facLevels)\r\n+ \r\n+ LisGLs_Mutability = lapply(1:nrow(matInput), function(x){\r\n+ cInput = rep(NA,nchar(matInput[x,1]))\r\n+ cInput[s2c(matInput[x,1])!="N"] = 1\r\n+ LisGLs_MutabilityU[[facIndex[x]]] * cInput \r\n+ })\r\n+ \r\n+ LisGLs_Targeting = lapply(1:dim(matInput)[1], function(x){\r\n+ computeTargeting(matInput[x,2],LisGLs_Mutability[[x]])\r\n+ })\r\n+ \r\n+ LisGLs_MutationTypes = lapply(1:length(matInput[,2]),function(x){\r\n+ #print(x)\r\n+ computeMutationTypes(matInput[x,2])\r\n+ })\r\n+ \r\n+ LisGLs_R_Exp = lapply(1:nrow(matInput), function(x){\r\n+ Exp_R <- rollapply(as.zoo(1:readEnd),width=3,by=3,\r\n+ function(codonNucs){ \r\n+ RPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="R") \r\n+ sum( LisGLs_Targeting[[x]][,codonNucs][RPos], na.rm=T ) \r\n+ }\r\n+ ) \r\n+ })\r\n+ \r\n+ LisGLs_S_Exp = lapply(1:nrow(matInput), function(x){\r\n+ Exp_S <- rollapply(as.zoo(1:readEnd),width=3,by=3,\r\n+ function(codonNucs){ \r\n+ SPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="S") \r\n+ sum( LisGLs_Targeting[[x]][,codonNucs][SPos], na.rm=T )\r\n+ }\r\n+ ) \r\n+ }) \r\n+ \r\n+ Exp_R = matrix(unlist(LisGLs_R_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) \r\n+ Exp_S = matrix(unlist(LisGLs_S_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) \r\n+ return( list( "Expected_R"=Exp_R, "Expected_S"=Exp_S) ) \r\n+ }\r\n+}\r\n+\r\n+# getObservedMutationsByCodon <- function(listMutations){\r\n+# numbSeqs <- length(listMutations) \r\n+# obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3))))\r\n+# obsMu_S <- obsMu_R\r\n+# temp <- mclapply(1:length(listMutations), function(i){\r\n+# arrMutations = listMutations[[i]]\r\n+# RPos = as.numeric(names(arrMutations)[arrMutations=="R"])\r\n+# RPos <- sapply(RPos,getCodonNumb) \r\n+# if(any(RPos)){\r\n+# tabR <- table(RPos)\r\n+# obsMu_R[i,as.numeric(names(tabR))] <<- tabR\r\n+# } \r\n+# \r\n+# SPos = as.numeric(names(arrMutations)[arrMutations=="S"])\r\n+# SPos <- sapply(SPos,getCodonNumb)\r\n+# if(any(SPos)){\r\n+# tabS <- table(SPos)\r\n+# obsMu_S[i,names(tabS)] <<- tabS\r\n+# } \r\n+# }\r\n+# )\r\n+# return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) \r\n+# }\r\n+\r\n+getObservedMutationsByCodon <- function(listMutations){\r\n+ numbSeqs <- length(listMutations) \r\n+ obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3))))\r\n+ obsMu_S <- obsMu_R\r\n+ temp <- lapply(1:length(listMutations), function(i){\r\n+ arrMutations = listMutations[[i]]\r\n+ RPos = as.numeric(names(arrMutations)[arrMutations=="R"])\r\n+ RPos <- sapply(RPos,getCodonNumb) \r\n+ if(any(RPos)){\r\n+ tabR <- table(RPos)\r\n+ obsMu_R[i,as.numeric(names(tabR))] <<- tabR\r\n+ } \r\n+ \r\n+ SPos = as.numeric(names(arrMutations)[arrMutations=="S"])\r\n+ SPos <- sapply(SPos,getCodonNumb)\r\n+ if(any(SPos)){\r\n+ tabS <- table(SPos)\r\n+ obsMu_S[i,names(tabS)] <<- tabS\r\n+ } \r\n+ }\r\n+ )\r\n+ return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) \r\n+}\r\n+\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/Baseline_Main.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/Baseline_Main.r Fri Feb 19 15:10:54 2021 +0000

[

b'@@ -0,0 +1,388 @@\n+#########################################################################################\r\n+# License Agreement\r\n+# \r\n+# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE \r\n+# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER \r\n+# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE \r\n+# OR COPYRIGHT LAW IS PROHIBITED.\r\n+# \r\n+# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE \r\n+# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED \r\n+# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN \r\n+# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\r\n+#\r\n+# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences\r\n+# Coded by: Mohamed Uduman & Gur Yaari\r\n+# Copyright 2012 Kleinstein Lab\r\n+# Version: 1.3 (01/23/2014)\r\n+#########################################################################################\r\n+\r\n+op <- options();\r\n+options(showWarnCalls=FALSE, showErrorCalls=FALSE, warn=-1)\r\n+library(\'seqinr\')\r\n+if( F & Sys.info()[1]=="Linux"){\r\n+ library("multicore")\r\n+}\r\n+\r\n+# Load functions and initialize global variables\r\n+source("Baseline_Functions.r")\r\n+\r\n+# Initialize parameters with user provided arguments\r\n+ arg <- commandArgs(TRUE) \r\n+ #arg = c(2,1,5,5,0,1,"1:26:38:55:65:104:116", "test.fasta","","sample")\r\n+ #arg = c(1,1,5,5,0,1,"1:38:55:65:104:116:200", "test.fasta","","sample")\r\n+ #arg = c(1,1,5,5,1,1,"1:26:38:55:65:104:116", "/home/mu37/Wu/Wu_Cloned_gapped_sequences_D-masked.fasta","/home/mu37/Wu/","Wu")\r\n+ testID <- as.numeric(arg[1]) # 1 = Focused, 2 = Local\r\n+ species <- as.numeric(arg[2]) # 1 = Human. 2 = Mouse\r\n+ substitutionModel <- as.numeric(arg[3]) # 0 = Uniform substitution, 1 = Smith DS et al. 1996, 5 = FiveS\r\n+ mutabilityModel <- as.numeric(arg[4]) # 0 = Uniform mutablity, 1 = Tri-nucleotide (Shapiro GS et al. 2002) , 5 = FiveS\r\n+ clonal <- as.numeric(arg[5]) # 0 = Independent sequences, 1 = Clonally related, 2 = Clonally related & only non-terminal mutations\r\n+ fixIndels <- as.numeric(arg[6]) # 0 = Do nothing, 1 = Try and fix Indels\r\n+ region <- as.numeric(strsplit(arg[7],":")[[1]]) # StartPos:LastNucleotideF1:C1:F2:C2:F3:C3\r\n+ inputFilePath <- arg[8] # Full path to input file\r\n+ outputPath <- arg[9] # Full path to location of output files\r\n+ outputID <- arg[10] # ID for session output \r\n+ \r\n+\r\n+ if(testID==5){\r\n+ traitChangeModel <- 1\r\n+ if( !is.na(any(arg[11])) ) traitChangeModel <- as.numeric(arg[11]) # 1 <- Chothia 1998\r\n+ initializeTraitChange(traitChangeModel) \r\n+ }\r\n+ \r\n+# Initialize other parameters/variables\r\n+ \r\n+ # Initialzie the codon table ( definitions of R/S )\r\n+ computeCodonTable(testID) \r\n+\r\n+ # Initialize \r\n+ # Test Name\r\n+ testName<-"Focused"\r\n+ if(testID==2) testName<-"Local"\r\n+ if(testID==3) testName<-"Imbalanced" \r\n+ if(testID==4) testName<-"ImbalancedSilent" \r\n+ \r\n+ # Indel placeholders initialization\r\n+ indelPos <- NULL\r\n+ delPos <- NULL\r\n+ insPos <- NULL\r\n+\r\n+ # Initialize in Tranistion & Mutability matrixes\r\n+ substitution <- initializeSubstitutionMatrix(substitutionModel,species)\r\n+ mutability <- initializeMutabilityMatrix(mutabilityModel,species)\r\n+ \r\n+ # FWR/CDR boundaries\r\n+ flagTrim <- F\r\n+ if( is.na(region[7])){\r\n+ flagTrim <- T\r\n+ region[7]<-region[6]\r\n+ }\r\n+ readStart = min(region,na.rm=T)\r\n+ readEnd = max(region,na.rm=T)\r\n+ if(readStart>1){\r\n+ region = region - (readStart - 1)\r\n+ }\r\n+ region_Nuc = c( (region[1]*3-2) , (region[2:7]*3) )\r\n+ region_Cod = region\r\n+ \r\n+ readStart = (readStart*3)-2\r\n+ readEnd = (readEnd*3)\r\n+ \r\n+ FWR_Nuc <- c( rep(TRUE,(region_Nuc[2])),\r\n+ '..b'umb]] = list("CDR"=bayesPDF_groups_cdr[[G]],"FWR"=bayesPDF_groups_fwr[[G]])\r\n+ names(listPDFs)[rowNumb] = names(groups[groups==paste(G)])[1]\r\n+ #if(names(groups)[which(groups==G)[1]]!="All sequences combined"){\r\n+ gs = unique(germlines[groups==G])\r\n+ rowNumb = rowNumb+1\r\n+ if( !is.na(gs) ){\r\n+ for( g in gs ){\r\n+ matOutput[rowNumb,c(1,2,11:18)] = c("Germline",names(germlines)[germlines==g][1],bayes_germlines_cdr[g,],bayes_germlines_fwr[g,],simgaP_germlines_cdr[g],simgaP_germlines_fwr[g])\r\n+ listPDFs[[rowNumb]] = list("CDR"=bayesPDF_germlines_cdr[[g]],"FWR"=bayesPDF_germlines_fwr[[g]])\r\n+ names(listPDFs)[rowNumb] = names(germlines[germlines==paste(g)])[1]\r\n+ rowNumb = rowNumb+1\r\n+ indexesOfInterest = which(germlines==g)\r\n+ numbSeqsOfInterest = length(indexesOfInterest)\r\n+ rowNumb = seq(rowNumb,rowNumb+(numbSeqsOfInterest-1))\r\n+ matOutput[rowNumb,] = matrix( c( rep("Sequence",numbSeqsOfInterest),\r\n+ rownames(matInput)[indexesOfInterest],\r\n+ c(matMutationInfo[indexesOfInterest,1:4]),\r\n+ c(matMutationInfo[indexesOfInterest,5:8]),\r\n+ c(bayes_cdr[indexesOfInterest,]),\r\n+ c(bayes_fwr[indexesOfInterest,]),\r\n+ c(simgaP_cdr[indexesOfInterest]),\r\n+ c(simgaP_fwr[indexesOfInterest]) \r\n+ ), ncol=18, nrow=numbSeqsOfInterest,byrow=F)\r\n+ increment=0\r\n+ for( ioi in indexesOfInterest){\r\n+ listPDFs[[min(rowNumb)+increment]] = list("CDR"=bayesPDF_cdr[[ioi]] , "FWR"=bayesPDF_fwr[[ioi]])\r\n+ names(listPDFs)[min(rowNumb)+increment] = rownames(matInput)[ioi]\r\n+ increment = increment + 1\r\n+ }\r\n+ rowNumb=max(rowNumb)+1\r\n+\r\n+ }\r\n+ }\r\n+ }\r\n+ colsToFormat = 11:18\r\n+ matOutput[,colsToFormat] = formatC( matrix(as.numeric(matOutput[,colsToFormat]), nrow=nrow(matOutput), ncol=length(colsToFormat)) , digits=3)\r\n+ matOutput[matOutput== " NaN"] = NA\r\n+ \r\n+ \r\n+ \r\n+ colnames(matOutput) = c("Type", "ID", "Observed_CDR_R", "Observed_CDR_S", "Observed_FWR_R", "Observed_FWR_S",\r\n+ "Expected_CDR_R", "Expected_CDR_S", "Expected_FWR_R", "Expected_FWR_S",\r\n+ paste( rep(testName,6), rep(c("Sigma","CIlower","CIupper"),2),rep(c("CDR","FWR"),each=3), sep="_"),\r\n+ paste( rep(testName,2), rep("P",2),c("CDR","FWR"), sep="_")\r\n+ )\r\n+ fileName = paste(outputPath,outputID,".txt",sep="")\r\n+ write.table(matOutput,file=fileName,quote=F,sep="\\t",row.names=T,col.names=NA)\r\n+ fileName = paste(outputPath,outputID,".RData",sep="")\r\n+ save(listPDFs,file=fileName)\r\n+\r\n+indelWarning = FALSE\r\n+if(sum(indelPos)>0){\r\n+ indelWarning = "Warning: The following sequences have either gaps and/or deletions, and have been ommited from the analysis.";\r\n+ indelWarning = paste( indelWarning , "<UL>", sep="" )\r\n+ for(indels in names(indelPos)[indelPos]){\r\n+ indelWarning = paste( indelWarning , "<LI>", indels, "</LI>", sep="" )\r\n+ }\r\n+ indelWarning = paste( indelWarning , "</UL>", sep="" )\r\n+}\r\n+\r\n+cloneWarning = FALSE\r\n+if(clonal==1){\r\n+ if(sum(matInputErrors)>0){\r\n+ cloneWarning = "Warning: The following clones have sequences of unequal length.";\r\n+ cloneWarning = paste( cloneWarning , "<UL>", sep="" )\r\n+ for(clone in names(matInputErrors)[matInputErrors]){\r\n+ cloneWarning = paste( cloneWarning , "<LI>", names(germlines)[as.numeric(clone)], "</LI>", sep="" )\r\n+ }\r\n+ cloneWarning = paste( cloneWarning , "</UL>", sep="" )\r\n+ }\r\n+}\r\n+cat(paste("Success",outputID,indelWarning,cloneWarning,sep="|"))\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/FiveS_Mutability.RData

Binary file baseline/FiveS_Mutability.RData has changed

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/FiveS_Substitution.RData

Binary file baseline/FiveS_Substitution.RData has changed

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa Fri Feb 19 15:10:54 2021 +0000

b'@@ -0,0 +1,703 @@\n+>IGHV1-18*01\n+caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-18*02\n+caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctaagatctgacgacacggcc\n+>IGHV1-18*03\n+caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacatggccgtgtattactgtgcgagaga\n+>IGHV1-18*04\n+caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctacggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-2*01\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccagtaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\n+>IGHV1-2*02\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-2*03\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcttggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcnacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-2*04\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggctgggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n+>IGHV1-2*05\n+caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\n+>IGHV1-24*01\n+caggtccagctggtacagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggtttccggatacaccctc............actgaattatccatgcactgggtgcgacaggctcctggaaaagggcttgagtggatgggaggttttgatcctgaa......gatggtgaaacaatctacgcacagaagttccag...ggcagagtcaccatgaccgaggacacatctacagacacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcaacaga\n+>IGHV1-3*01\n+caggtccagcttgtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggatcaacgctggc......aatggtaacacaaaatattcacagaagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaagacacggctgtgtattactgtgcgagaga\n+>IGHV1-3*02\n+caggttcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggagcaacgctggc......aatggtaacacaaaatattcacaggagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaggacatg'..b'aggtgcagctgttgcagtctgcagca...gaggtgaaaagacccggggagtctctgaggatctcctgtaagacttctggatacagcttt............accagctactggatccactgggtgcgccagatgcccgggaaagaactggagtggatggggagcatctatcctggg......aactctgataccagatacagcccatccttccaa...ggccacgtcaccatctcagccgacagctccagcagcaccgcctacctgcagtggagcagcctgaaggcctcggacgccgccatgtattattgtgtgaga\n+>IGHV6-1*01\n+caggtacagctgcagcagtcaggtcca...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\n+>IGHV6-1*02\n+caggtacagctgcagcagtcaggtccg...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\n+>IGHV7-34-1*01\n+...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......actgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\n+>IGHV7-34-1*02\n+...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......aatgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\n+>IGHV7-4-1*01\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatctgcagcctaaaggctgaggacactgccgtgtattactgtgcgaga\n+>IGHV7-4-1*02\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\n+>IGHV7-4-1*03\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcacgctaaaggctgaggacactg\n+>IGHV7-4-1*04\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\n+>IGHV7-4-1*05\n+caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtgttactgtgcgagaga\n+>AIGHV7-40*03|\n+ttttcaatagaaaagtcaaataatcta...agtgtcaatcagtggatgattagataaaatatgatatatgtaaatcatggaatactatgc............agccagtatggtatgaattcagtgtgaccagcccctggacaagggcttgagtggatgggatggatcatcacctac......actgggaacccaacatataccaacggcttcaca...ggacggtttctattctccatggacacctctgtcagcatggcgtatctgcagatcagcagcctaaaggctgaggacacggccgtgtatgactgtatgagaga\n+>IGHV7-81*01\n+caggtgcagctggtgcagtctggccat...gaggtgaagcagcctggggcctcagtgaaggtctcctgcaaggcttctggttacagtttc............accacctatggtatgaattgggtgccacaggcccctggacaagggcttgagtggatgggatggttcaacacctac......actgggaacccaacatatgcccagggcttcaca...ggacggtttgtcttctccatggacacctctgccagcacagcatacctgcagatcagcagcctaaaggctgaggacatggccatgtattactgtgcgagata\n'

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/IMGTVHreferencedataset20161215.fa
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/IMGTVHreferencedataset20161215.fa Fri Feb 19 15:10:54 2021 +0000

b'@@ -0,0 +1,1 @@\n+>IGHV1-18*01\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-18*02\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctaagatctgacgacacggcc\r>IGHV1-18*03\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacatggccgtgtattactgtgcgagaga\r>IGHV1-18*04\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctacggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*01\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccagtaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\r>IGHV1-2*02\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*03\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcttggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcnacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*04\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggctgggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*05\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\r>IGHV1-24*01\rcaggtccagctggtacagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggtttccggatacaccctc............actgaattatccatgcactgggtgcgacaggctcctggaaaagggcttgagtggatgggaggttttgatcctgaa......gatggtgaaacaatctacgcacagaagttccag...ggcagagtcaccatgaccgaggacacatctacagacacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcaacaga\r>IGHV1-3*01\rcaggtccagcttgtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggatcaacgctggc......aatggtaacacaaaatattcacagaagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaagacacggctgtgtattactgtgcgagaga\r>IGHV1-3*02\rcaggttcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggagcaacgctggc......aatggtaacacaaaatattcacaggagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaggacatggctgtgtattactgtgcgagaga\r>'..b'cgccatg\r>IGHV5-78*01\rgaggtgcagctgttgcagtctgcagca...gaggtgaaaagacccggggagtctctgaggatctcctgtaagacttctggatacagcttt............accagctactggatccactgggtgcgccagatgcccgggaaagaactggagtggatggggagcatctatcctggg......aactctgataccagatacagcccatccttccaa...ggccacgtcaccatctcagccgacagctccagcagcaccgcctacctgcagtggagcagcctgaaggcctcggacgccgccatgtattattgtgtgaga\r>IGHV6-1*01\rcaggtacagctgcagcagtcaggtcca...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\r>IGHV6-1*02\rcaggtacagctgcagcagtcaggtccg...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\r>IGHV7-34-1*01\r...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......actgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\r>IGHV7-34-1*02\r...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......aatgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\r>IGHV7-4-1*01\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatctgcagcctaaaggctgaggacactgccgtgtattactgtgcgaga\r>IGHV7-4-1*02\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\r>IGHV7-4-1*03\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcacgctaaaggctgaggacactg\r>IGHV7-4-1*04\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\r>IGHV7-4-1*05\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtgttactgtgcgagaga\r>IGHV7-40*03\rttttcaatagaaaagtcaaataatcta...agtgtcaatcagtggatgattagataaaatatgatatatgtaaatcatggaatactatgc............agccagtatggtatgaattcagtgtgaccagcccctggacaagggcttgagtggatgggatggatcatcacctac......actgggaacccaacatataccaacggcttcaca...ggacggtttctattctccatggacacctctgtcagcatggcgtatctgcagatcagcagcctaaaggctgaggacacggccgtgtatgactgtatgagaga\r>IGHV7-81*01\rcaggtgcagctggtgcagtctggccat...gaggtgaagcagcctggggcctcagtgaaggtctcctgcaaggcttctggttacagtttc............accacctatggtatgaattgggtgccacaggcccctggacaagggcttgagtggatgggatggttcaacacctac......actgggaacccaacatatgcccagggcttcaca...ggacggtttgtcttctccatggacacctctgccagcacagcatacctgcagatcagcagcctaaaggctgaggacatggccatgtattactgtgcgagata\r\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/IMGTVHreferencedataset20161215.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/IMGTVHreferencedataset20161215.fasta Fri Feb 19 15:10:54 2021 +0000

b'@@ -0,0 +1,1 @@\n+>IGHV1-18*01\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-18*02\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctaagatctgacgacacggcc\r>IGHV1-18*03\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacatggccgtgtattactgtgcgagaga\r>IGHV1-18*04\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctacggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*01\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccagtaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\r>IGHV1-2*02\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*03\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcttggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcnacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*04\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggctgggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*05\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\r>IGHV1-24*01\rcaggtccagctggtacagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggtttccggatacaccctc............actgaattatccatgcactgggtgcgacaggctcctggaaaagggcttgagtggatgggaggttttgatcctgaa......gatggtgaaacaatctacgcacagaagttccag...ggcagagtcaccatgaccgaggacacatctacagacacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcaacaga\r>IGHV1-3*01\rcaggtccagcttgtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggatcaacgctggc......aatggtaacacaaaatattcacagaagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaagacacggctgtgtattactgtgcgagaga\r>IGHV1-3*02\rcaggttcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggagcaacgctggc......aatggtaacacaaaatattcacaggagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaggacatggctgtgtattactgtgcgagaga\r>'..b'accgccatg\r>IGHV5-78*01\rgaggtgcagctgttgcagtctgcagca...gaggtgaaaagacccggggagtctctgaggatctcctgtaagacttctggatacagcttt............accagctactggatccactgggtgcgccagatgcccgggaaagaactggagtggatggggagcatctatcctggg......aactctgataccagatacagcccatccttccaa...ggccacgtcaccatctcagccgacagctccagcagcaccgcctacctgcagtggagcagcctgaaggcctcggacgccgccatgtattattgtgtgaga\r>IGHV6-1*01\rcaggtacagctgcagcagtcaggtcca...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\r>IGHV6-1*02\rcaggtacagctgcagcagtcaggtccg...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\r>IGHV7-34-1*01\r...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......actgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\r>IGHV7-34-1*02\r...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......aatgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\r>IGHV7-4-1*01\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatctgcagcctaaaggctgaggacactgccgtgtattactgtgcgaga\r>IGHV7-4-1*02\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\r>IGHV7-4-1*03\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcacgctaaaggctgaggacactg\r>IGHV7-4-1*04\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\r>IGHV7-4-1*05\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtgttactgtgcgagaga\r>IGHV7-40*03\rttttcaatagaaaagtcaaataatcta...agtgtcaatcagtggatgattagataaaatatgatatatgtaaatcatggaatactatgc............agccagtatggtatgaattcagtgtgaccagcccctggacaagggcttgagtggatgggatggatcatcacctac......actgggaacccaacatataccaacggcttcaca...ggacggtttctattctccatggacacctctgtcagcatggcgtatctgcagatcagcagcctaaaggctgaggacacggccgtgtatgactgtatgagaga\r>IGHV7-81*01\rcaggtgcagctggtgcagtctggccat...gaggtgaagcagcctggggcctcagtgaaggtctcctgcaaggcttctggttacagtttc............accacctatggtatgaattgggtgccacaggcccctggacaagggcttgagtggatgggatggttcaacacctac......actgggaacccaacatatgcccagggcttcaca...ggacggtttgtcttctccatggacacctctgccagcacagcatacctgcagatcagcagcctaaaggctgaggacatggccatgtattactgtgcgagata\n'

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/baseline_url.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/baseline_url.txt Fri Feb 19 15:10:54 2021 +0000

@@ -0,0 +1,1 @@
+http://selection.med.yale.edu/baseline/
\ No newline at end of file

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/comparePDFs.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/comparePDFs.r Fri Feb 19 15:10:54 2021 +0000

[

b'@@ -0,0 +1,225 @@\n+options("warn"=-1)\r\n+\r\n+#from http://selection.med.yale.edu/baseline/Archive/Baseline%20Version%201.3/Baseline_Functions_Version1.3.r\r\n+# Compute p-value of two distributions\r\n+compareTwoDistsFaster <-function(sigma_S=seq(-20,20,length.out=4001), N=10000, dens1=runif(4001,0,1), dens2=runif(4001,0,1)){\r\n+#print(c(length(dens1),length(dens2)))\r\n+if(length(dens1)>1 & length(dens2)>1 ){\r\n+\tdens1<-dens1/sum(dens1)\r\n+\tdens2<-dens2/sum(dens2)\r\n+\tcum2 <- cumsum(dens2)-dens2/2\r\n+\ttmp<- sum(sapply(1:length(dens1),function(i)return(dens1[i]*cum2[i])))\r\n+\t#print(tmp)\r\n+\tif(tmp>0.5)tmp<-tmp-1\r\n+\treturn( tmp )\r\n+\t}\r\n+\telse {\r\n+\treturn(NA)\r\n+\t}\r\n+\t#return (sum(sapply(1:N,function(i)(sample(sigma_S,1,prob=dens1)>sample(sigma_S,1,prob=dens2))))/N)\r\n+} \r\n+\r\n+\r\n+require("grid")\r\n+arg <- commandArgs(TRUE)\r\n+#arg <- c("300143","4","5")\r\n+arg[!arg=="clonal"]\r\n+input <- arg[1]\r\n+output <- arg[2]\r\n+rowIDs <- as.numeric( sapply(arg[3:(max(3,length(arg)))],function(x){ gsub("chkbx","",x) } ) )\r\n+\r\n+numbSeqs = length(rowIDs)\r\n+\r\n+if ( is.na(rowIDs[1]) | numbSeqs>10 ) {\r\n+ stop( paste("Error: Please select between one and 10 seqeunces to compare.") )\r\n+}\r\n+\r\n+#load( paste("output/",sessionID,".RData",sep="") )\r\n+load( input )\r\n+#input\r\n+\r\n+xMarks = seq(-20,20,length.out=4001)\r\n+\r\n+plot_grid_s<-function(pdf1,pdf2,Sample=100,cex=1,xlim=NULL,xMarks = seq(-20,20,length.out=4001)){\r\n+ yMax = max(c(abs(as.numeric(unlist(listPDFs[pdf1]))),abs(as.numeric(unlist(listPDFs[pdf2]))),0),na.rm=T) * 1.1\r\n+\r\n+ if(length(xlim==2)){\r\n+ xMin=xlim[1]\r\n+ xMax=xlim[2]\r\n+ } else {\r\n+ xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]\r\n+ xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]\r\n+ xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]\r\n+ xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]\r\n+ \r\n+ xMin_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][1]\r\n+ xMin_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][1]\r\n+ xMax_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001])]\r\n+ xMax_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001])]\r\n+ \r\n+ xMin=min(c(xMin_CDR,xMin_FWR,xMin_CDR2,xMin_FWR2,0),na.rm=TRUE)\r\n+ xMax=max(c(xMax_CDR,xMax_FWR,xMax_CDR2,xMax_FWR2,0),na.rm=TRUE)\r\n+ }\r\n+\r\n+ sigma<-approx(xMarks,xout=seq(xMin,xMax,length.out=Sample))$x\r\n+ grid.rect(gp = gpar(col=gray(0.6),fill="white",cex=cex))\r\n+ x <- sigma\r\n+ pushViewport(viewport(x=0.175,y=0.175,width=0.825,height=0.825,just=c("left","bottom"),default.units="npc"))\r\n+ #pushViewport(plotViewport(c(1.8, 1.8, 0.25, 0.25)*cex))\r\n+ pushViewport(dataViewport(x, c(yMax,-yMax),gp = gpar(cex=cex),extension=c(0.05)))\r\n+ grid.polygon(c(0,0,1,1),c(0,0.5,0.5,0),gp=gpar(col=grey(0.95),fill=grey(0.95)),default.units="npc")\r\n+ grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.9),fill=grey(0.9)),default.units="npc")\r\n+ grid.rect()\r\n+ grid.xaxis(gp = gpar(cex=cex/1.1))\r\n+ yticks = pretty(c(-yMax,yMax),8)\r\n+ yticks = yticks[yticks>(-yMax) & yticks<(yMax)]\r\n+ grid.yaxis(at=yticks,label=abs(yticks),gp = gpar(cex=cex/1.1))\r\n+ if(length(listPDFs[pdf1][[1]][["CDR"]])>1){\r\n+ ycdr<-approx(xMarks,listPDFs[pdf1][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n+ grid.lines(unit(x,"native"), unit(ycdr,"native"),gp=gpar(col=2,lwd=2))\r\n+ }\r\n+ if(length(listPDFs[pdf1][[1]][["FWR"]])>1){\r\n+ yfwr<-approx(xMarks,listPDFs[pdf1][[1]][["FWR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n+ grid.lines(unit(x,"native"), unit(-yfwr,"native"),gp=gpar(col=4,lwd=2))\r\n+ }\r\n+\r\n+ if(length(listPDFs[pdf2][[1]][["CDR"]])>1){\r\n+ ycdr2<-approx(xMarks,listPDFs[pdf2][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n+ grid.lines(unit(x,"native"), unit(ycdr2,"native"),gp=gpar(col=2,lwd'..b'npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n+ grid.text(formatC(as.numeric(pCDR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n+ grid.text(formatC(as.numeric(pFWR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.25, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n+ \r\n+ \r\n+ # grid.text(paste("P = ",formatC(pCDRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.98, "npc"),just=c("center", "top"),gp = gpar(cex=cex))\r\n+ # grid.text(paste("P = ",formatC(pFWRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.02, "npc"),just=c("center", "bottom"),gp = gpar(cex=cex))\r\n+ }\r\n+ else{\r\n+ }\r\n+}\r\n+\r\n+\r\n+##################################################################################\r\n+################## The whole OCD\'s matrix ########################################\r\n+##################################################################################\r\n+\r\n+#pdf(width=4*numbSeqs+1/3,height=4*numbSeqs+1/3)\r\n+pdf( output ,width=4*numbSeqs+1/3,height=4*numbSeqs+1/3) \r\n+\r\n+pushViewport(viewport(x=0.02,y=0.02,just = c("left", "bottom"),w =0.96,height=0.96,layout = grid.layout(numbSeqs+1,numbSeqs+1,widths=unit.c(unit(rep(1,numbSeqs),"null"),unit(4,"lines")),heights=unit.c(unit(4,"lines"),unit(rep(1,numbSeqs),"null")))))\r\n+\r\n+for( seqOne in 1:numbSeqs+1){\r\n+ pushViewport(viewport(layout.pos.col = seqOne-1, layout.pos.row = 1))\r\n+ if(seqOne>2){ \r\n+ grid.polygon(c(0,0,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")\r\n+ grid.polygon(c(1,1,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")\r\n+ grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.5)),default.units="npc")\r\n+ \r\n+ grid.text(y=.25,x=0.75,"FWR",gp = gpar(cex=1.5),just="center")\r\n+ grid.text(y=.25,x=0.25,"CDR",gp = gpar(cex=1.5),just="center")\r\n+ }\r\n+ grid.rect(gp = gpar(col=grey(0.9)))\r\n+ grid.text(y=.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),just="center")\r\n+ popViewport(1)\r\n+}\r\n+\r\n+for( seqOne in 1:numbSeqs+1){\r\n+ pushViewport(viewport(layout.pos.row = seqOne, layout.pos.col = numbSeqs+1))\r\n+ if(seqOne<=numbSeqs){ \r\n+ grid.polygon(c(0,0.5,0.5,0),c(0,0,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")\r\n+ grid.polygon(c(0,0.5,0.5,0),c(1,1,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")\r\n+ grid.polygon(c(1,0.5,0.5,1),c(0,0,1,1),gp=gpar(col=grey(0.5)),default.units="npc")\r\n+ grid.text(x=.25,y=0.75,"CDR",gp = gpar(cex=1.5),just="center",rot=270)\r\n+ grid.text(x=.25,y=0.25,"FWR",gp = gpar(cex=1.5),just="center",rot=270)\r\n+ }\r\n+ grid.rect(gp = gpar(col=grey(0.9)))\r\n+ grid.text(x=0.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),rot=270,just="center")\r\n+ popViewport(1)\r\n+}\r\n+\r\n+for( seqOne in 1:numbSeqs+1){\r\n+ for(seqTwo in 1:numbSeqs+1){\r\n+ pushViewport(viewport(layout.pos.col = seqTwo-1, layout.pos.row = seqOne))\r\n+ if(seqTwo>seqOne){\r\n+ plot_pvals(rowIDs[seqOne-1],rowIDs[seqTwo-1],cex=2)\r\n+ grid.rect()\r\n+ } \r\n+ popViewport(1)\r\n+ }\r\n+}\r\n+ \r\n+\r\n+xMin=0\r\n+xMax=0.01\r\n+for(pdf1 in rowIDs){\r\n+ xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]\r\n+ xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]\r\n+ xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]\r\n+ xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]\r\n+ xMin=min(c(xMin_CDR,xMin_FWR,xMin),na.rm=TRUE)\r\n+ xMax=max(c(xMax_CDR,xMax_FWR,xMax),na.rm=TRUE)\r\n+}\r\n+\r\n+\r\n+\r\n+for(i in 1:numbSeqs+1){\r\n+ for(j in (i-1):numbSeqs){ \r\n+ pushViewport(viewport(layout.pos.col = i-1, layout.pos.row = j+1))\r\n+ grid.rect()\r\n+ plot_grid_s(rowIDs[i-1],rowIDs[j],cex=1)\r\n+ popViewport(1)\r\n+ }\r\n+}\r\n+\r\n+dev.off() \r\n+\r\n+cat("Success", paste(rowIDs,collapse="_"),sep=":")\r\n+\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/filter.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/filter.r Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,55 @@
+arg = commandArgs(TRUE)
+summaryfile = arg[1]
+gappedfile = arg[2]
+selection = arg[3]
+output = arg[4]
+print(paste("selection = ", selection))
+
+
+summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote = "")
+gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote = "")
+
+fix_column_names = function(df){
+    if("V.DOMAIN.Functionality" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
+        print("found V.DOMAIN.Functionality, changed")
+    }
+    if("V.DOMAIN.Functionality.comment" %in% names(df)){
+        names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
+        print("found V.DOMAIN.Functionality.comment, changed")
+    }
+    return(df)
+}
+
+gappeddat = fix_column_names(gappeddat)
+
+#dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T))
+
+dat = cbind(gappeddat, summarydat$AA.JUNCTION)
+
+colnames(dat)[length(dat)] = "AA.JUNCTION"
+
+dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
+dat$VGene = gsub("[*].*", "", dat$VGene)
+
+dat$DGene = gsub("^Homsap ", "", dat$D.GENE.and.allele)
+dat$DGene = gsub("[*].*", "", dat$DGene)
+
+dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
+dat$JGene = gsub("[*].*", "", dat$JGene)
+
+print(str(dat))
+
+dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":"))
+
+dat = dat[!duplicated(dat$past), ]
+
+print(paste("Sequences remaining after duplicate filter:", nrow(dat)))
+
+dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",]
+
+print(paste("Sequences remaining after functionality filter:", nrow(dat)))
+
+print(paste("Sequences remaining:", nrow(dat)))
+
+write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/script_imgt.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/script_imgt.py Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,86 @@
+#import xlrd #avoid dep
+import argparse
+import re
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
+parser.add_argument("--ref", help="Reference file")
+parser.add_argument("--output", help="Output file")
+parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
+
+args = parser.parse_args()
+
+print "script_imgt.py"
+print "input:", args.input
+print "ref:", args.ref
+print "output:", args.output
+print "id:", args.id
+
+refdic = dict()
+with open(args.ref, 'rU') as ref:
+ currentSeq = ""
+ currentId = ""
+ for line in ref:
+ if line.startswith(">"):
+ if currentSeq is not "" and currentId is not "":
+ refdic[currentId[1:]] = currentSeq
+ currentId = line.rstrip()
+ currentSeq = ""
+ else:
+ currentSeq += line.rstrip()
+ refdic[currentId[1:]] = currentSeq
+
+print "Have", str(len(refdic)), "reference sequences"
+
+vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
+# r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
+# r"(IGKV[0-3]D?-[0-9]{1,2})",
+# r"(IGLV[0-9]-[0-9]{1,2})",
+# r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
+# r"(TRGV[234589])",
+# r"(TRDV[1-3])"]
+
+#vPattern = re.compile(r"|".join(vPattern))
+vPattern = re.compile("|".join(vPattern))
+
+def filterGene(s, pattern):
+    if type(s) is not str:
+        return None
+    res = pattern.search(s)
+    if res:
+        return res.group(0)
+    return None
+
+
+
+currentSeq = ""
+currentId = ""
+first=True
+with open(args.input, 'r') as i:
+ with open(args.output, 'a') as o:
+ o.write(">>>" + args.id + "\n")
+ outputdic = dict()
+ for line in i:
+ if first:
+ first = False
+ continue
+ linesplt = line.split("\t")
+ ref = filterGene(linesplt[1], vPattern)
+ if not ref or not linesplt[2].rstrip():
+ continue
+ if ref in outputdic:
+ outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
+ else:
+ outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
+ #print outputdic
+
+ for k in outputdic.keys():
+ if k in refdic:
+ o.write(">>" + k + "\n")
+ o.write(refdic[k] + "\n")
+ for seq in outputdic[k]:
+ #print seq
+ o.write(">" + seq[0] + "\n")
+ o.write(seq[1] + "\n")
+ else:
+ print k + " not in reference, skipping " + k

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/script_xlsx.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/script_xlsx.py Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,58 @@
+import xlrd
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
+parser.add_argument("--ref", help="Reference file")
+parser.add_argument("--output", help="Output file")
+
+args = parser.parse_args()
+
+gene_column = 6
+id_column = 7
+seq_column = 8
+LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
+
+
+refdic = dict()
+with open(args.ref, 'r') as ref:
+ currentSeq = ""
+ currentId = ""
+ for line in ref.readlines():
+ if line[0] is ">":
+ if currentSeq is not "" and currentId is not "":
+ refdic[currentId[1:]] = currentSeq
+ currentId = line.rstrip()
+ currentSeq = ""
+ else:
+ currentSeq += line.rstrip()
+ refdic[currentId[1:]] = currentSeq
+
+currentSeq = ""
+currentId = ""
+with xlrd.open_workbook(args.input, 'r') as wb:
+ with open(args.output, 'a') as o:
+ for sheet in wb.sheets():
+ if sheet.cell(1,gene_column).value.find("IGHV") < 0:
+ print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name
+ continue
+ o.write(">>>" + sheet.name + "\n")
+ outputdic = dict()
+ for rowindex in range(1, sheet.nrows):
+ ref = sheet.cell(rowindex, gene_column).value.replace(">", "")
+ if ref in outputdic:
+ outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
+ else:
+ outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
+ #print outputdic
+
+ for k in outputdic.keys():
+ if k in refdic:
+ o.write(">>" + k + "\n")
+ o.write(refdic[k] + "\n")
+ for seq in outputdic[k]:
+ #print seq
+ o.write(">" + seq[0] + "\n")
+ o.write(seq[1] + "\n")
+ else:
+ print k + " not in reference, skipping " + k

diff -r a4617f1d1d89 -r b6f9a640e098 baseline/wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/baseline/wrapper.sh Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,92 @@
+#!/bin/bash
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+testID=$1
+species=$2
+substitutionModel=$3
+mutabilityModel=$4
+clonal=$5
+fixIndels=$6
+region=$7
+inputs=$8
+inputs=($inputs)
+IDs=$9
+IDs=($IDs)
+ref=${10}
+output=${11}
+selection=${12}
+output_table=${13}
+outID="result"
+
+echo "$PWD"
+
+echo "testID = $testID"
+echo "species = $species"
+echo "substitutionModel = $substitutionModel"
+echo "mutabilityModel = $mutabilityModel"
+echo "clonal = $clonal"
+echo "fixIndels = $fixIndels"
+echo "region = $region"
+echo "inputs = ${inputs[@]}"
+echo "IDs = ${IDs[@]}"
+echo "ref = $ref"
+echo "output = $output"
+echo "outID = $outID"
+
+fasta="$PWD/baseline.fasta"
+
+
+count=0
+for current in ${inputs[@]}
+do
+ f=$(file $current)
+ zipType="Zip archive"
+ if [[ "$f" == *"Zip archive"* ]] || [[ "$f" == *"XZ compressed data"* ]]
+ then
+ id=${IDs[$count]}
+ echo "id=$id"
+ if [[ "$f" == *"Zip archive"* ]] ; then
+ echo "Zip archive"
+ echo "unzip $input -d $PWD/files/"
+ unzip $current -d "$PWD/$id/"
+ elif [[ "$f" == *"XZ compressed data"* ]] ; then
+ echo "ZX archive"
+ echo "tar -xJf $input -C $PWD/files/"
+ mkdir -p "$PWD/$id/files"
+ tar -xJf $current -C "$PWD/$id/files/"
+ fi
+ filtered="$PWD/filtered_${id}.txt"
+ imgt_1_file="`find $PWD/$id -name '1_*.txt'`"
+ imgt_2_file="`find $PWD/$id -name '2_*.txt'`"
+ echo "1_Summary file: ${imgt_1_file}"
+ echo "2_IMGT-gapped file: ${imgt_2_file}"
+ echo "filter.r for $id"
+ Rscript $dir/filter.r ${imgt_1_file} ${imgt_2_file} "$selection" $filtered 2>&1
+
+ final="$PWD/final_${id}.txt"
+ cat $filtered | cut -f2,4,7 > $final
+ python $dir/script_imgt.py --input $final --ref $ref --output $fasta --id $id
+ else
+ python $dir/script_xlsx.py --input $current --ref $ref --output $fasta
+ fi
+ count=$((count+1))
+done
+workdir="$PWD"
+cd $dir
+echo "file: ${inputs[0]}"
+#Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region ${inputs[0]} $workdir/ $outID 2>&1
+Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region $fasta $workdir/ $outID 2>&1
+
+echo "$workdir/${outID}.txt"
+
+rows=`tail -n +2 $workdir/${outID}.txt | grep -v "All sequences combined" | grep -n 'Group' | grep -Eoh '^[0-9]+' | tr '\n' ' '`
+rows=($rows)
+#unset rows[${#rows[@]}-1]
+
+cd $dir
+Rscript --verbose $dir/comparePDFs.r $workdir/${outID}.RData $output ${rows[@]} 2>&1
+cp $workdir/result.txt ${output_table}
+
+
+
+

diff -r a4617f1d1d89 -r b6f9a640e098 change_o/change_o_url.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/change_o_url.txt Fri Feb 19 15:10:54 2021 +0000

@@ -0,0 +1,1 @@
+https://changeo.readthedocs.io/en/version-0.4.4/
\ No newline at end of file

diff -r a4617f1d1d89 -r b6f9a640e098 change_o/define_clones.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/define_clones.r Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,15 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+input=args[1]
+output=args[2]
+
+change.o = read.table(input, header=T, sep="\t", quote="", stringsAsFactors=F)
+
+freq = data.frame(table(change.o$CLONE))
+freq2 = data.frame(table(freq$Freq))
+
+freq2$final = as.numeric(freq2$Freq) * as.numeric(as.character(freq2$Var1))
+
+names(freq2) = c("Clone size", "Nr of clones", "Nr of sequences")
+
+write.table(x=freq2, file=output, sep="\t",quote=F,row.names=F,col.names=T)

diff -r a4617f1d1d89 -r b6f9a640e098 change_o/define_clones.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/define_clones.sh Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,39 @@
+#!/bin/bash
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+#define_clones.sh $input $noparse $scores $regions $out_file
+
+type=$1
+input=$2
+
+mkdir -p $PWD/outdir
+
+cp $input $PWD/input.tab #file has to have a ".tab" extension
+
+if [ "bygroup" == "$type" ] ; then
+ mode=$3
+ act=$4
+ model=$5
+ norm=$6
+ sym=$7
+ link=$8
+ dist=$9
+ output=${10}
+ output2=${11}
+
+ DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
+
+ Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
+else
+ method=$3
+ output=$4
+ output2=$5
+
+ DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
+
+ Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
+fi
+
+cp $PWD/outdir/output_clone-pass.tab $output
+
+rm -rf $PWD/outdir/

diff -r a4617f1d1d89 -r b6f9a640e098 change_o/makedb.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/makedb.sh Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,36 @@
+#!/bin/bash
+dir="$(cd "$(dirname "$0")" && pwd)"
+
+input=$1
+noparse=$2
+scores=$3
+regions=$4
+output=$5
+
+if [ "true" == "$noparse" ] ; then
+ noparse="--noparse"
+else
+ noparse=""
+fi
+
+if [ "true" == "$scores" ] ; then
+ scores="--scores"
+else
+ scores=""
+fi
+
+if [ "true" == "$regions" ] ; then
+ regions="--regions"
+else
+ regions=""
+fi
+
+mkdir $PWD/outdir
+
+echo "makedb: $PWD/outdir"
+
+MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
+
+mv $PWD/outdir/output_db-pass.tab $output
+
+rm -rf $PWD/outdir/

diff -r a4617f1d1d89 -r b6f9a640e098 change_o/select_first_in_clone.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/change_o/select_first_in_clone.r Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,16 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+input.file = args[1]
+output.file = args[2]
+
+print("select_in_first_clone.r")
+print(input.file)
+print(output.file)
+
+input = read.table(input.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
+
+input = input[!duplicated(input$CLONE),]
+
+names(input)[1] = "Sequence.ID"
+
+write.table(input, output.file, quote=F, sep="\t", row.names=F, col.names=T, na="")

diff -r a4617f1d1d89 -r b6f9a640e098 check_unique_id.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/check_unique_id.r Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,25 @@
+args <- commandArgs(trailingOnly = TRUE) #first argument must be the summary file so it can grab the
+
+current_file = args[1]
+
+current = read.table(current_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="", check.names=F)
+
+if(!("Sequence number" %in% names(current))){
+ stop("First argument doesn't contain the 'Sequence number' column")
+}
+
+tbl = table(current[,"Sequence ID"])
+l_tbl = length(tbl)
+check = any(tbl > 1)
+
+#if(l_tbl != nrow(current)){ # non unique IDs?
+if(check){
+ print("Sequence.ID is not unique for every sequence, adding sequence number to IDs")
+ for(i in 1:length(args)){
+ current_file = args[i]
+ print(paste("Appending 'Sequence number' column to 'Sequence ID' column in", current_file))
+ current = read.table(current_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="", check.names=F)
+ current[,"Sequence ID"] = paste(current[,"Sequence ID"], current[,"Sequence number"], sep="_")
+ write.table(x = current, file = current_file, quote = F, sep = "\t", na = "", row.names = F, col.names = T)
+ }
+}

diff -r a4617f1d1d89 -r b6f9a640e098 datatypes_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml Fri Feb 19 15:10:54 2021 +0000

@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<datatypes>
+ <registration>
+ <datatype extension="imgt_archive" type="galaxy.datatypes.binary:CompressedArchive" display_in_upload="True" subclass="True"/>
+ </registration>
+</datatypes>

diff -r a4617f1d1d89 -r b6f9a640e098 gene_identification.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_identification.py Fri Feb 19 15:10:54 2021 +0000

[

b'@@ -0,0 +1,226 @@\n+import re\n+import argparse\n+import time\n+starttime= int(time.time() * 1000)\n+\n+parser = argparse.ArgumentParser()\n+parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file")\n+parser.add_argument("--output", help="The annotated output file to be merged back with the summary file")\n+\n+args = parser.parse_args()\n+\n+infile = args.input\n+#infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt"\n+output = args.output\n+#outfile = "identified.txt"\n+\n+dic = dict()\n+total = 0\n+\n+\n+first = True\n+IDIndex = 0\n+seqIndex = 0\n+\n+with open(infile, \'r\') as f: #read all sequences into a dictionary as key = ID, value = sequence\n+\tfor line in f:\n+\t\ttotal += 1\n+\t\tlinesplt = line.split("\\t")\n+\t\tif first:\n+\t\t\tprint "linesplt", linesplt\n+\t\t\tIDIndex = linesplt.index("Sequence ID")\n+\t\t\tseqIndex = linesplt.index("Sequence")\n+\t\t\tfirst = False\n+\t\t\tcontinue\n+\t\t\n+\t\tID = linesplt[IDIndex]\n+\t\tif len(linesplt) < 28: #weird rows without a sequence\n+\t\t\tdic[ID] = ""\n+\t\telse:\n+\t\t\tdic[ID] = linesplt[seqIndex]\n+\t\t\t\n+print "Number of input sequences:", len(dic)\n+\n+#old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc\n+#old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag\n+\n+#lambda/kappa reference sequence\n+searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg",\n+ "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc",\n+ "ce": "gcctccacacagagcccatccgtcttccccttgacccgctgctgcaaaaacattccctcc",\n+ "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence\n+\n+compiledregex = {"ca": [],\n+ "cg": [],\n+ "ce": [],\n+ "cm": []}\n+\n+#lambda/kappa reference sequence variable nucleotides\n+ca1 = {38: \'t\', 39: \'g\', 48: \'a\', 49: \'g\', 51: \'c\', 68: \'a\', 73: \'c\'}\n+ca2 = {38: \'g\', 39: \'a\', 48: \'c\', 49: \'c\', 51: \'a\', 68: \'g\', 73: \'a\'}\n+cg1 = {0: \'c\', 33: \'a\', 38: \'c\', 44: \'a\', 54: \'t\', 56: \'g\', 58: \'g\', 66: \'g\', 132: \'c\'}\n+cg2 = {0: \'c\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'c\', 56: \'a\', 58: \'a\', 66: \'g\', 132: \'t\'}\n+cg3 = {0: \'t\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'t\', 56: \'g\', 58: \'g\', 66: \'g\', 132: \'c\'}\n+cg4 = {0: \'t\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'c\', 56: \'a\', 58: \'a\', 66: \'c\', 132: \'c\'}\n+\n+#remove last snp for shorter cg sequence --- note, also change varsInCG\n+del cg1[132]\n+del cg2[132]\n+del cg3[132]\n+del cg4[132]\n+\n+#reference sequences are cut into smaller parts of \'chunklength\' length, and with \'chunklength\' / 2 overlap\n+chunklength = 8\n+\n+#create the chunks of the reference sequence with regular expressions for the variable nucleotides\n+for i in range(0, len(searchstrings["ca"]) - chunklength, chunklength / 2):\n+ pos = i\n+ chunk = searchstrings["ca"][i:i+chunklength]\n+ result = ""\n+ varsInResult = 0\n+ for c in chunk:\n+ if pos in ca1.keys():\n+ varsInResult += 1\n+ result += "[" + ca1[pos] + ca2[pos] + "]"\n+ else:\n+ result += c\n+ pos += 1\n+ compiledregex["ca"].append((re.compile(result), varsInResult))\n+\n+for i in range(0, len(searchstrings["cg"]) - chunklength, chunklength / 2):\n+ pos = i\n+ chunk = searchstrings["cg"][i:i+chunklength]\n+ result = ""\n+ varsInResult = 0\n+ for c in chunk:\n+ if pos in cg1.keys():\n+ varsInResult += 1\n+ result += "[" + "".join(set([cg1[pos], cg2[pos], cg3[pos], cg4[pos]])) + "]"\n+ else:\n+ result += c\n+ pos += 1\n+ compiledregex["cg"].append((re.compile(result), varsInResult))\n+\n+for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2):\n+ compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False))\n+\n+for i in range(0, len(searchstrings["ce"]) - chunklength + 1, chunklength / 2):\n+ compiledregex["ce"].append((re.compile(searchstrings["ce"][i:i+chunklength]), False))\n+\n+def removeAndReturnMaxIndex(x): #simplifies a list compr'..b'kstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]])\n+\t\t\t\t\telse: #key == "cm" #no variable regions in \'cm\' or \'ce\'\n+\t\t\t\t\t\tpass\n+\t\t\t\tbreak #this only breaks when there was a match with the regex, breaking means the \'else:\' clause is skipped\n+\t\t\telse: #only runs if there were no hits\n+\t\t\t\tcontinue\n+\t\t\t#print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it\'s now:", start[lastindex - chunklength / 2 * i]\n+\t\t\tcurrentIDHits[key + "_hits"] += 1\n+\t\tstart_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1 - start_zero) for x in range(5) if len(start) > 0 and max(start) > 1])\n+\t\t#start_location[ID + "_" + key] = str(start.index(max(start)))\n+\n+\n+varsInCA = float(len(ca1.keys()) * 2)\n+varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn\'t hit the first and last nt twice\n+varsInCM = 0\n+varsInCE = 0\n+\n+def round_int(val):\n+\treturn int(round(val))\n+\n+first = True\n+seq_write_count=0\n+with open(infile, \'r\') as f: #read all sequences into a dictionary as key = ID, value = sequence\n+\twith open(output, \'w\') as o:\n+\t\tfor line in f:\n+\t\t\ttotal += 1\n+\t\t\tif first:\n+\t\t\t\to.write("Sequence ID\\tbest_match\\tnt_hit_percentage\\tchunk_hit_percentage\\tstart_locations\\n")\n+\t\t\t\tfirst = False\n+\t\t\t\tcontinue\n+\t\t\tlinesplt = line.split("\\t")\n+\t\t\tif linesplt[2] == "No results":\n+\t\t\t\tpass\n+\t\t\tID = linesplt[1]\n+\t\t\tcurrentIDHits = hits[ID]\n+\t\t\tpossibleca = float(len(compiledregex["ca"]))\n+\t\t\tpossiblecg = float(len(compiledregex["cg"]))\n+\t\t\tpossiblecm = float(len(compiledregex["cm"]))\n+\t\t\tpossiblece = float(len(compiledregex["ce"]))\n+\t\t\tcahits = currentIDHits["ca_hits"]\n+\t\t\tcghits = currentIDHits["cg_hits"]\n+\t\t\tcmhits = currentIDHits["cm_hits"]\n+\t\t\tcehits = currentIDHits["ce_hits"]\n+\t\t\tif cahits >= cghits and cahits >= cmhits and cahits >= cehits: #its a ca gene\n+\t\t\t\tca1hits = currentIDHits["ca1"]\n+\t\t\t\tca2hits = currentIDHits["ca2"]\n+\t\t\t\tif ca1hits >= ca2hits:\n+\t\t\t\t\to.write(ID + "\\tIGA1\\t" + str(round_int(ca1hits / varsInCA * 100)) + "\\t" + str(round_int(cahits / possibleca * 100)) + "\\t" + start_location[ID + "_ca"] + "\\n")\n+\t\t\t\telse:\n+\t\t\t\t\to.write(ID + "\\tIGA2\\t" + str(round_int(ca2hits / varsInCA * 100)) + "\\t" + str(round_int(cahits / possibleca * 100)) + "\\t" + start_location[ID + "_ca"] + "\\n")\n+\t\t\telif cghits >= cahits and cghits >= cmhits and cghits >= cehits: #its a cg gene\n+\t\t\t\tcg1hits = currentIDHits["cg1"]\n+\t\t\t\tcg2hits = currentIDHits["cg2"]\n+\t\t\t\tcg3hits = currentIDHits["cg3"]\n+\t\t\t\tcg4hits = currentIDHits["cg4"]\n+\t\t\t\tif cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene\n+\t\t\t\t\to.write(ID + "\\tIGG1\\t" + str(round_int(cg1hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\t\telif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene\n+\t\t\t\t\to.write(ID + "\\tIGG2\\t" + str(round_int(cg2hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\t\telif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene\n+\t\t\t\t\to.write(ID + "\\tIGG3\\t" + str(round_int(cg3hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\t\telse: #cg4 gene\n+\t\t\t\t\to.write(ID + "\\tIGG4\\t" + str(round_int(cg4hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n+\t\t\telse: #its a cm or ce gene\n+\t\t\t\tif cmhits >= cehits:\n+\t\t\t\t\to.write(ID + "\\tIGM\\t100\\t" + str(round_int(cmhits / possiblecm * 100)) + "\\t" + start_location[ID + "_cm"] + "\\n")\n+\t\t\t\telse:\n+\t\t\t\t\to.write(ID + "\\tIGE\\t100\\t" + str(round_int(cehits / possiblece * 100)) + "\\t" + start_location[ID + "_ce"] + "\\n")\n+\t\t\tseq_write_count += 1\n+\n+print "Time: %i" % (int(time.time() * 1000) - starttime)\n+\n+print "Number of sequences written to file:", seq_write_count\n+\n+\n+\n+\n+\n'

diff -r a4617f1d1d89 -r b6f9a640e098 imgt_loader.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/imgt_loader.r Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,98 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+summ.file = args[1]
+aa.file = args[2]
+junction.file = args[3]
+out.file = args[4]
+
+summ = read.table(summ.file, sep="\t", header=T, quote="", fill=T)
+aa = read.table(aa.file, sep="\t", header=T, quote="", fill=T)
+junction = read.table(junction.file, sep="\t", header=T, quote="", fill=T)
+
+fix_column_names = function(df){
+ if("V.DOMAIN.Functionality" %in% names(df)){
+ names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
+ print("found V.DOMAIN.Functionality, changed")
+ }
+ if("V.DOMAIN.Functionality.comment" %in% names(df)){
+ names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
+ print("found V.DOMAIN.Functionality.comment, changed")
+ }
+ return(df)
+}
+
+summ = fix_column_names(summ)
+aa = fix_column_names(aa)
+junction = fix_column_names(junction)
+
+old_summary_columns=c('Sequence.ID','JUNCTION.frame','V.GENE.and.allele','D.GENE.and.allele','J.GENE.and.allele','CDR1.IMGT.length','CDR2.IMGT.length','CDR3.IMGT.length','Orientation')
+old_sequence_columns=c('CDR1.IMGT','CDR2.IMGT','CDR3.IMGT')
+old_junction_columns=c('JUNCTION')
+
+added_summary_columns=c('Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence')
+added_sequence_columns=c('FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT')
+
+added_junction_columns=c('P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION')
+added_junction_columns=c(added_junction_columns, 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')
+
+out=summ[,c("Sequence.ID","JUNCTION.frame","V.GENE.and.allele","D.GENE.and.allele","J.GENE.and.allele")]
+
+out[,"CDR1.Seq"] = aa[,"CDR1.IMGT"]
+out[,"CDR1.Length"] = summ[,"CDR1.IMGT.length"]
+
+out[,"CDR2.Seq"] = aa[,"CDR2.IMGT"]
+out[,"CDR2.Length"] = summ[,"CDR2.IMGT.length"]
+
+out[,"CDR3.Seq"] = aa[,"CDR3.IMGT"]
+out[,"CDR3.Length"] = summ[,"CDR3.IMGT.length"]
+
+out[,"CDR3.Seq.DNA"] = junction[,"JUNCTION"]
+out[,"CDR3.Length.DNA"] = nchar(as.character(junction[,"JUNCTION"]))
+out[,"Strand"] = summ[,"Orientation"]
+out[,"CDR3.Found.How"] = "a"
+
+out[,added_summary_columns] = summ[,added_summary_columns]
+
+out[,added_sequence_columns] = aa[,added_sequence_columns]
+
+out[,added_junction_columns] = junction[,added_junction_columns]
+
+out[,"Top V Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"V.GENE.and.allele"]))
+out[,"Top D Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"D.GENE.and.allele"]))
+out[,"Top J Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"J.GENE.and.allele"]))
+
+out = out[,c('Sequence.ID','JUNCTION.frame','Top V Gene','Top D Gene','Top J Gene','CDR1.Seq','CDR1.Length','CDR2.Seq','CDR2.Length','CDR3.Seq','CDR3.Length','CDR3.Seq.DNA','CDR3.Length.DNA','Strand','CDR3.Found.How','Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence','FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT','P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')]
+
+names(out) = c('ID','VDJ Frame','Top V Gene','Top D Gene','Top J Gene','CDR1 Seq','CDR1 Length','CDR2 Seq','CDR2 Length','CDR3 Seq','CDR3 Length','CDR3 Seq DNA','CDR3 Length DNA','Strand','CDR3 Found How','Functionality','V-REGION identity %','V-REGION identity nt','D-REGION reading frame','AA JUNCTION','Functionality comment','Sequence','FR1-IMGT','FR2-IMGT','FR3-IMGT','CDR3-IMGT','JUNCTION','J-REGION','FR4-IMGT','P3V-nt nb','N-REGION-nt nb','N1-REGION-nt nb','P5D-nt nb','P3D-nt nb','N2-REGION-nt nb','P5J-nt nb','3V-REGION trimmed-nt nb','5D-REGION trimmed-nt nb','3D-REGION trimmed-nt nb','5J-REGION trimmed-nt nb','N-REGION','N1-REGION','N2-REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')
+
+out[,"VDJ Frame"] = as.character(out[,"VDJ Frame"])
+
+fltr = out[,"VDJ Frame"] == "in-frame"
+if(any(fltr, na.rm = T)){
+ out[fltr, "VDJ Frame"] = "In-frame"
+}
+
+fltr = out[,"VDJ Frame"] == "null"
+if(any(fltr, na.rm = T)){
+ out[fltr, "VDJ Frame"] = "Out-of-frame"
+}
+
+fltr = out[,"VDJ Frame"] == "out-of-frame"
+if(any(fltr, na.rm = T)){
+ out[fltr, "VDJ Frame"] = "Out-of-frame"
+}
+
+fltr = out[,"VDJ Frame"] == ""
+if(any(fltr, na.rm = T)){
+ out[fltr, "VDJ Frame"] = "Out-of-frame"
+}
+
+for(col in c('Top V Gene','Top D Gene','Top J Gene')){
+ out[,col] = as.character(out[,col])
+ fltr = out[,col] == ""
+ if(any(fltr, na.rm = T)){
+ out[fltr,col] = "NA"
+ }
+}
+
+write.table(out, out.file, sep="\t", quote=F, row.names=F, col.names=T)

diff -r a4617f1d1d89 -r b6f9a640e098 merge.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/merge.r Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,27 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+input.1 = args[1]
+input.2 = args[2]
+
+fields.1 = args[3]
+fields.2 = args[4]
+
+field.1 = args[5]
+field.2 = args[6]
+
+output = args[7]
+
+dat1 = read.table(input.1, header=T, sep="\t", quote="", stringsAsFactors=F, fill=T, row.names=NULL)
+if(fields.1 != "all"){
+ fields.1 = unlist(strsplit(fields.1, ","))
+ dat1 = dat1[,fields.1]
+}
+dat2 = read.table(input.2, header=T, sep="\t", quote="", stringsAsFactors=F, fill=T, row.names=NULL)
+if(fields.2 != "all"){
+ fields.2 = unlist(strsplit(fields.2, ","))
+ dat2 = dat2[,fields.2]
+}
+
+dat3 = merge(dat1, dat2, by.x=field.1, by.y=field.2)
+
+write.table(dat3, output, sep="\t",quote=F,row.names=F,col.names=T)

diff -r a4617f1d1d89 -r b6f9a640e098 merge_and_filter.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/merge_and_filter.r Fri Feb 19 15:10:54 2021 +0000

[

b'@@ -0,0 +1,304 @@\n+args <- commandArgs(trailingOnly = TRUE)\r\n+\r\n+\r\n+summaryfile = args[1]\r\n+sequencesfile = args[2]\r\n+mutationanalysisfile = args[3]\r\n+mutationstatsfile = args[4]\r\n+hotspotsfile = args[5]\r\n+aafile = args[6]\r\n+gene_identification_file= args[7]\r\n+output = args[8]\r\n+before.unique.file = args[9]\r\n+unmatchedfile = args[10]\r\n+method=args[11]\r\n+functionality=args[12]\r\n+unique.type=args[13]\r\n+filter.unique=args[14]\r\n+filter.unique.count=as.numeric(args[15])\r\n+class.filter=args[16]\r\n+empty.region.filter=args[17]\r\n+\r\n+print(paste("filter.unique.count:", filter.unique.count))\r\n+\r\n+summ = read.table(summaryfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+sequences = read.table(sequencesfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+mutationstats = read.table(mutationstatsfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+hotspots = read.table(hotspotsfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+AAs = read.table(aafile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+gene_identification = read.table(gene_identification_file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n+\r\n+fix_column_names = function(df){\r\n+ if("V.DOMAIN.Functionality" %in% names(df)){\r\n+ names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"\r\n+ print("found V.DOMAIN.Functionality, changed")\r\n+ }\r\n+ if("V.DOMAIN.Functionality.comment" %in% names(df)){\r\n+ names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"\r\n+ print("found V.DOMAIN.Functionality.comment, changed")\r\n+ }\r\n+ return(df)\r\n+}\r\n+\r\n+fix_non_unique_ids = function(df){\r\n+\tdf$Sequence.ID = paste(df$Sequence.ID, 1:nrow(df))\r\n+\treturn(df)\r\n+}\r\n+\r\n+summ = fix_column_names(summ)\r\n+sequences = fix_column_names(sequences)\r\n+mutationanalysis = fix_column_names(mutationanalysis)\r\n+mutationstats = fix_column_names(mutationstats)\r\n+hotspots = fix_column_names(hotspots)\r\n+AAs = fix_column_names(AAs)\r\n+\r\n+if(method == "blastn"){\r\n+\t#"qseqid\\tsseqid\\tpident\\tlength\\tmismatch\\tgapopen\\tqstart\\tqend\\tsstart\\tsend\\tevalue\\tbitscore"\r\n+\tgene_identification = gene_identification[!duplicated(gene_identification$qseqid),]\r\n+\tref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))\r\n+\tgene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)\r\n+\tgene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100\r\n+\tgene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")]\r\n+\tcolnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")\r\n+}\r\n+\r\n+#print("Summary analysis files columns")\r\n+#print(names(summ))\r\n+\r\n+\r\n+\r\n+input.sequence.count = nrow(summ)\r\n+print(paste("Number of sequences in summary file:", input.sequence.count))\r\n+\r\n+filtering.steps = data.frame(character(0), numeric(0))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count))\r\n+\r\n+filtering.steps[,1] = as.character(filtering.steps[,1])\r\n+filtering.steps[,2] = as.character(filtering.steps[,2])\r\n+#filtering.steps[,3] = as.numeric(filtering.steps[,3])\r\n+\r\n+#print("summary files columns")\r\n+#print(names(summ))\r\n+\r\n+summ = merge(summ, gene_identification, by="Sequence.ID")\r\n+\r\n+print(paste("Number of sequences after merging with gene identification:", nrow(summ)))\r\n+\r\n+summ = summ[summ$Functionality != "No results",]\r\n+\r\n+print(paste("Number of sequences after \'No results\' filter:", nrow(summ)))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("After \'No results\' filter", nrow(summ)))\r\n+\r\n+if(functionality == "productive"){\r\n+\tsumm = summ[summ$Functionality == "productive (see comment)" | summ$Functionality'..b'ion.filter == "leader"){\r\n+\t\tresult$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n+\t} else if(empty.region.filter == "FR1"){\r\n+\t\tresult$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n+\t} else if(empty.region.filter == "CDR1"){\r\n+\t\tresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n+\t} else if(empty.region.filter == "FR2"){\r\n+\t\tresult$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n+\t}\r\n+\t\r\n+\tif(grepl("remove", filter.unique)){\r\n+\t\tresult = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]\r\n+\t\tunique.defs = data.frame(table(result$unique.def))\r\n+\t\tunique.defs = unique.defs[unique.defs$Freq >= filter.unique.count,]\r\n+\t\tresult = result[result$unique.def %in% unique.defs$Var1,]\r\n+\t}\r\n+\r\n+\tif(filter.unique != "remove_vjaa"){\r\n+\t\tresult$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don\'t have a class after it\r\n+\t}\r\n+\r\n+\tresult = result[!duplicated(result$unique.def),]\r\n+}\r\n+\r\n+write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\\t", quote=F,row.names=F,col.names=T)\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))\r\n+\r\n+print(paste("Number of sequences in result after unique filtering:", nrow(result)))\r\n+\r\n+if(nrow(summ) == 0){\r\n+\tstop("No data remaining after filter")\r\n+}\r\n+\r\n+result$best_match_class = gsub(",.*", "", result$best_match) #gsub so the unmatched don\'t have a class after it\r\n+\r\n+#result$past = ""\r\n+#cls = unlist(strsplit(unique.type, ","))\r\n+#for (i in 1:nrow(result)){\r\n+#\tresult[i,"past"] = paste(result[i,cls], collapse=":")\r\n+#}\r\n+\r\n+\r\n+\r\n+result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))\r\n+\r\n+result.matched = result[!grepl("unmatched", result$best_match),]\r\n+result.unmatched = result[grepl("unmatched", result$best_match),]\r\n+\r\n+result = rbind(result.matched, result.unmatched)\r\n+\r\n+result = result[!(duplicated(result$past)), ]\r\n+\r\n+result = result[,!(names(result) %in% c("past", "best_match_class"))]\r\n+\r\n+print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result)))\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("After remove duplicates based on filter", nrow(result)))\r\n+\r\n+unmatched = result[grepl("^unmatched", result$best_match),c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]\r\n+\r\n+print(paste("Number of rows in result:", nrow(result)))\r\n+print(paste("Number of rows in unmatched:", nrow(unmatched)))\r\n+\r\n+matched.sequences = result[!grepl("^unmatched", result$best_match),]\r\n+\r\n+write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\\t",quote=F,row.names=F,col.names=T)\r\n+\r\n+matched.sequences.count = nrow(matched.sequences)\r\n+unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))\r\n+if(matched.sequences.count <= unmatched.sequences.count){\r\n+\tprint("WARNING NO MATCHED (SUB)CLASS SEQUENCES!!")\r\n+}\r\n+\r\n+filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))\r\n+filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))\r\n+filtering.steps[,2] = as.numeric(filtering.steps[,2])\r\n+filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2)\r\n+\r\n+write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\\t",quote=F,row.names=F,col.names=F)\r\n+\r\n+write.table(x=result, file=output, sep="\\t",quote=F,row.names=F,col.names=T)\r\n+write.table(x=unmatched, file=unmatchedfile, sep="\\t",quote=F,row.names=F,col.names=T)\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 mutation_column_checker.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mutation_column_checker.py Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,27 @@
+import re
+
+mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
+
+with open("7_V-REGION-mutation-and-AA-change-table.txt", 'r') as file_handle:
+ first = True
+ fr3_index = -1
+ for i, line in enumerate(file_handle):
+ line_split = line.split("\t")
+ if first:
+ fr3_index = line_split.index("FR3-IMGT")
+ first = False
+ continue
+
+ if len(line_split) < fr3_index:
+ continue
+
+ fr3_data = line_split[fr3_index]
+ if len(fr3_data) > 5:
+ try:
+ test = [mutationMatcher.match(x).groups() for x in fr3_data.split("|") if x]
+ except:
+ print(line_split[1])
+ print("Something went wrong at line {line} with:".format(line=line_split[0]))
+ #print([x for x in fr3_data.split("|") if not mutationMatcher.match(x)])
+ if i % 100000 == 0:
+ print(i)

diff -r a4617f1d1d89 -r b6f9a640e098 naive_output.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/naive_output.r Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,45 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+naive.file = args[1]
+shm.file = args[2]
+output.file.ca = args[3]
+output.file.cg = args[4]
+output.file.cm = args[5]
+
+naive = read.table(naive.file, sep="\t", header=T, quote="", fill=T)
+shm.merge = read.table(shm.file, sep="\t", header=T, quote="", fill=T)
+
+
+final = merge(naive, shm.merge[,c("Sequence.ID", "best_match")], by.x="ID", by.y="Sequence.ID")
+print(paste("nrow final:", nrow(final)))
+names(final)[names(final) == "best_match"] = "Sample"
+final.numeric = final[,sapply(final, is.numeric)]
+final.numeric[is.na(final.numeric)] = 0
+final[,sapply(final, is.numeric)] = final.numeric
+
+final.ca = final[grepl("^ca", final$Sample),]
+final.cg = final[grepl("^cg", final$Sample),]
+final.cm = final[grepl("^cm", final$Sample),]
+
+if(nrow(final.ca) > 0){
+ final.ca$Replicate = 1
+}
+
+if(nrow(final.cg) > 0){
+ final.cg$Replicate = 1
+}
+
+if(nrow(final.cm) > 0){
+ final.cm$Replicate = 1
+}
+
+#print(paste("nrow final:", nrow(final)))
+#final2 = final
+#final2$Sample = gsub("[0-9]", "", final2$Sample)
+#final = rbind(final, final2)
+#final$Replicate = 1
+
+write.table(final.ca, output.file.ca, quote=F, sep="\t", row.names=F, col.names=T)
+write.table(final.cg, output.file.cg, quote=F, sep="\t", row.names=F, col.names=T)
+write.table(final.cm, output.file.cm, quote=F, sep="\t", row.names=F, col.names=T)
+

diff -r a4617f1d1d89 -r b6f9a640e098 new_imgt.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/new_imgt.r Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,40 @@
+args <- commandArgs(trailingOnly = TRUE)
+
+imgt.dir = args[1]
+merged.file = args[2]
+gene = args[3]
+
+merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F, comment.char="", quote="")
+
+if(!("Sequence.ID" %in% names(merged))){ #change-o db
+ print("Change-O DB changing 'SEQUENCE_ID' to 'Sequence.ID'")
+ names(merged)[which(names[merged] == "SEQUENCE_ID")] = "Sequence.ID"
+}
+
+if(gene != "-"){
+ merged = merged[grepl(paste("^", gene, sep=""), merged$best_match),]
+}
+
+if("best_match" %in% names(merged)){
+ merged = merged[!grepl("unmatched", merged$best_match),]
+}
+
+nrow_dat = 0
+
+for(f in list.files(imgt.dir, pattern="*.txt$")){
+ #print(paste("filtering", f))
+ path = file.path(imgt.dir, f)
+ dat = read.table(path, header=T, sep="\t", fill=T, quote="", stringsAsFactors=F, check.names=FALSE, comment.char="")
+
+ dat = dat[dat[,"Sequence ID"] %in% merged$Sequence.ID,]
+
+ nrow_dat = nrow(dat)
+
+ if(nrow(dat) > 0 & grepl("^8_", f)){ #change the FR1 columns to 0 in the "8_..." file
+ dat[,grepl("^FR1", names(dat))] = 0
+ }
+
+ write.table(dat, path, quote=F, sep="\t", row.names=F, col.names=T, na="")
+}
+
+print(paste("Creating new zip for ", gene, "with", nrow_dat, "sequences"))

diff -r a4617f1d1d89 -r b6f9a640e098 pattern_plots.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pattern_plots.r Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,178 @@
+library(ggplot2)
+library(reshape2)
+library(scales)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+input.file = args[1] #the data that's get turned into the "SHM overview" table in the html report "data_sum.txt"
+
+plot1.path = args[2]
+plot1.png = paste(plot1.path, ".png", sep="")
+plot1.txt = paste(plot1.path, ".txt", sep="")
+plot1.pdf = paste(plot1.path, ".pdf", sep="")
+
+plot2.path = args[3]
+plot2.png = paste(plot2.path, ".png", sep="")
+plot2.txt = paste(plot2.path, ".txt", sep="")
+plot2.pdf = paste(plot2.path, ".pdf", sep="")
+
+plot3.path = args[4]
+plot3.png = paste(plot3.path, ".png", sep="")
+plot3.txt = paste(plot3.path, ".txt", sep="")
+plot3.pdf = paste(plot3.path, ".pdf", sep="")
+
+clean.output = args[5]
+
+dat = read.table(input.file, header=F, sep=",", quote="", stringsAsFactors=F, fill=T, row.names=1)
+
+classes = c("IGA", "IGA1", "IGA2", "IGG", "IGG1", "IGG2", "IGG3", "IGG4", "IGM", "IGE")
+xyz = c("x", "y", "z")
+new.names = c(paste(rep(classes, each=3), xyz, sep="."), paste("un", xyz, sep="."), paste("all", xyz, sep="."))
+
+names(dat) = new.names
+
+clean.dat = dat
+clean.dat = clean.dat[,c(paste(rep(classes, each=3), xyz, sep="."), paste("all", xyz, sep="."), paste("un", xyz, sep="."))]
+
+write.table(clean.dat, clean.output, quote=F, sep="\t", na="", row.names=T, col.names=NA)
+
+dat["RGYW.WRCY",] = colSums(dat[c(14,15),], na.rm=T)
+dat["TW.WA",] = colSums(dat[c(16,17),], na.rm=T)
+
+data1 = dat[c("RGYW.WRCY", "TW.WA"),]
+
+data1 = data1[,names(data1)[grepl(".z", names(data1))]]
+names(data1) = gsub("\\..*", "", names(data1))
+
+data1 = melt(t(data1))
+
+names(data1) = c("Class", "Type", "value")
+
+chk = is.na(data1$value)
+if(any(chk)){
+ data1[chk, "value"] = 0
+}
+
+data1 = data1[order(data1$Type),]
+
+write.table(data1, plot1.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
+
+p = ggplot(data1, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge", colour = "black") + ylab("% of mutations") + guides(fill=guide_legend(title=NULL)) + ggtitle("Percentage of mutations in AID and pol eta motives")
+p = p + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_manual(values=c("RGYW.WRCY" = "white", "TW.WA" = "blue4"))
+#p = p + scale_colour_manual(values=c("RGYW.WRCY" = "black", "TW.WA" = "blue4"))
+png(filename=plot1.png, width=510, height=300)
+print(p)
+dev.off()
+
+ggsave(plot1.pdf, p)
+
+data2 = dat[c(1, 5:8),]
+
+data2 = data2[,names(data2)[grepl("\\.x", names(data2))]]
+names(data2) = gsub(".x", "", names(data2))
+
+data2["A/T",] = dat["Targeting of A T (%)",names(dat)[grepl("\\.z", names(dat))]]
+
+data2["G/C transitions",] = round(data2["Transitions at G C (%)",] / data2["Number of Mutations (%)",] * 100, 1)
+
+data2["mutation.at.gc",] = dat["Transitions at G C (%)",names(dat)[grepl("\\.y", names(dat))]]
+data2["G/C transversions",] = round((data2["mutation.at.gc",] - data2["Transitions at G C (%)",]) / data2["Number of Mutations (%)",] * 100, 1)
+
+data2["G/C transversions",is.nan(unlist(data2["G/C transversions",]))] = 0
+data2["G/C transversions",is.infinite(unlist(data2["G/C transversions",]))] = 0
+data2["G/C transitions",is.nan(unlist(data2["G/C transitions",]))] = 0
+data2["G/C transitions",is.infinite(unlist(data2["G/C transitions",]))] = 0
+
+data2 = melt(t(data2[c("A/T","G/C transitions","G/C transversions"),]))
+
+names(data2) = c("Class", "Type", "value")
+
+chk = is.na(data2$value)
+if(any(chk)){
+ data2[chk, "value"] = 0
+}
+
+data2 = data2[order(data2$Type),]
+
+write.table(data2, plot2.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
+
+p = ggplot(data2, aes(x=Class, y=value, fill=Type)) + geom_bar(position="fill", stat="identity", colour = "black") + scale_y_continuous(labels=percent_format()) + guides(fill=guide_legend(title=NULL)) + ylab("% of mutations") + ggtitle("Relative mutation patterns")
+p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
+#p = p + scale_colour_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "black"))
+png(filename=plot2.png, width=480, height=300)
+print(p)
+dev.off()
+
+ggsave(plot2.pdf, p)
+
+data3 = dat[c(5, 6, 8, 18:21),]
+data3 = data3[,names(data3)[grepl("\\.x", names(data3))]]
+names(data3) = gsub(".x", "", names(data3))
+
+data3["G/C transitions",] = round(data3["Transitions at G C (%)",] / (data3["C",] + data3["G",]) * 100, 1)
+
+data3["G/C transversions",] = round((data3["Targeting of G C (%)",] - data3["Transitions at G C (%)",]) / (data3["C",] + data3["G",]) * 100, 1)
+
+data3["A/T",] = round(data3["Targeting of A T (%)",] / (data3["A",] + data3["T",]) * 100, 1)
+
+data3["G/C transitions",is.nan(unlist(data3["G/C transitions",]))] = 0
+data3["G/C transitions",is.infinite(unlist(data3["G/C transitions",]))] = 0
+
+data3["G/C transversions",is.nan(unlist(data3["G/C transversions",]))] = 0
+data3["G/C transversions",is.infinite(unlist(data3["G/C transversions",]))] = 0
+
+data3["A/T",is.nan(unlist(data3["A/T",]))] = 0
+data3["A/T",is.infinite(unlist(data3["A/T",]))] = 0
+
+data3 = melt(t(data3[8:10,]))
+names(data3) = c("Class", "Type", "value")
+
+chk = is.na(data3$value)
+if(any(chk)){
+ data3[chk, "value"] = 0
+}
+
+data3 = data3[order(data3$Type),]
+
+write.table(data3, plot3.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
+
+p = ggplot(data3, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge", colour = "black") + ylab("% of nucleotides") + guides(fill=guide_legend(title=NULL)) + ggtitle("Absolute mutation patterns")
+p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
+#p = p + scale_colour_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "black"))
+png(filename=plot3.png, width=480, height=300)
+print(p)
+dev.off()
+
+ggsave(plot3.pdf, p)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

diff -r a4617f1d1d89 -r b6f9a640e098 plot_pdf.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/plot_pdf.r Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,17 @@
+library(ggplot2)
+
+args <- commandArgs(trailingOnly = TRUE)
+print(args)
+
+input = args[1]
+outputdir = args[2]
+setwd(outputdir)
+
+load(input)
+
+print(names(pdfplots))
+
+for(n in names(pdfplots)){
+ print(paste("n:", n))
+ ggsave(pdfplots[[n]], file=n)
+}

diff -r a4617f1d1d89 -r b6f9a640e098 sequence_overview.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sequence_overview.r Fri Feb 19 15:10:54 2021 +0000

[

b'@@ -0,0 +1,363 @@\n+library(reshape2)\n+\n+args <- commandArgs(trailingOnly = TRUE)\n+\n+before.unique.file = args[1]\n+merged.file = args[2]\n+outputdir = args[3]\n+gene.classes = unlist(strsplit(args[4], ","))\n+hotspot.analysis.sum.file = args[5]\n+NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")\n+NTsum.file = paste(outputdir, "ntsum.txt", sep="/")\n+main.html = "index.html"\n+empty.region.filter = args[6]\n+\n+\n+setwd(outputdir)\n+\n+before.unique = read.table(before.unique.file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\n+merged = read.table(merged.file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\n+hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="")\n+\n+#before.unique = before.unique[!grepl("unmatched", before.unique$best_match),]\n+\n+if(empty.region.filter == "leader"){\n+\tbefore.unique$seq_conc = paste(before.unique$FR1.IMGT.seq, before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)\n+} else if(empty.region.filter == "FR1"){\n+\tbefore.unique$seq_conc = paste(before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)\n+} else if(empty.region.filter == "CDR1"){\n+\tbefore.unique$seq_conc = paste(before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)\n+} else if(empty.region.filter == "FR2"){\n+\tbefore.unique$seq_conc = paste(before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)\n+}\n+\n+IDs = before.unique[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")]\n+IDs$best_match = as.character(IDs$best_match)\n+\n+dat = data.frame(table(before.unique$seq_conc))\n+\n+names(dat) = c("seq_conc", "Freq")\n+\n+dat$seq_conc = factor(dat$seq_conc)\n+\n+dat = dat[order(as.character(dat$seq_conc)),]\n+\n+#writing html from R...\n+get.bg.color = function(val){\n+\tif(val %in% c("TRUE", "FALSE", "T", "F")){ #if its a logical value, give the background a green/red color\n+\t\treturn(ifelse(val,"#eafaf1","#f9ebea"))\n+\t} else if (!is.na(as.numeric(val))) { #if its a numerical value, give it a grey tint if its >0\n+\t\treturn(ifelse(val > 0,"#eaecee","white"))\n+\t} else {\n+\t\treturn("white")\n+\t}\n+}\n+td = function(val) {\n+ return(paste("<td bgcolor=\'", get.bg.color(val), "\'>", val, "</td>", sep=""))\n+}\n+tr = function(val) { \n+\treturn(paste(c("<tr>", sapply(val, td), "</tr>"), collapse="")) \n+}\n+\n+make.link = function(id, clss, val) { \n+\tpaste("<a href=\'", clss, "_", id, ".html\'>", val, "</a>", sep="") \n+}\n+tbl = function(df) {\n+\tres = "<table border=\'1\'>"\n+\tfor(i in 1:nrow(df)){ \n+\t\tres = paste(res, tr(df[i,]), sep="")\n+\t}\n+\tres = paste(res, "</table>")\n+}\n+\n+cat("<center><img src=\'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA8AAAAPCAYAAAA71pVKAAAAzElEQVQoka2TwQ2CQBBFpwTshw4ImW8ogJMlUIMmhNCDxgasAi50oSXA8XlAjCG7aqKTzGX/vsnM31mzR0gk7tTudO5MEizpzvQ4ryUSe408J3Xn+grE0p1rnpOamVmWsZG4rS+dzzAMsN8Hi9yyjI1JNGtxu4VxBJgLRLpoTKIPiW0LlwtUVRTubW2OBGUJu92cZRmdfbKQMAw8o+vi5v0fLorZ7Y9waGYJjsf38DJz0O1PsEQffOcv4Sa6YYfDDJ5Obzbsp93+5VfdATueO1fdLdI0AAAAAElFTkSuQmCC\'> Please note that this tab is based on all sequences before filter unique sequences and the remove duplicates based on filters are applied. In this table only sequences occuring more than once are included. </center>", file=main.html, append=F)\n+cat("<table border=\'1\' class=\'pure-table pure-table-striped\'>", file=main.html, append=T)\n+\n+if(empty.region.filter == "leader"){\n+\tcat("<caption>FR1+CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)\n+} else if(empty.region.filter == "FR1"){\n+\tcat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)\n+} else if(empty.region.filter == "CDR1"){\n+\tcat("<caption>FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file='..b' & cg3.n > 0 & cg4.n > 0)\n+\t\n+\tin.cg.all = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0 & cg4.n > 0)\n+\t\n+\t#rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html)\n+\trw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, ce.html, un.html)\n+\trw = c(rw, ca.n, cg.n, cm.n, ce.n, in.classes, in.ca.cg, in.ca.cg.cm, in.ca.cg.ce, in.ca.cg.cm.ce, in.ca1.ca2, in.cg1.cg2, in.cg1.cg3, in.cg1.cg4, in.cg2.cg3, in.cg2.cg4, in.cg3.cg4, in.cg1.cg2.cg3, in.cg2.cg3.cg4, in.cg1.cg2.cg4, in.cg1.cg3.cg4, in.cg.all)\n+\t\n+\t\n+\n+\tcat(tr(rw), file=main.html, append=T)\n+\t\n+\t\n+\tfor(i in 1:nrow(allc)){ #generate html by id\n+\t\thtml = make.link(id, allc[i,"best_match"], allc[i,"Sequence.ID"])\n+\t\tcat(paste(html, " "), file=sequence.id.page, append=T)\n+\t}\n+}\n+\n+cat("</table>", file=main.html, append=T)\n+\n+print(paste("Single sequences:", single.sequences))\n+print(paste("Sequences in multiple subclasses:", in.multiple))\n+print(paste("Multiple sequences in one subclass:", multiple.in.one))\n+print(paste("Matched with unmatched:", some.unmatched))\n+print(paste("Count that should match \'matched\' sequences:", matched))\n+\n+#ACGT overview\n+\n+#NToverview = merged[!grepl("^unmatched", merged$best_match),]\n+NToverview = merged\n+\n+if(empty.region.filter == "leader"){\n+\tNToverview$seq = paste(NToverview$FR1.IMGT.seq, NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)\n+} else if(empty.region.filter == "FR1"){\n+\tNToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)\n+} else if(empty.region.filter == "CDR1"){\n+\tNToverview$seq = paste(NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)\n+} else if(empty.region.filter == "FR2"){\n+\tNToverview$seq = paste(NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)\n+}\n+\n+NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))\n+NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))\n+NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))\n+NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))\n+\n+#Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))\n+\n+#NToverview = rbind(NToverview, NTsum)\n+\n+NTresult = data.frame(nt=c("A", "C", "T", "G"))\n+\n+for(clazz in gene.classes){\n+\tprint(paste("class:", clazz))\n+\tNToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),]\n+\tprint(paste("nrow:", nrow(NToverview.sub)))\n+\tnew.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))\n+\tnew.col.y = sum(new.col.x)\n+\tnew.col.z = round(new.col.x / new.col.y * 100, 2)\n+\t\n+\ttmp = names(NTresult)\n+\tNTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))\n+\tnames(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))\n+}\n+\n+NToverview.tmp = NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")]\n+\n+names(NToverview.tmp) = c("Sequence.ID", "best_match", "Sequence of the analysed region", "A", "C", "G", "T")\n+\n+write.table(NToverview.tmp, NToverview.file, quote=F, sep="\\t", row.names=F, col.names=T)\n+\n+NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]\n+\n+new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))\n+new.col.y = sum(new.col.x)\n+new.col.z = round(new.col.x / new.col.y * 100, 2)\n+\n+tmp = names(NTresult)\n+NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))\n+names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep=""))\n+\n+names(hotspot.analysis.sum) = names(NTresult)\n+\n+hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)\n+\n+write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_clonality.htm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shm_clonality.htm Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,144 @@
+<html>
+
+<head>
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
+<meta name=Generator content="Microsoft Word 14 (filtered)">
+<style>
+
+</style>
+
+</head>
+
+<body lang=EN-US link=blue vlink=purple>
+
+<div class=WordSection1>
+
+References
+
+Gupta,
+Namita T. and Vander Heiden, Jason A. and Uduman, Mohamed and Gadala-Maria,
+Daniel and Yaari, Gur and Kleinstein, Steven H. (2015). <a name="OLE_LINK106"></a><a
+name="OLE_LINK107"></a>Change-O: a toolkit for analyzing large-scale B cell
+immunoglobulin repertoire sequencing data: Table 1. In Bioinformatics, 31 (20), pp.
+3356�3358. [<a
+href="http://dx.doi.org/10.1093/bioinformatics/btv359" target="_blank">doi:10.1093/bioinformatics/btv359</a>][<a
+href="http://dx.doi.org/10.1093/bioinformatics/btv359" target="_blank">Link</a>]
+
+ 
+
+<a name="OLE_LINK110">All, IGA, IGG, IGM and IGE tabs</a>
+
+In
+these tabs information on the clonal relation of transcripts can be found. To
+calculate clonal relation Change-O is used (Gupta et al, PMID: 26069265).
+Transcripts are considered clonally related if they have maximal three nucleotides
+difference in their CDR3 sequence and the same first V segment (as assigned by
+IMGT). Results are represented in a table format showing the clone size and the
+number of clones or sequences with this clone size. Change-O settings used are
+the nucleotide hamming distance substitution model with
+a complete distance of maximal three. For clonal assignment the first gene
+segments were used, and the distances were not normalized. In case of
+asymmetric distances, the minimal distance was used. 
+
+ 
+
+Overlap
+tab 
+
+This
+tab gives information on with which (sub)classe(s) each unique analyzed region
+(based on the exact nucleotide sequence of the analyzes region and the CDR3
+nucleotide sequence) is found with. This gives information if the combination
+of the exact same nucleotide sequence of the analyzed region and the CDR3
+sequence can be found in multiple (sub)classes.
+
+<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA8AAAAPCAYAAAA71pVKAAAAzElEQVQoka2TwQ2CQBBFpwTshw4ImW8ogJMlUIMmhNCDxgasAi50oSXA8XlAjCG7aqKTzGX/vsnM31mzR0gk7tTudO5MEizpzvQ4ryUSe408J3Xn+grE0p1rnpOamVmWsZG4rS+dzzAMsN8Hi9yyjI1JNGtxu4VxBJgLRLpoTKIPiW0LlwtUVRTubW2OBGUJu92cZRmdfbKQMAw8o+vi5v0fLorZ7Y9waGYJjsf38DJz0O1PsEQffOcv4Sa6YYfDDJ5Obzbsp93+5VfdATueO1fdLdI0AAAAAElFTkSuQmCC"> Please note that this tab is based on all
+sequences before filter unique sequences and the remove duplicates based on
+filters are applied. In this table only sequences occuring more than once are
+included. 
+
+</div>
+
+</body>
+
+</html>

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr.htm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shm_csr.htm Fri Feb 19 15:10:54 2021 +0000

@@ -0,0 +1,95 @@
+<html>
+
+<head>
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
+<meta name=Generator content="Microsoft Word 14 (filtered)">
+<style>
+
+</style>
+
+</head>
+
+<body lang=EN-US link=blue vlink=purple>
+
+<div class=WordSection1>
+
+The
+graphs in this tab give insight into the subclass distribution of IGG and IGA
+transcripts. Human C�, Cα, Cγ and Cε
+constant genes are assigned using a custom script
+specifically designed for human (sub)class assignment in repertoire data as
+described in van Schouwenburg and IJspeert et al, submitted for publication. In
+this script the reference sequences for the subclasses are divided in 8
+nucleotide chunks which overlap by 4 nucleotides. These overlapping chunks are
+then individually aligned in the right order to each input sequence. The
+percentage of the chunks identified in each rearrangement is calculated in the
+�chunk hit percentage�. Cα and Cγ
+subclasses are very homologous and only differ in a few nucleotides. To assign
+subclasses the �nt hit percentage� is calculated.
+This percentage indicates how well the chunks covering the subclass specific
+nucleotide match with the different subclasses. Information
+on normal distribution of subclasses in healthy individuals of different ages
+can be found in IJspeert and van Schouwenburg et al, PMID: 27799928.
+
+<a name="OLE_LINK100"></a><a
+name="OLE_LINK99"></a><a name="OLE_LINK25">IGA
+subclass distribution</a>
+
+Pie
+chart showing the relative distribution of IGA1 and IGA2 transcripts in the
+sample.
+
+IGG
+subclass distribution
+
+Pie
+chart showing the relative distribution of IGG1, IGG2, IGG3 and IGG4
+transcripts in the sample.
+
+</div>
+
+</body>
+
+</html>

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shm_csr.py Fri Feb 19 15:10:54 2021 +0000

[

b'@@ -0,0 +1,508 @@\n+import argparse\n+import logging\n+import sys\n+import os\n+import re\n+\n+from collections import defaultdict\n+\n+def main():\n+\tparser = argparse.ArgumentParser()\n+\tparser.add_argument("--input", help="The \'7_V-REGION-mutation-and-AA-change-table\' and \'10_V-REGION-mutation-hotspots\' merged together, with an added \'best_match\' annotation")\n+\tparser.add_argument("--genes", help="The genes available in the \'best_match\' column")\n+\tparser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=[\'leader\', \'FR1\', \'CDR1\', \'FR2\'])\n+\tparser.add_argument("--output", help="Output file")\n+\n+\targs = parser.parse_args()\n+\n+\tinfile = args.input\n+\tgenes = str(args.genes).split(",")\n+\tempty_region_filter = args.empty_region_filter\n+\toutfile = args.output\n+\n+\tgenedic = dict()\n+\n+\tmutationdic = dict()\n+\tmutationMatcher = re.compile("^(.)(\\d+).(.),?[ ]?(.)?(\\d+)?.?(.)?(.?.?.?.?.?)?")\n+\tmutationMatcher = re.compile("^([actg])(\\d+).([actg]),?[ ]?([A-Z])?(\\d+)?.?([A-Z])?(.*)?")\n+\tmutationMatcher = re.compile("^([actg])(\\d+).([actg]),?[ ]?([A-Z])?(\\d+)?[>]?([A-Z;])?(.*)?")\n+\tmutationMatcher = re.compile("^([nactg])(\\d+).([nactg]),?[ ]?([A-Z])?(\\d+)?[>]?([A-Z;])?(.*)?")\n+\tNAMatchResult = (None, None, None, None, None, None, \'\')\n+\tgeneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}\n+\tlinecount = 0\n+\n+\tIDIndex = 0\n+\tbest_matchIndex = 0\n+\tfr1Index = 0\n+\tcdr1Index = 0\n+\tfr2Index = 0\n+\tcdr2Index = 0\n+\tfr3Index = 0\n+\tfirst = True\n+\tIDlist = []\n+\tmutationList = []\n+\tmutationListByID = {}\n+\tcdr1LengthDic = {}\n+\tcdr2LengthDic = {}\n+\n+\tfr1LengthDict = {}\n+\tfr2LengthDict = {}\n+\tfr3LengthDict = {}\n+\n+\tcdr1LengthIndex = 0\n+\tcdr2LengthIndex = 0\n+\n+\tfr1SeqIndex = 0\n+\tfr2SeqIndex = 0\n+\tfr3SeqIndex = 0\n+\n+\ttandem_sum_by_class = defaultdict(int)\n+\texpected_tandem_sum_by_class = defaultdict(float)\n+\n+\twith open(infile, \'ru\') as i:\n+\t\tfor line in i:\n+\t\t\tif first:\n+\t\t\t\tlinesplt = line.split("\\t")\n+\t\t\t\tIDIndex = linesplt.index("Sequence.ID")\n+\t\t\t\tbest_matchIndex = linesplt.index("best_match")\n+\t\t\t\tfr1Index = linesplt.index("FR1.IMGT")\n+\t\t\t\tcdr1Index = linesplt.index("CDR1.IMGT")\n+\t\t\t\tfr2Index = linesplt.index("FR2.IMGT")\n+\t\t\t\tcdr2Index = linesplt.index("CDR2.IMGT")\n+\t\t\t\tfr3Index = linesplt.index("FR3.IMGT")\n+\t\t\t\tcdr1LengthIndex = linesplt.index("CDR1.IMGT.length")\n+\t\t\t\tcdr2LengthIndex = linesplt.index("CDR2.IMGT.length")\n+\t\t\t\tfr1SeqIndex = linesplt.index("FR1.IMGT.seq")\n+\t\t\t\tfr2SeqIndex = linesplt.index("FR2.IMGT.seq")\n+\t\t\t\tfr3SeqIndex = linesplt.index("FR3.IMGT.seq")\n+\t\t\t\tfirst = False\n+\t\t\t\tcontinue\n+\t\t\tlinecount += 1\n+\t\t\tlinesplt = line.split("\\t")\n+\t\t\tID = linesplt[IDIndex]\n+\t\t\tgenedic[ID] = linesplt[best_matchIndex]\n+\t\t\t\n+\t\t\tmutationdic[ID + "_FR1"] = []\n+\t\t\tif len(linesplt[fr1Index]) > 5 and empty_region_filter == "leader":\n+\t\t\t\tmutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x]\n+\n+\t\t\tmutationdic[ID + "_CDR1"] = []\n+\t\t\tif len(linesplt[cdr1Index]) > 5 and empty_region_filter in ["leader", "FR1"]:\n+\t\t\t\tmutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x]\n+\n+\t\t\tmutationdic[ID + "_FR2"] = []\n+\t\t\tif len(linesplt[fr2Index]) > 5 and empty_region_filter in ["leader", "FR1", "CDR1"]:\n+\t\t\t\tmutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x]\n+\n+\t\t\tmutationdic[ID + "_CDR2"] = []\n+\t\t\tif len(linesplt[cdr2Index]) > 5:\n+\t\t\t\tmutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x]\n+\t\t\t\n+\t\t\tmutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]\n+\n+\t\t\tmutationdic[ID + "_FR3"] = []\n+\t\t\tif len(linesplt[fr3Index]) > 5:\n+\t\t\t\tmutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x]\n+\t\t\t\t\n+\t\t\tmutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"'..b'utation_in_TW])\n+\n+\t\t\t\tif in_how_many_motifs > 0:\n+\t\t\t\t\tRGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs\n+\t\t\t\t\tWRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs\n+\t\t\t\t\tWACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs\n+\t\t\t\t\tTWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs\n+\t\t\t\n+\t\t\tmutations_in_motifs_file = os.path.join(os.path.dirname(os.path.abspath(infile)), "mutation_in_motifs.txt")\n+\t\t\tif not os.path.exists(mutation_by_id_file):\n+\t\t\t\twith open(mutations_in_motifs_file, \'w\') as out_handle:\n+\t\t\t\t\tout_handle.write("{0}\\n".format("\\t".join([\n+\t\t\t\t\t\t"Sequence.ID",\n+\t\t\t\t\t\t"mutation_position",\n+\t\t\t\t\t\t"region",\n+\t\t\t\t\t\t"from_nt",\n+\t\t\t\t\t\t"to_nt",\n+\t\t\t\t\t\t"mutation_position_AA",\n+\t\t\t\t\t\t"from_AA",\n+\t\t\t\t\t\t"to_AA",\n+\t\t\t\t\t\t"motif",\n+\t\t\t\t\t\t"motif_start_nt",\n+\t\t\t\t\t\t"motif_end_nt",\n+\t\t\t\t\t\t"rest"\n+\t\t\t\t\t])))\n+\n+\t\t\twith open(mutations_in_motifs_file, \'a\') as out_handle:\n+\t\t\t\tmotif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW}\n+\t\t\t\tfor mutation in mutationList:\n+\t\t\t\t\tfrm, where, to, AAfrm, AAwhere, AAto, junk = mutation\n+\t\t\t\t\tfor motif in motif_dic.keys():\n+\t\t\t\t\t\t\t\n+\t\t\t\t\t\tfor start, end, region in motif_dic[motif]:\n+\t\t\t\t\t\t\tif start <= int(where) <= end:\n+\t\t\t\t\t\t\t\tout_handle.write("{0}\\n".format(\n+\t\t\t\t\t\t\t\t\t"\\t".join([\n+\t\t\t\t\t\t\t\t\t\tID,\n+\t\t\t\t\t\t\t\t\t\twhere,\n+\t\t\t\t\t\t\t\t\t\tregion,\n+\t\t\t\t\t\t\t\t\t\tfrm,\n+\t\t\t\t\t\t\t\t\t\tto,\n+\t\t\t\t\t\t\t\t\t\tstr(AAwhere),\n+\t\t\t\t\t\t\t\t\t\tstr(AAfrm),\n+\t\t\t\t\t\t\t\t\t\tstr(AAto),\n+\t\t\t\t\t\t\t\t\t\tmotif,\n+\t\t\t\t\t\t\t\t\t\tstr(start),\n+\t\t\t\t\t\t\t\t\t\tstr(end),\n+\t\t\t\t\t\t\t\t\t\tstr(junk)\n+\t\t\t\t\t\t\t\t\t])\n+\t\t\t\t\t\t\t\t))\n+\n+\n+\n+\tdef mean(lst):\n+\t\treturn (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0\n+\n+\n+\tdef median(lst):\n+\t\tlst = sorted(lst)\n+\t\tl = len(lst)\n+\t\tif l == 0:\n+\t\t\treturn 0\n+\t\tif l == 1:\n+\t\t\treturn lst[0]\n+\t\t\t\n+\t\tl = int(l / 2)\n+\t\t\n+\t\tif len(lst) % 2 == 0:\n+\t\t\treturn float(lst[l] + lst[(l - 1)]) / 2.0\n+\t\telse:\n+\t\t\treturn lst[l]\n+\n+\tfuncs = {"mean": mean, "median": median, "sum": sum}\n+\n+\tdirectory = outfile[:outfile.rfind("/") + 1]\n+\tvalue = 0\n+\tvaluedic = dict()\n+\n+\tfor fname in funcs.keys():\n+\t\tfor gene in genes:\n+\t\t\twith open(directory + gene + "_" + fname + "_value.txt", \'r\') as v:\n+\t\t\t\tvaluedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())\n+\t\twith open(directory + "all_" + fname + "_value.txt", \'r\') as v:\n+\t\t\tvaluedic["total_" + fname] = float(v.readlines()[0].rstrip())\n+\t\t\n+\n+\tdef get_xyz(lst, gene, f, fname):\n+\t\tx = round(round(f(lst), 1))\n+\t\ty = valuedic[gene + "_" + fname]\n+\t\tz = str(round(x / float(y) * 100, 1)) if y != 0 else "0"\n+\t\treturn (str(x), str(y), z)\n+\n+\tdic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}\n+\tarr = ["RGYW", "WRCY", "WA", "TW"]\n+\n+\tfor fname in funcs.keys():\n+\t\tfunc = funcs[fname]\n+\t\tfoutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"\n+\t\twith open(foutfile, \'w\') as o:\n+\t\t\tfor typ in arr:\n+\t\t\t\to.write(typ + " (%)")\n+\t\t\t\tcurr = dic[typ]\n+\t\t\t\tfor gene in genes:\n+\t\t\t\t\tgeneMatcher = geneMatchers[gene]\n+\t\t\t\t\tif valuedic[gene + "_" + fname] is 0:\n+\t\t\t\t\t\to.write(",0,0,0")\n+\t\t\t\t\telse:\n+\t\t\t\t\t\tx, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)\n+\t\t\t\t\t\to.write("," + x + "," + y + "," + z)\n+\t\t\t\tx, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)\n+\t\t\t\t#x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)\n+\t\t\t\to.write("," + x + "," + y + "," + z + "\\n")\n+\n+\n+\t# for testing\n+\tseq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt"\n+\twith open(seq_motif_file, \'w\') as o:\n+\t\to.write("ID\\tRGYW\\tWRCY\\tWA\\tTW\\n")\n+\t\tfor ID in IDlist:\n+\t\t\t#o.write(ID + "\\t" + str(round(RGYWCount[ID], 2)) + "\\t" + str(round(WRCYCount[ID], 2)) + "\\t" + str(round(WACount[ID], 2)) + "\\t" + str(round(TWCount[ID], 2)) + "\\n")\n+\t\t\to.write(ID + "\\t" + str(RGYWCount[ID]) + "\\t" + str(WRCYCount[ID]) + "\\t" + str(WACount[ID]) + "\\t" + str(TWCount[ID]) + "\\n")\n+\n+if __name__ == "__main__":\n+\tmain()\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shm_csr.r Fri Feb 19 15:10:54 2021 +0000

[

b'@@ -0,0 +1,561 @@\n+library(data.table)\n+library(ggplot2)\n+library(reshape2)\n+\n+args <- commandArgs(trailingOnly = TRUE)\n+\n+input = args[1]\n+genes = unlist(strsplit(args[2], ","))\n+outputdir = args[3]\n+empty.region.filter = args[4]\n+setwd(outputdir)\n+\n+#dat = read.table(input, header=T, sep="\\t", fill=T, stringsAsFactors=F)\n+\n+dat = data.frame(fread(input, sep="\\t", header=T, stringsAsFactors=F)) #fread because read.table suddenly skips certain rows...\n+\n+if(length(dat$Sequence.ID) == 0){\n+ setwd(outputdir)\n+ result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))\n+ row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of G C (%)")\n+ write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)\n+ transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))\n+ row.names(transitionTable) = c("A", "C", "G", "T")\n+ transitionTable["A","A"] = NA\n+ transitionTable["C","C"] = NA\n+ transitionTable["G","G"] = NA\n+ transitionTable["T","T"] = NA\n+\n+ write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)\n+ cat("0", file="n.txt")\n+ stop("No data")\n+}\n+\n+cleanup_columns = c("FR1.IMGT.c.a",\n+\t\t\t\t\t"FR2.IMGT.g.t",\n+\t\t\t\t\t"CDR1.IMGT.Nb.of.nucleotides",\n+\t\t\t\t\t"CDR2.IMGT.t.a",\n+\t\t\t\t\t"FR1.IMGT.c.g",\n+\t\t\t\t\t"CDR1.IMGT.c.t",\n+\t\t\t\t\t"FR2.IMGT.a.c",\n+\t\t\t\t\t"FR2.IMGT.Nb.of.mutations",\n+\t\t\t\t\t"FR2.IMGT.g.c",\n+\t\t\t\t\t"FR2.IMGT.a.g",\n+\t\t\t\t\t"FR3.IMGT.t.a",\n+\t\t\t\t\t"FR3.IMGT.t.c",\n+\t\t\t\t\t"FR2.IMGT.g.a",\n+\t\t\t\t\t"FR3.IMGT.c.g",\n+\t\t\t\t\t"FR1.IMGT.Nb.of.mutations",\n+\t\t\t\t\t"CDR1.IMGT.g.a",\n+\t\t\t\t\t"CDR1.IMGT.t.g",\n+\t\t\t\t\t"CDR1.IMGT.g.c",\n+\t\t\t\t\t"CDR2.IMGT.Nb.of.nucleotides",\n+\t\t\t\t\t"FR2.IMGT.a.t",\n+\t\t\t\t\t"CDR1.IMGT.Nb.of.mutations",\n+\t\t\t\t\t"CDR3.IMGT.Nb.of.nucleotides",\n+\t\t\t\t\t"CDR1.IMGT.a.g",\n+\t\t\t\t\t"FR3.IMGT.a.c",\n+\t\t\t\t\t"FR1.IMGT.g.a",\n+\t\t\t\t\t"FR3.IMGT.a.g",\n+\t\t\t\t\t"FR1.IMGT.a.t",\n+\t\t\t\t\t"CDR2.IMGT.a.g",\n+\t\t\t\t\t"CDR2.IMGT.Nb.of.mutations",\n+\t\t\t\t\t"CDR2.IMGT.g.t",\n+\t\t\t\t\t"CDR2.IMGT.a.c",\n+\t\t\t\t\t"CDR1.IMGT.t.c",\n+\t\t\t\t\t"FR3.IMGT.g.c",\n+\t\t\t\t\t"FR1.IMGT.g.t",\n+\t\t\t\t\t"FR3.IMGT.g.t",\n+\t\t\t\t\t"CDR1.IMGT.a.t",\n+\t\t\t\t\t"FR1.IMGT.a.g",\n+\t\t\t\t\t"FR3.IMGT.a.t",\n+\t\t\t\t\t"FR3.IMGT.Nb.of.nucleotides",\n+\t\t\t\t\t"FR2.IMGT.t.c",\n+\t\t\t\t\t"CDR2.IMGT.g.a",\n+\t\t\t\t\t"FR2.IMGT.t.a",\n+\t\t\t\t\t"CDR1.IMGT.t.a",\n+\t\t\t\t\t"FR2.IMGT.t.g",\n+\t\t\t\t\t"FR3.IMGT.t.g",\n+\t\t\t\t\t"FR2.IMGT.Nb.of.nucleotides",\n+\t\t\t\t\t"FR1.IMGT.t.a",\n+\t\t\t\t\t"FR1.IMGT.t.g",\n+\t\t\t\t\t"FR3.IMGT.c.t",\n+\t\t\t\t\t"FR1.IMGT.t.c",\n+\t\t\t\t\t"CDR2.IMGT.a.t",\n+\t\t\t\t\t"FR2.IMGT.c.t",\n+\t\t\t\t\t"CDR1.IMGT.g.t",\n+\t\t\t\t\t"CDR2.IMGT.t.g",\n+\t\t\t\t\t"FR1.IMGT.Nb.of.nucleotides",\n+\t\t\t\t\t"CDR1.IMGT.c.g",\n+\t\t\t\t\t"CDR2.IMGT.t.c",\n+\t\t\t\t\t"FR3.IMGT.g.a",\n+\t\t\t\t\t"CDR1.IMGT.a.c",\n+\t\t\t\t\t"FR2.IMGT.c.a",\n+\t\t\t\t\t"FR3.IMGT.Nb.of.mutations",\n+\t\t\t\t\t"FR2.IMGT.c.g",\n+\t\t\t\t\t"CDR2.IMGT.g.c",\n+\t\t\t\t\t"FR1.IMGT.g.c",\n+\t\t\t\t\t"CDR2.IMGT.c.t",\n+\t\t\t\t\t"FR3.IMGT.c.a",\n+\t\t\t\t\t"CDR1.IMGT.c.a",\n+\t\t\t\t\t"CDR2.IMGT.c.g",\n+\t\t\t\t\t"CDR2.IMGT.c.a",\n+\t\t\t\t\t"FR1.IMGT.c.t",\n+\t\t\t\t\t"FR1.IMGT.Nb.of.silent.mutations",\n+\t\t\t\t\t"FR2.IMGT.Nb.of.silent.mutations",\n+\t\t\t\t\t"FR3.IMGT.Nb.of.silent.mutations",\n+\t\t\t\t\t"FR1.IMGT.Nb.of.nonsilent.mutations",\n+\t\t\t\t\t"FR2.IMGT.Nb.of.nonsilent.mutations",\n+\t\t\t\t\t"FR3.IMGT.Nb.of.nonsilent.mutations")\n+\n+print("Cleaning up columns")\n+\n+for(col in cleanup_columns){\n+ dat[,col] = gsub("\\\$.*\\\$", "", dat[,col])\n+ #dat[dat[,col] == "",] = "0"\n+ dat[,col] = as.numeric(dat[,col])\n+ dat[is.na(dat[,col]),col] = 0\n+}\n+\n+regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3")\n+if(empty.region.filter == "FR1") {\n+\tregions = c("CDR1", "FR2", "CDR2", "FR3")\n+} else if (empty.region.filter == "CDR1") {\n+\tregions = c("FR2", "CDR2", "FR3")\n+} else if (empty.region.filter == "FR2") {\n+\tregions = c("CDR2", "FR3")\n+}\n+\n+pdfplots = list() #save() this later to create the pdf plots in another script (maybe avoids the "address (nil), cause memory not mapped")\n+\n+sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) }\n+\n+print("aggregating data into new columns")\n+\n+VRegionMutat'..b'ur="black"))\n+p = p + scale_fill_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet", "IGE" = "darkorange", "all" = "blue4"))\n+p = p + scale_colour_manual(guide = guide_legend(title = "Subclass"), values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet", "IGE" = "darkorange", "all" = "blue4"))\n+\n+png(filename="scatter.png")\n+print(p)\n+dev.off()\n+\n+pdfplots[["scatter.pdf"]] <- p\n+\n+write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+print("Plotting frequency ranges plot")\n+\n+dat$best_match_class = substr(dat$best_match, 0, 3)\n+freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")\n+dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)\n+\n+frequency_bins_sum = data.frame(data.table(dat)[, list(class_sum=sum(.N)), by=c("best_match_class")])\n+\n+frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")])\n+\n+frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class")\n+\n+frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)\n+\n+p = ggplot(frequency_bins_data, aes(frequency_bins, frequency))\n+p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"))\n+p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + scale_fill_manual(guide = guide_legend(title = "Class"), values=c("IGA" = "blue4", "IGG" = "olivedrab3", "IGM" = "darkviolet", "IGE" = "darkorange", "all" = "blue4"))\n+\n+png(filename="frequency_ranges.png")\n+print(p)\n+dev.off()\n+\n+pdfplots[["frequency_ranges.pdf"]] <- p\n+\n+save(pdfplots, file="pdfplots.RData")\n+\n+frequency_bins_data_by_class = frequency_bins_data\n+\n+frequency_bins_data_by_class = frequency_bins_data_by_class[order(frequency_bins_data_by_class$best_match_class, frequency_bins_data_by_class$frequency_bins),]\n+\n+frequency_bins_data_by_class$frequency_bins = gsub("-", " to ", frequency_bins_data_by_class$frequency_bins)\n+frequency_bins_data_by_class[frequency_bins_data_by_class$frequency_bins == "20", c("frequency_bins")] = "20 or higher"\n+frequency_bins_data_by_class[frequency_bins_data_by_class$frequency_bins == "0", c("frequency_bins")] = "0 or lower"\n+\n+write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "best_match_class", "frequency_bins")])\n+\n+frequency_bins_sum = data.frame(data.table(dat)[, list(class_sum=sum(.N)), by=c("best_match")])\n+\n+frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match")\n+\n+frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)\n+\n+frequency_bins_data = frequency_bins_data[order(frequency_bins_data$best_match, frequency_bins_data$frequency_bins),]\n+frequency_bins_data$frequency_bins = gsub("-", " to ", frequency_bins_data$frequency_bins)\n+frequency_bins_data[frequency_bins_data$frequency_bins == "20", c("frequency_bins")] = "20 or higher"\n+frequency_bins_data[frequency_bins_data$frequency_bins == "0", c("frequency_bins")] = "0 or lower"\n+\n+write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shm_csr.xml Fri Feb 19 15:10:54 2021 +0000

b'@@ -0,0 +1,240 @@\n+<tool id="shm_csr" name="SHM & CSR pipeline" version="1.0">\r\n+\t<description></description>\r\n+\t<requirements>\r\n+\t\t<requirement type="package" version="2.7">python</requirement>\r\n+\t\t<requirement type="package" version="1.16.0">numpy</requirement>\r\n+\t\t<requirement type="package" version="1.2.0">xlrd</requirement>\r\n+\t\t<requirement type="package" version="3.0.0">r-ggplot2</requirement>\r\n+\t\t<requirement type="package" version="1.4.3">r-reshape2</requirement>\r\n+\t\t<requirement type="package" version="0.5.0">r-scales</requirement>\r\n+\t\t<requirement type="package" version="3.4_5">r-seqinr</requirement>\r\n+\t\t<requirement type="package" version="1.11.4">r-data.table</requirement>\r\n+\t</requirements>\r\n+\t<command interpreter="bash">\r\n+\t\t#if str ( $filter_unique.filter_unique_select ) == "remove":\r\n+\t\t\twrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select $filter_unique.filter_unique_clone_count $class_filter_cond.class_filter $empty_region_filter $fast\r\n+\t\t#else:\r\n+\t\t\twrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select 2 $class_filter_cond.class_filter $empty_region_filter $fast\r\n+\t\t#end if\r\n+\t</command>\r\n+\t<inputs>\r\n+\t\t<param name="in_file" type="data" format="data" label="IMGT zip file to be analysed" />\r\n+\t\t<param name="empty_region_filter" type="select" label="Sequence starts at" help="" >\r\n+\t\t\t<option value="leader" selected="true">Leader: include FR1, CDR1, FR2, CDR2, FR3 in filters</option>\r\n+\t\t\t<option value="FR1" selected="true">FR1: include CDR1,FR2,CDR2,FR3 in filters</option>\r\n+\t\t\t<option value="CDR1">CDR1: include FR2,CDR2,FR3 in filters</option>\r\n+\t\t\t<option value="FR2">FR2: include CDR2,FR3 in filters</option>\r\n+\t\t</param>\r\n+\t\t<param name="functionality" type="select" label="Functionality filter" help="" >\r\n+\t\t\t<option value="productive" selected="true">Productive (Productive and Productive see comment)</option>\r\n+\t\t\t<option value="unproductive">Unproductive (Unproductive and Unproductive see comment)</option>\r\n+\t\t\t<option value="remove_unknown">Productive and Unproductive (Productive, Productive see comment, Unproductive, Unproductive and Unproductive see comment)</option>\r\n+\t\t</param>\r\n+\t\t<conditional name="filter_unique">\r\n+\t\t\t<param name="filter_unique_select" type="select" label="Filter unique sequences" help="See below for an example.">\r\n+\t\t\t\t<option value="remove" selected="true">Remove uniques (Based on nucleotide sequence + C)</option>\r\n+\t\t\t\t<option value="remove_vjaa">Remove uniques (Based on V+J+CDR3 (AA))</option>\r\n+\t\t\t\t<option value="keep">Keep uniques (Based on nucleotide sequence + C)</option>\r\n+\t\t\t\t<option value="no">No</option>\r\n+\t\t\t</param>\r\n+\t\t\t<when value="remove">\r\n+\t\t\t\t<param name="filter_unique_clone_count" size="4" type="integer" label="How many sequences should be in a group to keep 1 of them" value="2" min="2"/>\r\n+\t\t\t</when>\r\n+\t\t\t<when value="keep"></when>\r\n+\t\t\t<when value="no"></when>\r\n+\t\t</conditional>\r\n+\t\t<param name="unique" type="select" label="Remove duplicates based on" help="" >\r\n+\t\t\t<option value="VGene,CDR3.IMGT.AA,best_match_class">Top.V.Gene, CDR3 (AA), C region</option>\r\n+\t\t\t<option value="VGene,CDR3.IMGT.AA">Top.V.Gene, CDR3 (AA)</option>\r\n+\t\t\t<option value="CDR3.IMGT.AA,best_match_class">CDR3 (AA), C region</option>\r\n+\t\t\t<option value="CDR3.IMGT.AA">CDR3 (AA)</option>\r\n+\t\t\t\r\n+\t\t\t<option value="VGene,CDR3.IMGT.seq,best_match_class">Top.V.Gene, CDR3 (nt), C region</option>\r\n+\t\t\t<option value="VGene,CDR3.IMGT.seq">Top.V.Gene, CDR3 (nt)</option>\r\n+\t\t\t<option value="CDR3.IMGT.seq,best_match_class">CDR3 (nt), C region</option>\r\n+\t\t\t<option value="CDR3.IMGT.seq">CDR3 (nt)'..b'either the \xe2\x80\x9cremove unique filter\xe2\x80\x9d or the \xe2\x80\x9ckeep unique filter\xe2\x80\x9d\r\n+\r\n++--------------------------+\r\n+| unique filter |\r\n++--------+--------+--------+\r\n+| values | remove | keep |\r\n++--------+--------+--------+\r\n+| A | A | A |\r\n++--------+--------+--------+\r\n+| A | B | B |\r\n++--------+--------+--------+\r\n+| B | D | C |\r\n++--------+--------+--------+\r\n+| B | | D |\r\n++--------+--------+--------+\r\n+| C | | |\r\n++--------+--------+--------+\r\n+| D | | |\r\n++--------+--------+--------+\r\n+| D | | |\r\n++--------+--------+--------+\r\n+\r\n+-----\r\n+ \r\n+**Remove duplicates based on**\r\n+\r\n+Allows the selection of a single sequence per clone. Different definitions of a clone can be chosen. \r\n+\r\n+.. class:: infomark\r\n+\r\n+Note: The first sequence (in the data set) of each clone is always included in the analysis. When the first matched sequence is unmatched (no subclass assigned) the first matched sequence will be included. This means that altering the data order (by for instance sorting) can change the sequence which is included in the analysis and therefore slightly influences the results. \r\n+\r\n+-----\r\n+\r\n+**Human Class/Subclass filter**\r\n+\r\n+.. class:: warningmark\r\n+\r\n+Note: This filter should only be applied when analysing human IGH data in which a (sub)class specific sequence is present. Otherwise please select the do not assign (sub)class option to prevent errors when running the pipeline. \r\n+\r\n+The class percentage is based on the \xe2\x80\x98chunk hit percentage\xe2\x80\x99 (see below). The subclass percentage is based on the \xe2\x80\x98nt hit percentage\xe2\x80\x99 (see below).\r\n+\r\n+The SHM & CSR pipeline identifies human C\xc2\xb5, C\xce\xb1, C\xce\xb3 and C\xce\xb5 constant genes by dividing the reference sequences for the subclasses (NG_001019) in 8 nucleotide chunks which overlap by 4 nucleotides. These overlapping chunks are then individually aligned in the right order to each input sequence. This alignment is used to calculate the chunck hit percentage and the nt hit percentage. \r\n+\r\n+*Chunk hit percentage*: The percentage of the chunks that is aligned \r\n+\r\n+*Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% \xe2\x80\x98nt hit percentage\xe2\x80\x99 which means that 5 out of 7 subclass specific nucleotides for C\xce\xb1 or 6 out of 8 subclass specific nucleotides of C\xce\xb3 should match with the specific subclass. \r\n+The option \xe2\x80\x9c>25% class\xe2\x80\x9d can be chosen when you only are interested in the class (C\xce\xb1/C\xce\xb3/C\xc2\xb5/C\xc9\x9b) of your sequences and the length of your sequence is not long enough to assign the subclasses.\r\n+\r\n+-----\r\n+\r\n+**Output new IMGT archives per class into your history?**\r\n+\r\n+If yes is selected, additional output files (one for each class) will be added to the history which contain information of the sequences that passed the selected filtering criteria. These files are in the same format as the IMGT/HighV-QUEST output files and therefore are also compatible with many other analysis programs, such as the Immune repertoire pipeline. \r\n+\r\n+-----\r\n+\r\n+**Execute**\r\n+\r\n+Upon pressing execute a new analysis is added to your history (right side of the page). Initially this analysis will be grey, after initiating the analysis colour of the analysis in the history will change to yellow. When the analysis is finished it will turn green in the history. Now the analysis can be opened by clicking on the eye icon on the analysis of interest. When an analysis turns red an error has occurred when running the analysis. If you click on the analysis title additional information can be found on the analysis. In addition a bug icon appears. Here more information on the error can be found.\r\n+\r\n+]]>\r\n+\t</help>\r\n+\t<citations>\r\n+\t\t<citation type="doi">10.1093/nar/gks457</citation>\r\n+\t\t<citation type="doi">10.1093/bioinformatics/btv359</citation>\r\n+\t</citations>\r\n+</tool>\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/.gitattributes
--- a/shm_csr/.gitattributes Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,2 +0,0 @@
-# Auto detect text files and perform LF normalization
-* text=auto

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/.gitignore
--- a/shm_csr/.gitignore Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,4 +0,0 @@
-
-shm_csr\.tar\.gz
-
-\.vscode/settings\.json

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/LICENSE
--- a/shm_csr/LICENSE Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2019 david
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/README.md
--- a/shm_csr/README.md Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,13 +0,0 @@
-# SHM CSR
-
-Somatic hypermutation and class switch recombination pipeline.
-The docker version can be found [here](https://github.com/ErasmusMC-Bioinformatics/ARGalaxy-docker).
-
-# Dependencies
---------------------
-[Python 2.7](https://www.python.org/)
-[Change-O](https://changeo.readthedocs.io/en/version-0.4.4/)
-[Baseline](http://selection.med.yale.edu/baseline/)
-[R data.table](https://cran.r-project.org/web/packages/data.table/data.table.pdf)
-[R ggplot2](https://cran.r-project.org/web/packages/ggplot2/ggplot2.pdf)
-[R reshape2](https://cran.r-project.org/web/packages/reshape/reshape.pdf)

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/aa_histogram.r
--- a/shm_csr/aa_histogram.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,69 +0,0 @@
-library(ggplot2)
-
-args <- commandArgs(trailingOnly = TRUE)
-
-mutations.by.id.file = args[1]
-absent.aa.by.id.file = args[2]
-genes = strsplit(args[3], ",")[[1]]
-genes = c(genes, "")
-outdir = args[4]
-
-
-print("---------------- read input ----------------")
-
-mutations.by.id = read.table(mutations.by.id.file, sep="\t", fill=T, header=T, quote="")
-absent.aa.by.id = read.table(absent.aa.by.id.file, sep="\t", fill=T, header=T, quote="")
-
-for(gene in genes){
- graph.title = paste(gene, "AA mutation frequency")
- if(gene == ""){
- mutations.by.id.gene = mutations.by.id[!grepl("unmatched", mutations.by.id$best_match),]
- absent.aa.by.id.gene = absent.aa.by.id[!grepl("unmatched", absent.aa.by.id$best_match),]
-
- graph.title = "AA mutation frequency all"
- } else {
- mutations.by.id.gene = mutations.by.id[grepl(paste("^", gene, sep=""), mutations.by.id$best_match),]
- absent.aa.by.id.gene = absent.aa.by.id[grepl(paste("^", gene, sep=""), absent.aa.by.id$best_match),]
- }
- print(paste("nrow", gene, nrow(absent.aa.by.id.gene)))
- if(nrow(mutations.by.id.gene) == 0){
- next
- }
-
- mutations.at.position = colSums(mutations.by.id.gene[,-c(1,2)])
- aa.at.position = colSums(absent.aa.by.id.gene[,-c(1,2,3,4)])
-
- dat_freq = mutations.at.position / aa.at.position
- dat_freq[is.na(dat_freq)] = 0
- dat_dt = data.frame(i=1:length(dat_freq), freq=dat_freq)
-
-
- print("---------------- plot ----------------")
-
- m = ggplot(dat_dt, aes(x=i, y=freq)) + theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=13, colour="black"))
- m = m + geom_bar(stat="identity", colour = "black", fill = "darkgrey", alpha=0.8) + scale_x_continuous(breaks=dat_dt$i, labels=dat_dt$i)
- m = m + annotate("segment", x = 0.5, y = -0.05, xend=26.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 13, y = -0.1, label="FR1")
- m = m + annotate("segment", x = 26.5, y = -0.07, xend=38.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 32.5, y = -0.15, label="CDR1")
- m = m + annotate("segment", x = 38.5, y = -0.05, xend=55.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 47, y = -0.1, label="FR2")
- m = m + annotate("segment", x = 55.5, y = -0.07, xend=65.5, yend=-0.07, colour="darkblue", size=1) + annotate("text", x = 60.5, y = -0.15, label="CDR2")
- m = m + annotate("segment", x = 65.5, y = -0.05, xend=104.5, yend=-0.05, colour="darkgreen", size=1) + annotate("text", x = 85, y = -0.1, label="FR3")
- m = m + expand_limits(y=c(-0.1,1)) + xlab("AA position") + ylab("Frequency") + ggtitle(graph.title)
- m = m + theme(panel.background = element_rect(fill = "white", colour="black"), panel.grid.major.y = element_line(colour = "black"), panel.grid.major.x = element_blank())
- #m = m + scale_colour_manual(values=c("black"))
-
- print("---------------- write/print ----------------")
-
-
- dat.sums = data.frame(index=1:length(mutations.at.position), mutations.at.position=mutations.at.position, aa.at.position=aa.at.position)
-
- write.table(dat.sums, paste(outdir, "/aa_histogram_sum_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
- write.table(mutations.by.id.gene, paste(outdir, "/aa_histogram_count_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
- write.table(absent.aa.by.id.gene, paste(outdir, "/aa_histogram_absent_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
- write.table(dat_dt, paste(outdir, "/aa_histogram_", gene, ".txt", sep=""), sep="\t",quote=F,row.names=F,col.names=T)
-
- png(filename=paste(outdir, "/aa_histogram_", gene, ".png", sep=""), width=1280, height=720)
- print(m)
- dev.off()
-
- ggsave(paste(outdir, "/aa_histogram_", gene, ".pdf", sep=""), m, width=14, height=7)
-}

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/Baseline_Functions.r
--- a/shm_csr/baseline/Baseline_Functions.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,2287 +0,0 @@\n-#########################################################################################\r\n-# License Agreement\r\n-# \r\n-# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE \r\n-# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER \r\n-# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE \r\n-# OR COPYRIGHT LAW IS PROHIBITED.\r\n-# \r\n-# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE \r\n-# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED \r\n-# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN \r\n-# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\r\n-#\r\n-# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences\r\n-# Coded by: Mohamed Uduman & Gur Yaari\r\n-# Copyright 2012 Kleinstein Lab\r\n-# Version: 1.3 (01/23/2014)\r\n-#########################################################################################\r\n-\r\n-# Global variables \r\n- \r\n- FILTER_BY_MUTATIONS = 1000\r\n-\r\n- # Nucleotides\r\n- NUCLEOTIDES = c("A","C","G","T")\r\n- \r\n- # Amino Acids\r\n- AMINO_ACIDS <- c("F", "F", "L", "L", "S", "S", "S", "S", "Y", "Y", "*", "*", "C", "C", "*", "W", "L", "L", "L", "L", "P", "P", "P", "P", "H", "H", "Q", "Q", "R", "R", "R", "R", "I", "I", "I", "M", "T", "T", "T", "T", "N", "N", "K", "K", "S", "S", "R", "R", "V", "V", "V", "V", "A", "A", "A", "A", "D", "D", "E", "E", "G", "G", "G", "G")\r\n- names(AMINO_ACIDS) <- c("TTT", "TTC", "TTA", "TTG", "TCT", "TCC", "TCA", "TCG", "TAT", "TAC", "TAA", "TAG", "TGT", "TGC", "TGA", "TGG", "CTT", "CTC", "CTA", "CTG", "CCT", "CCC", "CCA", "CCG", "CAT", "CAC", "CAA", "CAG", "CGT", "CGC", "CGA", "CGG", "ATT", "ATC", "ATA", "ATG", "ACT", "ACC", "ACA", "ACG", "AAT", "AAC", "AAA", "AAG", "AGT", "AGC", "AGA", "AGG", "GTT", "GTC", "GTA", "GTG", "GCT", "GCC", "GCA", "GCG", "GAT", "GAC", "GAA", "GAG", "GGT", "GGC", "GGA", "GGG")\r\n- names(AMINO_ACIDS) <- names(AMINO_ACIDS)\r\n-\r\n- #Amino Acid Traits\r\n- #"*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V" "W" "Y"\r\n- #B = "Hydrophobic/Burried" N = "Intermediate/Neutral" S="Hydrophilic/Surface") \r\n- TRAITS_AMINO_ACIDS_CHOTHIA98 <- c("*","N","B","S","S","B","N","N","B","S","B","B","S","N","S","S","N","N","B","B","N")\r\n- names(TRAITS_AMINO_ACIDS_CHOTHIA98) <- sort(unique(AMINO_ACIDS))\r\n- TRAITS_AMINO_ACIDS <- array(NA,21)\r\n- \r\n- # Codon Table\r\n- CODON_TABLE <- as.data.frame(matrix(NA,ncol=64,nrow=12))\r\n-\r\n- # Substitution Model: Smith DS et al. 1996\r\n- substitution_Literature_Mouse <- matrix(c(0, 0.156222928, 0.601501588, 0.242275484, 0.172506739, 0, 0.241239892, 0.586253369, 0.54636291, 0.255795364, 0, 0.197841727, 0.290240811, 0.467680608, 0.24207858, 0),nrow=4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n- substitution_Flu_Human <- matrix(c(0,0.2795596,0.5026927,0.2177477,0.1693210,0,0.3264723,0.5042067,0.4983549,0.3328321,0,0.1688130,0.2021079,0.4696077,0.3282844,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n- substitution_Flu25_Human <- matrix(c(0,0.2580641,0.5163685,0.2255674,0.1541125,0,0.3210224,0.5248651,0.5239281,0.3101292,0,0.1659427,0.1997207,0.4579444,0.3423350,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES))\r\n- load("FiveS_Substitution.RData")\r\n-\r\n- # Mutability Models: Shapiro GS et al. 2002\r\n- triMutability_Literature_Human <- matrix(c(0.24, 1.2, 0.96, 0.43, 2.14, 2, 1.11, 1.9, 0.85, 1.83, 2.36, 1.31, 0.82, 0.52, 0.89, 1.33, 1.4, 0.82, 1.83, 0.73, 1.83, 1.62, 1.53, 0.57, 0.92, 0.42, 0.42, 1.47, 3.44, 2.58, 1.18, 0.47, 0.39, 1.12, 1.8, 0.68, 0.47, 2.19, 2.35, 2.19, 1.05, 1.84, 1.26, 0.28, 0.98, 2.37, 0.66, 1.58, 0.67, 0.92, 1.76, 0.83, 0.97, 0.56, 0.75, 0.62, 2.26, 0.62, 0.74, 1.11, 1.16, 0.61, 0.88, 0.67, 0.37, 0.07, 1.08, 0.46, 0.31, 0.94, 0.62, 0.57, 0.29, NA, 1.44, 0.46, 0.69, 0.57, 0.24, 0.37, 1.1, 0.99, 1.39, 0.6, 2.26, 1.24, 1.36, 0.52, 0.33, 0.26, 1.25, 0.37, 0.58, 1.03, 1.2, '..b'U = lapply(1:length(facLevels), function(x){\r\n- computeMutabilities(facLevels[x])\r\n- })\r\n- facIndex = match(facGL,facLevels)\r\n- \r\n- LisGLs_Mutability = lapply(1:nrow(matInput), function(x){\r\n- cInput = rep(NA,nchar(matInput[x,1]))\r\n- cInput[s2c(matInput[x,1])!="N"] = 1\r\n- LisGLs_MutabilityU[[facIndex[x]]] * cInput \r\n- })\r\n- \r\n- LisGLs_Targeting = lapply(1:dim(matInput)[1], function(x){\r\n- computeTargeting(matInput[x,2],LisGLs_Mutability[[x]])\r\n- })\r\n- \r\n- LisGLs_MutationTypes = lapply(1:length(matInput[,2]),function(x){\r\n- #print(x)\r\n- computeMutationTypes(matInput[x,2])\r\n- })\r\n- \r\n- LisGLs_R_Exp = lapply(1:nrow(matInput), function(x){\r\n- Exp_R <- rollapply(as.zoo(1:readEnd),width=3,by=3,\r\n- function(codonNucs){ \r\n- RPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="R") \r\n- sum( LisGLs_Targeting[[x]][,codonNucs][RPos], na.rm=T ) \r\n- }\r\n- ) \r\n- })\r\n- \r\n- LisGLs_S_Exp = lapply(1:nrow(matInput), function(x){\r\n- Exp_S <- rollapply(as.zoo(1:readEnd),width=3,by=3,\r\n- function(codonNucs){ \r\n- SPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="S") \r\n- sum( LisGLs_Targeting[[x]][,codonNucs][SPos], na.rm=T )\r\n- }\r\n- ) \r\n- }) \r\n- \r\n- Exp_R = matrix(unlist(LisGLs_R_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) \r\n- Exp_S = matrix(unlist(LisGLs_S_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) \r\n- return( list( "Expected_R"=Exp_R, "Expected_S"=Exp_S) ) \r\n- }\r\n-}\r\n-\r\n-# getObservedMutationsByCodon <- function(listMutations){\r\n-# numbSeqs <- length(listMutations) \r\n-# obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3))))\r\n-# obsMu_S <- obsMu_R\r\n-# temp <- mclapply(1:length(listMutations), function(i){\r\n-# arrMutations = listMutations[[i]]\r\n-# RPos = as.numeric(names(arrMutations)[arrMutations=="R"])\r\n-# RPos <- sapply(RPos,getCodonNumb) \r\n-# if(any(RPos)){\r\n-# tabR <- table(RPos)\r\n-# obsMu_R[i,as.numeric(names(tabR))] <<- tabR\r\n-# } \r\n-# \r\n-# SPos = as.numeric(names(arrMutations)[arrMutations=="S"])\r\n-# SPos <- sapply(SPos,getCodonNumb)\r\n-# if(any(SPos)){\r\n-# tabS <- table(SPos)\r\n-# obsMu_S[i,names(tabS)] <<- tabS\r\n-# } \r\n-# }\r\n-# )\r\n-# return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) \r\n-# }\r\n-\r\n-getObservedMutationsByCodon <- function(listMutations){\r\n- numbSeqs <- length(listMutations) \r\n- obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3))))\r\n- obsMu_S <- obsMu_R\r\n- temp <- lapply(1:length(listMutations), function(i){\r\n- arrMutations = listMutations[[i]]\r\n- RPos = as.numeric(names(arrMutations)[arrMutations=="R"])\r\n- RPos <- sapply(RPos,getCodonNumb) \r\n- if(any(RPos)){\r\n- tabR <- table(RPos)\r\n- obsMu_R[i,as.numeric(names(tabR))] <<- tabR\r\n- } \r\n- \r\n- SPos = as.numeric(names(arrMutations)[arrMutations=="S"])\r\n- SPos <- sapply(SPos,getCodonNumb)\r\n- if(any(SPos)){\r\n- tabS <- table(SPos)\r\n- obsMu_S[i,names(tabS)] <<- tabS\r\n- } \r\n- }\r\n- )\r\n- return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) \r\n-}\r\n-\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/Baseline_Main.r
--- a/shm_csr/baseline/Baseline_Main.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,388 +0,0 @@\n-#########################################################################################\r\n-# License Agreement\r\n-# \r\n-# THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE \r\n-# ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER \r\n-# APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE \r\n-# OR COPYRIGHT LAW IS PROHIBITED.\r\n-# \r\n-# BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE \r\n-# BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED \r\n-# TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN \r\n-# CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.\r\n-#\r\n-# BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences\r\n-# Coded by: Mohamed Uduman & Gur Yaari\r\n-# Copyright 2012 Kleinstein Lab\r\n-# Version: 1.3 (01/23/2014)\r\n-#########################################################################################\r\n-\r\n-op <- options();\r\n-options(showWarnCalls=FALSE, showErrorCalls=FALSE, warn=-1)\r\n-library(\'seqinr\')\r\n-if( F & Sys.info()[1]=="Linux"){\r\n- library("multicore")\r\n-}\r\n-\r\n-# Load functions and initialize global variables\r\n-source("Baseline_Functions.r")\r\n-\r\n-# Initialize parameters with user provided arguments\r\n- arg <- commandArgs(TRUE) \r\n- #arg = c(2,1,5,5,0,1,"1:26:38:55:65:104:116", "test.fasta","","sample")\r\n- #arg = c(1,1,5,5,0,1,"1:38:55:65:104:116:200", "test.fasta","","sample")\r\n- #arg = c(1,1,5,5,1,1,"1:26:38:55:65:104:116", "/home/mu37/Wu/Wu_Cloned_gapped_sequences_D-masked.fasta","/home/mu37/Wu/","Wu")\r\n- testID <- as.numeric(arg[1]) # 1 = Focused, 2 = Local\r\n- species <- as.numeric(arg[2]) # 1 = Human. 2 = Mouse\r\n- substitutionModel <- as.numeric(arg[3]) # 0 = Uniform substitution, 1 = Smith DS et al. 1996, 5 = FiveS\r\n- mutabilityModel <- as.numeric(arg[4]) # 0 = Uniform mutablity, 1 = Tri-nucleotide (Shapiro GS et al. 2002) , 5 = FiveS\r\n- clonal <- as.numeric(arg[5]) # 0 = Independent sequences, 1 = Clonally related, 2 = Clonally related & only non-terminal mutations\r\n- fixIndels <- as.numeric(arg[6]) # 0 = Do nothing, 1 = Try and fix Indels\r\n- region <- as.numeric(strsplit(arg[7],":")[[1]]) # StartPos:LastNucleotideF1:C1:F2:C2:F3:C3\r\n- inputFilePath <- arg[8] # Full path to input file\r\n- outputPath <- arg[9] # Full path to location of output files\r\n- outputID <- arg[10] # ID for session output \r\n- \r\n-\r\n- if(testID==5){\r\n- traitChangeModel <- 1\r\n- if( !is.na(any(arg[11])) ) traitChangeModel <- as.numeric(arg[11]) # 1 <- Chothia 1998\r\n- initializeTraitChange(traitChangeModel) \r\n- }\r\n- \r\n-# Initialize other parameters/variables\r\n- \r\n- # Initialzie the codon table ( definitions of R/S )\r\n- computeCodonTable(testID) \r\n-\r\n- # Initialize \r\n- # Test Name\r\n- testName<-"Focused"\r\n- if(testID==2) testName<-"Local"\r\n- if(testID==3) testName<-"Imbalanced" \r\n- if(testID==4) testName<-"ImbalancedSilent" \r\n- \r\n- # Indel placeholders initialization\r\n- indelPos <- NULL\r\n- delPos <- NULL\r\n- insPos <- NULL\r\n-\r\n- # Initialize in Tranistion & Mutability matrixes\r\n- substitution <- initializeSubstitutionMatrix(substitutionModel,species)\r\n- mutability <- initializeMutabilityMatrix(mutabilityModel,species)\r\n- \r\n- # FWR/CDR boundaries\r\n- flagTrim <- F\r\n- if( is.na(region[7])){\r\n- flagTrim <- T\r\n- region[7]<-region[6]\r\n- }\r\n- readStart = min(region,na.rm=T)\r\n- readEnd = max(region,na.rm=T)\r\n- if(readStart>1){\r\n- region = region - (readStart - 1)\r\n- }\r\n- region_Nuc = c( (region[1]*3-2) , (region[2:7]*3) )\r\n- region_Cod = region\r\n- \r\n- readStart = (readStart*3)-2\r\n- readEnd = (readEnd*3)\r\n- \r\n- FWR_Nuc <- c( rep(TRUE,(region_Nuc[2])),\r\n- '..b'umb]] = list("CDR"=bayesPDF_groups_cdr[[G]],"FWR"=bayesPDF_groups_fwr[[G]])\r\n- names(listPDFs)[rowNumb] = names(groups[groups==paste(G)])[1]\r\n- #if(names(groups)[which(groups==G)[1]]!="All sequences combined"){\r\n- gs = unique(germlines[groups==G])\r\n- rowNumb = rowNumb+1\r\n- if( !is.na(gs) ){\r\n- for( g in gs ){\r\n- matOutput[rowNumb,c(1,2,11:18)] = c("Germline",names(germlines)[germlines==g][1],bayes_germlines_cdr[g,],bayes_germlines_fwr[g,],simgaP_germlines_cdr[g],simgaP_germlines_fwr[g])\r\n- listPDFs[[rowNumb]] = list("CDR"=bayesPDF_germlines_cdr[[g]],"FWR"=bayesPDF_germlines_fwr[[g]])\r\n- names(listPDFs)[rowNumb] = names(germlines[germlines==paste(g)])[1]\r\n- rowNumb = rowNumb+1\r\n- indexesOfInterest = which(germlines==g)\r\n- numbSeqsOfInterest = length(indexesOfInterest)\r\n- rowNumb = seq(rowNumb,rowNumb+(numbSeqsOfInterest-1))\r\n- matOutput[rowNumb,] = matrix( c( rep("Sequence",numbSeqsOfInterest),\r\n- rownames(matInput)[indexesOfInterest],\r\n- c(matMutationInfo[indexesOfInterest,1:4]),\r\n- c(matMutationInfo[indexesOfInterest,5:8]),\r\n- c(bayes_cdr[indexesOfInterest,]),\r\n- c(bayes_fwr[indexesOfInterest,]),\r\n- c(simgaP_cdr[indexesOfInterest]),\r\n- c(simgaP_fwr[indexesOfInterest]) \r\n- ), ncol=18, nrow=numbSeqsOfInterest,byrow=F)\r\n- increment=0\r\n- for( ioi in indexesOfInterest){\r\n- listPDFs[[min(rowNumb)+increment]] = list("CDR"=bayesPDF_cdr[[ioi]] , "FWR"=bayesPDF_fwr[[ioi]])\r\n- names(listPDFs)[min(rowNumb)+increment] = rownames(matInput)[ioi]\r\n- increment = increment + 1\r\n- }\r\n- rowNumb=max(rowNumb)+1\r\n-\r\n- }\r\n- }\r\n- }\r\n- colsToFormat = 11:18\r\n- matOutput[,colsToFormat] = formatC( matrix(as.numeric(matOutput[,colsToFormat]), nrow=nrow(matOutput), ncol=length(colsToFormat)) , digits=3)\r\n- matOutput[matOutput== " NaN"] = NA\r\n- \r\n- \r\n- \r\n- colnames(matOutput) = c("Type", "ID", "Observed_CDR_R", "Observed_CDR_S", "Observed_FWR_R", "Observed_FWR_S",\r\n- "Expected_CDR_R", "Expected_CDR_S", "Expected_FWR_R", "Expected_FWR_S",\r\n- paste( rep(testName,6), rep(c("Sigma","CIlower","CIupper"),2),rep(c("CDR","FWR"),each=3), sep="_"),\r\n- paste( rep(testName,2), rep("P",2),c("CDR","FWR"), sep="_")\r\n- )\r\n- fileName = paste(outputPath,outputID,".txt",sep="")\r\n- write.table(matOutput,file=fileName,quote=F,sep="\\t",row.names=T,col.names=NA)\r\n- fileName = paste(outputPath,outputID,".RData",sep="")\r\n- save(listPDFs,file=fileName)\r\n-\r\n-indelWarning = FALSE\r\n-if(sum(indelPos)>0){\r\n- indelWarning = "Warning: The following sequences have either gaps and/or deletions, and have been ommited from the analysis.";\r\n- indelWarning = paste( indelWarning , "<UL>", sep="" )\r\n- for(indels in names(indelPos)[indelPos]){\r\n- indelWarning = paste( indelWarning , "<LI>", indels, "</LI>", sep="" )\r\n- }\r\n- indelWarning = paste( indelWarning , "</UL>", sep="" )\r\n-}\r\n-\r\n-cloneWarning = FALSE\r\n-if(clonal==1){\r\n- if(sum(matInputErrors)>0){\r\n- cloneWarning = "Warning: The following clones have sequences of unequal length.";\r\n- cloneWarning = paste( cloneWarning , "<UL>", sep="" )\r\n- for(clone in names(matInputErrors)[matInputErrors]){\r\n- cloneWarning = paste( cloneWarning , "<LI>", names(germlines)[as.numeric(clone)], "</LI>", sep="" )\r\n- }\r\n- cloneWarning = paste( cloneWarning , "</UL>", sep="" )\r\n- }\r\n-}\r\n-cat(paste("Success",outputID,indelWarning,cloneWarning,sep="|"))\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/FiveS_Mutability.RData

Binary file shm_csr/baseline/FiveS_Mutability.RData has changed

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/FiveS_Substitution.RData

Binary file shm_csr/baseline/FiveS_Substitution.RData has changed

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa
--- a/shm_csr/baseline/IMGT-reference-seqs-IGHV-2015-11-05.fa Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,703 +0,0 @@\n->IGHV1-18*01\n-caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\n->IGHV1-18*02\n-caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctaagatctgacgacacggcc\n->IGHV1-18*03\n-caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacatggccgtgtattactgtgcgagaga\n->IGHV1-18*04\n-caggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctacggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\n->IGHV1-2*01\n-caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccagtaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\n->IGHV1-2*02\n-caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n->IGHV1-2*03\n-caggtgcagctggtgcagtctggggct...gaggtgaagaagcttggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcnacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n->IGHV1-2*04\n-caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggctgggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\n->IGHV1-2*05\n-caggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\n->IGHV1-24*01\n-caggtccagctggtacagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggtttccggatacaccctc............actgaattatccatgcactgggtgcgacaggctcctggaaaagggcttgagtggatgggaggttttgatcctgaa......gatggtgaaacaatctacgcacagaagttccag...ggcagagtcaccatgaccgaggacacatctacagacacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcaacaga\n->IGHV1-3*01\n-caggtccagcttgtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggatcaacgctggc......aatggtaacacaaaatattcacagaagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaagacacggctgtgtattactgtgcgagaga\n->IGHV1-3*02\n-caggttcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggagcaacgctggc......aatggtaacacaaaatattcacaggagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaggacatg'..b'aggtgcagctgttgcagtctgcagca...gaggtgaaaagacccggggagtctctgaggatctcctgtaagacttctggatacagcttt............accagctactggatccactgggtgcgccagatgcccgggaaagaactggagtggatggggagcatctatcctggg......aactctgataccagatacagcccatccttccaa...ggccacgtcaccatctcagccgacagctccagcagcaccgcctacctgcagtggagcagcctgaaggcctcggacgccgccatgtattattgtgtgaga\n->IGHV6-1*01\n-caggtacagctgcagcagtcaggtcca...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\n->IGHV6-1*02\n-caggtacagctgcagcagtcaggtccg...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\n->IGHV7-34-1*01\n-...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......actgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\n->IGHV7-34-1*02\n-...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......aatgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\n->IGHV7-4-1*01\n-caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatctgcagcctaaaggctgaggacactgccgtgtattactgtgcgaga\n->IGHV7-4-1*02\n-caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\n->IGHV7-4-1*03\n-caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcacgctaaaggctgaggacactg\n->IGHV7-4-1*04\n-caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\n->IGHV7-4-1*05\n-caggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtgttactgtgcgagaga\n->AIGHV7-40*03|\n-ttttcaatagaaaagtcaaataatcta...agtgtcaatcagtggatgattagataaaatatgatatatgtaaatcatggaatactatgc............agccagtatggtatgaattcagtgtgaccagcccctggacaagggcttgagtggatgggatggatcatcacctac......actgggaacccaacatataccaacggcttcaca...ggacggtttctattctccatggacacctctgtcagcatggcgtatctgcagatcagcagcctaaaggctgaggacacggccgtgtatgactgtatgagaga\n->IGHV7-81*01\n-caggtgcagctggtgcagtctggccat...gaggtgaagcagcctggggcctcagtgaaggtctcctgcaaggcttctggttacagtttc............accacctatggtatgaattgggtgccacaggcccctggacaagggcttgagtggatgggatggttcaacacctac......actgggaacccaacatatgcccagggcttcaca...ggacggtttgtcttctccatggacacctctgccagcacagcatacctgcagatcagcagcctaaaggctgaggacatggccatgtattactgtgcgagata\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/IMGTVHreferencedataset20161215.fa
--- a/shm_csr/baseline/IMGTVHreferencedataset20161215.fa Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,1 +0,0 @@\n->IGHV1-18*01\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-18*02\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctaagatctgacgacacggcc\r>IGHV1-18*03\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacatggccgtgtattactgtgcgagaga\r>IGHV1-18*04\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctacggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*01\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccagtaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\r>IGHV1-2*02\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*03\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcttggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcnacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*04\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggctgggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*05\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\r>IGHV1-24*01\rcaggtccagctggtacagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggtttccggatacaccctc............actgaattatccatgcactgggtgcgacaggctcctggaaaagggcttgagtggatgggaggttttgatcctgaa......gatggtgaaacaatctacgcacagaagttccag...ggcagagtcaccatgaccgaggacacatctacagacacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcaacaga\r>IGHV1-3*01\rcaggtccagcttgtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggatcaacgctggc......aatggtaacacaaaatattcacagaagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaagacacggctgtgtattactgtgcgagaga\r>IGHV1-3*02\rcaggttcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggagcaacgctggc......aatggtaacacaaaatattcacaggagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaggacatggctgtgtattactgtgcgagaga\r>'..b'cgccatg\r>IGHV5-78*01\rgaggtgcagctgttgcagtctgcagca...gaggtgaaaagacccggggagtctctgaggatctcctgtaagacttctggatacagcttt............accagctactggatccactgggtgcgccagatgcccgggaaagaactggagtggatggggagcatctatcctggg......aactctgataccagatacagcccatccttccaa...ggccacgtcaccatctcagccgacagctccagcagcaccgcctacctgcagtggagcagcctgaaggcctcggacgccgccatgtattattgtgtgaga\r>IGHV6-1*01\rcaggtacagctgcagcagtcaggtcca...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\r>IGHV6-1*02\rcaggtacagctgcagcagtcaggtccg...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\r>IGHV7-34-1*01\r...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......actgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\r>IGHV7-34-1*02\r...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......aatgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\r>IGHV7-4-1*01\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatctgcagcctaaaggctgaggacactgccgtgtattactgtgcgaga\r>IGHV7-4-1*02\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\r>IGHV7-4-1*03\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcacgctaaaggctgaggacactg\r>IGHV7-4-1*04\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\r>IGHV7-4-1*05\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtgttactgtgcgagaga\r>IGHV7-40*03\rttttcaatagaaaagtcaaataatcta...agtgtcaatcagtggatgattagataaaatatgatatatgtaaatcatggaatactatgc............agccagtatggtatgaattcagtgtgaccagcccctggacaagggcttgagtggatgggatggatcatcacctac......actgggaacccaacatataccaacggcttcaca...ggacggtttctattctccatggacacctctgtcagcatggcgtatctgcagatcagcagcctaaaggctgaggacacggccgtgtatgactgtatgagaga\r>IGHV7-81*01\rcaggtgcagctggtgcagtctggccat...gaggtgaagcagcctggggcctcagtgaaggtctcctgcaaggcttctggttacagtttc............accacctatggtatgaattgggtgccacaggcccctggacaagggcttgagtggatgggatggttcaacacctac......actgggaacccaacatatgcccagggcttcaca...ggacggtttgtcttctccatggacacctctgccagcacagcatacctgcagatcagcagcctaaaggctgaggacatggccatgtattactgtgcgagata\r\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/IMGTVHreferencedataset20161215.fasta
--- a/shm_csr/baseline/IMGTVHreferencedataset20161215.fasta Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,1 +0,0 @@\n->IGHV1-18*01\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-18*02\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctaagatctgacgacacggcc\r>IGHV1-18*03\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctatggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacatggccgtgtattactgtgcgagaga\r>IGHV1-18*04\rcaggttcagctggtgcagtctggagct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggttacaccttt............accagctacggtatcagctgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcagcgcttac......aatggtaacacaaactatgcacagaagctccag...ggcagagtcaccatgaccacagacacatccacgagcacagcctacatggagctgaggagcctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*01\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccagtaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\r>IGHV1-2*02\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*03\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcttggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcnacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*04\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggctgggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggccgtgtattactgtgcgagaga\r>IGHV1-2*05\rcaggtgcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggcttctggatacaccttc............accggctactatatgcactgggtgcgacaggcccctggacaagggcttgagtggatgggacggatcaaccctaac......agtggtggcacaaactatgcacagaagtttcag...ggcagggtcaccatgaccagggacacgtccatcagcacagcctacatggagctgagcaggctgagatctgacgacacggtcgtgtattactgtgcgagaga\r>IGHV1-24*01\rcaggtccagctggtacagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtctcctgcaaggtttccggatacaccctc............actgaattatccatgcactgggtgcgacaggctcctggaaaagggcttgagtggatgggaggttttgatcctgaa......gatggtgaaacaatctacgcacagaagttccag...ggcagagtcaccatgaccgaggacacatctacagacacagcctacatggagctgagcagcctgagatctgaggacacggccgtgtattactgtgcaacaga\r>IGHV1-3*01\rcaggtccagcttgtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggatcaacgctggc......aatggtaacacaaaatattcacagaagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaagacacggctgtgtattactgtgcgagaga\r>IGHV1-3*02\rcaggttcagctggtgcagtctggggct...gaggtgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgcattgggtgcgccaggcccccggacaaaggcttgagtggatgggatggagcaacgctggc......aatggtaacacaaaatattcacaggagttccag...ggcagagtcaccattaccagggacacatccgcgagcacagcctacatggagctgagcagcctgagatctgaggacatggctgtgtattactgtgcgagaga\r>'..b'accgccatg\r>IGHV5-78*01\rgaggtgcagctgttgcagtctgcagca...gaggtgaaaagacccggggagtctctgaggatctcctgtaagacttctggatacagcttt............accagctactggatccactgggtgcgccagatgcccgggaaagaactggagtggatggggagcatctatcctggg......aactctgataccagatacagcccatccttccaa...ggccacgtcaccatctcagccgacagctccagcagcaccgcctacctgcagtggagcagcctgaaggcctcggacgccgccatgtattattgtgtgaga\r>IGHV6-1*01\rcaggtacagctgcagcagtcaggtcca...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\r>IGHV6-1*02\rcaggtacagctgcagcagtcaggtccg...ggactggtgaagccctcgcagaccctctcactcacctgtgccatctccggggacagtgtctct......agcaacagtgctgcttggaactggatcaggcagtccccatcgagaggccttgagtggctgggaaggacatactacaggtcc...aagtggtataatgattatgcagtatctgtgaaa...agtcgaataaccatcaacccagacacatccaagaaccagttctccctgcagctgaactctgtgactcccgaggacacggctgtgtattactgtgcaagaga\r>IGHV7-34-1*01\r...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......actgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\r>IGHV7-34-1*02\r...ctgcagctggtgcagtctgggcct...gaggtgaagaagcctggggcctcagtgaaggtctcctataagtcttctggttacaccttc............accatctatggtatgaattgggtatgatagacccctggacagggctttgagtggatgtgatggatcatcacctac......aatgggaacccaacgtatacccacggcttcaca...ggatggtttgtcttctccatggacacgtctgtcagcacggcgtgtcttcagatcagcagcctaaaggctgaggacacggccgagtattactgtgcgaagta\r>IGHV7-4-1*01\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatctgcagcctaaaggctgaggacactgccgtgtattactgtgcgaga\r>IGHV7-4-1*02\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\r>IGHV7-4-1*03\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcacggcatatctgcagatcagcacgctaaaggctgaggacactg\r>IGHV7-4-1*04\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtattactgtgcgagaga\r>IGHV7-4-1*05\rcaggtgcagctggtgcaatctgggtct...gagttgaagaagcctggggcctcagtgaaggtttcctgcaaggcttctggatacaccttc............actagctatgctatgaattgggtgcgacaggcccctggacaagggcttgagtggatgggatggatcaacaccaac......actgggaacccaacgtatgcccagggcttcaca...ggacggtttgtcttctccttggacacctctgtcagcatggcatatctgcagatcagcagcctaaaggctgaggacactgccgtgtgttactgtgcgagaga\r>IGHV7-40*03\rttttcaatagaaaagtcaaataatcta...agtgtcaatcagtggatgattagataaaatatgatatatgtaaatcatggaatactatgc............agccagtatggtatgaattcagtgtgaccagcccctggacaagggcttgagtggatgggatggatcatcacctac......actgggaacccaacatataccaacggcttcaca...ggacggtttctattctccatggacacctctgtcagcatggcgtatctgcagatcagcagcctaaaggctgaggacacggccgtgtatgactgtatgagaga\r>IGHV7-81*01\rcaggtgcagctggtgcagtctggccat...gaggtgaagcagcctggggcctcagtgaaggtctcctgcaaggcttctggttacagtttc............accacctatggtatgaattgggtgccacaggcccctggacaagggcttgagtggatgggatggttcaacacctac......actgggaacccaacatatgcccagggcttcaca...ggacggtttgtcttctccatggacacctctgccagcacagcatacctgcagatcagcagcctaaaggctgaggacatggccatgtattactgtgcgagata\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/baseline_url.txt
--- a/shm_csr/baseline/baseline_url.txt Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,1 +0,0 @@
-http://selection.med.yale.edu/baseline/
\ No newline at end of file

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/comparePDFs.r
--- a/shm_csr/baseline/comparePDFs.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,225 +0,0 @@\n-options("warn"=-1)\r\n-\r\n-#from http://selection.med.yale.edu/baseline/Archive/Baseline%20Version%201.3/Baseline_Functions_Version1.3.r\r\n-# Compute p-value of two distributions\r\n-compareTwoDistsFaster <-function(sigma_S=seq(-20,20,length.out=4001), N=10000, dens1=runif(4001,0,1), dens2=runif(4001,0,1)){\r\n-#print(c(length(dens1),length(dens2)))\r\n-if(length(dens1)>1 & length(dens2)>1 ){\r\n-\tdens1<-dens1/sum(dens1)\r\n-\tdens2<-dens2/sum(dens2)\r\n-\tcum2 <- cumsum(dens2)-dens2/2\r\n-\ttmp<- sum(sapply(1:length(dens1),function(i)return(dens1[i]*cum2[i])))\r\n-\t#print(tmp)\r\n-\tif(tmp>0.5)tmp<-tmp-1\r\n-\treturn( tmp )\r\n-\t}\r\n-\telse {\r\n-\treturn(NA)\r\n-\t}\r\n-\t#return (sum(sapply(1:N,function(i)(sample(sigma_S,1,prob=dens1)>sample(sigma_S,1,prob=dens2))))/N)\r\n-} \r\n-\r\n-\r\n-require("grid")\r\n-arg <- commandArgs(TRUE)\r\n-#arg <- c("300143","4","5")\r\n-arg[!arg=="clonal"]\r\n-input <- arg[1]\r\n-output <- arg[2]\r\n-rowIDs <- as.numeric( sapply(arg[3:(max(3,length(arg)))],function(x){ gsub("chkbx","",x) } ) )\r\n-\r\n-numbSeqs = length(rowIDs)\r\n-\r\n-if ( is.na(rowIDs[1]) | numbSeqs>10 ) {\r\n- stop( paste("Error: Please select between one and 10 seqeunces to compare.") )\r\n-}\r\n-\r\n-#load( paste("output/",sessionID,".RData",sep="") )\r\n-load( input )\r\n-#input\r\n-\r\n-xMarks = seq(-20,20,length.out=4001)\r\n-\r\n-plot_grid_s<-function(pdf1,pdf2,Sample=100,cex=1,xlim=NULL,xMarks = seq(-20,20,length.out=4001)){\r\n- yMax = max(c(abs(as.numeric(unlist(listPDFs[pdf1]))),abs(as.numeric(unlist(listPDFs[pdf2]))),0),na.rm=T) * 1.1\r\n-\r\n- if(length(xlim==2)){\r\n- xMin=xlim[1]\r\n- xMax=xlim[2]\r\n- } else {\r\n- xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]\r\n- xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]\r\n- xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]\r\n- xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]\r\n- \r\n- xMin_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][1]\r\n- xMin_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][1]\r\n- xMax_CDR2 = xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["CDR"]]>0.001])]\r\n- xMax_FWR2 = xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf2][[1]][["FWR"]]>0.001])]\r\n- \r\n- xMin=min(c(xMin_CDR,xMin_FWR,xMin_CDR2,xMin_FWR2,0),na.rm=TRUE)\r\n- xMax=max(c(xMax_CDR,xMax_FWR,xMax_CDR2,xMax_FWR2,0),na.rm=TRUE)\r\n- }\r\n-\r\n- sigma<-approx(xMarks,xout=seq(xMin,xMax,length.out=Sample))$x\r\n- grid.rect(gp = gpar(col=gray(0.6),fill="white",cex=cex))\r\n- x <- sigma\r\n- pushViewport(viewport(x=0.175,y=0.175,width=0.825,height=0.825,just=c("left","bottom"),default.units="npc"))\r\n- #pushViewport(plotViewport(c(1.8, 1.8, 0.25, 0.25)*cex))\r\n- pushViewport(dataViewport(x, c(yMax,-yMax),gp = gpar(cex=cex),extension=c(0.05)))\r\n- grid.polygon(c(0,0,1,1),c(0,0.5,0.5,0),gp=gpar(col=grey(0.95),fill=grey(0.95)),default.units="npc")\r\n- grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.9),fill=grey(0.9)),default.units="npc")\r\n- grid.rect()\r\n- grid.xaxis(gp = gpar(cex=cex/1.1))\r\n- yticks = pretty(c(-yMax,yMax),8)\r\n- yticks = yticks[yticks>(-yMax) & yticks<(yMax)]\r\n- grid.yaxis(at=yticks,label=abs(yticks),gp = gpar(cex=cex/1.1))\r\n- if(length(listPDFs[pdf1][[1]][["CDR"]])>1){\r\n- ycdr<-approx(xMarks,listPDFs[pdf1][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n- grid.lines(unit(x,"native"), unit(ycdr,"native"),gp=gpar(col=2,lwd=2))\r\n- }\r\n- if(length(listPDFs[pdf1][[1]][["FWR"]])>1){\r\n- yfwr<-approx(xMarks,listPDFs[pdf1][[1]][["FWR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n- grid.lines(unit(x,"native"), unit(-yfwr,"native"),gp=gpar(col=4,lwd=2))\r\n- }\r\n-\r\n- if(length(listPDFs[pdf2][[1]][["CDR"]])>1){\r\n- ycdr2<-approx(xMarks,listPDFs[pdf2][[1]][["CDR"]],xout=seq(xMin,xMax,length.out=Sample),yleft=0,yright=0)$y\r\n- grid.lines(unit(x,"native"), unit(ycdr2,"native"),gp=gpar(col=2,lwd'..b'npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n- grid.text(formatC(as.numeric(pCDR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.75, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n- grid.text(formatC(as.numeric(pFWR1CDR2),digits=3), x = unit(0.25, "npc"),y = unit(0.25, "npc"),just=c("center", "center"),gp = gpar(cex=cex))\r\n- \r\n- \r\n- # grid.text(paste("P = ",formatC(pCDRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.98, "npc"),just=c("center", "top"),gp = gpar(cex=cex))\r\n- # grid.text(paste("P = ",formatC(pFWRFWR,digits=3)), x = unit(0.5, "npc"),y = unit(0.02, "npc"),just=c("center", "bottom"),gp = gpar(cex=cex))\r\n- }\r\n- else{\r\n- }\r\n-}\r\n-\r\n-\r\n-##################################################################################\r\n-################## The whole OCD\'s matrix ########################################\r\n-##################################################################################\r\n-\r\n-#pdf(width=4*numbSeqs+1/3,height=4*numbSeqs+1/3)\r\n-pdf( output ,width=4*numbSeqs+1/3,height=4*numbSeqs+1/3) \r\n-\r\n-pushViewport(viewport(x=0.02,y=0.02,just = c("left", "bottom"),w =0.96,height=0.96,layout = grid.layout(numbSeqs+1,numbSeqs+1,widths=unit.c(unit(rep(1,numbSeqs),"null"),unit(4,"lines")),heights=unit.c(unit(4,"lines"),unit(rep(1,numbSeqs),"null")))))\r\n-\r\n-for( seqOne in 1:numbSeqs+1){\r\n- pushViewport(viewport(layout.pos.col = seqOne-1, layout.pos.row = 1))\r\n- if(seqOne>2){ \r\n- grid.polygon(c(0,0,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")\r\n- grid.polygon(c(1,1,0.5,0.5),c(0,0.5,0.5,0),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")\r\n- grid.polygon(c(0,0,1,1),c(1,0.5,0.5,1),gp=gpar(col=grey(0.5)),default.units="npc")\r\n- \r\n- grid.text(y=.25,x=0.75,"FWR",gp = gpar(cex=1.5),just="center")\r\n- grid.text(y=.25,x=0.25,"CDR",gp = gpar(cex=1.5),just="center")\r\n- }\r\n- grid.rect(gp = gpar(col=grey(0.9)))\r\n- grid.text(y=.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),just="center")\r\n- popViewport(1)\r\n-}\r\n-\r\n-for( seqOne in 1:numbSeqs+1){\r\n- pushViewport(viewport(layout.pos.row = seqOne, layout.pos.col = numbSeqs+1))\r\n- if(seqOne<=numbSeqs){ \r\n- grid.polygon(c(0,0.5,0.5,0),c(0,0,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.95)),default.units="npc")\r\n- grid.polygon(c(0,0.5,0.5,0),c(1,1,0.5,0.5),gp=gpar(col=grey(0.5),fill=grey(0.9)),default.units="npc")\r\n- grid.polygon(c(1,0.5,0.5,1),c(0,0,1,1),gp=gpar(col=grey(0.5)),default.units="npc")\r\n- grid.text(x=.25,y=0.75,"CDR",gp = gpar(cex=1.5),just="center",rot=270)\r\n- grid.text(x=.25,y=0.25,"FWR",gp = gpar(cex=1.5),just="center",rot=270)\r\n- }\r\n- grid.rect(gp = gpar(col=grey(0.9)))\r\n- grid.text(x=0.75,substr(paste(names(listPDFs)[rowIDs[seqOne-1]]),1,16),gp = gpar(cex=2),rot=270,just="center")\r\n- popViewport(1)\r\n-}\r\n-\r\n-for( seqOne in 1:numbSeqs+1){\r\n- for(seqTwo in 1:numbSeqs+1){\r\n- pushViewport(viewport(layout.pos.col = seqTwo-1, layout.pos.row = seqOne))\r\n- if(seqTwo>seqOne){\r\n- plot_pvals(rowIDs[seqOne-1],rowIDs[seqTwo-1],cex=2)\r\n- grid.rect()\r\n- } \r\n- popViewport(1)\r\n- }\r\n-}\r\n- \r\n-\r\n-xMin=0\r\n-xMax=0.01\r\n-for(pdf1 in rowIDs){\r\n- xMin_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][1]\r\n- xMin_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][1]\r\n- xMax_CDR = xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["CDR"]]>0.001])]\r\n- xMax_FWR = xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001][length(xMarks[listPDFs[pdf1][[1]][["FWR"]]>0.001])]\r\n- xMin=min(c(xMin_CDR,xMin_FWR,xMin),na.rm=TRUE)\r\n- xMax=max(c(xMax_CDR,xMax_FWR,xMax),na.rm=TRUE)\r\n-}\r\n-\r\n-\r\n-\r\n-for(i in 1:numbSeqs+1){\r\n- for(j in (i-1):numbSeqs){ \r\n- pushViewport(viewport(layout.pos.col = i-1, layout.pos.row = j+1))\r\n- grid.rect()\r\n- plot_grid_s(rowIDs[i-1],rowIDs[j],cex=1)\r\n- popViewport(1)\r\n- }\r\n-}\r\n-\r\n-dev.off() \r\n-\r\n-cat("Success", paste(rowIDs,collapse="_"),sep=":")\r\n-\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/filter.r
--- a/shm_csr/baseline/filter.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,55 +0,0 @@
-arg = commandArgs(TRUE)
-summaryfile = arg[1]
-gappedfile = arg[2]
-selection = arg[3]
-output = arg[4]
-print(paste("selection = ", selection))
-
-
-summarydat = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote = "")
-gappeddat = read.table(gappedfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote = "")
-
-fix_column_names = function(df){
-    if("V.DOMAIN.Functionality" %in% names(df)){
-        names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
-        print("found V.DOMAIN.Functionality, changed")
-    }
-    if("V.DOMAIN.Functionality.comment" %in% names(df)){
-        names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
-        print("found V.DOMAIN.Functionality.comment, changed")
-    }
-    return(df)
-}
-
-gappeddat = fix_column_names(gappeddat)
-
-#dat = data.frame(merge(gappeddat, summarydat, by="Sequence.ID", all.x=T))
-
-dat = cbind(gappeddat, summarydat$AA.JUNCTION)
-
-colnames(dat)[length(dat)] = "AA.JUNCTION"
-
-dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele)
-dat$VGene = gsub("[*].*", "", dat$VGene)
-
-dat$DGene = gsub("^Homsap ", "", dat$D.GENE.and.allele)
-dat$DGene = gsub("[*].*", "", dat$DGene)
-
-dat$JGene = gsub("^Homsap ", "", dat$J.GENE.and.allele)
-dat$JGene = gsub("[*].*", "", dat$JGene)
-
-print(str(dat))
-
-dat$past = do.call(paste, c(dat[unlist(strsplit(selection, ","))], sep = ":"))
-
-dat = dat[!duplicated(dat$past), ]
-
-print(paste("Sequences remaining after duplicate filter:", nrow(dat)))
-
-dat = dat[dat$Functionality != "No results" & dat$Functionality != "unproductive",]
-
-print(paste("Sequences remaining after functionality filter:", nrow(dat)))
-
-print(paste("Sequences remaining:", nrow(dat)))
-
-write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T)

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/script_imgt.py
--- a/shm_csr/baseline/script_imgt.py Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,86 +0,0 @@
-#import xlrd #avoid dep
-import argparse
-import re
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
-parser.add_argument("--ref", help="Reference file")
-parser.add_argument("--output", help="Output file")
-parser.add_argument("--id", help="ID to be used at the '>>>' line in the output")
-
-args = parser.parse_args()
-
-print "script_imgt.py"
-print "input:", args.input
-print "ref:", args.ref
-print "output:", args.output
-print "id:", args.id
-
-refdic = dict()
-with open(args.ref, 'rU') as ref:
- currentSeq = ""
- currentId = ""
- for line in ref:
- if line.startswith(">"):
- if currentSeq is not "" and currentId is not "":
- refdic[currentId[1:]] = currentSeq
- currentId = line.rstrip()
- currentSeq = ""
- else:
- currentSeq += line.rstrip()
- refdic[currentId[1:]] = currentSeq
-
-print "Have", str(len(refdic)), "reference sequences"
-
-vPattern = [r"(IGHV[0-9]-[0-9ab]+-?[0-9]?D?\*\d{1,2})"]#,
-# r"(TRBV[0-9]{1,2}-?[0-9]?-?[123]?)",
-# r"(IGKV[0-3]D?-[0-9]{1,2})",
-# r"(IGLV[0-9]-[0-9]{1,2})",
-# r"(TRAV[0-9]{1,2}(-[1-46])?(/DV[45678])?)",
-# r"(TRGV[234589])",
-# r"(TRDV[1-3])"]
-
-#vPattern = re.compile(r"|".join(vPattern))
-vPattern = re.compile("|".join(vPattern))
-
-def filterGene(s, pattern):
-    if type(s) is not str:
-        return None
-    res = pattern.search(s)
-    if res:
-        return res.group(0)
-    return None
-
-
-
-currentSeq = ""
-currentId = ""
-first=True
-with open(args.input, 'r') as i:
- with open(args.output, 'a') as o:
- o.write(">>>" + args.id + "\n")
- outputdic = dict()
- for line in i:
- if first:
- first = False
- continue
- linesplt = line.split("\t")
- ref = filterGene(linesplt[1], vPattern)
- if not ref or not linesplt[2].rstrip():
- continue
- if ref in outputdic:
- outputdic[ref] += [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
- else:
- outputdic[ref] = [(linesplt[0].replace(">", ""), linesplt[2].replace(">", "").rstrip())]
- #print outputdic
-
- for k in outputdic.keys():
- if k in refdic:
- o.write(">>" + k + "\n")
- o.write(refdic[k] + "\n")
- for seq in outputdic[k]:
- #print seq
- o.write(">" + seq[0] + "\n")
- o.write(seq[1] + "\n")
- else:
- print k + " not in reference, skipping " + k

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/script_xlsx.py
--- a/shm_csr/baseline/script_xlsx.py Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,58 +0,0 @@
-import xlrd
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--input", help="Excel input file containing one or more sheets where column G has the gene annotation, H has the sequence id and J has the sequence")
-parser.add_argument("--ref", help="Reference file")
-parser.add_argument("--output", help="Output file")
-
-args = parser.parse_args()
-
-gene_column = 6
-id_column = 7
-seq_column = 8
-LETTERS = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
-
-
-refdic = dict()
-with open(args.ref, 'r') as ref:
- currentSeq = ""
- currentId = ""
- for line in ref.readlines():
- if line[0] is ">":
- if currentSeq is not "" and currentId is not "":
- refdic[currentId[1:]] = currentSeq
- currentId = line.rstrip()
- currentSeq = ""
- else:
- currentSeq += line.rstrip()
- refdic[currentId[1:]] = currentSeq
-
-currentSeq = ""
-currentId = ""
-with xlrd.open_workbook(args.input, 'r') as wb:
- with open(args.output, 'a') as o:
- for sheet in wb.sheets():
- if sheet.cell(1,gene_column).value.find("IGHV") < 0:
- print "Genes not in column " + LETTERS[gene_column] + ", skipping sheet " + sheet.name
- continue
- o.write(">>>" + sheet.name + "\n")
- outputdic = dict()
- for rowindex in range(1, sheet.nrows):
- ref = sheet.cell(rowindex, gene_column).value.replace(">", "")
- if ref in outputdic:
- outputdic[ref] += [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
- else:
- outputdic[ref] = [(sheet.cell(rowindex, id_column).value.replace(">", ""), sheet.cell(rowindex, seq_column).value)]
- #print outputdic
-
- for k in outputdic.keys():
- if k in refdic:
- o.write(">>" + k + "\n")
- o.write(refdic[k] + "\n")
- for seq in outputdic[k]:
- #print seq
- o.write(">" + seq[0] + "\n")
- o.write(seq[1] + "\n")
- else:
- print k + " not in reference, skipping " + k

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/baseline/wrapper.sh
--- a/shm_csr/baseline/wrapper.sh Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,92 +0,0 @@
-#!/bin/bash
-dir="$(cd "$(dirname "$0")" && pwd)"
-
-testID=$1
-species=$2
-substitutionModel=$3
-mutabilityModel=$4
-clonal=$5
-fixIndels=$6
-region=$7
-inputs=$8
-inputs=($inputs)
-IDs=$9
-IDs=($IDs)
-ref=${10}
-output=${11}
-selection=${12}
-output_table=${13}
-outID="result"
-
-echo "$PWD"
-
-echo "testID = $testID"
-echo "species = $species"
-echo "substitutionModel = $substitutionModel"
-echo "mutabilityModel = $mutabilityModel"
-echo "clonal = $clonal"
-echo "fixIndels = $fixIndels"
-echo "region = $region"
-echo "inputs = ${inputs[@]}"
-echo "IDs = ${IDs[@]}"
-echo "ref = $ref"
-echo "output = $output"
-echo "outID = $outID"
-
-fasta="$PWD/baseline.fasta"
-
-
-count=0
-for current in ${inputs[@]}
-do
- f=$(file $current)
- zipType="Zip archive"
- if [[ "$f" == *"Zip archive"* ]] || [[ "$f" == *"XZ compressed data"* ]]
- then
- id=${IDs[$count]}
- echo "id=$id"
- if [[ "$f" == *"Zip archive"* ]] ; then
- echo "Zip archive"
- echo "unzip $input -d $PWD/files/"
- unzip $current -d "$PWD/$id/"
- elif [[ "$f" == *"XZ compressed data"* ]] ; then
- echo "ZX archive"
- echo "tar -xJf $input -C $PWD/files/"
- mkdir -p "$PWD/$id/files"
- tar -xJf $current -C "$PWD/$id/files/"
- fi
- filtered="$PWD/filtered_${id}.txt"
- imgt_1_file="`find $PWD/$id -name '1_*.txt'`"
- imgt_2_file="`find $PWD/$id -name '2_*.txt'`"
- echo "1_Summary file: ${imgt_1_file}"
- echo "2_IMGT-gapped file: ${imgt_2_file}"
- echo "filter.r for $id"
- Rscript $dir/filter.r ${imgt_1_file} ${imgt_2_file} "$selection" $filtered 2>&1
-
- final="$PWD/final_${id}.txt"
- cat $filtered | cut -f2,4,7 > $final
- python $dir/script_imgt.py --input $final --ref $ref --output $fasta --id $id
- else
- python $dir/script_xlsx.py --input $current --ref $ref --output $fasta
- fi
- count=$((count+1))
-done
-workdir="$PWD"
-cd $dir
-echo "file: ${inputs[0]}"
-#Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region ${inputs[0]} $workdir/ $outID 2>&1
-Rscript --verbose $dir/Baseline_Main.r $testID $species $substitutionModel $mutabilityModel $clonal $fixIndels $region $fasta $workdir/ $outID 2>&1
-
-echo "$workdir/${outID}.txt"
-
-rows=`tail -n +2 $workdir/${outID}.txt | grep -v "All sequences combined" | grep -n 'Group' | grep -Eoh '^[0-9]+' | tr '\n' ' '`
-rows=($rows)
-#unset rows[${#rows[@]}-1]
-
-cd $dir
-Rscript --verbose $dir/comparePDFs.r $workdir/${outID}.RData $output ${rows[@]} 2>&1
-cp $workdir/result.txt ${output_table}
-
-
-
-

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/change_o/change_o_url.txt
--- a/shm_csr/change_o/change_o_url.txt Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,1 +0,0 @@
-https://changeo.readthedocs.io/en/version-0.4.4/
\ No newline at end of file

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/change_o/define_clones.r
--- a/shm_csr/change_o/define_clones.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,15 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE)
-
-input=args[1]
-output=args[2]
-
-change.o = read.table(input, header=T, sep="\t", quote="", stringsAsFactors=F)
-
-freq = data.frame(table(change.o$CLONE))
-freq2 = data.frame(table(freq$Freq))
-
-freq2$final = as.numeric(freq2$Freq) * as.numeric(as.character(freq2$Var1))
-
-names(freq2) = c("Clone size", "Nr of clones", "Nr of sequences")
-
-write.table(x=freq2, file=output, sep="\t",quote=F,row.names=F,col.names=T)

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/change_o/define_clones.sh
--- a/shm_csr/change_o/define_clones.sh Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,39 +0,0 @@
-#!/bin/bash
-dir="$(cd "$(dirname "$0")" && pwd)"
-
-#define_clones.sh $input $noparse $scores $regions $out_file
-
-type=$1
-input=$2
-
-mkdir -p $PWD/outdir
-
-cp $input $PWD/input.tab #file has to have a ".tab" extension
-
-if [ "bygroup" == "$type" ] ; then
- mode=$3
- act=$4
- model=$5
- norm=$6
- sym=$7
- link=$8
- dist=$9
- output=${10}
- output2=${11}
-
- DefineClones.py -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --mode $mode --act $act --model $model --dist $dist --norm $norm --sym $sym --link $link
-
- Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
-else
- method=$3
- output=$4
- output2=$5
-
- DefineClones.py hclust -d $PWD/input.tab --nproc 4 --outdir $PWD/outdir --outname output --method $method
-
- Rscript $dir/define_clones.r $PWD/outdir/output_clone-pass.tab $output2 2>&1
-fi
-
-cp $PWD/outdir/output_clone-pass.tab $output
-
-rm -rf $PWD/outdir/

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/change_o/makedb.sh
--- a/shm_csr/change_o/makedb.sh Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,36 +0,0 @@
-#!/bin/bash
-dir="$(cd "$(dirname "$0")" && pwd)"
-
-input=$1
-noparse=$2
-scores=$3
-regions=$4
-output=$5
-
-if [ "true" == "$noparse" ] ; then
- noparse="--noparse"
-else
- noparse=""
-fi
-
-if [ "true" == "$scores" ] ; then
- scores="--scores"
-else
- scores=""
-fi
-
-if [ "true" == "$regions" ] ; then
- regions="--regions"
-else
- regions=""
-fi
-
-mkdir $PWD/outdir
-
-echo "makedb: $PWD/outdir"
-
-MakeDb.py imgt -i $input --outdir $PWD/outdir --outname output $noparse $scores $regions
-
-mv $PWD/outdir/output_db-pass.tab $output
-
-rm -rf $PWD/outdir/

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/change_o/select_first_in_clone.r
--- a/shm_csr/change_o/select_first_in_clone.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,16 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE)
-
-input.file = args[1]
-output.file = args[2]
-
-print("select_in_first_clone.r")
-print(input.file)
-print(output.file)
-
-input = read.table(input.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
-
-input = input[!duplicated(input$CLONE),]
-
-names(input)[1] = "Sequence.ID"
-
-write.table(input, output.file, quote=F, sep="\t", row.names=F, col.names=T, na="")

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/check_unique_id.r
--- a/shm_csr/check_unique_id.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,25 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE) #first argument must be the summary file so it can grab the
-
-current_file = args[1]
-
-current = read.table(current_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="", check.names=F)
-
-if(!("Sequence number" %in% names(current))){
- stop("First argument doesn't contain the 'Sequence number' column")
-}
-
-tbl = table(current[,"Sequence ID"])
-l_tbl = length(tbl)
-check = any(tbl > 1)
-
-#if(l_tbl != nrow(current)){ # non unique IDs?
-if(check){
- print("Sequence.ID is not unique for every sequence, adding sequence number to IDs")
- for(i in 1:length(args)){
- current_file = args[i]
- print(paste("Appending 'Sequence number' column to 'Sequence ID' column in", current_file))
- current = read.table(current_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="", check.names=F)
- current[,"Sequence ID"] = paste(current[,"Sequence ID"], current[,"Sequence number"], sep="_")
- write.table(x = current, file = current_file, quote = F, sep = "\t", na = "", row.names = F, col.names = T)
- }
-}

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/datatypes_conf.xml
--- a/shm_csr/datatypes_conf.xml Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<datatypes>
- <registration>
- <datatype extension="imgt_archive" type="galaxy.datatypes.binary:CompressedArchive" display_in_upload="True" subclass="True"/>
- </registration>
-</datatypes>

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/gene_identification.py
--- a/shm_csr/gene_identification.py Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,226 +0,0 @@\n-import re\n-import argparse\n-import time\n-starttime= int(time.time() * 1000)\n-\n-parser = argparse.ArgumentParser()\n-parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file")\n-parser.add_argument("--output", help="The annotated output file to be merged back with the summary file")\n-\n-args = parser.parse_args()\n-\n-infile = args.input\n-#infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt"\n-output = args.output\n-#outfile = "identified.txt"\n-\n-dic = dict()\n-total = 0\n-\n-\n-first = True\n-IDIndex = 0\n-seqIndex = 0\n-\n-with open(infile, \'r\') as f: #read all sequences into a dictionary as key = ID, value = sequence\n-\tfor line in f:\n-\t\ttotal += 1\n-\t\tlinesplt = line.split("\\t")\n-\t\tif first:\n-\t\t\tprint "linesplt", linesplt\n-\t\t\tIDIndex = linesplt.index("Sequence ID")\n-\t\t\tseqIndex = linesplt.index("Sequence")\n-\t\t\tfirst = False\n-\t\t\tcontinue\n-\t\t\n-\t\tID = linesplt[IDIndex]\n-\t\tif len(linesplt) < 28: #weird rows without a sequence\n-\t\t\tdic[ID] = ""\n-\t\telse:\n-\t\t\tdic[ID] = linesplt[seqIndex]\n-\t\t\t\n-print "Number of input sequences:", len(dic)\n-\n-#old cm sequence: gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc\n-#old cg sequence: ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccag\n-\n-#lambda/kappa reference sequence\n-searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctgg",\n- "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggcc",\n- "ce": "gcctccacacagagcccatccgtcttccccttgacccgctgctgcaaaaacattccctcc",\n- "cm": "gggagtgcatccgccccaacc"} #new (shorter) cm sequence\n-\n-compiledregex = {"ca": [],\n- "cg": [],\n- "ce": [],\n- "cm": []}\n-\n-#lambda/kappa reference sequence variable nucleotides\n-ca1 = {38: \'t\', 39: \'g\', 48: \'a\', 49: \'g\', 51: \'c\', 68: \'a\', 73: \'c\'}\n-ca2 = {38: \'g\', 39: \'a\', 48: \'c\', 49: \'c\', 51: \'a\', 68: \'g\', 73: \'a\'}\n-cg1 = {0: \'c\', 33: \'a\', 38: \'c\', 44: \'a\', 54: \'t\', 56: \'g\', 58: \'g\', 66: \'g\', 132: \'c\'}\n-cg2 = {0: \'c\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'c\', 56: \'a\', 58: \'a\', 66: \'g\', 132: \'t\'}\n-cg3 = {0: \'t\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'t\', 56: \'g\', 58: \'g\', 66: \'g\', 132: \'c\'}\n-cg4 = {0: \'t\', 33: \'g\', 38: \'g\', 44: \'g\', 54: \'c\', 56: \'a\', 58: \'a\', 66: \'c\', 132: \'c\'}\n-\n-#remove last snp for shorter cg sequence --- note, also change varsInCG\n-del cg1[132]\n-del cg2[132]\n-del cg3[132]\n-del cg4[132]\n-\n-#reference sequences are cut into smaller parts of \'chunklength\' length, and with \'chunklength\' / 2 overlap\n-chunklength = 8\n-\n-#create the chunks of the reference sequence with regular expressions for the variable nucleotides\n-for i in range(0, len(searchstrings["ca"]) - chunklength, chunklength / 2):\n- pos = i\n- chunk = searchstrings["ca"][i:i+chunklength]\n- result = ""\n- varsInResult = 0\n- for c in chunk:\n- if pos in ca1.keys():\n- varsInResult += 1\n- result += "[" + ca1[pos] + ca2[pos] + "]"\n- else:\n- result += c\n- pos += 1\n- compiledregex["ca"].append((re.compile(result), varsInResult))\n-\n-for i in range(0, len(searchstrings["cg"]) - chunklength, chunklength / 2):\n- pos = i\n- chunk = searchstrings["cg"][i:i+chunklength]\n- result = ""\n- varsInResult = 0\n- for c in chunk:\n- if pos in cg1.keys():\n- varsInResult += 1\n- result += "[" + "".join(set([cg1[pos], cg2[pos], cg3[pos], cg4[pos]])) + "]"\n- else:\n- result += c\n- pos += 1\n- compiledregex["cg"].append((re.compile(result), varsInResult))\n-\n-for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2):\n- compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False))\n-\n-for i in range(0, len(searchstrings["ce"]) - chunklength + 1, chunklength / 2):\n- compiledregex["ce"].append((re.compile(searchstrings["ce"][i:i+chunklength]), False))\n-\n-def removeAndReturnMaxIndex(x): #simplifies a list compr'..b'kstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]])\n-\t\t\t\t\telse: #key == "cm" #no variable regions in \'cm\' or \'ce\'\n-\t\t\t\t\t\tpass\n-\t\t\t\tbreak #this only breaks when there was a match with the regex, breaking means the \'else:\' clause is skipped\n-\t\t\telse: #only runs if there were no hits\n-\t\t\t\tcontinue\n-\t\t\t#print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it\'s now:", start[lastindex - chunklength / 2 * i]\n-\t\t\tcurrentIDHits[key + "_hits"] += 1\n-\t\tstart_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1 - start_zero) for x in range(5) if len(start) > 0 and max(start) > 1])\n-\t\t#start_location[ID + "_" + key] = str(start.index(max(start)))\n-\n-\n-varsInCA = float(len(ca1.keys()) * 2)\n-varsInCG = float(len(cg1.keys()) * 2) - 2 # -2 because the sliding window doesn\'t hit the first and last nt twice\n-varsInCM = 0\n-varsInCE = 0\n-\n-def round_int(val):\n-\treturn int(round(val))\n-\n-first = True\n-seq_write_count=0\n-with open(infile, \'r\') as f: #read all sequences into a dictionary as key = ID, value = sequence\n-\twith open(output, \'w\') as o:\n-\t\tfor line in f:\n-\t\t\ttotal += 1\n-\t\t\tif first:\n-\t\t\t\to.write("Sequence ID\\tbest_match\\tnt_hit_percentage\\tchunk_hit_percentage\\tstart_locations\\n")\n-\t\t\t\tfirst = False\n-\t\t\t\tcontinue\n-\t\t\tlinesplt = line.split("\\t")\n-\t\t\tif linesplt[2] == "No results":\n-\t\t\t\tpass\n-\t\t\tID = linesplt[1]\n-\t\t\tcurrentIDHits = hits[ID]\n-\t\t\tpossibleca = float(len(compiledregex["ca"]))\n-\t\t\tpossiblecg = float(len(compiledregex["cg"]))\n-\t\t\tpossiblecm = float(len(compiledregex["cm"]))\n-\t\t\tpossiblece = float(len(compiledregex["ce"]))\n-\t\t\tcahits = currentIDHits["ca_hits"]\n-\t\t\tcghits = currentIDHits["cg_hits"]\n-\t\t\tcmhits = currentIDHits["cm_hits"]\n-\t\t\tcehits = currentIDHits["ce_hits"]\n-\t\t\tif cahits >= cghits and cahits >= cmhits and cahits >= cehits: #its a ca gene\n-\t\t\t\tca1hits = currentIDHits["ca1"]\n-\t\t\t\tca2hits = currentIDHits["ca2"]\n-\t\t\t\tif ca1hits >= ca2hits:\n-\t\t\t\t\to.write(ID + "\\tIGA1\\t" + str(round_int(ca1hits / varsInCA * 100)) + "\\t" + str(round_int(cahits / possibleca * 100)) + "\\t" + start_location[ID + "_ca"] + "\\n")\n-\t\t\t\telse:\n-\t\t\t\t\to.write(ID + "\\tIGA2\\t" + str(round_int(ca2hits / varsInCA * 100)) + "\\t" + str(round_int(cahits / possibleca * 100)) + "\\t" + start_location[ID + "_ca"] + "\\n")\n-\t\t\telif cghits >= cahits and cghits >= cmhits and cghits >= cehits: #its a cg gene\n-\t\t\t\tcg1hits = currentIDHits["cg1"]\n-\t\t\t\tcg2hits = currentIDHits["cg2"]\n-\t\t\t\tcg3hits = currentIDHits["cg3"]\n-\t\t\t\tcg4hits = currentIDHits["cg4"]\n-\t\t\t\tif cg1hits >= cg2hits and cg1hits >= cg3hits and cg1hits >= cg4hits: #cg1 gene\n-\t\t\t\t\to.write(ID + "\\tIGG1\\t" + str(round_int(cg1hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n-\t\t\t\telif cg2hits >= cg1hits and cg2hits >= cg3hits and cg2hits >= cg4hits: #cg2 gene\n-\t\t\t\t\to.write(ID + "\\tIGG2\\t" + str(round_int(cg2hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n-\t\t\t\telif cg3hits >= cg1hits and cg3hits >= cg2hits and cg3hits >= cg4hits: #cg3 gene\n-\t\t\t\t\to.write(ID + "\\tIGG3\\t" + str(round_int(cg3hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n-\t\t\t\telse: #cg4 gene\n-\t\t\t\t\to.write(ID + "\\tIGG4\\t" + str(round_int(cg4hits / varsInCG * 100)) + "\\t" + str(round_int(cghits / possiblecg * 100)) + "\\t" + start_location[ID + "_cg"] + "\\n")\n-\t\t\telse: #its a cm or ce gene\n-\t\t\t\tif cmhits >= cehits:\n-\t\t\t\t\to.write(ID + "\\tIGM\\t100\\t" + str(round_int(cmhits / possiblecm * 100)) + "\\t" + start_location[ID + "_cm"] + "\\n")\n-\t\t\t\telse:\n-\t\t\t\t\to.write(ID + "\\tIGE\\t100\\t" + str(round_int(cehits / possiblece * 100)) + "\\t" + start_location[ID + "_ce"] + "\\n")\n-\t\t\tseq_write_count += 1\n-\n-print "Time: %i" % (int(time.time() * 1000) - starttime)\n-\n-print "Number of sequences written to file:", seq_write_count\n-\n-\n-\n-\n-\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/imgt_loader.r
--- a/shm_csr/imgt_loader.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,98 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE)
-
-summ.file = args[1]
-aa.file = args[2]
-junction.file = args[3]
-out.file = args[4]
-
-summ = read.table(summ.file, sep="\t", header=T, quote="", fill=T)
-aa = read.table(aa.file, sep="\t", header=T, quote="", fill=T)
-junction = read.table(junction.file, sep="\t", header=T, quote="", fill=T)
-
-fix_column_names = function(df){
- if("V.DOMAIN.Functionality" %in% names(df)){
- names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"
- print("found V.DOMAIN.Functionality, changed")
- }
- if("V.DOMAIN.Functionality.comment" %in% names(df)){
- names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"
- print("found V.DOMAIN.Functionality.comment, changed")
- }
- return(df)
-}
-
-summ = fix_column_names(summ)
-aa = fix_column_names(aa)
-junction = fix_column_names(junction)
-
-old_summary_columns=c('Sequence.ID','JUNCTION.frame','V.GENE.and.allele','D.GENE.and.allele','J.GENE.and.allele','CDR1.IMGT.length','CDR2.IMGT.length','CDR3.IMGT.length','Orientation')
-old_sequence_columns=c('CDR1.IMGT','CDR2.IMGT','CDR3.IMGT')
-old_junction_columns=c('JUNCTION')
-
-added_summary_columns=c('Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence')
-added_sequence_columns=c('FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT')
-
-added_junction_columns=c('P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION')
-added_junction_columns=c(added_junction_columns, 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')
-
-out=summ[,c("Sequence.ID","JUNCTION.frame","V.GENE.and.allele","D.GENE.and.allele","J.GENE.and.allele")]
-
-out[,"CDR1.Seq"] = aa[,"CDR1.IMGT"]
-out[,"CDR1.Length"] = summ[,"CDR1.IMGT.length"]
-
-out[,"CDR2.Seq"] = aa[,"CDR2.IMGT"]
-out[,"CDR2.Length"] = summ[,"CDR2.IMGT.length"]
-
-out[,"CDR3.Seq"] = aa[,"CDR3.IMGT"]
-out[,"CDR3.Length"] = summ[,"CDR3.IMGT.length"]
-
-out[,"CDR3.Seq.DNA"] = junction[,"JUNCTION"]
-out[,"CDR3.Length.DNA"] = nchar(as.character(junction[,"JUNCTION"]))
-out[,"Strand"] = summ[,"Orientation"]
-out[,"CDR3.Found.How"] = "a"
-
-out[,added_summary_columns] = summ[,added_summary_columns]
-
-out[,added_sequence_columns] = aa[,added_sequence_columns]
-
-out[,added_junction_columns] = junction[,added_junction_columns]
-
-out[,"Top V Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"V.GENE.and.allele"]))
-out[,"Top D Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"D.GENE.and.allele"]))
-out[,"Top J Gene"] = gsub(".* ", "", gsub("\\*.*", "", summ[,"J.GENE.and.allele"]))
-
-out = out[,c('Sequence.ID','JUNCTION.frame','Top V Gene','Top D Gene','Top J Gene','CDR1.Seq','CDR1.Length','CDR2.Seq','CDR2.Length','CDR3.Seq','CDR3.Length','CDR3.Seq.DNA','CDR3.Length.DNA','Strand','CDR3.Found.How','Functionality','V.REGION.identity..','V.REGION.identity.nt','D.REGION.reading.frame','AA.JUNCTION','Functionality.comment','Sequence','FR1.IMGT','FR2.IMGT','FR3.IMGT','CDR3.IMGT','JUNCTION','J.REGION','FR4.IMGT','P3.V.nt.nb','N.REGION.nt.nb','N1.REGION.nt.nb','P5.D.nt.nb','P3.D.nt.nb','N2.REGION.nt.nb','P5.J.nt.nb','X3.V.REGION.trimmed.nt.nb','X5.D.REGION.trimmed.nt.nb','X3.D.REGION.trimmed.nt.nb','X5.J.REGION.trimmed.nt.nb','N.REGION','N1.REGION','N2.REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')]
-
-names(out) = c('ID','VDJ Frame','Top V Gene','Top D Gene','Top J Gene','CDR1 Seq','CDR1 Length','CDR2 Seq','CDR2 Length','CDR3 Seq','CDR3 Length','CDR3 Seq DNA','CDR3 Length DNA','Strand','CDR3 Found How','Functionality','V-REGION identity %','V-REGION identity nt','D-REGION reading frame','AA JUNCTION','Functionality comment','Sequence','FR1-IMGT','FR2-IMGT','FR3-IMGT','CDR3-IMGT','JUNCTION','J-REGION','FR4-IMGT','P3V-nt nb','N-REGION-nt nb','N1-REGION-nt nb','P5D-nt nb','P3D-nt nb','N2-REGION-nt nb','P5J-nt nb','3V-REGION trimmed-nt nb','5D-REGION trimmed-nt nb','3D-REGION trimmed-nt nb','5J-REGION trimmed-nt nb','N-REGION','N1-REGION','N2-REGION', 'P5.D1.nt.nb', 'P3.D1.nt.nb', 'N2.REGION.nt.nb', 'P5.D2.nt.nb', 'P3.D2.nt.nb', 'N3.REGION.nt.nb', 'P5.D3.nt.nb', 'P3.D2.nt.nb', 'N4.REGION.nt.nb', 'X5.D1.REGION.trimmed.nt.nb', 'X3.D1.REGION.trimmed.nt.nb', 'X5.D2.REGION.trimmed.nt.nb', 'X3.D2.REGION.trimmed.nt.nb', 'X5.D3.REGION.trimmed.nt.nb', 'X3.D3.REGION.trimmed.nt.nb', 'D.REGION.nt.nb', 'D1.REGION.nt.nb', 'D2.REGION.nt.nb', 'D3.REGION.nt.nb')
-
-out[,"VDJ Frame"] = as.character(out[,"VDJ Frame"])
-
-fltr = out[,"VDJ Frame"] == "in-frame"
-if(any(fltr, na.rm = T)){
- out[fltr, "VDJ Frame"] = "In-frame"
-}
-
-fltr = out[,"VDJ Frame"] == "null"
-if(any(fltr, na.rm = T)){
- out[fltr, "VDJ Frame"] = "Out-of-frame"
-}
-
-fltr = out[,"VDJ Frame"] == "out-of-frame"
-if(any(fltr, na.rm = T)){
- out[fltr, "VDJ Frame"] = "Out-of-frame"
-}
-
-fltr = out[,"VDJ Frame"] == ""
-if(any(fltr, na.rm = T)){
- out[fltr, "VDJ Frame"] = "Out-of-frame"
-}
-
-for(col in c('Top V Gene','Top D Gene','Top J Gene')){
- out[,col] = as.character(out[,col])
- fltr = out[,col] == ""
- if(any(fltr, na.rm = T)){
- out[fltr,col] = "NA"
- }
-}
-
-write.table(out, out.file, sep="\t", quote=F, row.names=F, col.names=T)

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/merge.r
--- a/shm_csr/merge.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,27 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE)
-
-input.1 = args[1]
-input.2 = args[2]
-
-fields.1 = args[3]
-fields.2 = args[4]
-
-field.1 = args[5]
-field.2 = args[6]
-
-output = args[7]
-
-dat1 = read.table(input.1, header=T, sep="\t", quote="", stringsAsFactors=F, fill=T, row.names=NULL)
-if(fields.1 != "all"){
- fields.1 = unlist(strsplit(fields.1, ","))
- dat1 = dat1[,fields.1]
-}
-dat2 = read.table(input.2, header=T, sep="\t", quote="", stringsAsFactors=F, fill=T, row.names=NULL)
-if(fields.2 != "all"){
- fields.2 = unlist(strsplit(fields.2, ","))
- dat2 = dat2[,fields.2]
-}
-
-dat3 = merge(dat1, dat2, by.x=field.1, by.y=field.2)
-
-write.table(dat3, output, sep="\t",quote=F,row.names=F,col.names=T)

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/merge_and_filter.r
--- a/shm_csr/merge_and_filter.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,304 +0,0 @@\n-args <- commandArgs(trailingOnly = TRUE)\r\n-\r\n-\r\n-summaryfile = args[1]\r\n-sequencesfile = args[2]\r\n-mutationanalysisfile = args[3]\r\n-mutationstatsfile = args[4]\r\n-hotspotsfile = args[5]\r\n-aafile = args[6]\r\n-gene_identification_file= args[7]\r\n-output = args[8]\r\n-before.unique.file = args[9]\r\n-unmatchedfile = args[10]\r\n-method=args[11]\r\n-functionality=args[12]\r\n-unique.type=args[13]\r\n-filter.unique=args[14]\r\n-filter.unique.count=as.numeric(args[15])\r\n-class.filter=args[16]\r\n-empty.region.filter=args[17]\r\n-\r\n-print(paste("filter.unique.count:", filter.unique.count))\r\n-\r\n-summ = read.table(summaryfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-sequences = read.table(sequencesfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-mutationstats = read.table(mutationstatsfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-hotspots = read.table(hotspotsfile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-AAs = read.table(aafile, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-gene_identification = read.table(gene_identification_file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\r\n-\r\n-fix_column_names = function(df){\r\n- if("V.DOMAIN.Functionality" %in% names(df)){\r\n- names(df)[names(df) == "V.DOMAIN.Functionality"] = "Functionality"\r\n- print("found V.DOMAIN.Functionality, changed")\r\n- }\r\n- if("V.DOMAIN.Functionality.comment" %in% names(df)){\r\n- names(df)[names(df) == "V.DOMAIN.Functionality.comment"] = "Functionality.comment"\r\n- print("found V.DOMAIN.Functionality.comment, changed")\r\n- }\r\n- return(df)\r\n-}\r\n-\r\n-fix_non_unique_ids = function(df){\r\n-\tdf$Sequence.ID = paste(df$Sequence.ID, 1:nrow(df))\r\n-\treturn(df)\r\n-}\r\n-\r\n-summ = fix_column_names(summ)\r\n-sequences = fix_column_names(sequences)\r\n-mutationanalysis = fix_column_names(mutationanalysis)\r\n-mutationstats = fix_column_names(mutationstats)\r\n-hotspots = fix_column_names(hotspots)\r\n-AAs = fix_column_names(AAs)\r\n-\r\n-if(method == "blastn"){\r\n-\t#"qseqid\\tsseqid\\tpident\\tlength\\tmismatch\\tgapopen\\tqstart\\tqend\\tsstart\\tsend\\tevalue\\tbitscore"\r\n-\tgene_identification = gene_identification[!duplicated(gene_identification$qseqid),]\r\n-\tref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52))\r\n-\tgene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T)\r\n-\tgene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100\r\n-\tgene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")]\r\n-\tcolnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")\r\n-}\r\n-\r\n-#print("Summary analysis files columns")\r\n-#print(names(summ))\r\n-\r\n-\r\n-\r\n-input.sequence.count = nrow(summ)\r\n-print(paste("Number of sequences in summary file:", input.sequence.count))\r\n-\r\n-filtering.steps = data.frame(character(0), numeric(0))\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("Input", input.sequence.count))\r\n-\r\n-filtering.steps[,1] = as.character(filtering.steps[,1])\r\n-filtering.steps[,2] = as.character(filtering.steps[,2])\r\n-#filtering.steps[,3] = as.numeric(filtering.steps[,3])\r\n-\r\n-#print("summary files columns")\r\n-#print(names(summ))\r\n-\r\n-summ = merge(summ, gene_identification, by="Sequence.ID")\r\n-\r\n-print(paste("Number of sequences after merging with gene identification:", nrow(summ)))\r\n-\r\n-summ = summ[summ$Functionality != "No results",]\r\n-\r\n-print(paste("Number of sequences after \'No results\' filter:", nrow(summ)))\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("After \'No results\' filter", nrow(summ)))\r\n-\r\n-if(functionality == "productive"){\r\n-\tsumm = summ[summ$Functionality == "productive (see comment)" | summ$Functionality'..b'ion.filter == "leader"){\r\n-\t\tresult$unique.def = paste(result$FR1.IMGT.seq, result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n-\t} else if(empty.region.filter == "FR1"){\r\n-\t\tresult$unique.def = paste(result$CDR1.IMGT.seq, result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n-\t} else if(empty.region.filter == "CDR1"){\r\n-\t\tresult$unique.def = paste(result$FR2.IMGT.seq, result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n-\t} else if(empty.region.filter == "FR2"){\r\n-\t\tresult$unique.def = paste(result$CDR2.IMGT.seq, result$FR3.IMGT.seq, result$CDR3.IMGT.seq)\r\n-\t}\r\n-\t\r\n-\tif(grepl("remove", filter.unique)){\r\n-\t\tresult = result[duplicated(result$unique.def) | duplicated(result$unique.def, fromLast=T),]\r\n-\t\tunique.defs = data.frame(table(result$unique.def))\r\n-\t\tunique.defs = unique.defs[unique.defs$Freq >= filter.unique.count,]\r\n-\t\tresult = result[result$unique.def %in% unique.defs$Var1,]\r\n-\t}\r\n-\r\n-\tif(filter.unique != "remove_vjaa"){\r\n-\t\tresult$unique.def = paste(result$unique.def, gsub(",.*", "", result$best_match)) #keep the unique sequences that are in multiple classes, gsub so the unmatched don\'t have a class after it\r\n-\t}\r\n-\r\n-\tresult = result[!duplicated(result$unique.def),]\r\n-}\r\n-\r\n-write.table(result, gsub("before_unique_filter.txt", "after_unique_filter.txt", before.unique.file), sep="\\t", quote=F,row.names=F,col.names=T)\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("After filter unique sequences", nrow(result)))\r\n-\r\n-print(paste("Number of sequences in result after unique filtering:", nrow(result)))\r\n-\r\n-if(nrow(summ) == 0){\r\n-\tstop("No data remaining after filter")\r\n-}\r\n-\r\n-result$best_match_class = gsub(",.*", "", result$best_match) #gsub so the unmatched don\'t have a class after it\r\n-\r\n-#result$past = ""\r\n-#cls = unlist(strsplit(unique.type, ","))\r\n-#for (i in 1:nrow(result)){\r\n-#\tresult[i,"past"] = paste(result[i,cls], collapse=":")\r\n-#}\r\n-\r\n-\r\n-\r\n-result$past = do.call(paste, c(result[unlist(strsplit(unique.type, ","))], sep = ":"))\r\n-\r\n-result.matched = result[!grepl("unmatched", result$best_match),]\r\n-result.unmatched = result[grepl("unmatched", result$best_match),]\r\n-\r\n-result = rbind(result.matched, result.unmatched)\r\n-\r\n-result = result[!(duplicated(result$past)), ]\r\n-\r\n-result = result[,!(names(result) %in% c("past", "best_match_class"))]\r\n-\r\n-print(paste("Number of sequences in result after", unique.type, "filtering:", nrow(result)))\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("After remove duplicates based on filter", nrow(result)))\r\n-\r\n-unmatched = result[grepl("^unmatched", result$best_match),c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")]\r\n-\r\n-print(paste("Number of rows in result:", nrow(result)))\r\n-print(paste("Number of rows in unmatched:", nrow(unmatched)))\r\n-\r\n-matched.sequences = result[!grepl("^unmatched", result$best_match),]\r\n-\r\n-write.table(x=matched.sequences, file=gsub("merged.txt$", "filtered.txt", output), sep="\\t",quote=F,row.names=F,col.names=T)\r\n-\r\n-matched.sequences.count = nrow(matched.sequences)\r\n-unmatched.sequences.count = sum(grepl("^unmatched", result$best_match))\r\n-if(matched.sequences.count <= unmatched.sequences.count){\r\n-\tprint("WARNING NO MATCHED (SUB)CLASS SEQUENCES!!")\r\n-}\r\n-\r\n-filtering.steps = rbind(filtering.steps, c("Number of matched sequences", matched.sequences.count))\r\n-filtering.steps = rbind(filtering.steps, c("Number of unmatched sequences", unmatched.sequences.count))\r\n-filtering.steps[,2] = as.numeric(filtering.steps[,2])\r\n-filtering.steps$perc = round(filtering.steps[,2] / input.sequence.count * 100, 2)\r\n-\r\n-write.table(x=filtering.steps, file=gsub("unmatched", "filtering_steps", unmatchedfile), sep="\\t",quote=F,row.names=F,col.names=F)\r\n-\r\n-write.table(x=result, file=output, sep="\\t",quote=F,row.names=F,col.names=T)\r\n-write.table(x=unmatched, file=unmatchedfile, sep="\\t",quote=F,row.names=F,col.names=T)\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/mutation_column_checker.py
--- a/shm_csr/mutation_column_checker.py Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,27 +0,0 @@
-import re
-
-mutationMatcher = re.compile("^([nactg])(\d+).([nactg]),?[ ]?([A-Z])?(\d+)?[>]?([A-Z;])?(.*)?")
-
-with open("7_V-REGION-mutation-and-AA-change-table.txt", 'r') as file_handle:
- first = True
- fr3_index = -1
- for i, line in enumerate(file_handle):
- line_split = line.split("\t")
- if first:
- fr3_index = line_split.index("FR3-IMGT")
- first = False
- continue
-
- if len(line_split) < fr3_index:
- continue
-
- fr3_data = line_split[fr3_index]
- if len(fr3_data) > 5:
- try:
- test = [mutationMatcher.match(x).groups() for x in fr3_data.split("|") if x]
- except:
- print(line_split[1])
- print("Something went wrong at line {line} with:".format(line=line_split[0]))
- #print([x for x in fr3_data.split("|") if not mutationMatcher.match(x)])
- if i % 100000 == 0:
- print(i)

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/naive_output.r
--- a/shm_csr/naive_output.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,45 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE)
-
-naive.file = args[1]
-shm.file = args[2]
-output.file.ca = args[3]
-output.file.cg = args[4]
-output.file.cm = args[5]
-
-naive = read.table(naive.file, sep="\t", header=T, quote="", fill=T)
-shm.merge = read.table(shm.file, sep="\t", header=T, quote="", fill=T)
-
-
-final = merge(naive, shm.merge[,c("Sequence.ID", "best_match")], by.x="ID", by.y="Sequence.ID")
-print(paste("nrow final:", nrow(final)))
-names(final)[names(final) == "best_match"] = "Sample"
-final.numeric = final[,sapply(final, is.numeric)]
-final.numeric[is.na(final.numeric)] = 0
-final[,sapply(final, is.numeric)] = final.numeric
-
-final.ca = final[grepl("^ca", final$Sample),]
-final.cg = final[grepl("^cg", final$Sample),]
-final.cm = final[grepl("^cm", final$Sample),]
-
-if(nrow(final.ca) > 0){
- final.ca$Replicate = 1
-}
-
-if(nrow(final.cg) > 0){
- final.cg$Replicate = 1
-}
-
-if(nrow(final.cm) > 0){
- final.cm$Replicate = 1
-}
-
-#print(paste("nrow final:", nrow(final)))
-#final2 = final
-#final2$Sample = gsub("[0-9]", "", final2$Sample)
-#final = rbind(final, final2)
-#final$Replicate = 1
-
-write.table(final.ca, output.file.ca, quote=F, sep="\t", row.names=F, col.names=T)
-write.table(final.cg, output.file.cg, quote=F, sep="\t", row.names=F, col.names=T)
-write.table(final.cm, output.file.cm, quote=F, sep="\t", row.names=F, col.names=T)
-

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/new_imgt.r
--- a/shm_csr/new_imgt.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,40 +0,0 @@
-args <- commandArgs(trailingOnly = TRUE)
-
-imgt.dir = args[1]
-merged.file = args[2]
-gene = args[3]
-
-merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F, comment.char="", quote="")
-
-if(!("Sequence.ID" %in% names(merged))){ #change-o db
- print("Change-O DB changing 'SEQUENCE_ID' to 'Sequence.ID'")
- names(merged)[which(names[merged] == "SEQUENCE_ID")] = "Sequence.ID"
-}
-
-if(gene != "-"){
- merged = merged[grepl(paste("^", gene, sep=""), merged$best_match),]
-}
-
-if("best_match" %in% names(merged)){
- merged = merged[!grepl("unmatched", merged$best_match),]
-}
-
-nrow_dat = 0
-
-for(f in list.files(imgt.dir, pattern="*.txt$")){
- #print(paste("filtering", f))
- path = file.path(imgt.dir, f)
- dat = read.table(path, header=T, sep="\t", fill=T, quote="", stringsAsFactors=F, check.names=FALSE, comment.char="")
-
- dat = dat[dat[,"Sequence ID"] %in% merged$Sequence.ID,]
-
- nrow_dat = nrow(dat)
-
- if(nrow(dat) > 0 & grepl("^8_", f)){ #change the FR1 columns to 0 in the "8_..." file
- dat[,grepl("^FR1", names(dat))] = 0
- }
-
- write.table(dat, path, quote=F, sep="\t", row.names=F, col.names=T, na="")
-}
-
-print(paste("Creating new zip for ", gene, "with", nrow_dat, "sequences"))

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/pattern_plots.r
--- a/shm_csr/pattern_plots.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,178 +0,0 @@
-library(ggplot2)
-library(reshape2)
-library(scales)
-
-args <- commandArgs(trailingOnly = TRUE)
-
-input.file = args[1] #the data that's get turned into the "SHM overview" table in the html report "data_sum.txt"
-
-plot1.path = args[2]
-plot1.png = paste(plot1.path, ".png", sep="")
-plot1.txt = paste(plot1.path, ".txt", sep="")
-plot1.pdf = paste(plot1.path, ".pdf", sep="")
-
-plot2.path = args[3]
-plot2.png = paste(plot2.path, ".png", sep="")
-plot2.txt = paste(plot2.path, ".txt", sep="")
-plot2.pdf = paste(plot2.path, ".pdf", sep="")
-
-plot3.path = args[4]
-plot3.png = paste(plot3.path, ".png", sep="")
-plot3.txt = paste(plot3.path, ".txt", sep="")
-plot3.pdf = paste(plot3.path, ".pdf", sep="")
-
-clean.output = args[5]
-
-dat = read.table(input.file, header=F, sep=",", quote="", stringsAsFactors=F, fill=T, row.names=1)
-
-classes = c("IGA", "IGA1", "IGA2", "IGG", "IGG1", "IGG2", "IGG3", "IGG4", "IGM", "IGE")
-xyz = c("x", "y", "z")
-new.names = c(paste(rep(classes, each=3), xyz, sep="."), paste("un", xyz, sep="."), paste("all", xyz, sep="."))
-
-names(dat) = new.names
-
-clean.dat = dat
-clean.dat = clean.dat[,c(paste(rep(classes, each=3), xyz, sep="."), paste("all", xyz, sep="."), paste("un", xyz, sep="."))]
-
-write.table(clean.dat, clean.output, quote=F, sep="\t", na="", row.names=T, col.names=NA)
-
-dat["RGYW.WRCY",] = colSums(dat[c(14,15),], na.rm=T)
-dat["TW.WA",] = colSums(dat[c(16,17),], na.rm=T)
-
-data1 = dat[c("RGYW.WRCY", "TW.WA"),]
-
-data1 = data1[,names(data1)[grepl(".z", names(data1))]]
-names(data1) = gsub("\\..*", "", names(data1))
-
-data1 = melt(t(data1))
-
-names(data1) = c("Class", "Type", "value")
-
-chk = is.na(data1$value)
-if(any(chk)){
- data1[chk, "value"] = 0
-}
-
-data1 = data1[order(data1$Type),]
-
-write.table(data1, plot1.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
-
-p = ggplot(data1, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge", colour = "black") + ylab("% of mutations") + guides(fill=guide_legend(title=NULL)) + ggtitle("Percentage of mutations in AID and pol eta motives")
-p = p + theme(panel.background = element_rect(fill = "white", colour="black"),text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_manual(values=c("RGYW.WRCY" = "white", "TW.WA" = "blue4"))
-#p = p + scale_colour_manual(values=c("RGYW.WRCY" = "black", "TW.WA" = "blue4"))
-png(filename=plot1.png, width=510, height=300)
-print(p)
-dev.off()
-
-ggsave(plot1.pdf, p)
-
-data2 = dat[c(1, 5:8),]
-
-data2 = data2[,names(data2)[grepl("\\.x", names(data2))]]
-names(data2) = gsub(".x", "", names(data2))
-
-data2["A/T",] = dat["Targeting of A T (%)",names(dat)[grepl("\\.z", names(dat))]]
-
-data2["G/C transitions",] = round(data2["Transitions at G C (%)",] / data2["Number of Mutations (%)",] * 100, 1)
-
-data2["mutation.at.gc",] = dat["Transitions at G C (%)",names(dat)[grepl("\\.y", names(dat))]]
-data2["G/C transversions",] = round((data2["mutation.at.gc",] - data2["Transitions at G C (%)",]) / data2["Number of Mutations (%)",] * 100, 1)
-
-data2["G/C transversions",is.nan(unlist(data2["G/C transversions",]))] = 0
-data2["G/C transversions",is.infinite(unlist(data2["G/C transversions",]))] = 0
-data2["G/C transitions",is.nan(unlist(data2["G/C transitions",]))] = 0
-data2["G/C transitions",is.infinite(unlist(data2["G/C transitions",]))] = 0
-
-data2 = melt(t(data2[c("A/T","G/C transitions","G/C transversions"),]))
-
-names(data2) = c("Class", "Type", "value")
-
-chk = is.na(data2$value)
-if(any(chk)){
- data2[chk, "value"] = 0
-}
-
-data2 = data2[order(data2$Type),]
-
-write.table(data2, plot2.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
-
-p = ggplot(data2, aes(x=Class, y=value, fill=Type)) + geom_bar(position="fill", stat="identity", colour = "black") + scale_y_continuous(labels=percent_format()) + guides(fill=guide_legend(title=NULL)) + ylab("% of mutations") + ggtitle("Relative mutation patterns")
-p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
-#p = p + scale_colour_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "black"))
-png(filename=plot2.png, width=480, height=300)
-print(p)
-dev.off()
-
-ggsave(plot2.pdf, p)
-
-data3 = dat[c(5, 6, 8, 18:21),]
-data3 = data3[,names(data3)[grepl("\\.x", names(data3))]]
-names(data3) = gsub(".x", "", names(data3))
-
-data3["G/C transitions",] = round(data3["Transitions at G C (%)",] / (data3["C",] + data3["G",]) * 100, 1)
-
-data3["G/C transversions",] = round((data3["Targeting of G C (%)",] - data3["Transitions at G C (%)",]) / (data3["C",] + data3["G",]) * 100, 1)
-
-data3["A/T",] = round(data3["Targeting of A T (%)",] / (data3["A",] + data3["T",]) * 100, 1)
-
-data3["G/C transitions",is.nan(unlist(data3["G/C transitions",]))] = 0
-data3["G/C transitions",is.infinite(unlist(data3["G/C transitions",]))] = 0
-
-data3["G/C transversions",is.nan(unlist(data3["G/C transversions",]))] = 0
-data3["G/C transversions",is.infinite(unlist(data3["G/C transversions",]))] = 0
-
-data3["A/T",is.nan(unlist(data3["A/T",]))] = 0
-data3["A/T",is.infinite(unlist(data3["A/T",]))] = 0
-
-data3 = melt(t(data3[8:10,]))
-names(data3) = c("Class", "Type", "value")
-
-chk = is.na(data3$value)
-if(any(chk)){
- data3[chk, "value"] = 0
-}
-
-data3 = data3[order(data3$Type),]
-
-write.table(data3, plot3.txt, quote=F, sep="\t", na="", row.names=F, col.names=T)
-
-p = ggplot(data3, aes(Class, value)) + geom_bar(aes(fill=Type), stat="identity", position="dodge", colour = "black") + ylab("% of nucleotides") + guides(fill=guide_legend(title=NULL)) + ggtitle("Absolute mutation patterns")
-p = p + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=15, colour="black"), axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "white"))
-#p = p + scale_colour_manual(values=c("A/T" = "blue4", "G/C transversions" = "gray74", "G/C transitions" = "black"))
-png(filename=plot3.png, width=480, height=300)
-print(p)
-dev.off()
-
-ggsave(plot3.pdf, p)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/plot_pdf.r
--- a/shm_csr/plot_pdf.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,17 +0,0 @@
-library(ggplot2)
-
-args <- commandArgs(trailingOnly = TRUE)
-print(args)
-
-input = args[1]
-outputdir = args[2]
-setwd(outputdir)
-
-load(input)
-
-print(names(pdfplots))
-
-for(n in names(pdfplots)){
- print(paste("n:", n))
- ggsave(pdfplots[[n]], file=n)
-}

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/sequence_overview.r
--- a/shm_csr/sequence_overview.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,363 +0,0 @@\n-library(reshape2)\n-\n-args <- commandArgs(trailingOnly = TRUE)\n-\n-before.unique.file = args[1]\n-merged.file = args[2]\n-outputdir = args[3]\n-gene.classes = unlist(strsplit(args[4], ","))\n-hotspot.analysis.sum.file = args[5]\n-NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")\n-NTsum.file = paste(outputdir, "ntsum.txt", sep="/")\n-main.html = "index.html"\n-empty.region.filter = args[6]\n-\n-\n-setwd(outputdir)\n-\n-before.unique = read.table(before.unique.file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\n-merged = read.table(merged.file, header=T, sep="\\t", fill=T, stringsAsFactors=F, quote="")\n-hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="")\n-\n-#before.unique = before.unique[!grepl("unmatched", before.unique$best_match),]\n-\n-if(empty.region.filter == "leader"){\n-\tbefore.unique$seq_conc = paste(before.unique$FR1.IMGT.seq, before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)\n-} else if(empty.region.filter == "FR1"){\n-\tbefore.unique$seq_conc = paste(before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)\n-} else if(empty.region.filter == "CDR1"){\n-\tbefore.unique$seq_conc = paste(before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)\n-} else if(empty.region.filter == "FR2"){\n-\tbefore.unique$seq_conc = paste(before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq)\n-}\n-\n-IDs = before.unique[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")]\n-IDs$best_match = as.character(IDs$best_match)\n-\n-dat = data.frame(table(before.unique$seq_conc))\n-\n-names(dat) = c("seq_conc", "Freq")\n-\n-dat$seq_conc = factor(dat$seq_conc)\n-\n-dat = dat[order(as.character(dat$seq_conc)),]\n-\n-#writing html from R...\n-get.bg.color = function(val){\n-\tif(val %in% c("TRUE", "FALSE", "T", "F")){ #if its a logical value, give the background a green/red color\n-\t\treturn(ifelse(val,"#eafaf1","#f9ebea"))\n-\t} else if (!is.na(as.numeric(val))) { #if its a numerical value, give it a grey tint if its >0\n-\t\treturn(ifelse(val > 0,"#eaecee","white"))\n-\t} else {\n-\t\treturn("white")\n-\t}\n-}\n-td = function(val) {\n- return(paste("<td bgcolor=\'", get.bg.color(val), "\'>", val, "</td>", sep=""))\n-}\n-tr = function(val) { \n-\treturn(paste(c("<tr>", sapply(val, td), "</tr>"), collapse="")) \n-}\n-\n-make.link = function(id, clss, val) { \n-\tpaste("<a href=\'", clss, "_", id, ".html\'>", val, "</a>", sep="") \n-}\n-tbl = function(df) {\n-\tres = "<table border=\'1\'>"\n-\tfor(i in 1:nrow(df)){ \n-\t\tres = paste(res, tr(df[i,]), sep="")\n-\t}\n-\tres = paste(res, "</table>")\n-}\n-\n-cat("<center><img src=\'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA8AAAAPCAYAAAA71pVKAAAAzElEQVQoka2TwQ2CQBBFpwTshw4ImW8ogJMlUIMmhNCDxgasAi50oSXA8XlAjCG7aqKTzGX/vsnM31mzR0gk7tTudO5MEizpzvQ4ryUSe408J3Xn+grE0p1rnpOamVmWsZG4rS+dzzAMsN8Hi9yyjI1JNGtxu4VxBJgLRLpoTKIPiW0LlwtUVRTubW2OBGUJu92cZRmdfbKQMAw8o+vi5v0fLorZ7Y9waGYJjsf38DJz0O1PsEQffOcv4Sa6YYfDDJ5Obzbsp93+5VfdATueO1fdLdI0AAAAAElFTkSuQmCC\'> Please note that this tab is based on all sequences before filter unique sequences and the remove duplicates based on filters are applied. In this table only sequences occuring more than once are included. </center>", file=main.html, append=F)\n-cat("<table border=\'1\' class=\'pure-table pure-table-striped\'>", file=main.html, append=T)\n-\n-if(empty.region.filter == "leader"){\n-\tcat("<caption>FR1+CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)\n-} else if(empty.region.filter == "FR1"){\n-\tcat("<caption>CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file=main.html, append=T)\n-} else if(empty.region.filter == "CDR1"){\n-\tcat("<caption>FR2+CDR2+FR3+CDR3 sequences that show up more than once</caption>", file='..b' & cg3.n > 0 & cg4.n > 0)\n-\t\n-\tin.cg.all = (cg1.n > 0 & cg2.n > 0 & cg3.n > 0 & cg4.n > 0)\n-\t\n-\t#rw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, un.html)\n-\trw = c(as.character(dat[i,"seq_conc"]), functionality, ca1.html, ca2.html, cg1.html, cg2.html, cg3.html, cg4.html, cm.html, ce.html, un.html)\n-\trw = c(rw, ca.n, cg.n, cm.n, ce.n, in.classes, in.ca.cg, in.ca.cg.cm, in.ca.cg.ce, in.ca.cg.cm.ce, in.ca1.ca2, in.cg1.cg2, in.cg1.cg3, in.cg1.cg4, in.cg2.cg3, in.cg2.cg4, in.cg3.cg4, in.cg1.cg2.cg3, in.cg2.cg3.cg4, in.cg1.cg2.cg4, in.cg1.cg3.cg4, in.cg.all)\n-\t\n-\t\n-\n-\tcat(tr(rw), file=main.html, append=T)\n-\t\n-\t\n-\tfor(i in 1:nrow(allc)){ #generate html by id\n-\t\thtml = make.link(id, allc[i,"best_match"], allc[i,"Sequence.ID"])\n-\t\tcat(paste(html, " "), file=sequence.id.page, append=T)\n-\t}\n-}\n-\n-cat("</table>", file=main.html, append=T)\n-\n-print(paste("Single sequences:", single.sequences))\n-print(paste("Sequences in multiple subclasses:", in.multiple))\n-print(paste("Multiple sequences in one subclass:", multiple.in.one))\n-print(paste("Matched with unmatched:", some.unmatched))\n-print(paste("Count that should match \'matched\' sequences:", matched))\n-\n-#ACGT overview\n-\n-#NToverview = merged[!grepl("^unmatched", merged$best_match),]\n-NToverview = merged\n-\n-if(empty.region.filter == "leader"){\n-\tNToverview$seq = paste(NToverview$FR1.IMGT.seq, NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)\n-} else if(empty.region.filter == "FR1"){\n-\tNToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)\n-} else if(empty.region.filter == "CDR1"){\n-\tNToverview$seq = paste(NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)\n-} else if(empty.region.filter == "FR2"){\n-\tNToverview$seq = paste(NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)\n-}\n-\n-NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))\n-NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))\n-NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))\n-NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))\n-\n-#Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))\n-\n-#NToverview = rbind(NToverview, NTsum)\n-\n-NTresult = data.frame(nt=c("A", "C", "T", "G"))\n-\n-for(clazz in gene.classes){\n-\tprint(paste("class:", clazz))\n-\tNToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),]\n-\tprint(paste("nrow:", nrow(NToverview.sub)))\n-\tnew.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))\n-\tnew.col.y = sum(new.col.x)\n-\tnew.col.z = round(new.col.x / new.col.y * 100, 2)\n-\t\n-\ttmp = names(NTresult)\n-\tNTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))\n-\tnames(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))\n-}\n-\n-NToverview.tmp = NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")]\n-\n-names(NToverview.tmp) = c("Sequence.ID", "best_match", "Sequence of the analysed region", "A", "C", "G", "T")\n-\n-write.table(NToverview.tmp, NToverview.file, quote=F, sep="\\t", row.names=F, col.names=T)\n-\n-NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]\n-\n-new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))\n-new.col.y = sum(new.col.x)\n-new.col.z = round(new.col.x / new.col.y * 100, 2)\n-\n-tmp = names(NTresult)\n-NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))\n-names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep=""))\n-\n-names(hotspot.analysis.sum) = names(NTresult)\n-\n-hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)\n-\n-write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/shm_clonality.htm
--- a/shm_csr/shm_clonality.htm Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,144 +0,0 @@
-<html>
-
-<head>
-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
-<meta name=Generator content="Microsoft Word 14 (filtered)">
-<style>
-
-</style>
-
-</head>
-
-<body lang=EN-US link=blue vlink=purple>
-
-<div class=WordSection1>
-
-References
-
-Gupta,
-Namita T. and Vander Heiden, Jason A. and Uduman, Mohamed and Gadala-Maria,
-Daniel and Yaari, Gur and Kleinstein, Steven H. (2015). <a name="OLE_LINK106"></a><a
-name="OLE_LINK107"></a>Change-O: a toolkit for analyzing large-scale B cell
-immunoglobulin repertoire sequencing data: Table 1. In Bioinformatics, 31 (20), pp.
-3356�3358. [<a
-href="http://dx.doi.org/10.1093/bioinformatics/btv359" target="_blank">doi:10.1093/bioinformatics/btv359</a>][<a
-href="http://dx.doi.org/10.1093/bioinformatics/btv359" target="_blank">Link</a>]
-
- 
-
-<a name="OLE_LINK110">All, IGA, IGG, IGM and IGE tabs</a>
-
-In
-these tabs information on the clonal relation of transcripts can be found. To
-calculate clonal relation Change-O is used (Gupta et al, PMID: 26069265).
-Transcripts are considered clonally related if they have maximal three nucleotides
-difference in their CDR3 sequence and the same first V segment (as assigned by
-IMGT). Results are represented in a table format showing the clone size and the
-number of clones or sequences with this clone size. Change-O settings used are
-the nucleotide hamming distance substitution model with
-a complete distance of maximal three. For clonal assignment the first gene
-segments were used, and the distances were not normalized. In case of
-asymmetric distances, the minimal distance was used. 
-
- 
-
-Overlap
-tab 
-
-This
-tab gives information on with which (sub)classe(s) each unique analyzed region
-(based on the exact nucleotide sequence of the analyzes region and the CDR3
-nucleotide sequence) is found with. This gives information if the combination
-of the exact same nucleotide sequence of the analyzed region and the CDR3
-sequence can be found in multiple (sub)classes.
-
-<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA8AAAAPCAYAAAA71pVKAAAAzElEQVQoka2TwQ2CQBBFpwTshw4ImW8ogJMlUIMmhNCDxgasAi50oSXA8XlAjCG7aqKTzGX/vsnM31mzR0gk7tTudO5MEizpzvQ4ryUSe408J3Xn+grE0p1rnpOamVmWsZG4rS+dzzAMsN8Hi9yyjI1JNGtxu4VxBJgLRLpoTKIPiW0LlwtUVRTubW2OBGUJu92cZRmdfbKQMAw8o+vi5v0fLorZ7Y9waGYJjsf38DJz0O1PsEQffOcv4Sa6YYfDDJ5Obzbsp93+5VfdATueO1fdLdI0AAAAAElFTkSuQmCC"> Please note that this tab is based on all
-sequences before filter unique sequences and the remove duplicates based on
-filters are applied. In this table only sequences occuring more than once are
-included. 
-
-</div>
-
-</body>
-
-</html>

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/shm_csr.htm
--- a/shm_csr/shm_csr.htm Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,95 +0,0 @@
-<html>
-
-<head>
-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
-<meta name=Generator content="Microsoft Word 14 (filtered)">
-<style>
-
-</style>
-
-</head>
-
-<body lang=EN-US link=blue vlink=purple>
-
-<div class=WordSection1>
-
-The
-graphs in this tab give insight into the subclass distribution of IGG and IGA
-transcripts. Human C�, Cα, Cγ and Cε
-constant genes are assigned using a custom script
-specifically designed for human (sub)class assignment in repertoire data as
-described in van Schouwenburg and IJspeert et al, submitted for publication. In
-this script the reference sequences for the subclasses are divided in 8
-nucleotide chunks which overlap by 4 nucleotides. These overlapping chunks are
-then individually aligned in the right order to each input sequence. The
-percentage of the chunks identified in each rearrangement is calculated in the
-�chunk hit percentage�. Cα and Cγ
-subclasses are very homologous and only differ in a few nucleotides. To assign
-subclasses the �nt hit percentage� is calculated.
-This percentage indicates how well the chunks covering the subclass specific
-nucleotide match with the different subclasses. Information
-on normal distribution of subclasses in healthy individuals of different ages
-can be found in IJspeert and van Schouwenburg et al, PMID: 27799928.
-
-<a name="OLE_LINK100"></a><a
-name="OLE_LINK99"></a><a name="OLE_LINK25">IGA
-subclass distribution</a>
-
-Pie
-chart showing the relative distribution of IGA1 and IGA2 transcripts in the
-sample.
-
-IGG
-subclass distribution
-
-Pie
-chart showing the relative distribution of IGG1, IGG2, IGG3 and IGG4
-transcripts in the sample.
-
-</div>
-
-</body>
-
-</html>

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/shm_csr.py
--- a/shm_csr/shm_csr.py Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,508 +0,0 @@\n-import argparse\n-import logging\n-import sys\n-import os\n-import re\n-\n-from collections import defaultdict\n-\n-def main():\n-\tparser = argparse.ArgumentParser()\n-\tparser.add_argument("--input", help="The \'7_V-REGION-mutation-and-AA-change-table\' and \'10_V-REGION-mutation-hotspots\' merged together, with an added \'best_match\' annotation")\n-\tparser.add_argument("--genes", help="The genes available in the \'best_match\' column")\n-\tparser.add_argument("--empty_region_filter", help="Where does the sequence start?", choices=[\'leader\', \'FR1\', \'CDR1\', \'FR2\'])\n-\tparser.add_argument("--output", help="Output file")\n-\n-\targs = parser.parse_args()\n-\n-\tinfile = args.input\n-\tgenes = str(args.genes).split(",")\n-\tempty_region_filter = args.empty_region_filter\n-\toutfile = args.output\n-\n-\tgenedic = dict()\n-\n-\tmutationdic = dict()\n-\tmutationMatcher = re.compile("^(.)(\\d+).(.),?[ ]?(.)?(\\d+)?.?(.)?(.?.?.?.?.?)?")\n-\tmutationMatcher = re.compile("^([actg])(\\d+).([actg]),?[ ]?([A-Z])?(\\d+)?.?([A-Z])?(.*)?")\n-\tmutationMatcher = re.compile("^([actg])(\\d+).([actg]),?[ ]?([A-Z])?(\\d+)?[>]?([A-Z;])?(.*)?")\n-\tmutationMatcher = re.compile("^([nactg])(\\d+).([nactg]),?[ ]?([A-Z])?(\\d+)?[>]?([A-Z;])?(.*)?")\n-\tNAMatchResult = (None, None, None, None, None, None, \'\')\n-\tgeneMatchers = {gene: re.compile("^" + gene + ".*") for gene in genes}\n-\tlinecount = 0\n-\n-\tIDIndex = 0\n-\tbest_matchIndex = 0\n-\tfr1Index = 0\n-\tcdr1Index = 0\n-\tfr2Index = 0\n-\tcdr2Index = 0\n-\tfr3Index = 0\n-\tfirst = True\n-\tIDlist = []\n-\tmutationList = []\n-\tmutationListByID = {}\n-\tcdr1LengthDic = {}\n-\tcdr2LengthDic = {}\n-\n-\tfr1LengthDict = {}\n-\tfr2LengthDict = {}\n-\tfr3LengthDict = {}\n-\n-\tcdr1LengthIndex = 0\n-\tcdr2LengthIndex = 0\n-\n-\tfr1SeqIndex = 0\n-\tfr2SeqIndex = 0\n-\tfr3SeqIndex = 0\n-\n-\ttandem_sum_by_class = defaultdict(int)\n-\texpected_tandem_sum_by_class = defaultdict(float)\n-\n-\twith open(infile, \'ru\') as i:\n-\t\tfor line in i:\n-\t\t\tif first:\n-\t\t\t\tlinesplt = line.split("\\t")\n-\t\t\t\tIDIndex = linesplt.index("Sequence.ID")\n-\t\t\t\tbest_matchIndex = linesplt.index("best_match")\n-\t\t\t\tfr1Index = linesplt.index("FR1.IMGT")\n-\t\t\t\tcdr1Index = linesplt.index("CDR1.IMGT")\n-\t\t\t\tfr2Index = linesplt.index("FR2.IMGT")\n-\t\t\t\tcdr2Index = linesplt.index("CDR2.IMGT")\n-\t\t\t\tfr3Index = linesplt.index("FR3.IMGT")\n-\t\t\t\tcdr1LengthIndex = linesplt.index("CDR1.IMGT.length")\n-\t\t\t\tcdr2LengthIndex = linesplt.index("CDR2.IMGT.length")\n-\t\t\t\tfr1SeqIndex = linesplt.index("FR1.IMGT.seq")\n-\t\t\t\tfr2SeqIndex = linesplt.index("FR2.IMGT.seq")\n-\t\t\t\tfr3SeqIndex = linesplt.index("FR3.IMGT.seq")\n-\t\t\t\tfirst = False\n-\t\t\t\tcontinue\n-\t\t\tlinecount += 1\n-\t\t\tlinesplt = line.split("\\t")\n-\t\t\tID = linesplt[IDIndex]\n-\t\t\tgenedic[ID] = linesplt[best_matchIndex]\n-\t\t\t\n-\t\t\tmutationdic[ID + "_FR1"] = []\n-\t\t\tif len(linesplt[fr1Index]) > 5 and empty_region_filter == "leader":\n-\t\t\t\tmutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[fr1Index].split("|") if x]\n-\n-\t\t\tmutationdic[ID + "_CDR1"] = []\n-\t\t\tif len(linesplt[cdr1Index]) > 5 and empty_region_filter in ["leader", "FR1"]:\n-\t\t\t\tmutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr1Index].split("|") if x]\n-\n-\t\t\tmutationdic[ID + "_FR2"] = []\n-\t\t\tif len(linesplt[fr2Index]) > 5 and empty_region_filter in ["leader", "FR1", "CDR1"]:\n-\t\t\t\tmutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[fr2Index].split("|") if x]\n-\n-\t\t\tmutationdic[ID + "_CDR2"] = []\n-\t\t\tif len(linesplt[cdr2Index]) > 5:\n-\t\t\t\tmutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[cdr2Index].split("|") if x]\n-\t\t\t\n-\t\t\tmutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"]\n-\n-\t\t\tmutationdic[ID + "_FR3"] = []\n-\t\t\tif len(linesplt[fr3Index]) > 5:\n-\t\t\t\tmutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[fr3Index].split("|") if x]\n-\t\t\t\t\n-\t\t\tmutationList += mutationdic[ID + "_FR1"] + mutationdic[ID + "_CDR1"] + mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"'..b'utation_in_TW])\n-\n-\t\t\t\tif in_how_many_motifs > 0:\n-\t\t\t\t\tRGYWCount[ID] += (1.0 * int(mutation_in_RGYW)) / in_how_many_motifs\n-\t\t\t\t\tWRCYCount[ID] += (1.0 * int(mutation_in_WRCY)) / in_how_many_motifs\n-\t\t\t\t\tWACount[ID] += (1.0 * int(mutation_in_WA)) / in_how_many_motifs\n-\t\t\t\t\tTWCount[ID] += (1.0 * int(mutation_in_TW)) / in_how_many_motifs\n-\t\t\t\n-\t\t\tmutations_in_motifs_file = os.path.join(os.path.dirname(os.path.abspath(infile)), "mutation_in_motifs.txt")\n-\t\t\tif not os.path.exists(mutation_by_id_file):\n-\t\t\t\twith open(mutations_in_motifs_file, \'w\') as out_handle:\n-\t\t\t\t\tout_handle.write("{0}\\n".format("\\t".join([\n-\t\t\t\t\t\t"Sequence.ID",\n-\t\t\t\t\t\t"mutation_position",\n-\t\t\t\t\t\t"region",\n-\t\t\t\t\t\t"from_nt",\n-\t\t\t\t\t\t"to_nt",\n-\t\t\t\t\t\t"mutation_position_AA",\n-\t\t\t\t\t\t"from_AA",\n-\t\t\t\t\t\t"to_AA",\n-\t\t\t\t\t\t"motif",\n-\t\t\t\t\t\t"motif_start_nt",\n-\t\t\t\t\t\t"motif_end_nt",\n-\t\t\t\t\t\t"rest"\n-\t\t\t\t\t])))\n-\n-\t\t\twith open(mutations_in_motifs_file, \'a\') as out_handle:\n-\t\t\t\tmotif_dic = {"RGYW": RGYW, "WRCY": WRCY, "WA": WA, "TW": TW}\n-\t\t\t\tfor mutation in mutationList:\n-\t\t\t\t\tfrm, where, to, AAfrm, AAwhere, AAto, junk = mutation\n-\t\t\t\t\tfor motif in motif_dic.keys():\n-\t\t\t\t\t\t\t\n-\t\t\t\t\t\tfor start, end, region in motif_dic[motif]:\n-\t\t\t\t\t\t\tif start <= int(where) <= end:\n-\t\t\t\t\t\t\t\tout_handle.write("{0}\\n".format(\n-\t\t\t\t\t\t\t\t\t"\\t".join([\n-\t\t\t\t\t\t\t\t\t\tID,\n-\t\t\t\t\t\t\t\t\t\twhere,\n-\t\t\t\t\t\t\t\t\t\tregion,\n-\t\t\t\t\t\t\t\t\t\tfrm,\n-\t\t\t\t\t\t\t\t\t\tto,\n-\t\t\t\t\t\t\t\t\t\tstr(AAwhere),\n-\t\t\t\t\t\t\t\t\t\tstr(AAfrm),\n-\t\t\t\t\t\t\t\t\t\tstr(AAto),\n-\t\t\t\t\t\t\t\t\t\tmotif,\n-\t\t\t\t\t\t\t\t\t\tstr(start),\n-\t\t\t\t\t\t\t\t\t\tstr(end),\n-\t\t\t\t\t\t\t\t\t\tstr(junk)\n-\t\t\t\t\t\t\t\t\t])\n-\t\t\t\t\t\t\t\t))\n-\n-\n-\n-\tdef mean(lst):\n-\t\treturn (float(sum(lst)) / len(lst)) if len(lst) > 0 else 0.0\n-\n-\n-\tdef median(lst):\n-\t\tlst = sorted(lst)\n-\t\tl = len(lst)\n-\t\tif l == 0:\n-\t\t\treturn 0\n-\t\tif l == 1:\n-\t\t\treturn lst[0]\n-\t\t\t\n-\t\tl = int(l / 2)\n-\t\t\n-\t\tif len(lst) % 2 == 0:\n-\t\t\treturn float(lst[l] + lst[(l - 1)]) / 2.0\n-\t\telse:\n-\t\t\treturn lst[l]\n-\n-\tfuncs = {"mean": mean, "median": median, "sum": sum}\n-\n-\tdirectory = outfile[:outfile.rfind("/") + 1]\n-\tvalue = 0\n-\tvaluedic = dict()\n-\n-\tfor fname in funcs.keys():\n-\t\tfor gene in genes:\n-\t\t\twith open(directory + gene + "_" + fname + "_value.txt", \'r\') as v:\n-\t\t\t\tvaluedic[gene + "_" + fname] = float(v.readlines()[0].rstrip())\n-\t\twith open(directory + "all_" + fname + "_value.txt", \'r\') as v:\n-\t\t\tvaluedic["total_" + fname] = float(v.readlines()[0].rstrip())\n-\t\t\n-\n-\tdef get_xyz(lst, gene, f, fname):\n-\t\tx = round(round(f(lst), 1))\n-\t\ty = valuedic[gene + "_" + fname]\n-\t\tz = str(round(x / float(y) * 100, 1)) if y != 0 else "0"\n-\t\treturn (str(x), str(y), z)\n-\n-\tdic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount}\n-\tarr = ["RGYW", "WRCY", "WA", "TW"]\n-\n-\tfor fname in funcs.keys():\n-\t\tfunc = funcs[fname]\n-\t\tfoutfile = outfile[:outfile.rindex("/")] + "/hotspot_analysis_" + fname + ".txt"\n-\t\twith open(foutfile, \'w\') as o:\n-\t\t\tfor typ in arr:\n-\t\t\t\to.write(typ + " (%)")\n-\t\t\t\tcurr = dic[typ]\n-\t\t\t\tfor gene in genes:\n-\t\t\t\t\tgeneMatcher = geneMatchers[gene]\n-\t\t\t\t\tif valuedic[gene + "_" + fname] is 0:\n-\t\t\t\t\t\to.write(",0,0,0")\n-\t\t\t\t\telse:\n-\t\t\t\t\t\tx, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname)\n-\t\t\t\t\t\to.write("," + x + "," + y + "," + z)\n-\t\t\t\tx, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname)\n-\t\t\t\t#x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname)\n-\t\t\t\to.write("," + x + "," + y + "," + z + "\\n")\n-\n-\n-\t# for testing\n-\tseq_motif_file = outfile[:outfile.rindex("/")] + "/motif_per_seq.txt"\n-\twith open(seq_motif_file, \'w\') as o:\n-\t\to.write("ID\\tRGYW\\tWRCY\\tWA\\tTW\\n")\n-\t\tfor ID in IDlist:\n-\t\t\t#o.write(ID + "\\t" + str(round(RGYWCount[ID], 2)) + "\\t" + str(round(WRCYCount[ID], 2)) + "\\t" + str(round(WACount[ID], 2)) + "\\t" + str(round(TWCount[ID], 2)) + "\\n")\n-\t\t\to.write(ID + "\\t" + str(RGYWCount[ID]) + "\\t" + str(WRCYCount[ID]) + "\\t" + str(WACount[ID]) + "\\t" + str(TWCount[ID]) + "\\n")\n-\n-if __name__ == "__main__":\n-\tmain()\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/shm_csr.r
--- a/shm_csr/shm_csr.r Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,561 +0,0 @@\n-library(data.table)\n-library(ggplot2)\n-library(reshape2)\n-\n-args <- commandArgs(trailingOnly = TRUE)\n-\n-input = args[1]\n-genes = unlist(strsplit(args[2], ","))\n-outputdir = args[3]\n-empty.region.filter = args[4]\n-setwd(outputdir)\n-\n-#dat = read.table(input, header=T, sep="\\t", fill=T, stringsAsFactors=F)\n-\n-dat = data.frame(fread(input, sep="\\t", header=T, stringsAsFactors=F)) #fread because read.table suddenly skips certain rows...\n-\n-if(length(dat$Sequence.ID) == 0){\n- setwd(outputdir)\n- result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5))\n- row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of G C (%)")\n- write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F)\n- transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4))\n- row.names(transitionTable) = c("A", "C", "G", "T")\n- transitionTable["A","A"] = NA\n- transitionTable["C","C"] = NA\n- transitionTable["G","G"] = NA\n- transitionTable["T","T"] = NA\n-\n- write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA)\n- cat("0", file="n.txt")\n- stop("No data")\n-}\n-\n-cleanup_columns = c("FR1.IMGT.c.a",\n-\t\t\t\t\t"FR2.IMGT.g.t",\n-\t\t\t\t\t"CDR1.IMGT.Nb.of.nucleotides",\n-\t\t\t\t\t"CDR2.IMGT.t.a",\n-\t\t\t\t\t"FR1.IMGT.c.g",\n-\t\t\t\t\t"CDR1.IMGT.c.t",\n-\t\t\t\t\t"FR2.IMGT.a.c",\n-\t\t\t\t\t"FR2.IMGT.Nb.of.mutations",\n-\t\t\t\t\t"FR2.IMGT.g.c",\n-\t\t\t\t\t"FR2.IMGT.a.g",\n-\t\t\t\t\t"FR3.IMGT.t.a",\n-\t\t\t\t\t"FR3.IMGT.t.c",\n-\t\t\t\t\t"FR2.IMGT.g.a",\n-\t\t\t\t\t"FR3.IMGT.c.g",\n-\t\t\t\t\t"FR1.IMGT.Nb.of.mutations",\n-\t\t\t\t\t"CDR1.IMGT.g.a",\n-\t\t\t\t\t"CDR1.IMGT.t.g",\n-\t\t\t\t\t"CDR1.IMGT.g.c",\n-\t\t\t\t\t"CDR2.IMGT.Nb.of.nucleotides",\n-\t\t\t\t\t"FR2.IMGT.a.t",\n-\t\t\t\t\t"CDR1.IMGT.Nb.of.mutations",\n-\t\t\t\t\t"CDR3.IMGT.Nb.of.nucleotides",\n-\t\t\t\t\t"CDR1.IMGT.a.g",\n-\t\t\t\t\t"FR3.IMGT.a.c",\n-\t\t\t\t\t"FR1.IMGT.g.a",\n-\t\t\t\t\t"FR3.IMGT.a.g",\n-\t\t\t\t\t"FR1.IMGT.a.t",\n-\t\t\t\t\t"CDR2.IMGT.a.g",\n-\t\t\t\t\t"CDR2.IMGT.Nb.of.mutations",\n-\t\t\t\t\t"CDR2.IMGT.g.t",\n-\t\t\t\t\t"CDR2.IMGT.a.c",\n-\t\t\t\t\t"CDR1.IMGT.t.c",\n-\t\t\t\t\t"FR3.IMGT.g.c",\n-\t\t\t\t\t"FR1.IMGT.g.t",\n-\t\t\t\t\t"FR3.IMGT.g.t",\n-\t\t\t\t\t"CDR1.IMGT.a.t",\n-\t\t\t\t\t"FR1.IMGT.a.g",\n-\t\t\t\t\t"FR3.IMGT.a.t",\n-\t\t\t\t\t"FR3.IMGT.Nb.of.nucleotides",\n-\t\t\t\t\t"FR2.IMGT.t.c",\n-\t\t\t\t\t"CDR2.IMGT.g.a",\n-\t\t\t\t\t"FR2.IMGT.t.a",\n-\t\t\t\t\t"CDR1.IMGT.t.a",\n-\t\t\t\t\t"FR2.IMGT.t.g",\n-\t\t\t\t\t"FR3.IMGT.t.g",\n-\t\t\t\t\t"FR2.IMGT.Nb.of.nucleotides",\n-\t\t\t\t\t"FR1.IMGT.t.a",\n-\t\t\t\t\t"FR1.IMGT.t.g",\n-\t\t\t\t\t"FR3.IMGT.c.t",\n-\t\t\t\t\t"FR1.IMGT.t.c",\n-\t\t\t\t\t"CDR2.IMGT.a.t",\n-\t\t\t\t\t"FR2.IMGT.c.t",\n-\t\t\t\t\t"CDR1.IMGT.g.t",\n-\t\t\t\t\t"CDR2.IMGT.t.g",\n-\t\t\t\t\t"FR1.IMGT.Nb.of.nucleotides",\n-\t\t\t\t\t"CDR1.IMGT.c.g",\n-\t\t\t\t\t"CDR2.IMGT.t.c",\n-\t\t\t\t\t"FR3.IMGT.g.a",\n-\t\t\t\t\t"CDR1.IMGT.a.c",\n-\t\t\t\t\t"FR2.IMGT.c.a",\n-\t\t\t\t\t"FR3.IMGT.Nb.of.mutations",\n-\t\t\t\t\t"FR2.IMGT.c.g",\n-\t\t\t\t\t"CDR2.IMGT.g.c",\n-\t\t\t\t\t"FR1.IMGT.g.c",\n-\t\t\t\t\t"CDR2.IMGT.c.t",\n-\t\t\t\t\t"FR3.IMGT.c.a",\n-\t\t\t\t\t"CDR1.IMGT.c.a",\n-\t\t\t\t\t"CDR2.IMGT.c.g",\n-\t\t\t\t\t"CDR2.IMGT.c.a",\n-\t\t\t\t\t"FR1.IMGT.c.t",\n-\t\t\t\t\t"FR1.IMGT.Nb.of.silent.mutations",\n-\t\t\t\t\t"FR2.IMGT.Nb.of.silent.mutations",\n-\t\t\t\t\t"FR3.IMGT.Nb.of.silent.mutations",\n-\t\t\t\t\t"FR1.IMGT.Nb.of.nonsilent.mutations",\n-\t\t\t\t\t"FR2.IMGT.Nb.of.nonsilent.mutations",\n-\t\t\t\t\t"FR3.IMGT.Nb.of.nonsilent.mutations")\n-\n-print("Cleaning up columns")\n-\n-for(col in cleanup_columns){\n- dat[,col] = gsub("\\\$.*\\\$", "", dat[,col])\n- #dat[dat[,col] == "",] = "0"\n- dat[,col] = as.numeric(dat[,col])\n- dat[is.na(dat[,col]),col] = 0\n-}\n-\n-regions = c("FR1", "CDR1", "FR2", "CDR2", "FR3")\n-if(empty.region.filter == "FR1") {\n-\tregions = c("CDR1", "FR2", "CDR2", "FR3")\n-} else if (empty.region.filter == "CDR1") {\n-\tregions = c("FR2", "CDR2", "FR3")\n-} else if (empty.region.filter == "FR2") {\n-\tregions = c("CDR2", "FR3")\n-}\n-\n-pdfplots = list() #save() this later to create the pdf plots in another script (maybe avoids the "address (nil), cause memory not mapped")\n-\n-sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) }\n-\n-print("aggregating data into new columns")\n-\n-VRegionMutat'..b'ur="black"))\n-p = p + scale_fill_manual(values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet", "IGE" = "darkorange", "all" = "blue4"))\n-p = p + scale_colour_manual(guide = guide_legend(title = "Subclass"), values=c("IGA" = "blue4", "IGA1" = "lightblue1", "IGA2" = "blue4", "IGG" = "olivedrab3", "IGG1" = "olivedrab3", "IGG2" = "red", "IGG3" = "gold", "IGG4" = "darkred", "IGM" = "darkviolet", "IGE" = "darkorange", "all" = "blue4"))\n-\n-png(filename="scatter.png")\n-print(p)\n-dev.off()\n-\n-pdfplots[["scatter.pdf"]] <- p\n-\n-write.table(dat[,c("Sequence.ID", "best_match", "VRegionMutations", "VRegionNucleotides", "percentage_mutations")], "scatter.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n-\n-print("Plotting frequency ranges plot")\n-\n-dat$best_match_class = substr(dat$best_match, 0, 3)\n-freq_labels = c("0", "0-2", "2-5", "5-10", "10-15", "15-20", "20")\n-dat$frequency_bins = cut(dat$percentage_mutations, breaks=c(-Inf, 0, 2,5,10,15,20, Inf), labels=freq_labels)\n-\n-frequency_bins_sum = data.frame(data.table(dat)[, list(class_sum=sum(.N)), by=c("best_match_class")])\n-\n-frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match_class", "frequency_bins")])\n-\n-frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match_class")\n-\n-frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)\n-\n-p = ggplot(frequency_bins_data, aes(frequency_bins, frequency))\n-p = p + geom_bar(aes(fill=best_match_class), stat="identity", position="dodge") + theme(panel.background = element_rect(fill = "white", colour="black"), text = element_text(size=16, colour="black"))\n-p = p + xlab("Frequency ranges") + ylab("Frequency") + ggtitle("Mutation Frequencies by class") + scale_fill_manual(guide = guide_legend(title = "Class"), values=c("IGA" = "blue4", "IGG" = "olivedrab3", "IGM" = "darkviolet", "IGE" = "darkorange", "all" = "blue4"))\n-\n-png(filename="frequency_ranges.png")\n-print(p)\n-dev.off()\n-\n-pdfplots[["frequency_ranges.pdf"]] <- p\n-\n-save(pdfplots, file="pdfplots.RData")\n-\n-frequency_bins_data_by_class = frequency_bins_data\n-\n-frequency_bins_data_by_class = frequency_bins_data_by_class[order(frequency_bins_data_by_class$best_match_class, frequency_bins_data_by_class$frequency_bins),]\n-\n-frequency_bins_data_by_class$frequency_bins = gsub("-", " to ", frequency_bins_data_by_class$frequency_bins)\n-frequency_bins_data_by_class[frequency_bins_data_by_class$frequency_bins == "20", c("frequency_bins")] = "20 or higher"\n-frequency_bins_data_by_class[frequency_bins_data_by_class$frequency_bins == "0", c("frequency_bins")] = "0 or lower"\n-\n-write.table(frequency_bins_data_by_class, "frequency_ranges_classes.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n-\n-frequency_bins_data = data.frame(data.table(dat)[, list(frequency_count=.N), by=c("best_match", "best_match_class", "frequency_bins")])\n-\n-frequency_bins_sum = data.frame(data.table(dat)[, list(class_sum=sum(.N)), by=c("best_match")])\n-\n-frequency_bins_data = merge(frequency_bins_data, frequency_bins_sum, by="best_match")\n-\n-frequency_bins_data$frequency = round(frequency_bins_data$frequency_count / frequency_bins_data$class_sum * 100, 2)\n-\n-frequency_bins_data = frequency_bins_data[order(frequency_bins_data$best_match, frequency_bins_data$frequency_bins),]\n-frequency_bins_data$frequency_bins = gsub("-", " to ", frequency_bins_data$frequency_bins)\n-frequency_bins_data[frequency_bins_data$frequency_bins == "20", c("frequency_bins")] = "20 or higher"\n-frequency_bins_data[frequency_bins_data$frequency_bins == "0", c("frequency_bins")] = "0 or lower"\n-\n-write.table(frequency_bins_data, "frequency_ranges_subclasses.txt", sep="\\t",quote=F,row.names=F,col.names=T)\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/shm_csr.xml
--- a/shm_csr/shm_csr.xml Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,240 +0,0 @@\n-<tool id="shm_csr" name="SHM & CSR pipeline" version="1.0">\r\n-\t<description></description>\r\n-\t<requirements>\r\n-\t\t<requirement type="package" version="2.7">python</requirement>\r\n-\t\t<requirement type="package" version="1.16.0">numpy</requirement>\r\n-\t\t<requirement type="package" version="1.2.0">xlrd</requirement>\r\n-\t\t<requirement type="package" version="3.0.0">r-ggplot2</requirement>\r\n-\t\t<requirement type="package" version="1.4.3">r-reshape2</requirement>\r\n-\t\t<requirement type="package" version="0.5.0">r-scales</requirement>\r\n-\t\t<requirement type="package" version="3.4_5">r-seqinr</requirement>\r\n-\t\t<requirement type="package" version="1.11.4">r-data.table</requirement>\r\n-\t</requirements>\r\n-\t<command interpreter="bash">\r\n-\t\t#if str ( $filter_unique.filter_unique_select ) == "remove":\r\n-\t\t\twrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select $filter_unique.filter_unique_clone_count $class_filter_cond.class_filter $empty_region_filter $fast\r\n-\t\t#else:\r\n-\t\t\twrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select 2 $class_filter_cond.class_filter $empty_region_filter $fast\r\n-\t\t#end if\r\n-\t</command>\r\n-\t<inputs>\r\n-\t\t<param name="in_file" type="data" format="data" label="IMGT zip file to be analysed" />\r\n-\t\t<param name="empty_region_filter" type="select" label="Sequence starts at" help="" >\r\n-\t\t\t<option value="leader" selected="true">Leader: include FR1, CDR1, FR2, CDR2, FR3 in filters</option>\r\n-\t\t\t<option value="FR1" selected="true">FR1: include CDR1,FR2,CDR2,FR3 in filters</option>\r\n-\t\t\t<option value="CDR1">CDR1: include FR2,CDR2,FR3 in filters</option>\r\n-\t\t\t<option value="FR2">FR2: include CDR2,FR3 in filters</option>\r\n-\t\t</param>\r\n-\t\t<param name="functionality" type="select" label="Functionality filter" help="" >\r\n-\t\t\t<option value="productive" selected="true">Productive (Productive and Productive see comment)</option>\r\n-\t\t\t<option value="unproductive">Unproductive (Unproductive and Unproductive see comment)</option>\r\n-\t\t\t<option value="remove_unknown">Productive and Unproductive (Productive, Productive see comment, Unproductive, Unproductive and Unproductive see comment)</option>\r\n-\t\t</param>\r\n-\t\t<conditional name="filter_unique">\r\n-\t\t\t<param name="filter_unique_select" type="select" label="Filter unique sequences" help="See below for an example.">\r\n-\t\t\t\t<option value="remove" selected="true">Remove uniques (Based on nucleotide sequence + C)</option>\r\n-\t\t\t\t<option value="remove_vjaa">Remove uniques (Based on V+J+CDR3 (AA))</option>\r\n-\t\t\t\t<option value="keep">Keep uniques (Based on nucleotide sequence + C)</option>\r\n-\t\t\t\t<option value="no">No</option>\r\n-\t\t\t</param>\r\n-\t\t\t<when value="remove">\r\n-\t\t\t\t<param name="filter_unique_clone_count" size="4" type="integer" label="How many sequences should be in a group to keep 1 of them" value="2" min="2"/>\r\n-\t\t\t</when>\r\n-\t\t\t<when value="keep"></when>\r\n-\t\t\t<when value="no"></when>\r\n-\t\t</conditional>\r\n-\t\t<param name="unique" type="select" label="Remove duplicates based on" help="" >\r\n-\t\t\t<option value="VGene,CDR3.IMGT.AA,best_match_class">Top.V.Gene, CDR3 (AA), C region</option>\r\n-\t\t\t<option value="VGene,CDR3.IMGT.AA">Top.V.Gene, CDR3 (AA)</option>\r\n-\t\t\t<option value="CDR3.IMGT.AA,best_match_class">CDR3 (AA), C region</option>\r\n-\t\t\t<option value="CDR3.IMGT.AA">CDR3 (AA)</option>\r\n-\t\t\t\r\n-\t\t\t<option value="VGene,CDR3.IMGT.seq,best_match_class">Top.V.Gene, CDR3 (nt), C region</option>\r\n-\t\t\t<option value="VGene,CDR3.IMGT.seq">Top.V.Gene, CDR3 (nt)</option>\r\n-\t\t\t<option value="CDR3.IMGT.seq,best_match_class">CDR3 (nt), C region</option>\r\n-\t\t\t<option value="CDR3.IMGT.seq">CDR3 (nt)'..b'either the \xe2\x80\x9cremove unique filter\xe2\x80\x9d or the \xe2\x80\x9ckeep unique filter\xe2\x80\x9d\r\n-\r\n-+--------------------------+\r\n-| unique filter |\r\n-+--------+--------+--------+\r\n-| values | remove | keep |\r\n-+--------+--------+--------+\r\n-| A | A | A |\r\n-+--------+--------+--------+\r\n-| A | B | B |\r\n-+--------+--------+--------+\r\n-| B | D | C |\r\n-+--------+--------+--------+\r\n-| B | | D |\r\n-+--------+--------+--------+\r\n-| C | | |\r\n-+--------+--------+--------+\r\n-| D | | |\r\n-+--------+--------+--------+\r\n-| D | | |\r\n-+--------+--------+--------+\r\n-\r\n------\r\n- \r\n-**Remove duplicates based on**\r\n-\r\n-Allows the selection of a single sequence per clone. Different definitions of a clone can be chosen. \r\n-\r\n-.. class:: infomark\r\n-\r\n-Note: The first sequence (in the data set) of each clone is always included in the analysis. When the first matched sequence is unmatched (no subclass assigned) the first matched sequence will be included. This means that altering the data order (by for instance sorting) can change the sequence which is included in the analysis and therefore slightly influences the results. \r\n-\r\n------\r\n-\r\n-**Human Class/Subclass filter**\r\n-\r\n-.. class:: warningmark\r\n-\r\n-Note: This filter should only be applied when analysing human IGH data in which a (sub)class specific sequence is present. Otherwise please select the do not assign (sub)class option to prevent errors when running the pipeline. \r\n-\r\n-The class percentage is based on the \xe2\x80\x98chunk hit percentage\xe2\x80\x99 (see below). The subclass percentage is based on the \xe2\x80\x98nt hit percentage\xe2\x80\x99 (see below).\r\n-\r\n-The SHM & CSR pipeline identifies human C\xc2\xb5, C\xce\xb1, C\xce\xb3 and C\xce\xb5 constant genes by dividing the reference sequences for the subclasses (NG_001019) in 8 nucleotide chunks which overlap by 4 nucleotides. These overlapping chunks are then individually aligned in the right order to each input sequence. This alignment is used to calculate the chunck hit percentage and the nt hit percentage. \r\n-\r\n-*Chunk hit percentage*: The percentage of the chunks that is aligned \r\n-\r\n-*Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% \xe2\x80\x98nt hit percentage\xe2\x80\x99 which means that 5 out of 7 subclass specific nucleotides for C\xce\xb1 or 6 out of 8 subclass specific nucleotides of C\xce\xb3 should match with the specific subclass. \r\n-The option \xe2\x80\x9c>25% class\xe2\x80\x9d can be chosen when you only are interested in the class (C\xce\xb1/C\xce\xb3/C\xc2\xb5/C\xc9\x9b) of your sequences and the length of your sequence is not long enough to assign the subclasses.\r\n-\r\n------\r\n-\r\n-**Output new IMGT archives per class into your history?**\r\n-\r\n-If yes is selected, additional output files (one for each class) will be added to the history which contain information of the sequences that passed the selected filtering criteria. These files are in the same format as the IMGT/HighV-QUEST output files and therefore are also compatible with many other analysis programs, such as the Immune repertoire pipeline. \r\n-\r\n------\r\n-\r\n-**Execute**\r\n-\r\n-Upon pressing execute a new analysis is added to your history (right side of the page). Initially this analysis will be grey, after initiating the analysis colour of the analysis in the history will change to yellow. When the analysis is finished it will turn green in the history. Now the analysis can be opened by clicking on the eye icon on the analysis of interest. When an analysis turns red an error has occurred when running the analysis. If you click on the analysis title additional information can be found on the analysis. In addition a bug icon appears. Here more information on the error can be found.\r\n-\r\n-]]>\r\n-\t</help>\r\n-\t<citations>\r\n-\t\t<citation type="doi">10.1093/nar/gks457</citation>\r\n-\t\t<citation type="doi">10.1093/bioinformatics/btv359</citation>\r\n-\t</citations>\r\n-</tool>\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/shm_downloads.htm
--- a/shm_csr/shm_downloads.htm Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,538 +0,0 @@\n-<html>\r\n-\r\n-<head>\r\n-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">\r\n-<meta name=Generator content="Microsoft Word 14 (filtered)">\r\n-<style>\r\n-\r\n-</style>\r\n-\r\n-</head>\r\n-\r\n-<body lang=EN-US link=blue vlink=purple>\r\n-\r\n-<div class=WordSection1>\r\n-\r\n-Info\r\n-\r\n-The complete\r\n-dataset:\r\n-Allows downloading of the complete parsed data set.\r\n-\r\n-The filtered\r\n-dataset:\r\n-Allows downloading of all parsed IMGT information of all transcripts that\r\n-passed the chosen filter settings.\r\n-\r\n-The alignment\r\n-info on the unmatched sequences: Provides information of the subclass\r\n-alignment of all unmatched sequences. For each sequence the chunck hit\r\n-percentage and the nt hit percentage is shown together with the best matched\r\n-subclass.\r\n-\r\n-SHM Overview\r\n-\r\n-The SHM Overview\r\n-table as a dataset: Allows downloading of the SHM Overview\r\n-table as a data set.\xa0 \r\n-\r\n-Motif data per\r\n-sequence ID: Provides a file that contains information for each\r\n-transcript on the number of mutations present in WA/TW and RGYW/WRCY motives.\r\n-\r\n-Mutation data\r\n-per sequence ID: Provides a file containing information\r\n-on the number of sequences bases, the number and location of mutations and the\r\n-type of mutations found in each transcript. \r\n-\r\n- Downloads a\r\n-.txz file with the same format as downloaded IMGT files that contains all IGA1\r\n-sequences that have passed the chosen filter settings.\r\n-\r\n-An IMGT archive\r\n-with just the matched and filtered IGA2 sequences: Downloads a .txz\r\n-file with the same format as downloaded IMGT files that contains all IGA2\r\n-sequences that have passed the chosen filter settings.\r\n-\r\n-An IMGT archive\r\n-with just the matched and filtered IGG sequences: Downloads a .txz\r\n-file with the same format as downloaded IMGT files that contains all IGG\r\n-sequences that have passed the chosen filter settings.\r\n-\r\n-An IMGT archive\r\n-with just the matched and filtered IGG1 sequences: Downloads a\r\n-.txz file with the same format as downloaded IMGT files that contains all IGG1\r\n-sequences that have passed the chosen filter settings.\r\n-\r\n-An IMGT archive\r\n-with just the matched and filtered IGG2 sequences: Downloads a\r\n-.txz file with the same format as downloaded IMGT files that contains all IGG2\r\n-sequences that have passed the chosen filter settings.\r\n-\r\n-An IMGT archive\r\n-with just the matched and filtered IGG3 sequences: Downloads a .txz\r\n-file with the same format as downloaded IMGT files that contains all IGG3\r\n-sequences that have passed the chosen filter settings.\r\n-\r\n-An IMGT archive\r\n-with just the matched and filtered IGG4 sequences: Downloads a\r\n-.txz file with the same format as downloaded IMGT files that contains all IGG4\r\n-sequences that have passed the chosen filter settings.\r\n-\r\n-An IMGT archive\r\n-with just the matched and filtered IGM sequences: Downloads a .txz\r\n-file with the same format as downloaded IMGT files that contains all IGM\r\n-sequences that have passed the chosen filter settings.\r\n-\r\n-An IMGT archive\r\n-with just the matched and filtered IGE sequences: Downloads a\r\n-.txz file with the same format as downloaded IMGT files that contains all IGE\r\n-sequences that have passed the chosen filter settings.\r\n-\r\n-</div>\r\n-\r\n-</body>\r\n-\r\n-</html>\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/shm_first.htm
--- a/shm_csr/shm_first.htm Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,127 +0,0 @@
-<html>
-
-<head>
-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
-<meta name=Generator content="Microsoft Word 14 (filtered)">
-<style>
-
-</style>
-
-</head>
-
-<body lang=EN-US>
-
-<div class=WordSection1>
-
-Table showing the order of each
-filtering step and the number and percentage of sequences after each filtering
-step. 
-
-Input: The
-number of sequences in the original IMGT file. This is always 100% of the
-sequences.
-
-After "no results" filter: IMGT
-classifies sequences either as "productive", "unproductive", "unknown", or "no
-results". Here, the number and percentages of sequences that are not classified
-as "no results" are reported.
-
-After functionality filter: The
-number and percentages of sequences that have passed the functionality filter. The
-filtering performed is dependent on the settings of the functionality filter.
-Details on the functionality filter <a name="OLE_LINK12"></a><a
-name="OLE_LINK11"></a><a name="OLE_LINK10">can be found on the start page of
-the SHM&CSR pipeline</a>.
-
-After
-removal sequences that are missing a gene region:
-In this step all sequences that are missing a gene region (FR1, CDR1, FR2,
-CDR2, FR3) that should be present are removed from analysis. The sequence
-regions that should be present are dependent on the settings of the sequence
-starts at filter. <a name="OLE_LINK9"></a><a name="OLE_LINK8">The number and
-percentage of sequences that pass this filter step are reported.</a> 
-
-After
-N filter: In this step all sequences that contain
-an ambiguous base (n) in the analysed region or the CDR3 are removed from the
-analysis. The analysed region is determined by the setting of the sequence
-starts at filter. The number and percentage of sequences that pass this filter
-step are reported.
-
-After
-filter unique sequences: The number and
-percentage of sequences that pass the "filter unique sequences" filter. Details
-on this filter can be found on the start page of
-the SHM&CSR pipeline
-
-After
-remove duplicate based on filter: The number and
-percentage of sequences that passed the remove duplicate filter. Details on the
-"remove duplicate filter based on filter" can be found on the start page of the
-SHM&CSR pipeline.
-
-<a name="OLE_LINK17"></a><a
-name="OLE_LINK16">Number of matches sequences:</a>
-The number and percentage of sequences that passed all the filters described
-above and have a (sub)class assigned.
-
-Number
-of unmatched sequences: The number and percentage
-of sequences that passed all the filters described above and do not have
-subclass assigned.
-
- 
-
-</div>
-
-</body>
-
-</html>

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/shm_frequency.htm
--- a/shm_csr/shm_frequency.htm Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,87 +0,0 @@
-<html>
-
-<head>
-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
-<meta name=Generator content="Microsoft Word 14 (filtered)">
-<style>
-
-</style>
-
-</head>
-
-<body lang=EN-US>
-
-<div class=WordSection1>
-
-SHM
-frequency tab
-
-Graphs
-
-These
-graphs give insight into the level of SHM. The data represented in these graphs
-can be downloaded in the download tab. <a name="OLE_LINK24"></a><a
-name="OLE_LINK23"></a><a name="OLE_LINK90"></a><a name="OLE_LINK89">More
-information on the values found in healthy individuals of different ages can be
-found in IJspeert and van Schouwenburg et al, PMID: 27799928. </a>
-
-Frequency
-scatter plot
-
-A
-dot plot showing the percentage of SHM in each transcript divided into the
-different (sub)classes. In the graph each dot
-represents an individual transcript.
-
-Mutation
-frequency by class
-
-A
-bar graph showing the percentage of transcripts that contain 0%, 0-2%, 2-5%,
-5-10% 10-15%, 15-20% or more than 20% SHM for each subclass. 
-
-Hanna IJspeert, Pauline A. van
-Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,
-Andrew P. Stubbs, and Mirjam van der Burg (2016). Evaluation
-of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and
-Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a
-href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">doi:10.3389/fimmu.2016.00410</a>][<a
-href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">Link</a>]
-
-</div>
-
-</body>
-
-</html>

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/shm_overview.htm
--- a/shm_csr/shm_overview.htm Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,332 +0,0 @@\n-<html>\r\n-\r\n-<head>\r\n-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">\r\n-<meta name=Generator content="Microsoft Word 14 (filtered)">\r\n-<style>\r\n-\r\n-</style>\r\n-\r\n-</head>\r\n-\r\n-<body lang=EN-US>\r\n-\r\n-<div class=WordSection1>\r\n-\r\n-Info\r\n-table\r\n-\r\n-This\r\n-table contains information on different characteristics of SHM. For all\r\n-characteristics information can be found for all sequences or only sequences of\r\n-a certain (sub)class. All results are based on the sequences that passed the filter\r\n-settings chosen on the start page of the SHM & CSR pipeline and only\r\n-include details on the analysed region as determined by the setting of the\r\n-sequence starts at filter. All data in this table can be downloaded via the\r\n-\x93downloads\x94 tab.\r\n-\r\n-Mutation\r\n-frequency:\r\n-\r\n-<a name="OLE_LINK83"></a><a\r\n-name="OLE_LINK82"></a><a name="OLE_LINK81">These values\r\n-give information on the level of SHM. </a><a name="OLE_LINK22"></a><a\r\n-name="OLE_LINK21"></a><a name="OLE_LINK20">More information\r\n-on the values found in healthy individuals of different ages can be found in </a><a\r\n-name="OLE_LINK15"></a><a name="OLE_LINK14"></a><a name="OLE_LINK13"><span\r\n-lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>IJspeert\r\n-and van Schouwenburg et al, PMID: 27799928</a>\r\n-\r\n-Number\r\n-of mutations: Shows the number of total\r\n-mutations / the number of sequenced bases (the % of mutated bases).\r\n-\r\n-Median\r\n-number of mutations: Shows the median % of\r\n-SHM of all sequences.\r\n-\r\n-Patterns\r\n-of SHM:\r\n-\r\n-<a name="OLE_LINK72"></a><a\r\n-name="OLE_LINK71"></a><a name="OLE_LINK70">These values\r\n-give insights into the targeting and patterns of SH'..b'if"\'>Shows the total number of sequenced <a\r\n-name="OLE_LINK59"></a><a name="OLE_LINK58">guanine</a>s / The total number of\r\n-sequenced bases (the percentage of sequenced bases that were guanines).\r\n-\r\n-<a name="OLE_LINK69"><span\r\n-lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Graphs</a>\r\n-\r\n-<a name="OLE_LINK75"></a><a\r\n-name="OLE_LINK74"></a><a name="OLE_LINK73">These graphs visualize\r\n-information on the patterns and targeting of SHM and thereby give information\r\n-into the repair pathways used to repair the U:G mismatches introduced by AID. The\r\n-data represented in these graphs can be downloaded in the download tab. More\r\n-information on the values found in healthy individuals of different ages can be\r\n-found in IJspeert and van Schouwenburg et al, PMID: 27799928</a><span\r\n-lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>.\r\n-<a name="OLE_LINK85"></a><a name="OLE_LINK84"></a>\r\n-\r\n-Percentage\r\n-of mutations in AID and pol eta motives\r\n-\r\n-Visualizes\r\n-<a name="OLE_LINK80"></a><a name="OLE_LINK79"></a><a name="OLE_LINK78">for each\r\n-(sub)class </a>the percentage of mutations that are present in AID (RGYW or\r\n-WRCY) or polymerase eta motives (WA or TW) in the different subclasses <span\r\n-lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>(R=Purine,\r\n-Y=pyrimidine, W = A or T).\r\n-\r\n-Relative\r\n-mutation patterns\r\n-\r\n-Visualizes\r\n-for each (sub)class the distribution of mutations between mutations at AT\r\n-locations and transitions or transversions at GC locations. \r\n-\r\n-Absolute\r\n-mutation patterns\r\n-\r\n-Visualized\r\n-for each (sub)class the percentage of sequenced AT and GC bases that are\r\n-mutated. The mutations at GC bases are divided into transition and transversion\r\n-mutations<a name="OLE_LINK77"></a><a name="OLE_LINK76">. </a>\r\n-\r\n-Hanna IJspeert, Pauline A. van\r\n-Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,\r\n-Andrew P. Stubbs, and Mirjam van der Burg (2016). <span\r\n-style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Evaluation\r\n-of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and\r\n-Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a\r\n-href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/"><span\r\n-style=\'color:windowtext\'>doi:10.3389/fimmu.2016.00410</a>][<a\r\n-href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/"><span\r\n-style=\'color:windowtext\'>Link</a>]\r\n-\r\n-</div>\r\n-\r\n-</body>\r\n-\r\n-</html>\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/shm_selection.htm
--- a/shm_csr/shm_selection.htm Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,128 +0,0 @@
-<html>
-
-<head>
-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
-<meta name=Generator content="Microsoft Word 14 (filtered)">
-<style>
-
-</style>
-
-</head>
-
-<body lang=EN-US link=blue vlink=purple>
-
-<div class=WordSection1>
-
-References
-
-Yaari, G. and Uduman, M. and Kleinstein, S. H. (2012). Quantifying
-selection in high-throughput Immunoglobulin sequencing data sets. In Nucleic Acids Research, 40 (17),
-pp. e134�e134. [<a href="http://dx.doi.org/10.1093/nar/gks457" target="_blank">doi:10.1093/nar/gks457</a>][<a
-href="http://dx.doi.org/10.1093/nar/gks457" target="_blank">Link</a>]
-
-Graphs
-
-AA
-mutation frequency
-
-For
-each class, the frequency of replacement mutations at each amino acid position
-is shown, which is calculated by dividing the number of replacement mutations
-at a particular amino acid position/the number sequences that have an amino
-acid at that particular position. Since the length of the CDR1 and CDR2 region
-is not the same for every VH gene, some amino acids positions are absent.
-Therefore we calculate the frequency using the number of amino acids present at
-that that particular location. 
-
-Antigen
-selection (BASELINe)
-
-Shows
-the results of the analysis of antigen selection as performed using BASELINe.
-Details on the analysis performed by BASELINe can be found in Yaari et al,
-PMID: 22641856. The settings used for the analysis are:
-focused, SHM targeting model: human Tri-nucleotide, custom bounderies. The
-custom boundries are dependent on the �sequence starts at filter�. 
-
-Leader:
-1:26:38:55:65:104:-
-
-FR1: 27:27:38:55:65:104:-
-
-CDR1: 27:27:38:55:65:104:-
-
-FR2: 27:27:38:55:65:104:-
-
-Hanna IJspeert, Pauline A. van
-Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,
-Andrew P. Stubbs, and Mirjam van der Burg (2016). Evaluation
-of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and
-Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a
-href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">doi:10.3389/fimmu.2016.00410</a>][<a
-href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">Link</a>]
-
-</div>
-
-</body>
-
-</html>

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/shm_transition.htm
--- a/shm_csr/shm_transition.htm Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,120 +0,0 @@
-<html>
-
-<head>
-<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
-<meta name=Generator content="Microsoft Word 14 (filtered)">
-<style>
-
-</style>
-
-</head>
-
-<body lang=EN-US link=blue vlink=purple>
-
-<div class=WordSection1>
-
-These graphs and
-tables give insight into the targeting and patterns of SHM. This can give
-insight into the DNA repair pathways used to solve the U:G mismatches
-introduced by AID. More information on the values found in healthy individuals
-of different ages can be found in IJspeert and van Schouwenburg et al, PMID:
-27799928.
-
-Graphs
-
-
-<a name="OLE_LINK93"></a><a
-name="OLE_LINK92"></a><a name="OLE_LINK91">Heatmap transition
-information</a>
-
-<a name="OLE_LINK98"></a><a
-name="OLE_LINK97">Heatmaps visualizing for each subclass the frequency
-of all possible substitutions. On the x-axes the original base is shown, while
-the y-axes shows the new base. The darker the shade of blue, the more frequent
-this type of substitution is occurring.� </a>
-
-Bargraph
-transition information
-
-Bar graph
-visualizing for each original base the distribution of substitutions into the other
-bases. A graph is included for each (sub)class. 
-
-Tables
-
-Transition
-tables are shown for each (sub)class. All the original bases are listed
-horizontally, while the new bases are listed vertically. 
-
-Hanna IJspeert, Pauline A. van
-Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,
-Andrew P. Stubbs, and Mirjam van der Burg (2016). Evaluation
-of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and
-Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a
-href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">doi:10.3389/fimmu.2016.00410</a>][<a
-href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">Link</a>]
-
-</div>
-
-</body>
-
-</html>

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/style.tar.gz

Binary file shm_csr/style.tar.gz has changed

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/subclass_definition.db.nhr

Binary file shm_csr/subclass_definition.db.nhr has changed

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/subclass_definition.db.nin

Binary file shm_csr/subclass_definition.db.nin has changed

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/subclass_definition.db.nsq

Binary file shm_csr/subclass_definition.db.nsq has changed

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/summary_to_fasta.py
--- a/shm_csr/summary_to_fasta.py Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,42 +0,0 @@
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--input", help="The 1_Summary file of an IMGT zip file")
-parser.add_argument("--fasta", help="The output fasta file")
-
-args = parser.parse_args()
-
-infile = args.input
-fasta = args.fasta
-
-with open(infile, 'r') as i, open(fasta, 'w') as o:
- first = True
- id_col = 0
- seq_col = 0
- no_results = 0
- no_seqs = 0
- passed = 0
- for line in i:
- splt = line.split("\t")
- if first:
- id_col = splt.index("Sequence ID")
- seq_col = splt.index("Sequence")
- first = False
- continue
- if len(splt) < 5:
- no_results += 1
- continue
-
- ID = splt[id_col]
- seq = splt[seq_col]
-
- if not len(seq) > 0:
- no_seqs += 1
- continue
-
- o.write(">" + ID + "\n" + seq + "\n")
- passed += 1
-
- print "No results:", no_results
- print "No sequences:", no_seqs
- print "Written to fasta file:", passed

diff -r a4617f1d1d89 -r b6f9a640e098 shm_csr/wrapper.sh
--- a/shm_csr/wrapper.sh Fri Feb 19 15:08:51 2021 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,913 +0,0 @@\n-#!/bin/bash\n-#set -e\n-dir="$(cd "$(dirname "$0")" && pwd)"\n-input=$1\n-method=$2\n-log=$3 #becomes the main html page at the end\n-outdir=$4\n-output="$outdir/index.html" #copied to $log location at the end\n-title="$5"\n-include_fr1=$6\n-functionality=$7\n-unique=$8\n-naive_output=$9\n-naive_output_ca=${10}\n-naive_output_cg=${11}\n-naive_output_cm=${12}\n-naive_output_ce=${13}\n-naive_output_all=${14}\n-filter_unique=${15}\n-filter_unique_count=${16}\n-class_filter=${17}\n-empty_region_filter=${18}\n-fast=${19}\n-\n-mkdir $outdir\n-\n-tar -xzf $dir/style.tar.gz -C $outdir\n-\n-echo "---------------- read parameters ----------------"\n-echo "---------------- read parameters ---------------- " > $log\n-\n-echo "unpacking IMGT file"\n-\n-type="`file $input`"\n-if [[ "$type" == *"Zip archive"* ]] ; then\n-\techo "Zip archive"\n-\techo "unzip $input -d $PWD/files/"\n-\tunzip $input -d $PWD/files/\n-elif [[ "$type" == *"XZ compressed data"* ]] ; then\n-\techo "ZX archive"\n-\techo "tar -xJf $input -C $PWD/files/"\n-\tmkdir -p "$PWD/files/$title"\n-\ttar -xJf $input -C "$PWD/files/$title"\n-else\n-\techo "Unrecognized format $type"\n-\techo "Unrecognized format $type" > $log\n-\texit 1\n-fi\n-\n-cat "`find $PWD/files/ -name "1_*"`" > $PWD/summary.txt\n-cat "`find $PWD/files/ -name "2_*"`" > $PWD/gapped_nt.txt\n-cat "`find $PWD/files/ -name "3_*"`" > $PWD/sequences.txt\n-cat "`find $PWD/files/ -name "4_*"`" > $PWD/gapped_aa.txt\n-cat "`find $PWD/files/ -name "5_*"`" > $PWD/aa.txt\n-cat "`find $PWD/files/ -name "6_*"`" > $PWD/junction.txt\n-cat "`find $PWD/files/ -name "7_*"`" > $PWD/mutationanalysis.txt\n-cat "`find $PWD/files/ -name "8_*"`" > $PWD/mutationstats.txt\n-cat "`find $PWD/files/ -name "9_*"`" > $PWD/aa_change_stats.txt\n-cat "`find $PWD/files/ -name "10_*"`" > $PWD/hotspots.txt\n-\n-echo "---------------- unique id check ----------------"\n-\n-Rscript $dir/check_unique_id.r $PWD/summary.txt $PWD/gapped_nt.txt $PWD/sequences.txt $PWD/gapped_aa.txt $PWD/aa.txt $PWD/junction.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/aa_change_stats.txt $PWD/hotspots.txt\n-\n-if [[ ${#BLASTN_DIR} -ge 5 ]] ; then\n-\techo "On server, using BLASTN_DIR env: ${BLASTN_DIR}"\n-else\n-\tBLASTN_DIR="/home/galaxy/Downloads/ncbi-blast-2.4.0+/bin"\n-\techo "Dev Galaxy set BLASTN_DIR to: ${BLASTN_DIR}"\n-fi\n-\n-echo "---------------- class identification ----------------"\n-echo "---------------- class identification ---------------- " >> $log\n-\n-python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt\n-\n-echo "---------------- merge_and_filter.r ----------------"\n-echo "---------------- merge_and_filter.r ---------------- " >> $log\n-\n-Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt "$PWD/gapped_aa.txt" $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${filter_unique_count} ${class_filter} ${empty_region_filter} 2>&1\n-\n-if [[ "${naive_output}" == "yes" ]] || [[ "$fast" == "no" ]] ; then\n-\n-\techo "---------------- creating new IMGT zips ----------------"\n-\techo "---------------- creating new IMGT zips ---------------- " >> $log\n-\n-\tmkdir $outdir/new_IMGT\n-\n-\tcp $PWD/summary.txt "$outdir/new_IMGT/1_Summary.txt"\n-\tcp $PWD/gapped_nt.txt "$outdir/new_IMGT/2_IMGT-gapped-nt-sequences.txt"\n-\tcp $PWD/sequences.txt "$outdir/new_IMGT/3_Nt-sequences.txt"\n-\tcp $PWD/gapped_aa.txt "$outdir/new_IMGT/4_IMGT-gapped-AA-sequences.txt"\n-\tcp $PWD/aa.txt "$outdir/new_IMGT/5_AA-sequences.txt"\n-\tcp $PWD/junction.txt "$outdir/new_IMGT/6_Junction.txt"\n-\tcp $PWD/mutationanalysis.txt "$outdir/new_IMGT/7_V-REGION-mutation-and-AA-change-table.txt"\n-\tcp $PWD/mutationstats.txt "$outdir/new_IMGT/8_V-REGION-nt-mutation-statistics.txt"\n-\tcp $PWD/aa_change_stats.txt "$outdir/new_IMGT/9_V-REGION-AA-change-statistics.txt"\n-\tcp $PWD/hotspots.txt "$outdir/new_IMGT/10_V-REGION-mutation-hotspot'..b'equence of a clone (IGE)</td><td><a href=\'new_IMGT_IGE_first_seq_of_clone.txz\' download=\'new_IMGT_IGE_first_seq_of_clone.txz\' >Download</a></td></tr>" >> $output\n-\n-echo "<tr><td colspan=\'2\' style=\'background-color:#E0E0E0;\'>Filtered IMGT output files</td></tr>" >> $output\n-echo "<tr><td>An IMGT archive with just the matched and filtered sequences</td><td><a href=\'new_IMGT.txz\' download=\'new_IMGT.txz\' >Download</a></td></tr>" >> $output\n-echo "<tr><td>An IMGT archive with just the matched and filtered IGA sequences</td><td><a href=\'new_IMGT_IGA.txz\' download=\'new_IMGT_IGA.txz\' >Download</a></td></tr>" >> $output\n-echo "<tr><td>An IMGT archive with just the matched and filtered IGA1 sequences</td><td><a href=\'new_IMGT_IGA1.txz\' download=\'new_IMGT_IGA1.txz\' >Download</a></td></tr>" >> $output\n-echo "<tr><td>An IMGT archive with just the matched and filtered IGA2 sequences</td><td><a href=\'new_IMGT_IGA2.txz\' download=\'new_IMGT_IGA2.txz\' >Download</a></td></tr>" >> $output\n-echo "<tr><td>An IMGT archive with just the matched and filtered IGG sequences</td><td><a href=\'new_IMGT_IGG.txz\' download=\'new_IMGT_IGG.txz\' >Download</a></td></tr>" >> $output\n-echo "<tr><td>An IMGT archive with just the matched and filtered IGG1 sequences</td><td><a href=\'new_IMGT_IGG1.txz\' download=\'new_IMGT_IGG1.txz\' >Download</a></td></tr>" >> $output\n-echo "<tr><td>An IMGT archive with just the matched and filtered IGG2 sequences</td><td><a href=\'new_IMGT_IGG2.txz\' download=\'new_IMGT_IGG2.txz\' >Download</a></td></tr>" >> $output\n-echo "<tr><td>An IMGT archive with just the matched and filtered IGG3 sequences</td><td><a href=\'new_IMGT_IGG3.txz\' download=\'new_IMGT_IGG3.txz\' >Download</a></td></tr>" >> $output\n-echo "<tr><td>An IMGT archive with just the matched and filtered IGG4 sequences</td><td><a href=\'new_IMGT_IGG4.txz\' download=\'new_IMGT_IGG4.txz\' >Download</a></td></tr>" >> $output\n-echo "<tr><td>An IMGT archive with just the matched and filtered IGM sequences</td><td><a href=\'new_IMGT_IGM.txz\' download=\'new_IMGT_IGM.txz\' >Download</a></td></tr>" >> $output\n-echo "<tr><td>An IMGT archive with just the matched and filtered IGE sequences</td><td><a href=\'new_IMGT_IGE.txz\' download=\'new_IMGT_IGE.txz\' >Download</a></td></tr>" >> $output\n-\n-echo "</table>" >> $output\n-\n-echo " " >> $output\n-cat $dir/shm_downloads.htm >> $output\n-\n-echo "</div>" >> $output #downloads tab end\n-\n-echo "</div>" >> $output #tabs end \n-\n-echo "</html>" >> $output\n-\n-\n-echo "---------------- naive_output.r ----------------"\n-echo "---------------- naive_output.r ---------------- " >> $log\n-\n-if [[ "$naive_output" == "yes" ]]\n-then\n-\techo "output naive output"\n-\tif [[ "${class_filter}" == "101_101" ]]\n-\tthen\n-\t\techo "copy new_IMGT.txz to ${naive_output_all}"\n-\t\tcp $outdir/new_IMGT.txz ${naive_output_all}\n-\telse\n-\t\techo "copy for classes"\n-\t\tcp $outdir/new_IMGT_IGA.txz ${naive_output_ca}\n-\t\tcp $outdir/new_IMGT_IGG.txz ${naive_output_cg}\n-\t\tcp $outdir/new_IMGT_IGM.txz ${naive_output_cm}\n-\t\tcp $outdir/new_IMGT_IGE.txz ${naive_output_ce}\n-\tfi\n-fi\n-\n-echo "</table>" >> $outdir/base_overview.html\n-\n-mv $log $outdir/log.html\n-\n-echo "<html><center><h1><a href=\'index.html\'>Click here for the results</a></h1>Tip: Open it in a new tab (middle mouse button or right mouse button -> \'open in new tab\' on the link above) " > $log\n-echo "<table border = 1>" >> $log\n-echo "<thead><tr><th>Info</th><th>Sequences</th><th>Percentage</th></tr></thead>" >> $log\n-tIFS="$TMP"\n-IFS=$\'\\t\'\n-while read step seq perc\n-\tdo\n-\t\techo "<tr>" >> $log\n-\t\techo "<td>$step</td>" >> $log\n-\t\techo "<td>$seq</td>" >> $log\n-\t\techo "<td>${perc}%</td>" >> $log\n-\t\techo "</tr>" >> $log\n-done < $outdir/filtering_steps.txt\n-echo "</table>" >> $log\n-echo " " >> $log\n-cat $dir/shm_first.htm >> $log\n-echo "</center></html>" >> $log\n-\n-IFS="$tIFS"\n-\n-\n-echo "---------------- Done! ----------------"\n-echo "---------------- Done! ---------------- " >> $outdir/log.html\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_downloads.htm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shm_downloads.htm Fri Feb 19 15:10:54 2021 +0000

b'@@ -0,0 +1,538 @@\n+<html>\r\n+\r\n+<head>\r\n+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">\r\n+<meta name=Generator content="Microsoft Word 14 (filtered)">\r\n+<style>\r\n+\r\n+</style>\r\n+\r\n+</head>\r\n+\r\n+<body lang=EN-US link=blue vlink=purple>\r\n+\r\n+<div class=WordSection1>\r\n+\r\n+Info\r\n+\r\n+The complete\r\n+dataset:\r\n+Allows downloading of the complete parsed data set.\r\n+\r\n+The filtered\r\n+dataset:\r\n+Allows downloading of all parsed IMGT information of all transcripts that\r\n+passed the chosen filter settings.\r\n+\r\n+The alignment\r\n+info on the unmatched sequences: Provides information of the subclass\r\n+alignment of all unmatched sequences. For each sequence the chunck hit\r\n+percentage and the nt hit percentage is shown together with the best matched\r\n+subclass.\r\n+\r\n+SHM Overview\r\n+\r\n+The SHM Overview\r\n+table as a dataset: Allows downloading of the SHM Overview\r\n+table as a data set.\xa0 \r\n+\r\n+Motif data per\r\n+sequence ID: Provides a file that contains information for each\r\n+transcript on the number of mutations present in WA/TW and RGYW/WRCY motives.\r\n+\r\n+Mutation data\r\n+per sequence ID: Provides a file containing information\r\n+on the number of sequences bases, the number and location of mutations and the\r\n+type of mutations found in each transcript. \r\n+\r\n+ Downloads a\r\n+.txz file with the same format as downloaded IMGT files that contains all IGA1\r\n+sequences that have passed the chosen filter settings.\r\n+\r\n+An IMGT archive\r\n+with just the matched and filtered IGA2 sequences: Downloads a .txz\r\n+file with the same format as downloaded IMGT files that contains all IGA2\r\n+sequences that have passed the chosen filter settings.\r\n+\r\n+An IMGT archive\r\n+with just the matched and filtered IGG sequences: Downloads a .txz\r\n+file with the same format as downloaded IMGT files that contains all IGG\r\n+sequences that have passed the chosen filter settings.\r\n+\r\n+An IMGT archive\r\n+with just the matched and filtered IGG1 sequences: Downloads a\r\n+.txz file with the same format as downloaded IMGT files that contains all IGG1\r\n+sequences that have passed the chosen filter settings.\r\n+\r\n+An IMGT archive\r\n+with just the matched and filtered IGG2 sequences: Downloads a\r\n+.txz file with the same format as downloaded IMGT files that contains all IGG2\r\n+sequences that have passed the chosen filter settings.\r\n+\r\n+An IMGT archive\r\n+with just the matched and filtered IGG3 sequences: Downloads a .txz\r\n+file with the same format as downloaded IMGT files that contains all IGG3\r\n+sequences that have passed the chosen filter settings.\r\n+\r\n+An IMGT archive\r\n+with just the matched and filtered IGG4 sequences: Downloads a\r\n+.txz file with the same format as downloaded IMGT files that contains all IGG4\r\n+sequences that have passed the chosen filter settings.\r\n+\r\n+An IMGT archive\r\n+with just the matched and filtered IGM sequences: Downloads a .txz\r\n+file with the same format as downloaded IMGT files that contains all IGM\r\n+sequences that have passed the chosen filter settings.\r\n+\r\n+An IMGT archive\r\n+with just the matched and filtered IGE sequences: Downloads a\r\n+.txz file with the same format as downloaded IMGT files that contains all IGE\r\n+sequences that have passed the chosen filter settings.\r\n+\r\n+</div>\r\n+\r\n+</body>\r\n+\r\n+</html>\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_first.htm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shm_first.htm Fri Feb 19 15:10:54 2021 +0000

@@ -0,0 +1,127 @@
+<html>
+
+<head>
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
+<meta name=Generator content="Microsoft Word 14 (filtered)">
+<style>
+
+</style>
+
+</head>
+
+<body lang=EN-US>
+
+<div class=WordSection1>
+
+Table showing the order of each
+filtering step and the number and percentage of sequences after each filtering
+step. 
+
+Input: The
+number of sequences in the original IMGT file. This is always 100% of the
+sequences.
+
+After "no results" filter: IMGT
+classifies sequences either as "productive", "unproductive", "unknown", or "no
+results". Here, the number and percentages of sequences that are not classified
+as "no results" are reported.
+
+After functionality filter: The
+number and percentages of sequences that have passed the functionality filter. The
+filtering performed is dependent on the settings of the functionality filter.
+Details on the functionality filter <a name="OLE_LINK12"></a><a
+name="OLE_LINK11"></a><a name="OLE_LINK10">can be found on the start page of
+the SHM&CSR pipeline</a>.
+
+After
+removal sequences that are missing a gene region:
+In this step all sequences that are missing a gene region (FR1, CDR1, FR2,
+CDR2, FR3) that should be present are removed from analysis. The sequence
+regions that should be present are dependent on the settings of the sequence
+starts at filter. <a name="OLE_LINK9"></a><a name="OLE_LINK8">The number and
+percentage of sequences that pass this filter step are reported.</a> 
+
+After
+N filter: In this step all sequences that contain
+an ambiguous base (n) in the analysed region or the CDR3 are removed from the
+analysis. The analysed region is determined by the setting of the sequence
+starts at filter. The number and percentage of sequences that pass this filter
+step are reported.
+
+After
+filter unique sequences: The number and
+percentage of sequences that pass the "filter unique sequences" filter. Details
+on this filter can be found on the start page of
+the SHM&CSR pipeline
+
+After
+remove duplicate based on filter: The number and
+percentage of sequences that passed the remove duplicate filter. Details on the
+"remove duplicate filter based on filter" can be found on the start page of the
+SHM&CSR pipeline.
+
+<a name="OLE_LINK17"></a><a
+name="OLE_LINK16">Number of matches sequences:</a>
+The number and percentage of sequences that passed all the filters described
+above and have a (sub)class assigned.
+
+Number
+of unmatched sequences: The number and percentage
+of sequences that passed all the filters described above and do not have
+subclass assigned.
+
+ 
+
+</div>
+
+</body>
+
+</html>

diff -r a4617f1d1d89 -r b6f9a640e098 shm_frequency.htm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shm_frequency.htm Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,87 @@
+<html>
+
+<head>
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
+<meta name=Generator content="Microsoft Word 14 (filtered)">
+<style>
+
+</style>
+
+</head>
+
+<body lang=EN-US>
+
+<div class=WordSection1>
+
+SHM
+frequency tab
+
+Graphs
+
+These
+graphs give insight into the level of SHM. The data represented in these graphs
+can be downloaded in the download tab. <a name="OLE_LINK24"></a><a
+name="OLE_LINK23"></a><a name="OLE_LINK90"></a><a name="OLE_LINK89">More
+information on the values found in healthy individuals of different ages can be
+found in IJspeert and van Schouwenburg et al, PMID: 27799928. </a>
+
+Frequency
+scatter plot
+
+A
+dot plot showing the percentage of SHM in each transcript divided into the
+different (sub)classes. In the graph each dot
+represents an individual transcript.
+
+Mutation
+frequency by class
+
+A
+bar graph showing the percentage of transcripts that contain 0%, 0-2%, 2-5%,
+5-10% 10-15%, 15-20% or more than 20% SHM for each subclass. 
+
+Hanna IJspeert, Pauline A. van
+Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,
+Andrew P. Stubbs, and Mirjam van der Burg (2016). Evaluation
+of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and
+Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a
+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">doi:10.3389/fimmu.2016.00410</a>][<a
+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">Link</a>]
+
+</div>
+
+</body>
+
+</html>

diff -r a4617f1d1d89 -r b6f9a640e098 shm_overview.htm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shm_overview.htm Fri Feb 19 15:10:54 2021 +0000

[

b'@@ -0,0 +1,332 @@\n+<html>\r\n+\r\n+<head>\r\n+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">\r\n+<meta name=Generator content="Microsoft Word 14 (filtered)">\r\n+<style>\r\n+\r\n+</style>\r\n+\r\n+</head>\r\n+\r\n+<body lang=EN-US>\r\n+\r\n+<div class=WordSection1>\r\n+\r\n+Info\r\n+table\r\n+\r\n+This\r\n+table contains information on different characteristics of SHM. For all\r\n+characteristics information can be found for all sequences or only sequences of\r\n+a certain (sub)class. All results are based on the sequences that passed the filter\r\n+settings chosen on the start page of the SHM & CSR pipeline and only\r\n+include details on the analysed region as determined by the setting of the\r\n+sequence starts at filter. All data in this table can be downloaded via the\r\n+\x93downloads\x94 tab.\r\n+\r\n+Mutation\r\n+frequency:\r\n+\r\n+<a name="OLE_LINK83"></a><a\r\n+name="OLE_LINK82"></a><a name="OLE_LINK81">These values\r\n+give information on the level of SHM. </a><a name="OLE_LINK22"></a><a\r\n+name="OLE_LINK21"></a><a name="OLE_LINK20">More information\r\n+on the values found in healthy individuals of different ages can be found in </a><a\r\n+name="OLE_LINK15"></a><a name="OLE_LINK14"></a><a name="OLE_LINK13"><span\r\n+lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>IJspeert\r\n+and van Schouwenburg et al, PMID: 27799928</a>\r\n+\r\n+Number\r\n+of mutations: Shows the number of total\r\n+mutations / the number of sequenced bases (the % of mutated bases).\r\n+\r\n+Median\r\n+number of mutations: Shows the median % of\r\n+SHM of all sequences.\r\n+\r\n+Patterns\r\n+of SHM:\r\n+\r\n+<a name="OLE_LINK72"></a><a\r\n+name="OLE_LINK71"></a><a name="OLE_LINK70">These values\r\n+give insights into the targeting and patterns of SH'..b'if"\'>Shows the total number of sequenced <a\r\n+name="OLE_LINK59"></a><a name="OLE_LINK58">guanine</a>s / The total number of\r\n+sequenced bases (the percentage of sequenced bases that were guanines).\r\n+\r\n+<a name="OLE_LINK69"><span\r\n+lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Graphs</a>\r\n+\r\n+<a name="OLE_LINK75"></a><a\r\n+name="OLE_LINK74"></a><a name="OLE_LINK73">These graphs visualize\r\n+information on the patterns and targeting of SHM and thereby give information\r\n+into the repair pathways used to repair the U:G mismatches introduced by AID. The\r\n+data represented in these graphs can be downloaded in the download tab. More\r\n+information on the values found in healthy individuals of different ages can be\r\n+found in IJspeert and van Schouwenburg et al, PMID: 27799928</a><span\r\n+lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>.\r\n+<a name="OLE_LINK85"></a><a name="OLE_LINK84"></a>\r\n+\r\n+Percentage\r\n+of mutations in AID and pol eta motives\r\n+\r\n+Visualizes\r\n+<a name="OLE_LINK80"></a><a name="OLE_LINK79"></a><a name="OLE_LINK78">for each\r\n+(sub)class </a>the percentage of mutations that are present in AID (RGYW or\r\n+WRCY) or polymerase eta motives (WA or TW) in the different subclasses <span\r\n+lang=EN-GB style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>(R=Purine,\r\n+Y=pyrimidine, W = A or T).\r\n+\r\n+Relative\r\n+mutation patterns\r\n+\r\n+Visualizes\r\n+for each (sub)class the distribution of mutations between mutations at AT\r\n+locations and transitions or transversions at GC locations. \r\n+\r\n+Absolute\r\n+mutation patterns\r\n+\r\n+Visualized\r\n+for each (sub)class the percentage of sequenced AT and GC bases that are\r\n+mutated. The mutations at GC bases are divided into transition and transversion\r\n+mutations<a name="OLE_LINK77"></a><a name="OLE_LINK76">. </a>\r\n+\r\n+Hanna IJspeert, Pauline A. van\r\n+Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,\r\n+Andrew P. Stubbs, and Mirjam van der Burg (2016). <span\r\n+style=\'font-size:12.0pt;line-height:115%;font-family:"Times New Roman","serif"\'>Evaluation\r\n+of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and\r\n+Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a\r\n+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/"><span\r\n+style=\'color:windowtext\'>doi:10.3389/fimmu.2016.00410</a>][<a\r\n+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/"><span\r\n+style=\'color:windowtext\'>Link</a>]\r\n+\r\n+</div>\r\n+\r\n+</body>\r\n+\r\n+</html>\r\n'

diff -r a4617f1d1d89 -r b6f9a640e098 shm_selection.htm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shm_selection.htm Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,128 @@
+<html>
+
+<head>
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
+<meta name=Generator content="Microsoft Word 14 (filtered)">
+<style>
+
+</style>
+
+</head>
+
+<body lang=EN-US link=blue vlink=purple>
+
+<div class=WordSection1>
+
+References
+
+Yaari, G. and Uduman, M. and Kleinstein, S. H. (2012). Quantifying
+selection in high-throughput Immunoglobulin sequencing data sets. In Nucleic Acids Research, 40 (17),
+pp. e134�e134. [<a href="http://dx.doi.org/10.1093/nar/gks457" target="_blank">doi:10.1093/nar/gks457</a>][<a
+href="http://dx.doi.org/10.1093/nar/gks457" target="_blank">Link</a>]
+
+Graphs
+
+AA
+mutation frequency
+
+For
+each class, the frequency of replacement mutations at each amino acid position
+is shown, which is calculated by dividing the number of replacement mutations
+at a particular amino acid position/the number sequences that have an amino
+acid at that particular position. Since the length of the CDR1 and CDR2 region
+is not the same for every VH gene, some amino acids positions are absent.
+Therefore we calculate the frequency using the number of amino acids present at
+that that particular location. 
+
+Antigen
+selection (BASELINe)
+
+Shows
+the results of the analysis of antigen selection as performed using BASELINe.
+Details on the analysis performed by BASELINe can be found in Yaari et al,
+PMID: 22641856. The settings used for the analysis are:
+focused, SHM targeting model: human Tri-nucleotide, custom bounderies. The
+custom boundries are dependent on the �sequence starts at filter�. 
+
+Leader:
+1:26:38:55:65:104:-
+
+FR1: 27:27:38:55:65:104:-
+
+CDR1: 27:27:38:55:65:104:-
+
+FR2: 27:27:38:55:65:104:-
+
+Hanna IJspeert, Pauline A. van
+Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,
+Andrew P. Stubbs, and Mirjam van der Burg (2016). Evaluation
+of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and
+Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a
+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">doi:10.3389/fimmu.2016.00410</a>][<a
+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">Link</a>]
+
+</div>
+
+</body>
+
+</html>

diff -r a4617f1d1d89 -r b6f9a640e098 shm_transition.htm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/shm_transition.htm Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,120 @@
+<html>
+
+<head>
+<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
+<meta name=Generator content="Microsoft Word 14 (filtered)">
+<style>
+
+</style>
+
+</head>
+
+<body lang=EN-US link=blue vlink=purple>
+
+<div class=WordSection1>
+
+These graphs and
+tables give insight into the targeting and patterns of SHM. This can give
+insight into the DNA repair pathways used to solve the U:G mismatches
+introduced by AID. More information on the values found in healthy individuals
+of different ages can be found in IJspeert and van Schouwenburg et al, PMID:
+27799928.
+
+Graphs
+
+
+<a name="OLE_LINK93"></a><a
+name="OLE_LINK92"></a><a name="OLE_LINK91">Heatmap transition
+information</a>
+
+<a name="OLE_LINK98"></a><a
+name="OLE_LINK97">Heatmaps visualizing for each subclass the frequency
+of all possible substitutions. On the x-axes the original base is shown, while
+the y-axes shows the new base. The darker the shade of blue, the more frequent
+this type of substitution is occurring.� </a>
+
+Bargraph
+transition information
+
+Bar graph
+visualizing for each original base the distribution of substitutions into the other
+bases. A graph is included for each (sub)class. 
+
+Tables
+
+Transition
+tables are shown for each (sub)class. All the original bases are listed
+horizontally, while the new bases are listed vertically. 
+
+Hanna IJspeert, Pauline A. van
+Schouwenburg, David van Zessen, Ingrid Pico-Knijnenburg, Gertjan J. Driessen,
+Andrew P. Stubbs, and Mirjam van der Burg (2016). Evaluation
+of the Antigen-Experienced B-Cell Receptor Repertoire in Healthy Children and
+Adults. In Frontiers in Immunolog, 7, pp. e410-410. [<a
+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">doi:10.3389/fimmu.2016.00410</a>][<a
+href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5066086/">Link</a>]
+
+</div>
+
+</body>
+
+</html>

diff -r a4617f1d1d89 -r b6f9a640e098 style.tar.gz

Binary file style.tar.gz has changed

diff -r a4617f1d1d89 -r b6f9a640e098 subclass_definition.db.nhr

Binary file subclass_definition.db.nhr has changed

diff -r a4617f1d1d89 -r b6f9a640e098 subclass_definition.db.nin

Binary file subclass_definition.db.nin has changed

diff -r a4617f1d1d89 -r b6f9a640e098 subclass_definition.db.nsq

Binary file subclass_definition.db.nsq has changed

diff -r a4617f1d1d89 -r b6f9a640e098 summary_to_fasta.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/summary_to_fasta.py Fri Feb 19 15:10:54 2021 +0000

[

@@ -0,0 +1,42 @@
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--input", help="The 1_Summary file of an IMGT zip file")
+parser.add_argument("--fasta", help="The output fasta file")
+
+args = parser.parse_args()
+
+infile = args.input
+fasta = args.fasta
+
+with open(infile, 'r') as i, open(fasta, 'w') as o:
+ first = True
+ id_col = 0
+ seq_col = 0
+ no_results = 0
+ no_seqs = 0
+ passed = 0
+ for line in i:
+ splt = line.split("\t")
+ if first:
+ id_col = splt.index("Sequence ID")
+ seq_col = splt.index("Sequence")
+ first = False
+ continue
+ if len(splt) < 5:
+ no_results += 1
+ continue
+
+ ID = splt[id_col]
+ seq = splt[seq_col]
+
+ if not len(seq) > 0:
+ no_seqs += 1
+ continue
+
+ o.write(">" + ID + "\n" + seq + "\n")
+ passed += 1
+
+ print "No results:", no_results
+ print "No sequences:", no_seqs
+ print "Written to fasta file:", passed

diff -r a4617f1d1d89 -r b6f9a640e098 wrapper.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/wrapper.sh Fri Feb 19 15:10:54 2021 +0000

[

b'@@ -0,0 +1,913 @@\n+#!/bin/bash\n+#set -e\n+dir="$(cd "$(dirname "$0")" && pwd)"\n+input=$1\n+method=$2\n+log=$3 #becomes the main html page at the end\n+outdir=$4\n+output="$outdir/index.html" #copied to $log location at the end\n+title="$5"\n+include_fr1=$6\n+functionality=$7\n+unique=$8\n+naive_output=$9\n+naive_output_ca=${10}\n+naive_output_cg=${11}\n+naive_output_cm=${12}\n+naive_output_ce=${13}\n+naive_output_all=${14}\n+filter_unique=${15}\n+filter_unique_count=${16}\n+class_filter=${17}\n+empty_region_filter=${18}\n+fast=${19}\n+\n+mkdir $outdir\n+\n+tar -xzf $dir/style.tar.gz -C $outdir\n+\n+echo "---------------- read parameters ----------------"\n+echo "---------------- read parameters ---------------- " > $log\n+\n+echo "unpacking IMGT file"\n+\n+type="`file $input`"\n+if [[ "$type" == *"Zip archive"* ]] ; then\n+\techo "Zip archive"\n+\techo "unzip $input -d $PWD/files/"\n+\tunzip $input -d $PWD/files/\n+elif [[ "$type" == *"XZ compressed data"* ]] ; then\n+\techo "ZX archive"\n+\techo "tar -xJf $input -C $PWD/files/"\n+\tmkdir -p "$PWD/files/$title"\n+\ttar -xJf $input -C "$PWD/files/$title"\n+else\n+\techo "Unrecognized format $type"\n+\techo "Unrecognized format $type" > $log\n+\texit 1\n+fi\n+\n+cat "`find $PWD/files/ -name "1_*"`" > $PWD/summary.txt\n+cat "`find $PWD/files/ -name "2_*"`" > $PWD/gapped_nt.txt\n+cat "`find $PWD/files/ -name "3_*"`" > $PWD/sequences.txt\n+cat "`find $PWD/files/ -name "4_*"`" > $PWD/gapped_aa.txt\n+cat "`find $PWD/files/ -name "5_*"`" > $PWD/aa.txt\n+cat "`find $PWD/files/ -name "6_*"`" > $PWD/junction.txt\n+cat "`find $PWD/files/ -name "7_*"`" > $PWD/mutationanalysis.txt\n+cat "`find $PWD/files/ -name "8_*"`" > $PWD/mutationstats.txt\n+cat "`find $PWD/files/ -name "9_*"`" > $PWD/aa_change_stats.txt\n+cat "`find $PWD/files/ -name "10_*"`" > $PWD/hotspots.txt\n+\n+echo "---------------- unique id check ----------------"\n+\n+Rscript $dir/check_unique_id.r $PWD/summary.txt $PWD/gapped_nt.txt $PWD/sequences.txt $PWD/gapped_aa.txt $PWD/aa.txt $PWD/junction.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/aa_change_stats.txt $PWD/hotspots.txt\n+\n+if [[ ${#BLASTN_DIR} -ge 5 ]] ; then\n+\techo "On server, using BLASTN_DIR env: ${BLASTN_DIR}"\n+else\n+\tBLASTN_DIR="/home/galaxy/Downloads/ncbi-blast-2.4.0+/bin"\n+\techo "Dev Galaxy set BLASTN_DIR to: ${BLASTN_DIR}"\n+fi\n+\n+echo "---------------- class identification ----------------"\n+echo "---------------- class identification ---------------- " >> $log\n+\n+python $dir/gene_identification.py --input $PWD/summary.txt --output $outdir/identified_genes.txt\n+\n+echo "---------------- merge_and_filter.r ----------------"\n+echo "---------------- merge_and_filter.r ---------------- " >> $log\n+\n+Rscript $dir/merge_and_filter.r $PWD/summary.txt $PWD/sequences.txt $PWD/mutationanalysis.txt $PWD/mutationstats.txt $PWD/hotspots.txt "$PWD/gapped_aa.txt" $outdir/identified_genes.txt $outdir/merged.txt $outdir/before_unique_filter.txt $outdir/unmatched.txt $method $functionality $unique ${filter_unique} ${filter_unique_count} ${class_filter} ${empty_region_filter} 2>&1\n+\n+if [[ "${naive_output}" == "yes" ]] || [[ "$fast" == "no" ]] ; then\n+\n+\techo "---------------- creating new IMGT zips ----------------"\n+\techo "---------------- creating new IMGT zips ---------------- " >> $log\n+\n+\tmkdir $outdir/new_IMGT\n+\n+\tcp $PWD/summary.txt "$outdir/new_IMGT/1_Summary.txt"\n+\tcp $PWD/gapped_nt.txt "$outdir/new_IMGT/2_IMGT-gapped-nt-sequences.txt"\n+\tcp $PWD/sequences.txt "$outdir/new_IMGT/3_Nt-sequences.txt"\n+\tcp $PWD/gapped_aa.txt "$outdir/new_IMGT/4_IMGT-gapped-AA-sequences.txt"\n+\tcp $PWD/aa.txt "$outdir/new_IMGT/5_AA-sequences.txt"\n+\tcp $PWD/junction.txt "$outdir/new_IMGT/6_Junction.txt"\n+\tcp $PWD/mutationanalysis.txt "$outdir/new_IMGT/7_V-REGION-mutation-and-AA-change-table.txt"\n+\tcp $PWD/mutationstats.txt "$outdir/new_IMGT/8_V-REGION-nt-mutation-statistics.txt"\n+\tcp $PWD/aa_change_stats.txt "$outdir/new_IMGT/9_V-REGION-AA-change-statistics.txt"\n+\tcp $PWD/hotspots.txt "$outdir/new_IMGT/10_V-REGION-mutation-hotspot'..b'equence of a clone (IGE)</td><td><a href=\'new_IMGT_IGE_first_seq_of_clone.txz\' download=\'new_IMGT_IGE_first_seq_of_clone.txz\' >Download</a></td></tr>" >> $output\n+\n+echo "<tr><td colspan=\'2\' style=\'background-color:#E0E0E0;\'>Filtered IMGT output files</td></tr>" >> $output\n+echo "<tr><td>An IMGT archive with just the matched and filtered sequences</td><td><a href=\'new_IMGT.txz\' download=\'new_IMGT.txz\' >Download</a></td></tr>" >> $output\n+echo "<tr><td>An IMGT archive with just the matched and filtered IGA sequences</td><td><a href=\'new_IMGT_IGA.txz\' download=\'new_IMGT_IGA.txz\' >Download</a></td></tr>" >> $output\n+echo "<tr><td>An IMGT archive with just the matched and filtered IGA1 sequences</td><td><a href=\'new_IMGT_IGA1.txz\' download=\'new_IMGT_IGA1.txz\' >Download</a></td></tr>" >> $output\n+echo "<tr><td>An IMGT archive with just the matched and filtered IGA2 sequences</td><td><a href=\'new_IMGT_IGA2.txz\' download=\'new_IMGT_IGA2.txz\' >Download</a></td></tr>" >> $output\n+echo "<tr><td>An IMGT archive with just the matched and filtered IGG sequences</td><td><a href=\'new_IMGT_IGG.txz\' download=\'new_IMGT_IGG.txz\' >Download</a></td></tr>" >> $output\n+echo "<tr><td>An IMGT archive with just the matched and filtered IGG1 sequences</td><td><a href=\'new_IMGT_IGG1.txz\' download=\'new_IMGT_IGG1.txz\' >Download</a></td></tr>" >> $output\n+echo "<tr><td>An IMGT archive with just the matched and filtered IGG2 sequences</td><td><a href=\'new_IMGT_IGG2.txz\' download=\'new_IMGT_IGG2.txz\' >Download</a></td></tr>" >> $output\n+echo "<tr><td>An IMGT archive with just the matched and filtered IGG3 sequences</td><td><a href=\'new_IMGT_IGG3.txz\' download=\'new_IMGT_IGG3.txz\' >Download</a></td></tr>" >> $output\n+echo "<tr><td>An IMGT archive with just the matched and filtered IGG4 sequences</td><td><a href=\'new_IMGT_IGG4.txz\' download=\'new_IMGT_IGG4.txz\' >Download</a></td></tr>" >> $output\n+echo "<tr><td>An IMGT archive with just the matched and filtered IGM sequences</td><td><a href=\'new_IMGT_IGM.txz\' download=\'new_IMGT_IGM.txz\' >Download</a></td></tr>" >> $output\n+echo "<tr><td>An IMGT archive with just the matched and filtered IGE sequences</td><td><a href=\'new_IMGT_IGE.txz\' download=\'new_IMGT_IGE.txz\' >Download</a></td></tr>" >> $output\n+\n+echo "</table>" >> $output\n+\n+echo " " >> $output\n+cat $dir/shm_downloads.htm >> $output\n+\n+echo "</div>" >> $output #downloads tab end\n+\n+echo "</div>" >> $output #tabs end \n+\n+echo "</html>" >> $output\n+\n+\n+echo "---------------- naive_output.r ----------------"\n+echo "---------------- naive_output.r ---------------- " >> $log\n+\n+if [[ "$naive_output" == "yes" ]]\n+then\n+\techo "output naive output"\n+\tif [[ "${class_filter}" == "101_101" ]]\n+\tthen\n+\t\techo "copy new_IMGT.txz to ${naive_output_all}"\n+\t\tcp $outdir/new_IMGT.txz ${naive_output_all}\n+\telse\n+\t\techo "copy for classes"\n+\t\tcp $outdir/new_IMGT_IGA.txz ${naive_output_ca}\n+\t\tcp $outdir/new_IMGT_IGG.txz ${naive_output_cg}\n+\t\tcp $outdir/new_IMGT_IGM.txz ${naive_output_cm}\n+\t\tcp $outdir/new_IMGT_IGE.txz ${naive_output_ce}\n+\tfi\n+fi\n+\n+echo "</table>" >> $outdir/base_overview.html\n+\n+mv $log $outdir/log.html\n+\n+echo "<html><center><h1><a href=\'index.html\'>Click here for the results</a></h1>Tip: Open it in a new tab (middle mouse button or right mouse button -> \'open in new tab\' on the link above) " > $log\n+echo "<table border = 1>" >> $log\n+echo "<thead><tr><th>Info</th><th>Sequences</th><th>Percentage</th></tr></thead>" >> $log\n+tIFS="$TMP"\n+IFS=$\'\\t\'\n+while read step seq perc\n+\tdo\n+\t\techo "<tr>" >> $log\n+\t\techo "<td>$step</td>" >> $log\n+\t\techo "<td>$seq</td>" >> $log\n+\t\techo "<td>${perc}%</td>" >> $log\n+\t\techo "</tr>" >> $log\n+done < $outdir/filtering_steps.txt\n+echo "</table>" >> $log\n+echo " " >> $log\n+cat $dir/shm_first.htm >> $log\n+echo "</center></html>" >> $log\n+\n+IFS="$tIFS"\n+\n+\n+echo "---------------- Done! ----------------"\n+echo "---------------- Done! ---------------- " >> $outdir/log.html\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n+\n'