# HG changeset patch # User davidvanzessen # Date 1478505847 18000 # Node ID ad9be244b1048cd8502bb9d669fa5fd4f0f8ba6d # Parent 2ddb9a21f6352ec861bb55dd0b26946929042c74 Uploaded diff -r 2ddb9a21f635 -r ad9be244b104 sequence_overview.r --- a/sequence_overview.r Tue Nov 01 10:48:38 2016 -0400 +++ b/sequence_overview.r Mon Nov 07 03:04:07 2016 -0500 @@ -10,6 +10,8 @@ NToverview.file = paste(outputdir, "ntoverview.txt", sep="/") NTsum.file = paste(outputdir, "ntsum.txt", sep="/") main.html = "index.html" +empty.region.filter = args[6] + setwd(outputdir) @@ -19,19 +21,21 @@ #before.unique = before.unique[!grepl("unmatched", before.unique$best_match),] -before.unique$seq_conc = paste(before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq, before.unique$CDR3.IMGT.seq) +if(empty.region.filter == "leader"){ + before.unique$seq_conc = paste(before.unique$FR1.IMGT.seq, before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq) +} else if(empty.region.filter == "FR1"){ + before.unique$seq_conc = paste(before.unique$CDR1.IMGT.seq, before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq) +} else if(empty.region.filter == "CDR1"){ + before.unique$seq_conc = paste(before.unique$FR2.IMGT.seq, before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq) +} else if(empty.region.filter == "FR2"){ + before.unique$seq_conc = paste(before.unique$CDR2.IMGT.seq, before.unique$FR3.IMGT.seq) +} IDs = before.unique[,c("Sequence.ID", "seq_conc", "best_match", "Functionality")] IDs$best_match = as.character(IDs$best_match) -#dat = data.frame(data.table(dat)[, list(freq=.N), by=c("best_match", "seq_conc")]) +dat = data.frame(table(before.unique$seq_conc)) -dat = data.frame(table(before.unique$seq_conc)) -#dat = data.frame(table(merged$seq_conc, merged$Functionality)) - -#dat = dat[dat$Freq > 1,] - -#names(dat) = c("seq_conc", "Functionality", "Freq") names(dat) = c("seq_conc", "Freq") dat$seq_conc = factor(dat$seq_conc) @@ -67,7 +71,17 @@ } cat("", file=main.html, append=F) -#cat("", file=main.html, append=T) + +if(empty.region.filter == "leader"){ + cat("", file=main.html, append=T) +} else if(empty.region.filter == "FR1"){ + cat("", file=main.html, append=T) +} else if(empty.region.filter == "CDR1"){ + cat("", file=main.html, append=T) +} else if(empty.region.filter == "FR2"){ + cat("", file=main.html, append=T) +} + cat("", file=main.html, append=T) cat("", file=main.html, append=T) cat("", file=main.html, append=T) @@ -240,9 +254,18 @@ #ACGT overview -NToverview = merged[!grepl("^unmatched", merged$best_match),] +#NToverview = merged[!grepl("^unmatched", merged$best_match),] +NToverview = merged -NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq, sep="_") +if(empty.region.filter == "leader"){ + NToverview$seq = paste(NToverview$FR1.IMGT.seq, NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq) +} else if(empty.region.filter == "FR1"){ + NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq) +} else if(empty.region.filter == "CDR1"){ + NToverview$seq = paste(NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq) +} else if(empty.region.filter == "FR2"){ + NToverview$seq = paste(NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq) +} NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq)) NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq)) diff -r 2ddb9a21f635 -r ad9be244b104 shm_csr.py --- a/shm_csr.py Tue Nov 01 10:48:38 2016 -0400 +++ b/shm_csr.py Mon Nov 07 03:04:07 2016 -0500 @@ -247,9 +247,12 @@ def get_xyz(lst, gene, f, fname): + x = int(round(f(lst))) y = valuedic[gene + "_" + fname] z = str(round(x / float(y) * 100, 1)) if y != 0 else "0" + if gene == "unmatched": + print x, y, z return (str(x), str(y), z) dic = {"RGYW": RGYWCount, "WRCY": WRCYCount, "WA": WACount, "TW": TWCount} @@ -271,8 +274,8 @@ else: x, y, z = get_xyz([curr[x] for x in [y for y, z in genedic.iteritems() if geneMatcher.match(z)]], gene, func, fname) o.write("," + x + "," + y + "," + z) - x, y, z = get_xyz([y for x, y in curr.iteritems() if not genedic[x].startswith("unmatched")], "total", func, fname) + #x, y, z = get_xyz([y for x, y in curr.iteritems()], "total", func, fname) o.write("," + x + "," + y + "," + z + "\n") diff -r 2ddb9a21f635 -r ad9be244b104 shm_csr.r --- a/shm_csr.r Tue Nov 01 10:48:38 2016 -0400 +++ b/shm_csr.r Mon Nov 07 03:04:07 2016 -0500 @@ -119,9 +119,9 @@ if(empty.region.filter == "FR1") { regions = c("CDR1", "FR2", "CDR2", "FR3") } else if (empty.region.filter == "CDR1") { - regions = c("FR2", "CDR2", "FR3", "CDR3") + regions = c("FR2", "CDR2", "FR3") } else if (empty.region.filter == "FR2") { - regions = c("CDR2", "FR3", "CDR3") + regions = c("CDR2", "FR3") } sum_by_row = function(x, columns) { sum(as.numeric(x[columns]), na.rm=T) } @@ -229,11 +229,25 @@ matrx[9,z] = round(matrx[9,x] / matrx[9,y], digits=1) if(fname == "sum"){ - matrx[10,x] = round(f(rowSums(tmp[,c("FR2.IMGT.Nb.of.nucleotides", "FR3.IMGT.Nb.of.nucleotides")], na.rm=T)), digits=1) + + regions.fr = regions[grepl("FR", regions)] + regions.fr = paste(regions.fr, ".IMGT.Nb.of.nucleotides", sep="") + regions.cdr = regions[grepl("CDR", regions)] + regions.cdr = paste(regions.cdr, ".IMGT.Nb.of.nucleotides", sep="") + + if(length(regions.fr) > 1){ #in case there is only on FR region (rowSums needs >1 column) + matrx[10,x] = round(f(rowSums(tmp[,regions.fr], na.rm=T)), digits=1) + } else { + matrx[10,x] = round(f(tmp[,regions.fr], na.rm=T), digits=1) + } matrx[10,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) matrx[10,z] = round(matrx[10,x] / matrx[10,y] * 100, digits=1) - matrx[11,x] = round(f(rowSums(tmp[,c("CDR1.IMGT.Nb.of.nucleotides", "CDR2.IMGT.Nb.of.nucleotides")], na.rm=T)), digits=1) + if(length(regions.cdr) > 1){ #in case there is only on CDR region + matrx[11,x] = round(f(rowSums(tmp[,regions.cdr], na.rm=T)), digits=1) + } else { + matrx[11,x] = round(f(tmp[,regions.cdr], na.rm=T), digits=1) + } matrx[11,y] = round(f(tmp$VRegionNucleotides, na.rm=T), digits=1) matrx[11,z] = round(matrx[11,x] / matrx[11,y] * 100, digits=1) } diff -r 2ddb9a21f635 -r ad9be244b104 wrapper.sh --- a/wrapper.sh Tue Nov 01 10:48:38 2016 -0400 +++ b/wrapper.sh Mon Nov 07 03:04:07 2016 -0500 @@ -197,7 +197,7 @@ mkdir $outdir/sequence_overview -Rscript $dir/sequence_overview.r $outdir/before_unique_filter.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt 2>&1 +Rscript $dir/sequence_overview.r $outdir/before_unique_filter.txt $outdir/merged.txt $outdir/sequence_overview $classes $outdir/hotspot_analysis_sum.txt ${empty_region_filter} 2>&1 echo "
CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than onceFR1+CDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than onceCDR1+FR2+CDR2+FR3+CDR3 sequences that show up more than onceFR2+CDR2+FR3+CDR3 sequences that show up more than onceCDR2+FR3+CDR3 sequences that show up more than once
SequenceFunctionalityca1ca2cg1cg2cg3cg4cmuntotal CAtotal CGnumber of subclassespresent in both Ca and CgCa1+Ca2
" > $outdir/base_overview.html