fsd_bvsa: fsd_beforevsafter.py comparison

comparison fsd_beforevsafter.py @ 11:fca9b000b8d8 draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_beforevsafter commit 82b53a26581a2296ad272bbba1e80934864dfa58

author	mheinzl
date	Mon, 26 Nov 2018 04:38:21 -0500
parents	238a71241876
children

comparison

equal deleted inserted replaced

-:e80557c091e9
+:fca9b000b8d8
 #!/usr/bin/env python
 # Family size distribution of DCS from various steps of the Galaxy pipeline
 #
-# Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
+# Author: Monika Heinzl & Gundula Povysil, Johannes-Kepler University Linz (Austria)
 # Contact: monika.heinzl@edumail.at
 #
 # Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional),
 # a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and
 # a FASTA file with tags after trimming as input (optional).
 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and
 # a CSV file with the data of the plot.
-# USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming -- alignedTags filenameTagsRefGenome
+# USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming --alignedTags DCSbamFile
-# --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf
+# --output_tabular outputfile_name_tabular --output_pdf outputfile_name_pdf
 import argparse
+import re
 import sys
 from collections import Counter
 import matplotlib.pyplot as plt
 import numpy
+import pysam
 from Bio import SeqIO
 from matplotlib.backends.backend_pdf import PdfPages
 plt.switch_backend('agg')
 parser.add_argument('--inputName1')
 parser.add_argument('--makeDCS',
 help='FASTA File with information about tag and family size in the header.')
 parser.add_argument('--afterTrimming', default=None,
 help='FASTA File with information about tag and family size in the header.')
-parser.add_argument('--alignedTags', default=None,
+parser.add_argument('--bamFile',
-help=' TXT file with tags aligned to the reference genome and family size.')
+help='BAM file with aligned reads.')
 parser.add_argument('--output_pdf', default="data.pdf", type=str,
 help='Name of the pdf and tabular file.')
 parser.add_argument('--output_tabular', default="data.tabular", type=str,
 help='Name of the pdf and tabular file.')
 return parser
 def compare_read_families_read_loss(argv):
 parser = make_argparser()
 args = parser.parse_args(argv[1:])
+#
 SSCS_file = args.inputFile_SSCS
 SSCS_file_name = args.inputName1
 makeConsensus = args.makeDCS
 afterTrimming = args.afterTrimming
-ref_genome = args.alignedTags
+ref_genome = args.bamFile
 title_file = args.output_tabular
 title_file2 = args.output_pdf
 sep = "\t"
 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
 plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure)
 plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure)
 # data of tags aligned to reference genome
 if ref_genome != str(None):
-mut_array = readFileReferenceFree(ref_genome, " ")
+pysam.index(ref_genome)
+bam = pysam.AlignmentFile(ref_genome, "rb")
+seq_mut = []
+for read in bam.fetch():
+if not read.is_unmapped:
+if re.search('_', read.query_name):
+tags = re.split('_', read.query_name)[0]
+else:
+tags = read.query_name
+seq_mut.append(tags)
 # use only unique tags that were alignment to the reference genome
-seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)
+seq_mut = numpy.array(seq_mut)
+seq_mut, seqMut_index = numpy.unique(seq_mut, return_index=True)
-# get family sizes
+# get family sizes for each tag in the BAM file
 quant_ab = []
+quant_ba = []
 for i in seq_mut:
 quant_ab.append(seqDic_ab.get(i))
-quant_ba = []
-for i in seq_mut:
 quant_ba.append(seqDic_ba.get(i))
 quant_ab_ref = numpy.array(quant_ab)
 quant_ba_ref = numpy.array(quant_ba)
 quant_all_ref = numpy.concatenate((quant_ab_ref, quant_ba_ref))
 bigFamilies = numpy.where(quant_all_ref > 20)[0]  # group large family sizes
 quant_all_ref[bigFamilies] = 22
 list1.append(quant_all_ref)
 colors.append("#04cec7")
 labels.append("after alignment\nto reference")
 legend7 = "after alignment to reference:"
 length_DCS_ref = len(quant_ba_ref)  # count of duplex tags that were aligned to reference genome
 ticks = numpy.arange(0, maximumX, 1)
 ticks1 = map(str, ticks)
 ticks1[len(ticks1) - 1] = ">20"
 plt.xticks(numpy.array(ticks), ticks1)
 if ref_genome != str(None):
-count = numpy.array(
+count = numpy.array([v for k, v in sorted(Counter(quant_ab_ref).iteritems())])  # count all family sizes from all ab strands
-[v for k, v in sorted(Counter(quant_ab_ref).iteritems())])  # count all family sizes from all ab strands
+legend = "max. family size:\nabsolute frequency:\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)"
-legend = "max. family size =\nabsolute frequency=\nrelative frequency=\n\ntotal nr. of reads (before)"
+plt.text(0.1, 0.085, legend, size=11, transform=plt.gcf().transFigure)
-plt.text(0.1, 0.105, legend, size=11, transform=plt.gcf().transFigure)
 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \
 .format(max(quant_ab_ref), count[len(count) - 1], float(count[len(count) - 1]) / sum(count),
 sum(numpy.array(data_array[:, 0]).astype(int)))
-plt.text(0.3, 0.105, legend, size=11, transform=plt.gcf().transFigure)
+plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure)
 count2 = numpy.array(
 [v for k, v in sorted(Counter(quant_ba_ref).iteritems())])  # count all family sizes from all ba strands
 legend = "BA\n{}\n{}\n{:.5f}" \
 .format(max(quant_ba_ref), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
-plt.text(0.4, 0.15, legend, size=11, transform=plt.gcf().transFigure)
+plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure)
 legend4 = "* In the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags."
 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure)
 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
 "absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
 output_file.write(
 "relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep,
 float(count2[len(count2) - 1]) / sum(count2)))
-output_file.write("\n\ntotal nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
+output_file.write("\ntotal nr. of reads before SSCS building{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
 output_file.write("\n\nValues from family size distribution\n")
 if afterTrimming == str(None) and ref_genome == str(None):
 if afterTrimming == str(None):
-output_file.write("{}before SSCS buidling{}after DCS building\n".format(sep, sep))
+output_file.write("{}before SSCS building{}after DCS building\n".format(sep, sep))
 elif ref_genome == str(None):
 output_file.write("{}before SSCS building{}atfer DCS building\n".format(sep, sep))
 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])]):
 if fs == 21:
 output_file.write("FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs)))
 output_file.write("sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1]))))
 elif afterTrimming == str(None) or ref_genome == str(None):
 if afterTrimming == str(None):
-output_file.write("{}before SSCS buidling{}after DCS building{}after alignment to reference\n".format(sep, sep, sep))
+output_file.write("{}before SSCS building{}after DCS building{}after alignment to reference\n".format(sep, sep, sep))
 elif ref_genome == str(None):
 output_file.write("{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep))
 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])]):
 if fs == 21:

Mercurial > repos > mheinzl > fsd_bvsa

comparison fsd_beforevsafter.py @ 11:fca9b000b8d8 draft