comparison fsd_beforevsafter.py @ 11:fca9b000b8d8 draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_beforevsafter commit 82b53a26581a2296ad272bbba1e80934864dfa58
author mheinzl
date Mon, 26 Nov 2018 04:38:21 -0500
parents 238a71241876
children
comparison
equal deleted inserted replaced
10:e80557c091e9 11:fca9b000b8d8
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Family size distribution of DCS from various steps of the Galaxy pipeline 2 # Family size distribution of DCS from various steps of the Galaxy pipeline
3 # 3 #
4 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) 4 # Author: Monika Heinzl & Gundula Povysil, Johannes-Kepler University Linz (Austria)
5 # Contact: monika.heinzl@edumail.at 5 # Contact: monika.heinzl@edumail.at
6 # 6 #
7 # Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional), 7 # Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional),
8 # a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and 8 # a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and
9 # a FASTA file with tags after trimming as input (optional). 9 # a FASTA file with tags after trimming as input (optional).
10 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and 10 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and
11 # a CSV file with the data of the plot. 11 # a CSV file with the data of the plot.
12 # USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming -- alignedTags filenameTagsRefGenome 12 # USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming --alignedTags DCSbamFile
13 # --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf 13 # --output_tabular outputfile_name_tabular --output_pdf outputfile_name_pdf
14 14
15 import argparse 15 import argparse
16 import re
16 import sys 17 import sys
17 from collections import Counter 18 from collections import Counter
18 19
19 import matplotlib.pyplot as plt 20 import matplotlib.pyplot as plt
20 import numpy 21 import numpy
22 import pysam
21 from Bio import SeqIO 23 from Bio import SeqIO
22 from matplotlib.backends.backend_pdf import PdfPages 24 from matplotlib.backends.backend_pdf import PdfPages
23 25
24 plt.switch_backend('agg') 26 plt.switch_backend('agg')
25 27
51 parser.add_argument('--inputName1') 53 parser.add_argument('--inputName1')
52 parser.add_argument('--makeDCS', 54 parser.add_argument('--makeDCS',
53 help='FASTA File with information about tag and family size in the header.') 55 help='FASTA File with information about tag and family size in the header.')
54 parser.add_argument('--afterTrimming', default=None, 56 parser.add_argument('--afterTrimming', default=None,
55 help='FASTA File with information about tag and family size in the header.') 57 help='FASTA File with information about tag and family size in the header.')
56 parser.add_argument('--alignedTags', default=None, 58 parser.add_argument('--bamFile',
57 help=' TXT file with tags aligned to the reference genome and family size.') 59 help='BAM file with aligned reads.')
58 parser.add_argument('--output_pdf', default="data.pdf", type=str, 60 parser.add_argument('--output_pdf', default="data.pdf", type=str,
59 help='Name of the pdf and tabular file.') 61 help='Name of the pdf and tabular file.')
60 parser.add_argument('--output_tabular', default="data.tabular", type=str, 62 parser.add_argument('--output_tabular', default="data.tabular", type=str,
61 help='Name of the pdf and tabular file.') 63 help='Name of the pdf and tabular file.')
62 return parser 64 return parser
63 65
64 66
65 def compare_read_families_read_loss(argv): 67 def compare_read_families_read_loss(argv):
66 parser = make_argparser() 68 parser = make_argparser()
67 args = parser.parse_args(argv[1:]) 69 args = parser.parse_args(argv[1:])
68 70 #
69 SSCS_file = args.inputFile_SSCS 71 SSCS_file = args.inputFile_SSCS
70 SSCS_file_name = args.inputName1 72 SSCS_file_name = args.inputName1
71 makeConsensus = args.makeDCS 73 makeConsensus = args.makeDCS
72 afterTrimming = args.afterTrimming 74 afterTrimming = args.afterTrimming
73 ref_genome = args.alignedTags 75 ref_genome = args.bamFile
74 title_file = args.output_tabular 76 title_file = args.output_tabular
75 title_file2 = args.output_pdf 77 title_file2 = args.output_pdf
76 sep = "\t" 78 sep = "\t"
77 79
78 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: 80 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
162 plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure) 164 plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure)
163 plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure) 165 plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure)
164 166
165 # data of tags aligned to reference genome 167 # data of tags aligned to reference genome
166 if ref_genome != str(None): 168 if ref_genome != str(None):
167 mut_array = readFileReferenceFree(ref_genome, " ") 169 pysam.index(ref_genome)
170 bam = pysam.AlignmentFile(ref_genome, "rb")
171 seq_mut = []
172 for read in bam.fetch():
173 if not read.is_unmapped:
174 if re.search('_', read.query_name):
175 tags = re.split('_', read.query_name)[0]
176 else:
177 tags = read.query_name
178 seq_mut.append(tags)
179
168 # use only unique tags that were alignment to the reference genome 180 # use only unique tags that were alignment to the reference genome
169 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True) 181 seq_mut = numpy.array(seq_mut)
170 182 seq_mut, seqMut_index = numpy.unique(seq_mut, return_index=True)
171 # get family sizes 183 # get family sizes for each tag in the BAM file
172 quant_ab = [] 184 quant_ab = []
185 quant_ba = []
173 for i in seq_mut: 186 for i in seq_mut:
174 quant_ab.append(seqDic_ab.get(i)) 187 quant_ab.append(seqDic_ab.get(i))
175
176 quant_ba = []
177 for i in seq_mut:
178 quant_ba.append(seqDic_ba.get(i)) 188 quant_ba.append(seqDic_ba.get(i))
179 189
180 quant_ab_ref = numpy.array(quant_ab) 190 quant_ab_ref = numpy.array(quant_ab)
181 quant_ba_ref = numpy.array(quant_ba) 191 quant_ba_ref = numpy.array(quant_ba)
182 quant_all_ref = numpy.concatenate((quant_ab_ref, quant_ba_ref)) 192 quant_all_ref = numpy.concatenate((quant_ab_ref, quant_ba_ref))
183 bigFamilies = numpy.where(quant_all_ref > 20)[0] # group large family sizes 193 bigFamilies = numpy.where(quant_all_ref > 20)[0] # group large family sizes
184 quant_all_ref[bigFamilies] = 22 194 quant_all_ref[bigFamilies] = 22
195
185 list1.append(quant_all_ref) 196 list1.append(quant_all_ref)
186 colors.append("#04cec7") 197 colors.append("#04cec7")
187 labels.append("after alignment\nto reference") 198 labels.append("after alignment\nto reference")
188 legend7 = "after alignment to reference:" 199 legend7 = "after alignment to reference:"
189 length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome 200 length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome
196 ticks = numpy.arange(0, maximumX, 1) 207 ticks = numpy.arange(0, maximumX, 1)
197 ticks1 = map(str, ticks) 208 ticks1 = map(str, ticks)
198 ticks1[len(ticks1) - 1] = ">20" 209 ticks1[len(ticks1) - 1] = ">20"
199 plt.xticks(numpy.array(ticks), ticks1) 210 plt.xticks(numpy.array(ticks), ticks1)
200 if ref_genome != str(None): 211 if ref_genome != str(None):
201 count = numpy.array( 212 count = numpy.array([v for k, v in sorted(Counter(quant_ab_ref).iteritems())]) # count all family sizes from all ab strands
202 [v for k, v in sorted(Counter(quant_ab_ref).iteritems())]) # count all family sizes from all ab strands 213
203 214 legend = "max. family size:\nabsolute frequency:\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)"
204 legend = "max. family size =\nabsolute frequency=\nrelative frequency=\n\ntotal nr. of reads (before)" 215 plt.text(0.1, 0.085, legend, size=11, transform=plt.gcf().transFigure)
205 plt.text(0.1, 0.105, legend, size=11, transform=plt.gcf().transFigure)
206 216
207 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \ 217 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \
208 .format(max(quant_ab_ref), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), 218 .format(max(quant_ab_ref), count[len(count) - 1], float(count[len(count) - 1]) / sum(count),
209 sum(numpy.array(data_array[:, 0]).astype(int))) 219 sum(numpy.array(data_array[:, 0]).astype(int)))
210 plt.text(0.3, 0.105, legend, size=11, transform=plt.gcf().transFigure) 220 plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure)
211 221
212 count2 = numpy.array( 222 count2 = numpy.array(
213 [v for k, v in sorted(Counter(quant_ba_ref).iteritems())]) # count all family sizes from all ba strands 223 [v for k, v in sorted(Counter(quant_ba_ref).iteritems())]) # count all family sizes from all ba strands
214 legend = "BA\n{}\n{}\n{:.5f}" \ 224 legend = "BA\n{}\n{}\n{:.5f}" \
215 .format(max(quant_ba_ref), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) 225 .format(max(quant_ba_ref), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
216 plt.text(0.4, 0.15, legend, size=11, transform=plt.gcf().transFigure) 226 plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure)
217 227
218 legend4 = "* In the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags." 228 legend4 = "* In the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags."
219 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) 229 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure)
220 230
221 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) 231 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
237 "absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) 247 "absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
238 output_file.write( 248 output_file.write(
239 "relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, 249 "relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep,
240 float(count2[len(count2) - 1]) / sum(count2))) 250 float(count2[len(count2) - 1]) / sum(count2)))
241 251
242 output_file.write("\n\ntotal nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) 252 output_file.write("\ntotal nr. of reads before SSCS building{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
243 output_file.write("\n\nValues from family size distribution\n") 253 output_file.write("\n\nValues from family size distribution\n")
244 254
245 if afterTrimming == str(None) and ref_genome == str(None): 255 if afterTrimming == str(None) and ref_genome == str(None):
246 if afterTrimming == str(None): 256 if afterTrimming == str(None):
247 output_file.write("{}before SSCS buidling{}after DCS building\n".format(sep, sep)) 257 output_file.write("{}before SSCS building{}after DCS building\n".format(sep, sep))
248 elif ref_genome == str(None): 258 elif ref_genome == str(None):
249 output_file.write("{}before SSCS building{}atfer DCS building\n".format(sep, sep)) 259 output_file.write("{}before SSCS building{}atfer DCS building\n".format(sep, sep))
250 260
251 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])]): 261 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])]):
252 if fs == 21: 262 if fs == 21:
256 output_file.write("FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs))) 266 output_file.write("FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs)))
257 output_file.write("sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])))) 267 output_file.write("sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1]))))
258 268
259 elif afterTrimming == str(None) or ref_genome == str(None): 269 elif afterTrimming == str(None) or ref_genome == str(None):
260 if afterTrimming == str(None): 270 if afterTrimming == str(None):
261 output_file.write("{}before SSCS buidling{}after DCS building{}after alignment to reference\n".format(sep, sep, sep)) 271 output_file.write("{}before SSCS building{}after DCS building{}after alignment to reference\n".format(sep, sep, sep))
262 elif ref_genome == str(None): 272 elif ref_genome == str(None):
263 output_file.write("{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep)) 273 output_file.write("{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep))
264 274
265 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])]): 275 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])]):
266 if fs == 21: 276 if fs == 21: