Mercurial > repos > mheinzl > fsd_bvsa
comparison fsd_beforevsafter.py @ 11:fca9b000b8d8 draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_beforevsafter commit 82b53a26581a2296ad272bbba1e80934864dfa58
author | mheinzl |
---|---|
date | Mon, 26 Nov 2018 04:38:21 -0500 |
parents | 238a71241876 |
children |
comparison
equal
deleted
inserted
replaced
10:e80557c091e9 | 11:fca9b000b8d8 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Family size distribution of DCS from various steps of the Galaxy pipeline | 2 # Family size distribution of DCS from various steps of the Galaxy pipeline |
3 # | 3 # |
4 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) | 4 # Author: Monika Heinzl & Gundula Povysil, Johannes-Kepler University Linz (Austria) |
5 # Contact: monika.heinzl@edumail.at | 5 # Contact: monika.heinzl@edumail.at |
6 # | 6 # |
7 # Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional), | 7 # Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional), |
8 # a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and | 8 # a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and |
9 # a FASTA file with tags after trimming as input (optional). | 9 # a FASTA file with tags after trimming as input (optional). |
10 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and | 10 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and |
11 # a CSV file with the data of the plot. | 11 # a CSV file with the data of the plot. |
12 # USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming -- alignedTags filenameTagsRefGenome | 12 # USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming --alignedTags DCSbamFile |
13 # --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf | 13 # --output_tabular outputfile_name_tabular --output_pdf outputfile_name_pdf |
14 | 14 |
15 import argparse | 15 import argparse |
16 import re | |
16 import sys | 17 import sys |
17 from collections import Counter | 18 from collections import Counter |
18 | 19 |
19 import matplotlib.pyplot as plt | 20 import matplotlib.pyplot as plt |
20 import numpy | 21 import numpy |
22 import pysam | |
21 from Bio import SeqIO | 23 from Bio import SeqIO |
22 from matplotlib.backends.backend_pdf import PdfPages | 24 from matplotlib.backends.backend_pdf import PdfPages |
23 | 25 |
24 plt.switch_backend('agg') | 26 plt.switch_backend('agg') |
25 | 27 |
51 parser.add_argument('--inputName1') | 53 parser.add_argument('--inputName1') |
52 parser.add_argument('--makeDCS', | 54 parser.add_argument('--makeDCS', |
53 help='FASTA File with information about tag and family size in the header.') | 55 help='FASTA File with information about tag and family size in the header.') |
54 parser.add_argument('--afterTrimming', default=None, | 56 parser.add_argument('--afterTrimming', default=None, |
55 help='FASTA File with information about tag and family size in the header.') | 57 help='FASTA File with information about tag and family size in the header.') |
56 parser.add_argument('--alignedTags', default=None, | 58 parser.add_argument('--bamFile', |
57 help=' TXT file with tags aligned to the reference genome and family size.') | 59 help='BAM file with aligned reads.') |
58 parser.add_argument('--output_pdf', default="data.pdf", type=str, | 60 parser.add_argument('--output_pdf', default="data.pdf", type=str, |
59 help='Name of the pdf and tabular file.') | 61 help='Name of the pdf and tabular file.') |
60 parser.add_argument('--output_tabular', default="data.tabular", type=str, | 62 parser.add_argument('--output_tabular', default="data.tabular", type=str, |
61 help='Name of the pdf and tabular file.') | 63 help='Name of the pdf and tabular file.') |
62 return parser | 64 return parser |
63 | 65 |
64 | 66 |
65 def compare_read_families_read_loss(argv): | 67 def compare_read_families_read_loss(argv): |
66 parser = make_argparser() | 68 parser = make_argparser() |
67 args = parser.parse_args(argv[1:]) | 69 args = parser.parse_args(argv[1:]) |
68 | 70 # |
69 SSCS_file = args.inputFile_SSCS | 71 SSCS_file = args.inputFile_SSCS |
70 SSCS_file_name = args.inputName1 | 72 SSCS_file_name = args.inputName1 |
71 makeConsensus = args.makeDCS | 73 makeConsensus = args.makeDCS |
72 afterTrimming = args.afterTrimming | 74 afterTrimming = args.afterTrimming |
73 ref_genome = args.alignedTags | 75 ref_genome = args.bamFile |
74 title_file = args.output_tabular | 76 title_file = args.output_tabular |
75 title_file2 = args.output_pdf | 77 title_file2 = args.output_pdf |
76 sep = "\t" | 78 sep = "\t" |
77 | 79 |
78 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: | 80 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: |
162 plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure) | 164 plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure) |
163 plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure) | 165 plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure) |
164 | 166 |
165 # data of tags aligned to reference genome | 167 # data of tags aligned to reference genome |
166 if ref_genome != str(None): | 168 if ref_genome != str(None): |
167 mut_array = readFileReferenceFree(ref_genome, " ") | 169 pysam.index(ref_genome) |
170 bam = pysam.AlignmentFile(ref_genome, "rb") | |
171 seq_mut = [] | |
172 for read in bam.fetch(): | |
173 if not read.is_unmapped: | |
174 if re.search('_', read.query_name): | |
175 tags = re.split('_', read.query_name)[0] | |
176 else: | |
177 tags = read.query_name | |
178 seq_mut.append(tags) | |
179 | |
168 # use only unique tags that were alignment to the reference genome | 180 # use only unique tags that were alignment to the reference genome |
169 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True) | 181 seq_mut = numpy.array(seq_mut) |
170 | 182 seq_mut, seqMut_index = numpy.unique(seq_mut, return_index=True) |
171 # get family sizes | 183 # get family sizes for each tag in the BAM file |
172 quant_ab = [] | 184 quant_ab = [] |
185 quant_ba = [] | |
173 for i in seq_mut: | 186 for i in seq_mut: |
174 quant_ab.append(seqDic_ab.get(i)) | 187 quant_ab.append(seqDic_ab.get(i)) |
175 | |
176 quant_ba = [] | |
177 for i in seq_mut: | |
178 quant_ba.append(seqDic_ba.get(i)) | 188 quant_ba.append(seqDic_ba.get(i)) |
179 | 189 |
180 quant_ab_ref = numpy.array(quant_ab) | 190 quant_ab_ref = numpy.array(quant_ab) |
181 quant_ba_ref = numpy.array(quant_ba) | 191 quant_ba_ref = numpy.array(quant_ba) |
182 quant_all_ref = numpy.concatenate((quant_ab_ref, quant_ba_ref)) | 192 quant_all_ref = numpy.concatenate((quant_ab_ref, quant_ba_ref)) |
183 bigFamilies = numpy.where(quant_all_ref > 20)[0] # group large family sizes | 193 bigFamilies = numpy.where(quant_all_ref > 20)[0] # group large family sizes |
184 quant_all_ref[bigFamilies] = 22 | 194 quant_all_ref[bigFamilies] = 22 |
195 | |
185 list1.append(quant_all_ref) | 196 list1.append(quant_all_ref) |
186 colors.append("#04cec7") | 197 colors.append("#04cec7") |
187 labels.append("after alignment\nto reference") | 198 labels.append("after alignment\nto reference") |
188 legend7 = "after alignment to reference:" | 199 legend7 = "after alignment to reference:" |
189 length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome | 200 length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome |
196 ticks = numpy.arange(0, maximumX, 1) | 207 ticks = numpy.arange(0, maximumX, 1) |
197 ticks1 = map(str, ticks) | 208 ticks1 = map(str, ticks) |
198 ticks1[len(ticks1) - 1] = ">20" | 209 ticks1[len(ticks1) - 1] = ">20" |
199 plt.xticks(numpy.array(ticks), ticks1) | 210 plt.xticks(numpy.array(ticks), ticks1) |
200 if ref_genome != str(None): | 211 if ref_genome != str(None): |
201 count = numpy.array( | 212 count = numpy.array([v for k, v in sorted(Counter(quant_ab_ref).iteritems())]) # count all family sizes from all ab strands |
202 [v for k, v in sorted(Counter(quant_ab_ref).iteritems())]) # count all family sizes from all ab strands | 213 |
203 | 214 legend = "max. family size:\nabsolute frequency:\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)" |
204 legend = "max. family size =\nabsolute frequency=\nrelative frequency=\n\ntotal nr. of reads (before)" | 215 plt.text(0.1, 0.085, legend, size=11, transform=plt.gcf().transFigure) |
205 plt.text(0.1, 0.105, legend, size=11, transform=plt.gcf().transFigure) | |
206 | 216 |
207 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \ | 217 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \ |
208 .format(max(quant_ab_ref), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), | 218 .format(max(quant_ab_ref), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), |
209 sum(numpy.array(data_array[:, 0]).astype(int))) | 219 sum(numpy.array(data_array[:, 0]).astype(int))) |
210 plt.text(0.3, 0.105, legend, size=11, transform=plt.gcf().transFigure) | 220 plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure) |
211 | 221 |
212 count2 = numpy.array( | 222 count2 = numpy.array( |
213 [v for k, v in sorted(Counter(quant_ba_ref).iteritems())]) # count all family sizes from all ba strands | 223 [v for k, v in sorted(Counter(quant_ba_ref).iteritems())]) # count all family sizes from all ba strands |
214 legend = "BA\n{}\n{}\n{:.5f}" \ | 224 legend = "BA\n{}\n{}\n{:.5f}" \ |
215 .format(max(quant_ba_ref), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) | 225 .format(max(quant_ba_ref), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) |
216 plt.text(0.4, 0.15, legend, size=11, transform=plt.gcf().transFigure) | 226 plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure) |
217 | 227 |
218 legend4 = "* In the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags." | 228 legend4 = "* In the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags." |
219 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) | 229 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) |
220 | 230 |
221 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) | 231 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) |
237 "absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) | 247 "absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) |
238 output_file.write( | 248 output_file.write( |
239 "relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, | 249 "relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, |
240 float(count2[len(count2) - 1]) / sum(count2))) | 250 float(count2[len(count2) - 1]) / sum(count2))) |
241 | 251 |
242 output_file.write("\n\ntotal nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) | 252 output_file.write("\ntotal nr. of reads before SSCS building{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) |
243 output_file.write("\n\nValues from family size distribution\n") | 253 output_file.write("\n\nValues from family size distribution\n") |
244 | 254 |
245 if afterTrimming == str(None) and ref_genome == str(None): | 255 if afterTrimming == str(None) and ref_genome == str(None): |
246 if afterTrimming == str(None): | 256 if afterTrimming == str(None): |
247 output_file.write("{}before SSCS buidling{}after DCS building\n".format(sep, sep)) | 257 output_file.write("{}before SSCS building{}after DCS building\n".format(sep, sep)) |
248 elif ref_genome == str(None): | 258 elif ref_genome == str(None): |
249 output_file.write("{}before SSCS building{}atfer DCS building\n".format(sep, sep)) | 259 output_file.write("{}before SSCS building{}atfer DCS building\n".format(sep, sep)) |
250 | 260 |
251 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])]): | 261 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])]): |
252 if fs == 21: | 262 if fs == 21: |
256 output_file.write("FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs))) | 266 output_file.write("FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs))) |
257 output_file.write("sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])))) | 267 output_file.write("sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])))) |
258 | 268 |
259 elif afterTrimming == str(None) or ref_genome == str(None): | 269 elif afterTrimming == str(None) or ref_genome == str(None): |
260 if afterTrimming == str(None): | 270 if afterTrimming == str(None): |
261 output_file.write("{}before SSCS buidling{}after DCS building{}after alignment to reference\n".format(sep, sep, sep)) | 271 output_file.write("{}before SSCS building{}after DCS building{}after alignment to reference\n".format(sep, sep, sep)) |
262 elif ref_genome == str(None): | 272 elif ref_genome == str(None): |
263 output_file.write("{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep)) | 273 output_file.write("{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep)) |
264 | 274 |
265 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])]): | 275 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])]): |
266 if fs == 21: | 276 if fs == 21: |