Mercurial > repos > mheinzl > fsd_bvsa
comparison fsd_beforevsafter.py @ 0:6716b1cddf3e draft
planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/fsd_beforevsafter commit 6055f8c5c052f528ff85fb5e0d43b4500830637a
author | mheinzl |
---|---|
date | Thu, 10 May 2018 07:29:24 -0400 |
parents | |
children | e8115b71edbd |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6716b1cddf3e |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Family size distribution of DCS from various steps of the Galaxy pipeline | |
4 # | |
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) | |
6 # Contact: monika.heinzl@edumail.at | |
7 # | |
8 # Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional), | |
9 # a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and | |
10 # a FASTA file with tags after trimming as input (optional). | |
11 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and | |
12 # a CSV file with the data of the plot. | |
13 | |
14 # USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming -- alignedTags filenameTagsRefGenome | |
15 # --sep "characterWhichSeparatesCSVFile" --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf | |
16 | |
17 | |
18 import numpy | |
19 import matplotlib.pyplot as plt | |
20 from collections import Counter | |
21 from Bio import SeqIO | |
22 import argparse | |
23 import sys | |
24 import os | |
25 from matplotlib.backends.backend_pdf import PdfPages | |
26 | |
27 def readFileReferenceFree(file, delim): | |
28 with open(file, 'r') as dest_f: | |
29 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string') | |
30 return(data_array) | |
31 | |
32 def readFasta(file): | |
33 tag_consensus = [] | |
34 fs_consensus = [] | |
35 with open(file, "r") as consFile: | |
36 for record in SeqIO.parse(consFile, "fasta"): | |
37 tag_consensus.append(record.id) | |
38 line = record.description | |
39 a, b = line.split(" ") | |
40 fs1, fs2 = b.split("-") | |
41 fs_consensus.extend([fs1,fs2]) | |
42 fs_consensus = numpy.array(fs_consensus).astype(int) | |
43 return(tag_consensus, fs_consensus) | |
44 | |
45 def make_argparser(): | |
46 parser = argparse.ArgumentParser(description='Analysis of read loss in duplex sequencing data') | |
47 parser.add_argument('--inputFile_SSCS', | |
48 help='Tabular File with three columns: ab or ba, tag and family size.') | |
49 parser.add_argument('--makeDCS', | |
50 help='FASTA File with information about tag and family size in the header.') | |
51 parser.add_argument('--afterTrimming',default=None, | |
52 help='FASTA File with information about tag and family size in the header.') | |
53 parser.add_argument('--alignedTags',default=None, | |
54 help=' TXT file with tags aligned to the reference genome and family size.') | |
55 parser.add_argument('--output_csv', default="data.csv", type=str, | |
56 help='Name of the pdf and csv file.') | |
57 parser.add_argument('--output_pdf', default="data.pdf", type=str, | |
58 help='Name of the pdf and csv file.') | |
59 parser.add_argument('--sep', default=",", | |
60 help='Separator in the csv file.') | |
61 return parser | |
62 | |
63 def compare_read_families_read_loss(argv): | |
64 parser = make_argparser() | |
65 args = parser.parse_args(argv[1:]) | |
66 | |
67 SSCS_file = args.inputFile_SSCS | |
68 makeConsensus = args.makeDCS | |
69 afterTrimming = args.afterTrimming | |
70 ref_genome = args.alignedTags | |
71 title_file = args.output_csv | |
72 title_file2 = args.output_pdf | |
73 sep = args.sep | |
74 | |
75 if type(sep) is not str or len(sep)>1: | |
76 print("Error: --sep must be a single character.") | |
77 exit(4) | |
78 | |
79 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: | |
80 ### PLOT ### | |
81 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | |
82 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color | |
83 plt.rcParams['xtick.labelsize'] = 12 | |
84 plt.rcParams['ytick.labelsize'] = 12 | |
85 plt.rcParams['patch.edgecolor'] = "black" | |
86 fig = plt.figure() | |
87 plt.subplots_adjust(bottom=0.3) | |
88 | |
89 list1 = [] | |
90 colors = [] | |
91 labels = [] | |
92 | |
93 ### data with tags of SSCS | |
94 data_array = readFileReferenceFree(SSCS_file, "\t") | |
95 seq = numpy.array(data_array[:, 1]) | |
96 tags = numpy.array(data_array[:, 2]) | |
97 quant = numpy.array(data_array[:, 0]).astype(int) | |
98 | |
99 # split data with all tags of SSCS after ab and ba strands | |
100 all_ab = seq[numpy.where(tags == "ab")[0]] | |
101 all_ba = seq[numpy.where(tags == "ba")[0]] | |
102 quant_ab_sscs = quant[numpy.where(tags == "ab")[0]] | |
103 quant_ba_sscs = quant[numpy.where(tags == "ba")[0]] | |
104 | |
105 seqDic_ab = dict(zip(all_ab, quant_ab_sscs)) | |
106 seqDic_ba = dict(zip(all_ba, quant_ba_sscs)) | |
107 | |
108 ### get tags of the SSCS which form a DCS | |
109 # group large family sizes | |
110 bigFamilies = numpy.where(quant > 20)[0] | |
111 quant[bigFamilies] = 22 | |
112 maximumX = numpy.amax(quant) | |
113 | |
114 # find all unique tags and get the indices for ALL tags (ab AND ba) | |
115 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True) | |
116 d = u[c > 1] | |
117 | |
118 # get family sizes, tag for the duplicates | |
119 duplTags_double = quant[numpy.in1d(seq, d)] | |
120 list1.append(duplTags_double) | |
121 colors.append("#0000FF") | |
122 labels.append("before alignment\nto SSCS") | |
123 | |
124 duplTags = duplTags_double[0::2] # ab of DCS | |
125 duplTagsBA = duplTags_double[1::2] # ba of DCS | |
126 | |
127 d2 = d[(duplTags >= 3) & (duplTagsBA >= 3)] # ab and ba FS>=3 | |
128 | |
129 # all SSCSs FS>=3 | |
130 seq_unique, seqUnique_index = numpy.unique(seq, return_index=True) | |
131 seq_unique_FS = quant[seqUnique_index] | |
132 seq_unique_FS3 = seq_unique_FS[seq_unique_FS >= 3] | |
133 | |
134 legend1 = "\ntotal nr. of tags (unique, FS>=1):\nDCS (before alignment to SSCS, FS>=1):\ntotal nr. of tags (unique, FS>=3):\nDCS (before alignment to SSCS, FS>=3):" | |
135 legend2 = "total numbers * \n{:,}\n{:,}\n{:,}\n{:,}".format(len(seq_unique_FS), len(duplTags), | |
136 len(seq_unique_FS3), len(d2)) | |
137 plt.text(0.55, 0.14, legend1, size=11, transform=plt.gcf().transFigure) | |
138 plt.text(0.88, 0.14, legend2, size=11, transform=plt.gcf().transFigure) | |
139 | |
140 ## data make DCS | |
141 tag_consensus, fs_consensus = readFasta(makeConsensus) | |
142 ### group large family sizes in the plot of fasta files | |
143 bigFamilies = numpy.where(fs_consensus > 20)[0] | |
144 fs_consensus[bigFamilies] = 22 | |
145 list1.append(fs_consensus) | |
146 colors.append("#298A08") | |
147 labels.append("make DCS") | |
148 legend3 = "make DCS:" | |
149 legend4 = "{:,}".format(len(tag_consensus)) | |
150 plt.text(0.55, 0.11, legend3, size=11, transform=plt.gcf().transFigure) | |
151 plt.text(0.88, 0.11, legend4, size=11, transform=plt.gcf().transFigure) | |
152 | |
153 ### data after trimming | |
154 if afterTrimming != str(None): | |
155 tag_trimming, fs_trimming = readFasta(afterTrimming) | |
156 bigFamilies = numpy.where(fs_trimming > 20)[0] | |
157 fs_trimming[bigFamilies] = 22 | |
158 list1.append(fs_trimming) | |
159 colors.append("#DF0101") | |
160 labels.append("after trimming") | |
161 legend5 = "after trimming:" | |
162 legend6 = "{:,}".format(len(tag_trimming)) | |
163 plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure) | |
164 plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure) | |
165 | |
166 ### data of tags aligned to reference genome | |
167 if ref_genome != str(None): | |
168 mut_array = readFileReferenceFree(ref_genome, " ") | |
169 ### use only unique tags that were alignment to the reference genome | |
170 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True) | |
171 | |
172 # get family sizes | |
173 quant_ab = [] | |
174 for i in seq_mut: | |
175 quant_ab.append(seqDic_ab.get(i)) | |
176 | |
177 quant_ba = [] | |
178 for i in seq_mut: | |
179 quant_ba.append(seqDic_ba.get(i)) | |
180 | |
181 quant_ab_ref = numpy.array(quant_ab) | |
182 quant_ba_ref = numpy.array(quant_ba) | |
183 quant_all_ref = numpy.concatenate((quant_ab_ref, quant_ba_ref)) | |
184 bigFamilies = numpy.where(quant_all_ref > 20)[0] # group large family sizes | |
185 quant_all_ref[bigFamilies] = 22 | |
186 list1.append(quant_all_ref) | |
187 colors.append("#04cec7") | |
188 labels.append("after alignment\nto reference genome") | |
189 legend7 = "after alignment to reference genome:" | |
190 length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome | |
191 legend8 = "{:,}".format(length_DCS_ref) | |
192 plt.text(0.55, 0.07, legend7, size=11, transform=plt.gcf().transFigure) | |
193 plt.text(0.88, 0.07, legend8, size=11, transform=plt.gcf().transFigure) | |
194 | |
195 | |
196 counts = plt.hist(list1, bins=range(-1, maximumX + 1), stacked=False, label=labels, color=colors, | |
197 align="left", alpha=1, edgecolor="black", linewidth=1) | |
198 ticks = numpy.arange(0, maximumX, 1) | |
199 ticks1 = map(str, ticks) | |
200 ticks1[len(ticks1) - 1] = ">20" | |
201 plt.xticks(numpy.array(ticks), ticks1) | |
202 if ref_genome != str(None): | |
203 count = numpy.array( | |
204 [v for k, v in sorted(Counter(quant_ab_ref).iteritems())]) # count all family sizes from all ab strands | |
205 | |
206 legend = "max. family size =\nabsolute frequency=\nrelative frequency=\n\ntotal nr. of reads (before)" | |
207 plt.text(0.1, 0.105, legend, size=11, transform=plt.gcf().transFigure) | |
208 | |
209 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \ | |
210 .format(max(quant_ab_ref), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), | |
211 sum(numpy.array(data_array[:, 0]).astype(int))) | |
212 plt.text(0.3, 0.105, legend, size=11, transform=plt.gcf().transFigure) | |
213 | |
214 count2 = numpy.array( | |
215 [v for k, v in sorted(Counter(quant_ba_ref).iteritems())]) # count all family sizes from all ba strands | |
216 legend = "BA\n{}\n{}\n{:.5f}" \ | |
217 .format(max(quant_ba_ref), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) | |
218 plt.text(0.4, 0.15, legend, size=11, transform=plt.gcf().transFigure) | |
219 | |
220 legend4 = "* In the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags." | |
221 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) | |
222 | |
223 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) | |
224 plt.title("Family Size Distribution of Tags from various Steps of the Galaxy Pipeline", fontsize=14) | |
225 plt.xlabel("No. of Family Members", fontsize=12) | |
226 plt.ylabel("Absolute Frequency", fontsize=12) | |
227 plt.grid(b=True, which="major", color="#424242", linestyle=":") | |
228 plt.margins(0.01, None) | |
229 | |
230 pdf.savefig(fig, bbox_inch="tight") | |
231 plt.close() | |
232 | |
233 # write information about plot into a csv file | |
234 output_file.write("Dataset:{}{}\n".format(sep, SSCS_file)) | |
235 if ref_genome != str(None): | |
236 output_file.write("{}AB{}BA\n".format(sep, sep)) | |
237 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(quant_ab_ref), sep, max(quant_ba_ref))) | |
238 output_file.write( | |
239 "absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) | |
240 output_file.write( | |
241 "relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, | |
242 float(count2[len(count2) - 1]) / sum(count2))) | |
243 | |
244 output_file.write("\n\ntotal nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) | |
245 output_file.write("\n\nValues from family size distribution\n") | |
246 | |
247 if afterTrimming == str(None) and ref_genome == str(None): | |
248 if afterTrimming == str(None): | |
249 output_file.write( | |
250 "{}before alignment to SSCS{}make DCS\n".format(sep, sep)) | |
251 elif ref_genome == str(None): | |
252 output_file.write( | |
253 "{}before alignment to SSCS{}make DCS\n".format(sep, sep)) | |
254 | |
255 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], | |
256 counts[0][1][2:len(counts[0][1])]): | |
257 if fs == 21: | |
258 fs = ">20" | |
259 else: | |
260 fs = "={}".format(fs) | |
261 output_file.write( | |
262 "FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs))) | |
263 output_file.write( | |
264 "sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])))) | |
265 | |
266 elif afterTrimming == str(None) or ref_genome == str(None): | |
267 if afterTrimming == str(None): | |
268 output_file.write( | |
269 "{}before alignment to SSCS{}make DCS{}after alignment to reference genome\n".format(sep, sep, sep)) | |
270 elif ref_genome == str(None): | |
271 output_file.write( | |
272 "{}before alignment to SSCS{}make DCS{}after trimming\n".format(sep, sep, sep)) | |
273 | |
274 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], | |
275 counts[0][1][2:len(counts[0][1])],counts[0][2][2:len(counts[0][2])]): | |
276 if fs == 21: | |
277 fs = ">20" | |
278 else: | |
279 fs = "={}".format(fs) | |
280 output_file.write( | |
281 "FS{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(reference))) | |
282 output_file.write( | |
283 "sum{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2])))) | |
284 | |
285 else: | |
286 output_file.write( | |
287 "{}before alignment to SSCS{}make DCS{}after trimming{}after alignment to reference genome\n".format( | |
288 sep, sep, sep, sep)) | |
289 for fs, sscs, dcs, trim, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], | |
290 counts[0][1][2:len(counts[0][1])], | |
291 counts[0][2][2:len(counts[0][2])], | |
292 counts[0][3][2:len(counts[0][3])]): | |
293 if fs == 21: | |
294 fs = ">20" | |
295 else: | |
296 fs = "={}".format(fs) | |
297 output_file.write( | |
298 "FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(trim), sep, | |
299 int(reference))) | |
300 output_file.write( | |
301 "sum{}{}{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, | |
302 int(sum(counts[0][2])), sep, int(sum(counts[0][3])))) | |
303 | |
304 output_file.write("\n\nIn the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags.\n") | |
305 output_file.write("total nr. of tags (unique, FS>=1){}{}\n".format(sep, len(seq_unique_FS))) | |
306 output_file.write("DCS (before alignment to SSCS, FS>=1){}{}\n".format(sep, len(duplTags))) | |
307 output_file.write("total nr. of tags (unique, FS>=3){}{}\n".format(sep, len(seq_unique_FS3))) | |
308 output_file.write("DCS (before alignment to SSCS, FS>=3){}{}\n".format(sep, len(d2))) | |
309 output_file.write("make DCS{}{}\n".format(sep, len(tag_consensus))) | |
310 if afterTrimming != str(None): | |
311 output_file.write("after trimming{}{}\n".format(sep, len(tag_trimming))) | |
312 if ref_genome != str(None): | |
313 output_file.write("after alignment to reference genome{}{}\n".format(sep, length_DCS_ref)) | |
314 | |
315 print("Files successfully created!") | |
316 | |
317 if __name__ == '__main__': | |
318 sys.exit(compare_read_families_read_loss(sys.argv)) |