comparison fsd_beforevsafter.py @ 0:6716b1cddf3e draft

planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/fsd_beforevsafter commit 6055f8c5c052f528ff85fb5e0d43b4500830637a
author mheinzl
date Thu, 10 May 2018 07:29:24 -0400
parents
children e8115b71edbd
comparison
equal deleted inserted replaced
-1:000000000000 0:6716b1cddf3e
1 #!/usr/bin/env python
2
3 # Family size distribution of DCS from various steps of the Galaxy pipeline
4 #
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
6 # Contact: monika.heinzl@edumail.at
7 #
8 # Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional),
9 # a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and
10 # a FASTA file with tags after trimming as input (optional).
11 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and
12 # a CSV file with the data of the plot.
13
14 # USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming -- alignedTags filenameTagsRefGenome
15 # --sep "characterWhichSeparatesCSVFile" --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf
16
17
18 import numpy
19 import matplotlib.pyplot as plt
20 from collections import Counter
21 from Bio import SeqIO
22 import argparse
23 import sys
24 import os
25 from matplotlib.backends.backend_pdf import PdfPages
26
27 def readFileReferenceFree(file, delim):
28 with open(file, 'r') as dest_f:
29 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string')
30 return(data_array)
31
32 def readFasta(file):
33 tag_consensus = []
34 fs_consensus = []
35 with open(file, "r") as consFile:
36 for record in SeqIO.parse(consFile, "fasta"):
37 tag_consensus.append(record.id)
38 line = record.description
39 a, b = line.split(" ")
40 fs1, fs2 = b.split("-")
41 fs_consensus.extend([fs1,fs2])
42 fs_consensus = numpy.array(fs_consensus).astype(int)
43 return(tag_consensus, fs_consensus)
44
45 def make_argparser():
46 parser = argparse.ArgumentParser(description='Analysis of read loss in duplex sequencing data')
47 parser.add_argument('--inputFile_SSCS',
48 help='Tabular File with three columns: ab or ba, tag and family size.')
49 parser.add_argument('--makeDCS',
50 help='FASTA File with information about tag and family size in the header.')
51 parser.add_argument('--afterTrimming',default=None,
52 help='FASTA File with information about tag and family size in the header.')
53 parser.add_argument('--alignedTags',default=None,
54 help=' TXT file with tags aligned to the reference genome and family size.')
55 parser.add_argument('--output_csv', default="data.csv", type=str,
56 help='Name of the pdf and csv file.')
57 parser.add_argument('--output_pdf', default="data.pdf", type=str,
58 help='Name of the pdf and csv file.')
59 parser.add_argument('--sep', default=",",
60 help='Separator in the csv file.')
61 return parser
62
63 def compare_read_families_read_loss(argv):
64 parser = make_argparser()
65 args = parser.parse_args(argv[1:])
66
67 SSCS_file = args.inputFile_SSCS
68 makeConsensus = args.makeDCS
69 afterTrimming = args.afterTrimming
70 ref_genome = args.alignedTags
71 title_file = args.output_csv
72 title_file2 = args.output_pdf
73 sep = args.sep
74
75 if type(sep) is not str or len(sep)>1:
76 print("Error: --sep must be a single character.")
77 exit(4)
78
79 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
80 ### PLOT ###
81 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
82 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color
83 plt.rcParams['xtick.labelsize'] = 12
84 plt.rcParams['ytick.labelsize'] = 12
85 plt.rcParams['patch.edgecolor'] = "black"
86 fig = plt.figure()
87 plt.subplots_adjust(bottom=0.3)
88
89 list1 = []
90 colors = []
91 labels = []
92
93 ### data with tags of SSCS
94 data_array = readFileReferenceFree(SSCS_file, "\t")
95 seq = numpy.array(data_array[:, 1])
96 tags = numpy.array(data_array[:, 2])
97 quant = numpy.array(data_array[:, 0]).astype(int)
98
99 # split data with all tags of SSCS after ab and ba strands
100 all_ab = seq[numpy.where(tags == "ab")[0]]
101 all_ba = seq[numpy.where(tags == "ba")[0]]
102 quant_ab_sscs = quant[numpy.where(tags == "ab")[0]]
103 quant_ba_sscs = quant[numpy.where(tags == "ba")[0]]
104
105 seqDic_ab = dict(zip(all_ab, quant_ab_sscs))
106 seqDic_ba = dict(zip(all_ba, quant_ba_sscs))
107
108 ### get tags of the SSCS which form a DCS
109 # group large family sizes
110 bigFamilies = numpy.where(quant > 20)[0]
111 quant[bigFamilies] = 22
112 maximumX = numpy.amax(quant)
113
114 # find all unique tags and get the indices for ALL tags (ab AND ba)
115 u, index_unique, c = numpy.unique(numpy.array(seq), return_counts=True, return_index=True)
116 d = u[c > 1]
117
118 # get family sizes, tag for the duplicates
119 duplTags_double = quant[numpy.in1d(seq, d)]
120 list1.append(duplTags_double)
121 colors.append("#0000FF")
122 labels.append("before alignment\nto SSCS")
123
124 duplTags = duplTags_double[0::2] # ab of DCS
125 duplTagsBA = duplTags_double[1::2] # ba of DCS
126
127 d2 = d[(duplTags >= 3) & (duplTagsBA >= 3)] # ab and ba FS>=3
128
129 # all SSCSs FS>=3
130 seq_unique, seqUnique_index = numpy.unique(seq, return_index=True)
131 seq_unique_FS = quant[seqUnique_index]
132 seq_unique_FS3 = seq_unique_FS[seq_unique_FS >= 3]
133
134 legend1 = "\ntotal nr. of tags (unique, FS>=1):\nDCS (before alignment to SSCS, FS>=1):\ntotal nr. of tags (unique, FS>=3):\nDCS (before alignment to SSCS, FS>=3):"
135 legend2 = "total numbers * \n{:,}\n{:,}\n{:,}\n{:,}".format(len(seq_unique_FS), len(duplTags),
136 len(seq_unique_FS3), len(d2))
137 plt.text(0.55, 0.14, legend1, size=11, transform=plt.gcf().transFigure)
138 plt.text(0.88, 0.14, legend2, size=11, transform=plt.gcf().transFigure)
139
140 ## data make DCS
141 tag_consensus, fs_consensus = readFasta(makeConsensus)
142 ### group large family sizes in the plot of fasta files
143 bigFamilies = numpy.where(fs_consensus > 20)[0]
144 fs_consensus[bigFamilies] = 22
145 list1.append(fs_consensus)
146 colors.append("#298A08")
147 labels.append("make DCS")
148 legend3 = "make DCS:"
149 legend4 = "{:,}".format(len(tag_consensus))
150 plt.text(0.55, 0.11, legend3, size=11, transform=plt.gcf().transFigure)
151 plt.text(0.88, 0.11, legend4, size=11, transform=plt.gcf().transFigure)
152
153 ### data after trimming
154 if afterTrimming != str(None):
155 tag_trimming, fs_trimming = readFasta(afterTrimming)
156 bigFamilies = numpy.where(fs_trimming > 20)[0]
157 fs_trimming[bigFamilies] = 22
158 list1.append(fs_trimming)
159 colors.append("#DF0101")
160 labels.append("after trimming")
161 legend5 = "after trimming:"
162 legend6 = "{:,}".format(len(tag_trimming))
163 plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure)
164 plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure)
165
166 ### data of tags aligned to reference genome
167 if ref_genome != str(None):
168 mut_array = readFileReferenceFree(ref_genome, " ")
169 ### use only unique tags that were alignment to the reference genome
170 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)
171
172 # get family sizes
173 quant_ab = []
174 for i in seq_mut:
175 quant_ab.append(seqDic_ab.get(i))
176
177 quant_ba = []
178 for i in seq_mut:
179 quant_ba.append(seqDic_ba.get(i))
180
181 quant_ab_ref = numpy.array(quant_ab)
182 quant_ba_ref = numpy.array(quant_ba)
183 quant_all_ref = numpy.concatenate((quant_ab_ref, quant_ba_ref))
184 bigFamilies = numpy.where(quant_all_ref > 20)[0] # group large family sizes
185 quant_all_ref[bigFamilies] = 22
186 list1.append(quant_all_ref)
187 colors.append("#04cec7")
188 labels.append("after alignment\nto reference genome")
189 legend7 = "after alignment to reference genome:"
190 length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome
191 legend8 = "{:,}".format(length_DCS_ref)
192 plt.text(0.55, 0.07, legend7, size=11, transform=plt.gcf().transFigure)
193 plt.text(0.88, 0.07, legend8, size=11, transform=plt.gcf().transFigure)
194
195
196 counts = plt.hist(list1, bins=range(-1, maximumX + 1), stacked=False, label=labels, color=colors,
197 align="left", alpha=1, edgecolor="black", linewidth=1)
198 ticks = numpy.arange(0, maximumX, 1)
199 ticks1 = map(str, ticks)
200 ticks1[len(ticks1) - 1] = ">20"
201 plt.xticks(numpy.array(ticks), ticks1)
202 if ref_genome != str(None):
203 count = numpy.array(
204 [v for k, v in sorted(Counter(quant_ab_ref).iteritems())]) # count all family sizes from all ab strands
205
206 legend = "max. family size =\nabsolute frequency=\nrelative frequency=\n\ntotal nr. of reads (before)"
207 plt.text(0.1, 0.105, legend, size=11, transform=plt.gcf().transFigure)
208
209 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \
210 .format(max(quant_ab_ref), count[len(count) - 1], float(count[len(count) - 1]) / sum(count),
211 sum(numpy.array(data_array[:, 0]).astype(int)))
212 plt.text(0.3, 0.105, legend, size=11, transform=plt.gcf().transFigure)
213
214 count2 = numpy.array(
215 [v for k, v in sorted(Counter(quant_ba_ref).iteritems())]) # count all family sizes from all ba strands
216 legend = "BA\n{}\n{}\n{:.5f}" \
217 .format(max(quant_ba_ref), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
218 plt.text(0.4, 0.15, legend, size=11, transform=plt.gcf().transFigure)
219
220 legend4 = "* In the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags."
221 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure)
222
223 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
224 plt.title("Family Size Distribution of Tags from various Steps of the Galaxy Pipeline", fontsize=14)
225 plt.xlabel("No. of Family Members", fontsize=12)
226 plt.ylabel("Absolute Frequency", fontsize=12)
227 plt.grid(b=True, which="major", color="#424242", linestyle=":")
228 plt.margins(0.01, None)
229
230 pdf.savefig(fig, bbox_inch="tight")
231 plt.close()
232
233 # write information about plot into a csv file
234 output_file.write("Dataset:{}{}\n".format(sep, SSCS_file))
235 if ref_genome != str(None):
236 output_file.write("{}AB{}BA\n".format(sep, sep))
237 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(quant_ab_ref), sep, max(quant_ba_ref)))
238 output_file.write(
239 "absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
240 output_file.write(
241 "relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep,
242 float(count2[len(count2) - 1]) / sum(count2)))
243
244 output_file.write("\n\ntotal nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
245 output_file.write("\n\nValues from family size distribution\n")
246
247 if afterTrimming == str(None) and ref_genome == str(None):
248 if afterTrimming == str(None):
249 output_file.write(
250 "{}before alignment to SSCS{}make DCS\n".format(sep, sep))
251 elif ref_genome == str(None):
252 output_file.write(
253 "{}before alignment to SSCS{}make DCS\n".format(sep, sep))
254
255 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])],
256 counts[0][1][2:len(counts[0][1])]):
257 if fs == 21:
258 fs = ">20"
259 else:
260 fs = "={}".format(fs)
261 output_file.write(
262 "FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs)))
263 output_file.write(
264 "sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1]))))
265
266 elif afterTrimming == str(None) or ref_genome == str(None):
267 if afterTrimming == str(None):
268 output_file.write(
269 "{}before alignment to SSCS{}make DCS{}after alignment to reference genome\n".format(sep, sep, sep))
270 elif ref_genome == str(None):
271 output_file.write(
272 "{}before alignment to SSCS{}make DCS{}after trimming\n".format(sep, sep, sep))
273
274 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])],
275 counts[0][1][2:len(counts[0][1])],counts[0][2][2:len(counts[0][2])]):
276 if fs == 21:
277 fs = ">20"
278 else:
279 fs = "={}".format(fs)
280 output_file.write(
281 "FS{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(reference)))
282 output_file.write(
283 "sum{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2]))))
284
285 else:
286 output_file.write(
287 "{}before alignment to SSCS{}make DCS{}after trimming{}after alignment to reference genome\n".format(
288 sep, sep, sep, sep))
289 for fs, sscs, dcs, trim, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])],
290 counts[0][1][2:len(counts[0][1])],
291 counts[0][2][2:len(counts[0][2])],
292 counts[0][3][2:len(counts[0][3])]):
293 if fs == 21:
294 fs = ">20"
295 else:
296 fs = "={}".format(fs)
297 output_file.write(
298 "FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(trim), sep,
299 int(reference)))
300 output_file.write(
301 "sum{}{}{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep,
302 int(sum(counts[0][2])), sep, int(sum(counts[0][3]))))
303
304 output_file.write("\n\nIn the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags.\n")
305 output_file.write("total nr. of tags (unique, FS>=1){}{}\n".format(sep, len(seq_unique_FS)))
306 output_file.write("DCS (before alignment to SSCS, FS>=1){}{}\n".format(sep, len(duplTags)))
307 output_file.write("total nr. of tags (unique, FS>=3){}{}\n".format(sep, len(seq_unique_FS3)))
308 output_file.write("DCS (before alignment to SSCS, FS>=3){}{}\n".format(sep, len(d2)))
309 output_file.write("make DCS{}{}\n".format(sep, len(tag_consensus)))
310 if afterTrimming != str(None):
311 output_file.write("after trimming{}{}\n".format(sep, len(tag_trimming)))
312 if ref_genome != str(None):
313 output_file.write("after alignment to reference genome{}{}\n".format(sep, length_DCS_ref))
314
315 print("Files successfully created!")
316
317 if __name__ == '__main__':
318 sys.exit(compare_read_families_read_loss(sys.argv))