Mercurial > repos > mheinzl > fsd_bvsa
comparison fsd_beforevsafter.py @ 8:238a71241876 draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_beforevsafter commit dfaab79252a858e8df16bbea3607ebf1b6962e5a-dirty
author | mheinzl |
---|---|
date | Mon, 08 Oct 2018 05:55:14 -0400 |
parents | 1eae0524b285 |
children | fca9b000b8d8 |
comparison
equal
deleted
inserted
replaced
7:c357ce2783a4 | 8:238a71241876 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | |
3 # Family size distribution of DCS from various steps of the Galaxy pipeline | 2 # Family size distribution of DCS from various steps of the Galaxy pipeline |
4 # | 3 # |
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) | 4 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) |
6 # Contact: monika.heinzl@edumail.at | 5 # Contact: monika.heinzl@edumail.at |
7 # | 6 # |
8 # Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional), | 7 # Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional), |
9 # a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and | 8 # a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and |
10 # a FASTA file with tags after trimming as input (optional). | 9 # a FASTA file with tags after trimming as input (optional). |
11 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and | 10 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and |
12 # a CSV file with the data of the plot. | 11 # a CSV file with the data of the plot. |
13 | 12 # USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming -- alignedTags filenameTagsRefGenome |
14 # USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming -- alignedTags filenameTagsRefGenome | 13 # --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf |
15 # --sep "characterWhichSeparatesCSVFile" --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf | 14 |
16 | |
17 | |
18 import numpy | |
19 import matplotlib.pyplot as plt | |
20 from collections import Counter | |
21 from Bio import SeqIO | |
22 import argparse | 15 import argparse |
23 import sys | 16 import sys |
24 import os | 17 from collections import Counter |
18 | |
19 import matplotlib.pyplot as plt | |
20 import numpy | |
21 from Bio import SeqIO | |
25 from matplotlib.backends.backend_pdf import PdfPages | 22 from matplotlib.backends.backend_pdf import PdfPages |
23 | |
24 plt.switch_backend('agg') | |
25 | |
26 | 26 |
27 def readFileReferenceFree(file, delim): | 27 def readFileReferenceFree(file, delim): |
28 with open(file, 'r') as dest_f: | 28 with open(file, 'r') as dest_f: |
29 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string') | 29 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string') |
30 return(data_array) | 30 return(data_array) |
31 | |
31 | 32 |
32 def readFasta(file): | 33 def readFasta(file): |
33 tag_consensus = [] | 34 tag_consensus = [] |
34 fs_consensus = [] | 35 fs_consensus = [] |
35 with open(file, "r") as consFile: | 36 with open(file, "r") as consFile: |
36 for record in SeqIO.parse(consFile, "fasta"): | 37 for record in SeqIO.parse(consFile, "fasta"): |
37 tag_consensus.append(record.id) | 38 tag_consensus.append(record.id) |
38 line = record.description | 39 line = record.description |
39 a, b = line.split(" ") | 40 a, b = line.split(" ") |
40 fs1, fs2 = b.split("-") | 41 fs1, fs2 = b.split("-") |
41 fs_consensus.extend([fs1,fs2]) | 42 fs_consensus.extend([fs1, fs2]) |
42 fs_consensus = numpy.array(fs_consensus).astype(int) | 43 fs_consensus = numpy.array(fs_consensus).astype(int) |
43 return(tag_consensus, fs_consensus) | 44 return(tag_consensus, fs_consensus) |
45 | |
44 | 46 |
45 def make_argparser(): | 47 def make_argparser(): |
46 parser = argparse.ArgumentParser(description='Analysis of read loss in duplex sequencing data') | 48 parser = argparse.ArgumentParser(description='Analysis of read loss in duplex sequencing data') |
47 parser.add_argument('--inputFile_SSCS', | 49 parser.add_argument('--inputFile_SSCS', |
48 help='Tabular File with three columns: ab or ba, tag and family size.') | 50 help='Tabular File with three columns: ab or ba, tag and family size.') |
49 parser.add_argument('--inputName1') | 51 parser.add_argument('--inputName1') |
50 parser.add_argument('--makeDCS', | 52 parser.add_argument('--makeDCS', |
51 help='FASTA File with information about tag and family size in the header.') | 53 help='FASTA File with information about tag and family size in the header.') |
52 parser.add_argument('--afterTrimming',default=None, | 54 parser.add_argument('--afterTrimming', default=None, |
53 help='FASTA File with information about tag and family size in the header.') | 55 help='FASTA File with information about tag and family size in the header.') |
54 parser.add_argument('--alignedTags',default=None, | 56 parser.add_argument('--alignedTags', default=None, |
55 help=' TXT file with tags aligned to the reference genome and family size.') | 57 help=' TXT file with tags aligned to the reference genome and family size.') |
56 parser.add_argument('--output_pdf', default="data.pdf", type=str, | 58 parser.add_argument('--output_pdf', default="data.pdf", type=str, |
57 help='Name of the pdf and csv file.') | 59 help='Name of the pdf and tabular file.') |
58 parser.add_argument('--output_csv', default="data.csv", type=str, | 60 parser.add_argument('--output_tabular', default="data.tabular", type=str, |
59 help='Name of the pdf and csv file.') | 61 help='Name of the pdf and tabular file.') |
60 parser.add_argument('--sep', default=",", | |
61 help='Separator in the csv file.') | |
62 return parser | 62 return parser |
63 | |
63 | 64 |
64 def compare_read_families_read_loss(argv): | 65 def compare_read_families_read_loss(argv): |
65 parser = make_argparser() | 66 parser = make_argparser() |
66 args = parser.parse_args(argv[1:]) | 67 args = parser.parse_args(argv[1:]) |
67 | 68 |
68 SSCS_file = args.inputFile_SSCS | 69 SSCS_file = args.inputFile_SSCS |
69 SSCS_file_name = args.inputName1 | 70 SSCS_file_name = args.inputName1 |
70 makeConsensus = args.makeDCS | 71 makeConsensus = args.makeDCS |
71 afterTrimming = args.afterTrimming | 72 afterTrimming = args.afterTrimming |
72 ref_genome = args.alignedTags | 73 ref_genome = args.alignedTags |
73 title_file = args.output_csv | 74 title_file = args.output_tabular |
74 title_file2 = args.output_pdf | 75 title_file2 = args.output_pdf |
75 sep = args.sep | 76 sep = "\t" |
76 | |
77 if type(sep) is not str or len(sep)>1: | |
78 print("Error: --sep must be a single character.") | |
79 exit(4) | |
80 | 77 |
81 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: | 78 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: |
82 ### PLOT ### | 79 # PLOT |
83 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | 80 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format |
84 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color | 81 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color |
85 plt.rcParams['xtick.labelsize'] = 14 | 82 plt.rcParams['xtick.labelsize'] = 14 |
86 plt.rcParams['ytick.labelsize'] = 14 | 83 plt.rcParams['ytick.labelsize'] = 14 |
87 plt.rcParams['patch.edgecolor'] = "black" | 84 plt.rcParams['patch.edgecolor'] = "black" |
90 | 87 |
91 list1 = [] | 88 list1 = [] |
92 colors = [] | 89 colors = [] |
93 labels = [] | 90 labels = [] |
94 | 91 |
95 ### data with tags of SSCS | 92 # data with tags of SSCS |
96 data_array = readFileReferenceFree(SSCS_file, "\t") | 93 data_array = readFileReferenceFree(SSCS_file, "\t") |
97 seq = numpy.array(data_array[:, 1]) | 94 seq = numpy.array(data_array[:, 1]) |
98 tags = numpy.array(data_array[:, 2]) | 95 tags = numpy.array(data_array[:, 2]) |
99 quant = numpy.array(data_array[:, 0]).astype(int) | 96 quant = numpy.array(data_array[:, 0]).astype(int) |
100 | 97 |
105 quant_ba_sscs = quant[numpy.where(tags == "ba")[0]] | 102 quant_ba_sscs = quant[numpy.where(tags == "ba")[0]] |
106 | 103 |
107 seqDic_ab = dict(zip(all_ab, quant_ab_sscs)) | 104 seqDic_ab = dict(zip(all_ab, quant_ab_sscs)) |
108 seqDic_ba = dict(zip(all_ba, quant_ba_sscs)) | 105 seqDic_ba = dict(zip(all_ba, quant_ba_sscs)) |
109 | 106 |
110 ### get tags of the SSCS which form a DCS | 107 # get tags of the SSCS which form a DCS |
111 # group large family sizes | 108 # group large family sizes |
112 bigFamilies = numpy.where(quant > 20)[0] | 109 bigFamilies = numpy.where(quant > 20)[0] |
113 quant[bigFamilies] = 22 | 110 quant[bigFamilies] = 22 |
114 maximumX = numpy.amax(quant) | 111 maximumX = numpy.amax(quant) |
115 | 112 |
137 legend2 = "total numbers * \n{:,}\n{:,}\n{:,}\n{:,}".format(len(seq_unique_FS), len(duplTags), | 134 legend2 = "total numbers * \n{:,}\n{:,}\n{:,}\n{:,}".format(len(seq_unique_FS), len(duplTags), |
138 len(seq_unique_FS3), len(d2)) | 135 len(seq_unique_FS3), len(d2)) |
139 plt.text(0.55, 0.14, legend1, size=11, transform=plt.gcf().transFigure) | 136 plt.text(0.55, 0.14, legend1, size=11, transform=plt.gcf().transFigure) |
140 plt.text(0.88, 0.14, legend2, size=11, transform=plt.gcf().transFigure) | 137 plt.text(0.88, 0.14, legend2, size=11, transform=plt.gcf().transFigure) |
141 | 138 |
142 ## data make DCS | 139 # data make DCS |
143 tag_consensus, fs_consensus = readFasta(makeConsensus) | 140 tag_consensus, fs_consensus = readFasta(makeConsensus) |
144 ### group large family sizes in the plot of fasta files | 141 # group large family sizes in the plot of fasta files |
145 bigFamilies = numpy.where(fs_consensus > 20)[0] | 142 bigFamilies = numpy.where(fs_consensus > 20)[0] |
146 fs_consensus[bigFamilies] = 22 | 143 fs_consensus[bigFamilies] = 22 |
147 list1.append(fs_consensus) | 144 list1.append(fs_consensus) |
148 colors.append("#298A08") | 145 colors.append("#298A08") |
149 labels.append("after DCS building") | 146 labels.append("after DCS building") |
150 legend3 = "after DCS building:" | 147 legend3 = "after DCS building:" |
151 legend4 = "{:,}".format(len(tag_consensus)) | 148 legend4 = "{:,}".format(len(tag_consensus)) |
152 plt.text(0.55, 0.11, legend3, size=11, transform=plt.gcf().transFigure) | 149 plt.text(0.55, 0.11, legend3, size=11, transform=plt.gcf().transFigure) |
153 plt.text(0.88, 0.11, legend4, size=11, transform=plt.gcf().transFigure) | 150 plt.text(0.88, 0.11, legend4, size=11, transform=plt.gcf().transFigure) |
154 | 151 |
155 ### data after trimming | 152 # data after trimming |
156 if afterTrimming != str(None): | 153 if afterTrimming != str(None): |
157 tag_trimming, fs_trimming = readFasta(afterTrimming) | 154 tag_trimming, fs_trimming = readFasta(afterTrimming) |
158 bigFamilies = numpy.where(fs_trimming > 20)[0] | 155 bigFamilies = numpy.where(fs_trimming > 20)[0] |
159 fs_trimming[bigFamilies] = 22 | 156 fs_trimming[bigFamilies] = 22 |
160 list1.append(fs_trimming) | 157 list1.append(fs_trimming) |
163 legend5 = "after trimming:" | 160 legend5 = "after trimming:" |
164 legend6 = "{:,}".format(len(tag_trimming)) | 161 legend6 = "{:,}".format(len(tag_trimming)) |
165 plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure) | 162 plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure) |
166 plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure) | 163 plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure) |
167 | 164 |
168 ### data of tags aligned to reference genome | 165 # data of tags aligned to reference genome |
169 if ref_genome != str(None): | 166 if ref_genome != str(None): |
170 mut_array = readFileReferenceFree(ref_genome, " ") | 167 mut_array = readFileReferenceFree(ref_genome, " ") |
171 ### use only unique tags that were alignment to the reference genome | 168 # use only unique tags that were alignment to the reference genome |
172 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True) | 169 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True) |
173 | 170 |
174 # get family sizes | 171 # get family sizes |
175 quant_ab = [] | 172 quant_ab = [] |
176 for i in seq_mut: | 173 for i in seq_mut: |
192 length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome | 189 length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome |
193 legend8 = "{:,}".format(length_DCS_ref) | 190 legend8 = "{:,}".format(length_DCS_ref) |
194 plt.text(0.55, 0.07, legend7, size=11, transform=plt.gcf().transFigure) | 191 plt.text(0.55, 0.07, legend7, size=11, transform=plt.gcf().transFigure) |
195 plt.text(0.88, 0.07, legend8, size=11, transform=plt.gcf().transFigure) | 192 plt.text(0.88, 0.07, legend8, size=11, transform=plt.gcf().transFigure) |
196 | 193 |
197 | |
198 counts = plt.hist(list1, bins=range(-1, maximumX + 1), stacked=False, label=labels, color=colors, | 194 counts = plt.hist(list1, bins=range(-1, maximumX + 1), stacked=False, label=labels, color=colors, |
199 align="left", alpha=1, edgecolor="black", linewidth=1) | 195 align="left", alpha=1, edgecolor="black", linewidth=1) |
200 ticks = numpy.arange(0, maximumX, 1) | 196 ticks = numpy.arange(0, maximumX, 1) |
201 ticks1 = map(str, ticks) | 197 ticks1 = map(str, ticks) |
202 ticks1[len(ticks1) - 1] = ">20" | 198 ticks1[len(ticks1) - 1] = ">20" |
246 output_file.write("\n\ntotal nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) | 242 output_file.write("\n\ntotal nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) |
247 output_file.write("\n\nValues from family size distribution\n") | 243 output_file.write("\n\nValues from family size distribution\n") |
248 | 244 |
249 if afterTrimming == str(None) and ref_genome == str(None): | 245 if afterTrimming == str(None) and ref_genome == str(None): |
250 if afterTrimming == str(None): | 246 if afterTrimming == str(None): |
251 output_file.write( | 247 output_file.write("{}before SSCS buidling{}after DCS building\n".format(sep, sep)) |
252 "{}before SSCS buidling{}after DCS building\n".format(sep, sep)) | |
253 elif ref_genome == str(None): | 248 elif ref_genome == str(None): |
254 output_file.write( | 249 output_file.write("{}before SSCS building{}atfer DCS building\n".format(sep, sep)) |
255 "{}before SSCS building{}atfer DCS building\n".format(sep, sep)) | 250 |
256 | 251 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])]): |
257 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], | |
258 counts[0][1][2:len(counts[0][1])]): | |
259 if fs == 21: | 252 if fs == 21: |
260 fs = ">20" | 253 fs = ">20" |
261 else: | 254 else: |
262 fs = "={}".format(fs) | 255 fs = "={}".format(fs) |
263 output_file.write( | 256 output_file.write("FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs))) |
264 "FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs))) | 257 output_file.write("sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])))) |
265 output_file.write( | |
266 "sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])))) | |
267 | 258 |
268 elif afterTrimming == str(None) or ref_genome == str(None): | 259 elif afterTrimming == str(None) or ref_genome == str(None): |
269 if afterTrimming == str(None): | 260 if afterTrimming == str(None): |
270 output_file.write( | 261 output_file.write("{}before SSCS buidling{}after DCS building{}after alignment to reference\n".format(sep, sep, sep)) |
271 "{}before SSCS buidling{}after DCS building{}after alignment to reference\n".format(sep, sep, sep)) | |
272 elif ref_genome == str(None): | 262 elif ref_genome == str(None): |
273 output_file.write( | 263 output_file.write("{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep)) |
274 "{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep)) | 264 |
275 | 265 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])]): |
276 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], | |
277 counts[0][1][2:len(counts[0][1])],counts[0][2][2:len(counts[0][2])]): | |
278 if fs == 21: | 266 if fs == 21: |
279 fs = ">20" | 267 fs = ">20" |
280 else: | 268 else: |
281 fs = "={}".format(fs) | 269 fs = "={}".format(fs) |
282 output_file.write( | 270 output_file.write("FS{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(reference))) |
283 "FS{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(reference))) | 271 output_file.write("sum{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2])))) |
284 output_file.write( | |
285 "sum{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2])))) | |
286 | 272 |
287 else: | 273 else: |
288 output_file.write( | 274 output_file.write("{}before SSCS building{}after DCS building{}after trimming{}after alignment to reference\n".format(sep, sep, sep, sep)) |
289 "{}before SSCS building{}after DCS building{}after trimming{}after alignment to reference\n".format( | 275 for fs, sscs, dcs, trim, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])], counts[0][3][2:len(counts[0][3])]): |
290 sep, sep, sep, sep)) | |
291 for fs, sscs, dcs, trim, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], | |
292 counts[0][1][2:len(counts[0][1])], | |
293 counts[0][2][2:len(counts[0][2])], | |
294 counts[0][3][2:len(counts[0][3])]): | |
295 if fs == 21: | 276 if fs == 21: |
296 fs = ">20" | 277 fs = ">20" |
297 else: | 278 else: |
298 fs = "={}".format(fs) | 279 fs = "={}".format(fs) |
299 output_file.write( | 280 output_file.write("FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(trim), sep, int(reference))) |
300 "FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(trim), sep, | 281 output_file.write("sum{}{}{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2])), sep, int(sum(counts[0][3])))) |
301 int(reference))) | |
302 output_file.write( | |
303 "sum{}{}{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, | |
304 int(sum(counts[0][2])), sep, int(sum(counts[0][3])))) | |
305 | 282 |
306 output_file.write("\n\nIn the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags.\n") | 283 output_file.write("\n\nIn the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags.\n") |
307 output_file.write("total nr. of tags (unique, FS>=1){}{}\n".format(sep, len(seq_unique_FS))) | 284 output_file.write("total nr. of tags (unique, FS>=1){}{}\n".format(sep, len(seq_unique_FS))) |
308 output_file.write("DCS (before SSCS building, FS>=1){}{}\n".format(sep, len(duplTags))) | 285 output_file.write("DCS (before SSCS building, FS>=1){}{}\n".format(sep, len(duplTags))) |
309 output_file.write("total nr. of tags (unique, FS>=3){}{}\n".format(sep, len(seq_unique_FS3))) | 286 output_file.write("total nr. of tags (unique, FS>=3){}{}\n".format(sep, len(seq_unique_FS3))) |
314 if ref_genome != str(None): | 291 if ref_genome != str(None): |
315 output_file.write("after alignment to reference{}{}\n".format(sep, length_DCS_ref)) | 292 output_file.write("after alignment to reference{}{}\n".format(sep, length_DCS_ref)) |
316 | 293 |
317 print("Files successfully created!") | 294 print("Files successfully created!") |
318 | 295 |
296 | |
319 if __name__ == '__main__': | 297 if __name__ == '__main__': |
320 sys.exit(compare_read_families_read_loss(sys.argv)) | 298 sys.exit(compare_read_families_read_loss(sys.argv)) |