comparison fsd_beforevsafter.py @ 8:238a71241876 draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_beforevsafter commit dfaab79252a858e8df16bbea3607ebf1b6962e5a-dirty
author mheinzl
date Mon, 08 Oct 2018 05:55:14 -0400
parents 1eae0524b285
children fca9b000b8d8
comparison
equal deleted inserted replaced
7:c357ce2783a4 8:238a71241876
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2
3 # Family size distribution of DCS from various steps of the Galaxy pipeline 2 # Family size distribution of DCS from various steps of the Galaxy pipeline
4 # 3 #
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) 4 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria)
6 # Contact: monika.heinzl@edumail.at 5 # Contact: monika.heinzl@edumail.at
7 # 6 #
8 # Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional), 7 # Takes a TXT file with tags of reads that were aligned to certain regions of the reference genome (optional),
9 # a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and 8 # a TABULAR file with tags before the alignment to the SSCS, a FASTA file with reads that were part of the DCS and
10 # a FASTA file with tags after trimming as input (optional). 9 # a FASTA file with tags after trimming as input (optional).
11 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and 10 # The program produces a plot which shows the distribution of family sizes of the DCS from the input files and
12 # a CSV file with the data of the plot. 11 # a CSV file with the data of the plot.
13 12 # USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming -- alignedTags filenameTagsRefGenome
14 # USAGE: python FSD before vs after_no_refF1.3_FINAL.py --inputFile_SSCS filenameSSCS --inputName1 filenameSSCS --makeDCS filenameMakeDCS --afterTrimming filenameAfterTrimming -- alignedTags filenameTagsRefGenome 13 # --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf
15 # --sep "characterWhichSeparatesCSVFile" --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf 14
16
17
18 import numpy
19 import matplotlib.pyplot as plt
20 from collections import Counter
21 from Bio import SeqIO
22 import argparse 15 import argparse
23 import sys 16 import sys
24 import os 17 from collections import Counter
18
19 import matplotlib.pyplot as plt
20 import numpy
21 from Bio import SeqIO
25 from matplotlib.backends.backend_pdf import PdfPages 22 from matplotlib.backends.backend_pdf import PdfPages
23
24 plt.switch_backend('agg')
25
26 26
27 def readFileReferenceFree(file, delim): 27 def readFileReferenceFree(file, delim):
28 with open(file, 'r') as dest_f: 28 with open(file, 'r') as dest_f:
29 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string') 29 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string')
30 return(data_array) 30 return(data_array)
31
31 32
32 def readFasta(file): 33 def readFasta(file):
33 tag_consensus = [] 34 tag_consensus = []
34 fs_consensus = [] 35 fs_consensus = []
35 with open(file, "r") as consFile: 36 with open(file, "r") as consFile:
36 for record in SeqIO.parse(consFile, "fasta"): 37 for record in SeqIO.parse(consFile, "fasta"):
37 tag_consensus.append(record.id) 38 tag_consensus.append(record.id)
38 line = record.description 39 line = record.description
39 a, b = line.split(" ") 40 a, b = line.split(" ")
40 fs1, fs2 = b.split("-") 41 fs1, fs2 = b.split("-")
41 fs_consensus.extend([fs1,fs2]) 42 fs_consensus.extend([fs1, fs2])
42 fs_consensus = numpy.array(fs_consensus).astype(int) 43 fs_consensus = numpy.array(fs_consensus).astype(int)
43 return(tag_consensus, fs_consensus) 44 return(tag_consensus, fs_consensus)
45
44 46
45 def make_argparser(): 47 def make_argparser():
46 parser = argparse.ArgumentParser(description='Analysis of read loss in duplex sequencing data') 48 parser = argparse.ArgumentParser(description='Analysis of read loss in duplex sequencing data')
47 parser.add_argument('--inputFile_SSCS', 49 parser.add_argument('--inputFile_SSCS',
48 help='Tabular File with three columns: ab or ba, tag and family size.') 50 help='Tabular File with three columns: ab or ba, tag and family size.')
49 parser.add_argument('--inputName1') 51 parser.add_argument('--inputName1')
50 parser.add_argument('--makeDCS', 52 parser.add_argument('--makeDCS',
51 help='FASTA File with information about tag and family size in the header.') 53 help='FASTA File with information about tag and family size in the header.')
52 parser.add_argument('--afterTrimming',default=None, 54 parser.add_argument('--afterTrimming', default=None,
53 help='FASTA File with information about tag and family size in the header.') 55 help='FASTA File with information about tag and family size in the header.')
54 parser.add_argument('--alignedTags',default=None, 56 parser.add_argument('--alignedTags', default=None,
55 help=' TXT file with tags aligned to the reference genome and family size.') 57 help=' TXT file with tags aligned to the reference genome and family size.')
56 parser.add_argument('--output_pdf', default="data.pdf", type=str, 58 parser.add_argument('--output_pdf', default="data.pdf", type=str,
57 help='Name of the pdf and csv file.') 59 help='Name of the pdf and tabular file.')
58 parser.add_argument('--output_csv', default="data.csv", type=str, 60 parser.add_argument('--output_tabular', default="data.tabular", type=str,
59 help='Name of the pdf and csv file.') 61 help='Name of the pdf and tabular file.')
60 parser.add_argument('--sep', default=",",
61 help='Separator in the csv file.')
62 return parser 62 return parser
63
63 64
64 def compare_read_families_read_loss(argv): 65 def compare_read_families_read_loss(argv):
65 parser = make_argparser() 66 parser = make_argparser()
66 args = parser.parse_args(argv[1:]) 67 args = parser.parse_args(argv[1:])
67 68
68 SSCS_file = args.inputFile_SSCS 69 SSCS_file = args.inputFile_SSCS
69 SSCS_file_name = args.inputName1 70 SSCS_file_name = args.inputName1
70 makeConsensus = args.makeDCS 71 makeConsensus = args.makeDCS
71 afterTrimming = args.afterTrimming 72 afterTrimming = args.afterTrimming
72 ref_genome = args.alignedTags 73 ref_genome = args.alignedTags
73 title_file = args.output_csv 74 title_file = args.output_tabular
74 title_file2 = args.output_pdf 75 title_file2 = args.output_pdf
75 sep = args.sep 76 sep = "\t"
76
77 if type(sep) is not str or len(sep)>1:
78 print("Error: --sep must be a single character.")
79 exit(4)
80 77
81 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: 78 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf:
82 ### PLOT ### 79 # PLOT
83 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format 80 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
84 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color 81 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color
85 plt.rcParams['xtick.labelsize'] = 14 82 plt.rcParams['xtick.labelsize'] = 14
86 plt.rcParams['ytick.labelsize'] = 14 83 plt.rcParams['ytick.labelsize'] = 14
87 plt.rcParams['patch.edgecolor'] = "black" 84 plt.rcParams['patch.edgecolor'] = "black"
90 87
91 list1 = [] 88 list1 = []
92 colors = [] 89 colors = []
93 labels = [] 90 labels = []
94 91
95 ### data with tags of SSCS 92 # data with tags of SSCS
96 data_array = readFileReferenceFree(SSCS_file, "\t") 93 data_array = readFileReferenceFree(SSCS_file, "\t")
97 seq = numpy.array(data_array[:, 1]) 94 seq = numpy.array(data_array[:, 1])
98 tags = numpy.array(data_array[:, 2]) 95 tags = numpy.array(data_array[:, 2])
99 quant = numpy.array(data_array[:, 0]).astype(int) 96 quant = numpy.array(data_array[:, 0]).astype(int)
100 97
105 quant_ba_sscs = quant[numpy.where(tags == "ba")[0]] 102 quant_ba_sscs = quant[numpy.where(tags == "ba")[0]]
106 103
107 seqDic_ab = dict(zip(all_ab, quant_ab_sscs)) 104 seqDic_ab = dict(zip(all_ab, quant_ab_sscs))
108 seqDic_ba = dict(zip(all_ba, quant_ba_sscs)) 105 seqDic_ba = dict(zip(all_ba, quant_ba_sscs))
109 106
110 ### get tags of the SSCS which form a DCS 107 # get tags of the SSCS which form a DCS
111 # group large family sizes 108 # group large family sizes
112 bigFamilies = numpy.where(quant > 20)[0] 109 bigFamilies = numpy.where(quant > 20)[0]
113 quant[bigFamilies] = 22 110 quant[bigFamilies] = 22
114 maximumX = numpy.amax(quant) 111 maximumX = numpy.amax(quant)
115 112
137 legend2 = "total numbers * \n{:,}\n{:,}\n{:,}\n{:,}".format(len(seq_unique_FS), len(duplTags), 134 legend2 = "total numbers * \n{:,}\n{:,}\n{:,}\n{:,}".format(len(seq_unique_FS), len(duplTags),
138 len(seq_unique_FS3), len(d2)) 135 len(seq_unique_FS3), len(d2))
139 plt.text(0.55, 0.14, legend1, size=11, transform=plt.gcf().transFigure) 136 plt.text(0.55, 0.14, legend1, size=11, transform=plt.gcf().transFigure)
140 plt.text(0.88, 0.14, legend2, size=11, transform=plt.gcf().transFigure) 137 plt.text(0.88, 0.14, legend2, size=11, transform=plt.gcf().transFigure)
141 138
142 ## data make DCS 139 # data make DCS
143 tag_consensus, fs_consensus = readFasta(makeConsensus) 140 tag_consensus, fs_consensus = readFasta(makeConsensus)
144 ### group large family sizes in the plot of fasta files 141 # group large family sizes in the plot of fasta files
145 bigFamilies = numpy.where(fs_consensus > 20)[0] 142 bigFamilies = numpy.where(fs_consensus > 20)[0]
146 fs_consensus[bigFamilies] = 22 143 fs_consensus[bigFamilies] = 22
147 list1.append(fs_consensus) 144 list1.append(fs_consensus)
148 colors.append("#298A08") 145 colors.append("#298A08")
149 labels.append("after DCS building") 146 labels.append("after DCS building")
150 legend3 = "after DCS building:" 147 legend3 = "after DCS building:"
151 legend4 = "{:,}".format(len(tag_consensus)) 148 legend4 = "{:,}".format(len(tag_consensus))
152 plt.text(0.55, 0.11, legend3, size=11, transform=plt.gcf().transFigure) 149 plt.text(0.55, 0.11, legend3, size=11, transform=plt.gcf().transFigure)
153 plt.text(0.88, 0.11, legend4, size=11, transform=plt.gcf().transFigure) 150 plt.text(0.88, 0.11, legend4, size=11, transform=plt.gcf().transFigure)
154 151
155 ### data after trimming 152 # data after trimming
156 if afterTrimming != str(None): 153 if afterTrimming != str(None):
157 tag_trimming, fs_trimming = readFasta(afterTrimming) 154 tag_trimming, fs_trimming = readFasta(afterTrimming)
158 bigFamilies = numpy.where(fs_trimming > 20)[0] 155 bigFamilies = numpy.where(fs_trimming > 20)[0]
159 fs_trimming[bigFamilies] = 22 156 fs_trimming[bigFamilies] = 22
160 list1.append(fs_trimming) 157 list1.append(fs_trimming)
163 legend5 = "after trimming:" 160 legend5 = "after trimming:"
164 legend6 = "{:,}".format(len(tag_trimming)) 161 legend6 = "{:,}".format(len(tag_trimming))
165 plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure) 162 plt.text(0.55, 0.09, legend5, size=11, transform=plt.gcf().transFigure)
166 plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure) 163 plt.text(0.88, 0.09, legend6, size=11, transform=plt.gcf().transFigure)
167 164
168 ### data of tags aligned to reference genome 165 # data of tags aligned to reference genome
169 if ref_genome != str(None): 166 if ref_genome != str(None):
170 mut_array = readFileReferenceFree(ref_genome, " ") 167 mut_array = readFileReferenceFree(ref_genome, " ")
171 ### use only unique tags that were alignment to the reference genome 168 # use only unique tags that were alignment to the reference genome
172 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True) 169 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)
173 170
174 # get family sizes 171 # get family sizes
175 quant_ab = [] 172 quant_ab = []
176 for i in seq_mut: 173 for i in seq_mut:
192 length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome 189 length_DCS_ref = len(quant_ba_ref) # count of duplex tags that were aligned to reference genome
193 legend8 = "{:,}".format(length_DCS_ref) 190 legend8 = "{:,}".format(length_DCS_ref)
194 plt.text(0.55, 0.07, legend7, size=11, transform=plt.gcf().transFigure) 191 plt.text(0.55, 0.07, legend7, size=11, transform=plt.gcf().transFigure)
195 plt.text(0.88, 0.07, legend8, size=11, transform=plt.gcf().transFigure) 192 plt.text(0.88, 0.07, legend8, size=11, transform=plt.gcf().transFigure)
196 193
197
198 counts = plt.hist(list1, bins=range(-1, maximumX + 1), stacked=False, label=labels, color=colors, 194 counts = plt.hist(list1, bins=range(-1, maximumX + 1), stacked=False, label=labels, color=colors,
199 align="left", alpha=1, edgecolor="black", linewidth=1) 195 align="left", alpha=1, edgecolor="black", linewidth=1)
200 ticks = numpy.arange(0, maximumX, 1) 196 ticks = numpy.arange(0, maximumX, 1)
201 ticks1 = map(str, ticks) 197 ticks1 = map(str, ticks)
202 ticks1[len(ticks1) - 1] = ">20" 198 ticks1[len(ticks1) - 1] = ">20"
246 output_file.write("\n\ntotal nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) 242 output_file.write("\n\ntotal nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
247 output_file.write("\n\nValues from family size distribution\n") 243 output_file.write("\n\nValues from family size distribution\n")
248 244
249 if afterTrimming == str(None) and ref_genome == str(None): 245 if afterTrimming == str(None) and ref_genome == str(None):
250 if afterTrimming == str(None): 246 if afterTrimming == str(None):
251 output_file.write( 247 output_file.write("{}before SSCS buidling{}after DCS building\n".format(sep, sep))
252 "{}before SSCS buidling{}after DCS building\n".format(sep, sep))
253 elif ref_genome == str(None): 248 elif ref_genome == str(None):
254 output_file.write( 249 output_file.write("{}before SSCS building{}atfer DCS building\n".format(sep, sep))
255 "{}before SSCS building{}atfer DCS building\n".format(sep, sep)) 250
256 251 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])]):
257 for fs, sscs, dcs in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])],
258 counts[0][1][2:len(counts[0][1])]):
259 if fs == 21: 252 if fs == 21:
260 fs = ">20" 253 fs = ">20"
261 else: 254 else:
262 fs = "={}".format(fs) 255 fs = "={}".format(fs)
263 output_file.write( 256 output_file.write("FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs)))
264 "FS{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs))) 257 output_file.write("sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1]))))
265 output_file.write(
266 "sum{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1]))))
267 258
268 elif afterTrimming == str(None) or ref_genome == str(None): 259 elif afterTrimming == str(None) or ref_genome == str(None):
269 if afterTrimming == str(None): 260 if afterTrimming == str(None):
270 output_file.write( 261 output_file.write("{}before SSCS buidling{}after DCS building{}after alignment to reference\n".format(sep, sep, sep))
271 "{}before SSCS buidling{}after DCS building{}after alignment to reference\n".format(sep, sep, sep))
272 elif ref_genome == str(None): 262 elif ref_genome == str(None):
273 output_file.write( 263 output_file.write("{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep))
274 "{}before SSCS building{}atfer DCS building{}after trimming\n".format(sep, sep, sep)) 264
275 265 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])]):
276 for fs, sscs, dcs, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])],
277 counts[0][1][2:len(counts[0][1])],counts[0][2][2:len(counts[0][2])]):
278 if fs == 21: 266 if fs == 21:
279 fs = ">20" 267 fs = ">20"
280 else: 268 else:
281 fs = "={}".format(fs) 269 fs = "={}".format(fs)
282 output_file.write( 270 output_file.write("FS{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(reference)))
283 "FS{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(reference))) 271 output_file.write("sum{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2]))))
284 output_file.write(
285 "sum{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2]))))
286 272
287 else: 273 else:
288 output_file.write( 274 output_file.write("{}before SSCS building{}after DCS building{}after trimming{}after alignment to reference\n".format(sep, sep, sep, sep))
289 "{}before SSCS building{}after DCS building{}after trimming{}after alignment to reference\n".format( 275 for fs, sscs, dcs, trim, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])], counts[0][1][2:len(counts[0][1])], counts[0][2][2:len(counts[0][2])], counts[0][3][2:len(counts[0][3])]):
290 sep, sep, sep, sep))
291 for fs, sscs, dcs, trim, reference in zip(counts[1][2:len(counts[1])], counts[0][0][2:len(counts[0][0])],
292 counts[0][1][2:len(counts[0][1])],
293 counts[0][2][2:len(counts[0][2])],
294 counts[0][3][2:len(counts[0][3])]):
295 if fs == 21: 276 if fs == 21:
296 fs = ">20" 277 fs = ">20"
297 else: 278 else:
298 fs = "={}".format(fs) 279 fs = "={}".format(fs)
299 output_file.write( 280 output_file.write("FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(trim), sep, int(reference)))
300 "FS{}{}{}{}{}{}{}{}{}\n".format(fs, sep, int(sscs), sep, int(dcs), sep, int(trim), sep, 281 output_file.write("sum{}{}{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep, int(sum(counts[0][2])), sep, int(sum(counts[0][3]))))
301 int(reference)))
302 output_file.write(
303 "sum{}{}{}{}{}{}{}{}\n".format(sep, int(sum(counts[0][0])), sep, int(sum(counts[0][1])), sep,
304 int(sum(counts[0][2])), sep, int(sum(counts[0][3]))))
305 282
306 output_file.write("\n\nIn the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags.\n") 283 output_file.write("\n\nIn the plot, the family sizes of ab and ba strands and of both duplex tags were used.\nWhereas the total numbers indicate only the single count of the formed duplex tags.\n")
307 output_file.write("total nr. of tags (unique, FS>=1){}{}\n".format(sep, len(seq_unique_FS))) 284 output_file.write("total nr. of tags (unique, FS>=1){}{}\n".format(sep, len(seq_unique_FS)))
308 output_file.write("DCS (before SSCS building, FS>=1){}{}\n".format(sep, len(duplTags))) 285 output_file.write("DCS (before SSCS building, FS>=1){}{}\n".format(sep, len(duplTags)))
309 output_file.write("total nr. of tags (unique, FS>=3){}{}\n".format(sep, len(seq_unique_FS3))) 286 output_file.write("total nr. of tags (unique, FS>=3){}{}\n".format(sep, len(seq_unique_FS3)))
314 if ref_genome != str(None): 291 if ref_genome != str(None):
315 output_file.write("after alignment to reference{}{}\n".format(sep, length_DCS_ref)) 292 output_file.write("after alignment to reference{}{}\n".format(sep, length_DCS_ref))
316 293
317 print("Files successfully created!") 294 print("Files successfully created!")
318 295
296
319 if __name__ == '__main__': 297 if __name__ == '__main__':
320 sys.exit(compare_read_families_read_loss(sys.argv)) 298 sys.exit(compare_read_families_read_loss(sys.argv))