Mercurial > repos > mheinzl > fsd
comparison fsd.py @ 18:c825a29a7d9f draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
author | mheinzl |
---|---|
date | Wed, 08 May 2019 07:03:39 -0400 |
parents | 2e517a54eedc |
children | b7bccbbee4a7 |
comparison
equal
deleted
inserted
replaced
17:2e517a54eedc | 18:c825a29a7d9f |
---|---|
9 # The program produces a plot which shows the distribution of family sizes of the all SSCSs from the input files and | 9 # The program produces a plot which shows the distribution of family sizes of the all SSCSs from the input files and |
10 # a tabular file with the data of the plot, as well as a TXT file with all tags of the DCS and their family sizes. | 10 # a tabular file with the data of the plot, as well as a TXT file with all tags of the DCS and their family sizes. |
11 # If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced. | 11 # If only one file is provided, then a family size distribution, which is separated after SSCSs without a partner and DCSs, is produced. |
12 # Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given. | 12 # Whereas a family size distribution with multiple data in one plot is produced, when more than one file (up to 4) is given. |
13 | 13 |
14 # USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf | 14 # USAGE: python FSD_Galaxy_1.4_commandLine_FINAL.py --inputFile1 filename --inputName1 filename --inputFile2 filename2 --inputName2 filename2 --inputFile3 filename3 --inputName3 filename3 --inputFile4 filename4 --inputName4 filename4 --log_axis --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf |
15 | 15 |
16 import argparse | 16 import argparse |
17 import sys | 17 import sys |
18 import os | |
18 | 19 |
19 import matplotlib.pyplot as plt | 20 import matplotlib.pyplot as plt |
20 import numpy | 21 import numpy |
21 from matplotlib.backends.backend_pdf import PdfPages | 22 from matplotlib.backends.backend_pdf import PdfPages |
22 | 23 |
37 parser.add_argument('--inputName2') | 38 parser.add_argument('--inputName2') |
38 parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') | 39 parser.add_argument('--inputFile3', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') |
39 parser.add_argument('--inputName3') | 40 parser.add_argument('--inputName3') |
40 parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') | 41 parser.add_argument('--inputFile4', default=None, help='Tabular File with three columns: ab or ba, tag and family size.') |
41 parser.add_argument('--inputName4') | 42 parser.add_argument('--inputName4') |
43 parser.add_argument('--log_axis', action="store_false", help='Transform y axis in log scale.') | |
42 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') | 44 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf file.') |
43 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') | 45 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the tabular file.') |
44 return parser | 46 return parser |
45 | 47 |
46 | 48 |
47 def compare_read_families(argv): | 49 def compare_read_families(argv): |
48 | 50 |
49 parser = make_argparser() | 51 parser = make_argparser() |
50 args = parser.parse_args(argv[1:]) | 52 args = parser.parse_args(argv[1:]) |
53 | |
51 firstFile = args.inputFile1 | 54 firstFile = args.inputFile1 |
52 name1 = args.inputName1 | 55 name1 = args.inputName1 |
56 | |
53 secondFile = args.inputFile2 | 57 secondFile = args.inputFile2 |
54 name2 = args.inputName2 | 58 name2 = args.inputName2 |
55 thirdFile = args.inputFile3 | 59 thirdFile = args.inputFile3 |
56 name3 = args.inputName3 | 60 name3 = args.inputName3 |
57 fourthFile = args.inputFile4 | 61 fourthFile = args.inputFile4 |
58 name4 = args.inputName4 | 62 name4 = args.inputName4 |
63 log_axis = args.log_axis | |
64 | |
59 title_file = args.output_tabular | 65 title_file = args.output_tabular |
60 title_file2 = args.output_pdf | 66 title_file2 = args.output_pdf |
61 | 67 |
62 sep = "\t" | 68 sep = "\t" |
63 | 69 |
68 plt.rcParams['ytick.labelsize'] = 14 | 74 plt.rcParams['ytick.labelsize'] = 14 |
69 | 75 |
70 list_to_plot = [] | 76 list_to_plot = [] |
71 label = [] | 77 label = [] |
72 data_array_list = [] | 78 data_array_list = [] |
79 list_to_plot_original = [] | |
80 colors = [] | |
81 | |
73 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: | 82 with open(title_file, "w") as output_file, PdfPages(title_file2) as pdf: |
74 fig = plt.figure() | 83 fig = plt.figure() |
75 plt.subplots_adjust(bottom=0.25) | 84 fig.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) |
85 fig2 = plt.figure() | |
86 fig2.subplots_adjust(left=0.12, right=0.97, bottom=0.23, top=0.94, hspace=0) | |
87 | |
88 # plt.subplots_adjust(bottom=0.25) | |
76 if firstFile != str(None): | 89 if firstFile != str(None): |
77 file1 = readFileReferenceFree(firstFile) | 90 file1 = readFileReferenceFree(firstFile) |
78 integers = numpy.array(file1[:, 0]).astype(int) # keep original family sizes | 91 integers = numpy.array(file1[:, 0]).astype(int) # keep original family sizes |
92 list_to_plot_original.append(integers) | |
93 colors.append("#0000FF") | |
79 | 94 |
80 # for plot: replace all big family sizes by 22 | 95 # for plot: replace all big family sizes by 22 |
81 data1 = numpy.array(file1[:, 0]).astype(int) | 96 # data1 = numpy.array(file1[:, 0]).astype(int) |
82 bigFamilies = numpy.where(data1 > 20)[0] | 97 # bigFamilies = numpy.where(data1 > 20)[0] |
83 data1[bigFamilies] = 22 | 98 # data1[bigFamilies] = 22 |
84 | 99 if numpy.amax(integers) > 20: |
100 bins = numpy.arange(numpy.amin(integers), numpy.amax(integers) + 1) | |
101 data1 = numpy.clip(integers, bins[0], bins[-1]) | |
102 else: | |
103 data1 = integers | |
85 name1 = name1.split(".tabular")[0] | 104 name1 = name1.split(".tabular")[0] |
86 list_to_plot.append(data1) | 105 list_to_plot.append(data1) |
87 label.append(name1) | 106 label.append(name1) |
88 data_array_list.append(file1) | 107 data_array_list.append(file1) |
89 | 108 |
90 legend = "\n\n\n{}".format(name1) | 109 legend = "\n\n\n{}".format(name1) |
91 plt.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) | 110 fig.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) |
92 legend1 = "singletons:\nnr. of tags\n{:,}".format(numpy.bincount(data1)[1]) | 111 fig2.text(0.05, 0.11, legend, size=10, transform=plt.gcf().transFigure) |
93 plt.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) | 112 |
94 | 113 legend1 = "singletons:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / len(data1)) |
95 legend3 = "freq. of tags\n{:.3f}".format(float(numpy.bincount(data1)[1]) / len(data1)) | 114 fig.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) |
96 plt.text(0.41, 0.11, legend3, size=10, transform=plt.gcf().transFigure) | 115 fig2.text(0.32, 0.11, legend1, size=10, transform=plt.gcf().transFigure) |
97 | 116 |
98 legend3b = "PE reads\n{:.3f}".format(float(numpy.bincount(data1)[1]) / sum(integers)) | 117 legend3b = "PE reads\n{:,} ({:.3f})".format(numpy.bincount(data1)[1], float(numpy.bincount(data1)[1]) / sum(integers)) |
99 plt.text(0.5, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) | 118 fig.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) |
119 fig2.text(0.45, 0.11, legend3b, size=10, transform=plt.gcf().transFigure) | |
100 | 120 |
101 legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int), float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1)) | 121 legend4 = "family size > 20:\nnr. of tags\n{:,} ({:.3f})".format(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1].astype(int), float(numpy.bincount(data1)[len(numpy.bincount(data1)) - 1]) / len(data1)) |
102 plt.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) | 122 fig.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) |
123 fig2.text(0.58, 0.11, legend4, size=10, transform=plt.gcf().transFigure) | |
103 | 124 |
104 legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers)) | 125 legend5 = "PE reads\n{:,} ({:.3f})".format(sum(integers[integers > 20]), float(sum(integers[integers > 20])) / sum(integers)) |
105 plt.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) | 126 fig.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) |
127 fig2.text(0.70, 0.11, legend5, size=10, transform=plt.gcf().transFigure) | |
106 | 128 |
107 legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) | 129 legend6 = "total nr. of\ntags\n{:,}".format(len(data1)) |
108 plt.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) | 130 fig.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) |
131 fig2.text(0.82, 0.11, legend6, size=10, transform=plt.gcf().transFigure) | |
109 | 132 |
110 legend6b = "PE reads\n{:,}".format(sum(integers)) | 133 legend6b = "PE reads\n{:,}".format(sum(integers)) |
111 plt.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) | 134 fig.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) |
135 fig2.text(0.89, 0.11, legend6b, size=10, transform=plt.gcf().transFigure) | |
112 | 136 |
113 if secondFile != str(None): | 137 if secondFile != str(None): |
114 file2 = readFileReferenceFree(secondFile) | 138 file2 = readFileReferenceFree(secondFile) |
115 integers2 = numpy.array(file2[:, 0]).astype(int) # keep original family sizes | 139 integers2 = numpy.array(file2[:, 0]).astype(int) # keep original family sizes |
116 | 140 list_to_plot_original.append(integers2) |
117 data2 = numpy.asarray(file2[:, 0]).astype(int) | 141 colors.append("#298A08") |
118 bigFamilies2 = numpy.where(data2 > 20)[0] | 142 |
119 data2[bigFamilies2] = 22 | 143 # data2 = numpy.asarray(file2[:, 0]).astype(int) |
120 | 144 # bigFamilies2 = numpy.where(data2 > 20)[0] |
145 # data2[bigFamilies2] = 22 | |
146 | |
147 if numpy.amax(integers) > 20: | |
148 bins = numpy.arange(numpy.amin(integers2), numpy.amax(integers2) + 1) | |
149 data2 = numpy.clip(integers2, bins[0], bins[-1]) | |
150 else: | |
151 data2 = integers2 | |
121 list_to_plot.append(data2) | 152 list_to_plot.append(data2) |
122 name2 = name2.split(".tabular")[0] | 153 name2 = name2.split(".tabular")[0] |
123 label.append(name2) | 154 label.append(name2) |
124 data_array_list.append(file2) | 155 data_array_list.append(file2) |
125 | 156 |
126 plt.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) | 157 fig.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) |
127 | 158 fig2.text(0.05, 0.09, name2, size=10, transform=plt.gcf().transFigure) |
128 legend1 = "{:,}".format(numpy.bincount(data2)[1]) | 159 |
129 plt.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) | 160 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / len(data2)) |
130 | 161 fig.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) |
131 legend3 = "{:.3f}".format(float(numpy.bincount(data2)[1]) / len(data2)) | 162 fig2.text(0.32, 0.09, legend1, size=10, transform=plt.gcf().transFigure) |
132 plt.text(0.41, 0.09, legend3, size=10, transform=plt.gcf().transFigure) | 163 |
133 | 164 legend3 = "{:,} ({:.3f})".format(numpy.bincount(data2)[1], float(numpy.bincount(data2)[1]) / sum(integers2)) |
134 legend3b = "{:.3f}".format(float(numpy.bincount(data2)[1]) / sum(integers2)) | 165 fig.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) |
135 plt.text(0.5, 0.09, legend3b, size=10, transform=plt.gcf().transFigure) | 166 fig2.text(0.45, 0.09, legend3, size=10, transform=plt.gcf().transFigure) |
136 | 167 |
137 legend4 = "{:,} ({:.3f})".format( | 168 legend4 = "{:,} ({:.3f})".format( |
138 numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int), | 169 numpy.bincount(data2)[len(numpy.bincount(data2)) - 1].astype(int), |
139 float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2)) | 170 float(numpy.bincount(data2)[len(numpy.bincount(data2)) - 1]) / len(data2)) |
140 plt.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) | 171 fig.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) |
172 fig2.text(0.58, 0.09, legend4, size=10, transform=plt.gcf().transFigure) | |
141 | 173 |
142 legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) | 174 legend5 = "{:,} ({:.3f})".format(sum(integers2[integers2 > 20]), float(sum(integers2[integers2 > 20])) / sum(integers2)) |
143 plt.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) | 175 fig.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) |
176 fig2.text(0.70, 0.09, legend5, size=10, transform=plt.gcf().transFigure) | |
144 | 177 |
145 legend6 = "{:,}".format(len(data2)) | 178 legend6 = "{:,}".format(len(data2)) |
146 plt.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) | 179 fig.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) |
180 fig2.text(0.82, 0.09, legend6, size=10, transform=plt.gcf().transFigure) | |
147 | 181 |
148 legend6b = "{:,}".format(sum(integers2)) | 182 legend6b = "{:,}".format(sum(integers2)) |
149 plt.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) | 183 fig.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) |
184 fig2.text(0.89, 0.09, legend6b, size=10, transform=plt.gcf().transFigure) | |
150 | 185 |
151 if thirdFile != str(None): | 186 if thirdFile != str(None): |
152 file3 = readFileReferenceFree(thirdFile) | 187 file3 = readFileReferenceFree(thirdFile) |
153 integers3 = numpy.array(file3[:, 0]).astype(int) # keep original family sizes | 188 integers3 = numpy.array(file3[:, 0]).astype(int) # keep original family sizes |
154 | 189 list_to_plot_original.append(integers3) |
155 data3 = numpy.asarray(file3[:, 0]).astype(int) | 190 colors.append("#DF0101") |
156 bigFamilies3 = numpy.where(data3 > 20)[0] | 191 |
157 data3[bigFamilies3] = 22 | 192 # data3 = numpy.asarray(file3[:, 0]).astype(int) |
158 | 193 # bigFamilies3 = numpy.where(data3 > 20)[0] |
194 # data3[bigFamilies3] = 22 | |
195 | |
196 if numpy.amax(integers3) > 20: | |
197 bins = numpy.arange(numpy.amin(integers3), numpy.amax(integers3) + 1) | |
198 data3 = numpy.clip(integers3, bins[0], bins[-1]) | |
199 else: | |
200 data3 = integers3 | |
159 list_to_plot.append(data3) | 201 list_to_plot.append(data3) |
160 name3 = name3.split(".tabular")[0] | 202 name3 = name3.split(".tabular")[0] |
161 label.append(name3) | 203 label.append(name3) |
162 data_array_list.append(file3) | 204 data_array_list.append(file3) |
163 | 205 |
164 plt.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) | 206 fig.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) |
165 | 207 fig2.text(0.05, 0.07, name3, size=10, transform=plt.gcf().transFigure) |
166 legend1 = "{:,}".format(numpy.bincount(data3)[1]) | 208 |
167 plt.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) | 209 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / len(data3)) |
168 | 210 fig.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) |
169 legend3 = "{:.3f}".format(float(numpy.bincount(data3)[1]) / len(data3)) | 211 fig2.text(0.32, 0.07, legend1, size=10, transform=plt.gcf().transFigure) |
170 plt.text(0.41, 0.07, legend3, size=10, transform=plt.gcf().transFigure) | 212 |
171 | 213 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data3)[1], float(numpy.bincount(data3)[1]) / sum(integers3)) |
172 legend3b = "{:.3f}".format(float(numpy.bincount(data3)[1]) / sum(integers3)) | 214 fig.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) |
173 plt.text(0.5, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) | 215 fig2.text(0.45, 0.07, legend3b, size=10, transform=plt.gcf().transFigure) |
174 | 216 |
175 legend4 = "{:,} ({:.3f})".format( | 217 legend4 = "{:,} ({:.3f})".format( |
176 numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int), | 218 numpy.bincount(data3)[len(numpy.bincount(data3)) - 1].astype(int), |
177 float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3)) | 219 float(numpy.bincount(data3)[len(numpy.bincount(data3)) - 1]) / len(data3)) |
178 plt.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) | 220 fig.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) |
221 fig2.text(0.58, 0.07, legend4, size=10, transform=plt.gcf().transFigure) | |
179 | 222 |
180 legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), | 223 legend5 = "{:,} ({:.3f})".format(sum(integers3[integers3 > 20]), |
181 float(sum(integers3[integers3 > 20])) / sum(integers3)) | 224 float(sum(integers3[integers3 > 20])) / sum(integers3)) |
182 plt.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) | 225 fig.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) |
226 fig2.text(0.70, 0.07, legend5, size=10, transform=plt.gcf().transFigure) | |
183 | 227 |
184 legend6 = "{:,}".format(len(data3)) | 228 legend6 = "{:,}".format(len(data3)) |
185 plt.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) | 229 fig.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) |
230 fig2.text(0.82, 0.07, legend6, size=10, transform=plt.gcf().transFigure) | |
186 | 231 |
187 legend6b = "{:,}".format(sum(integers3)) | 232 legend6b = "{:,}".format(sum(integers3)) |
188 plt.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) | 233 fig.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) |
234 fig2.text(0.89, 0.07, legend6b, size=10, transform=plt.gcf().transFigure) | |
189 | 235 |
190 if fourthFile != str(None): | 236 if fourthFile != str(None): |
191 file4 = readFileReferenceFree(fourthFile) | 237 file4 = readFileReferenceFree(fourthFile) |
192 integers4 = numpy.array(file4[:, 0]).astype(int) # keep original family sizes | 238 integers4 = numpy.array(file4[:, 0]).astype(int) # keep original family sizes |
193 | 239 list_to_plot_original.append(integers4) |
194 data4 = numpy.asarray(file4[:, 0]).astype(int) | 240 colors.append("#04cec7") |
195 | 241 |
196 bigFamilies4 = numpy.where(data4 > 20)[0] | 242 # data4 = numpy.asarray(file4[:, 0]).astype(int) |
197 data4[bigFamilies4] = 22 | 243 # bigFamilies4 = numpy.where(data4 > 20)[0] |
198 | 244 # data4[bigFamilies4] = 22 |
245 if numpy.amax(integers4) > 20: | |
246 bins = numpy.arange(numpy.amin(integers4), numpy.amax(integers4) + 1) | |
247 data4 = numpy.clip(integers4, bins[0], bins[-1]) | |
248 else: | |
249 data4 = integers4 | |
199 list_to_plot.append(data4) | 250 list_to_plot.append(data4) |
200 name4 = name4.split(".tabular")[0] | 251 name4 = name4.split(".tabular")[0] |
201 label.append(name4) | 252 label.append(name4) |
202 data_array_list.append(file4) | 253 data_array_list.append(file4) |
203 | 254 |
204 plt.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) | 255 fig.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) |
205 | 256 fig2.text(0.05, 0.05, name4, size=10, transform=plt.gcf().transFigure) |
206 legend1 = "{:,}".format(numpy.bincount(data4)[1]) | 257 |
207 plt.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) | 258 legend1 = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / len(data4)) |
208 | 259 fig.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) |
209 legend3 = "{:.3f}".format(float(numpy.bincount(data4)[1]) / len(data4)) | 260 fig2.text(0.32, 0.05, legend1, size=10, transform=plt.gcf().transFigure) |
210 plt.text(0.41, 0.05, legend3, size=10, transform=plt.gcf().transFigure) | 261 |
211 | 262 legend3b = "{:,} ({:.3f})".format(numpy.bincount(data4)[1], float(numpy.bincount(data4)[1]) / sum(integers4)) |
212 legend3b = "{:.3f}".format(float(numpy.bincount(data4)[1]) / sum(integers4)) | 263 fig.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) |
213 plt.text(0.5, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) | 264 fig2.text(0.45, 0.05, legend3b, size=10, transform=plt.gcf().transFigure) |
214 | 265 |
215 legend4 = "{:,} ({:.3f})".format( | 266 legend4 = "{:,} ({:.3f})".format( |
216 numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int), | 267 numpy.bincount(data4)[len(numpy.bincount(data4)) - 1].astype(int), |
217 float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4)) | 268 float(numpy.bincount(data4)[len(numpy.bincount(data4)) - 1]) / len(data4)) |
218 plt.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) | 269 fig.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) |
270 fig2.text(0.58, 0.05, legend4, size=10, transform=plt.gcf().transFigure) | |
219 | 271 |
220 legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), | 272 legend5 = "{:,} ({:.3f})".format(sum(integers4[integers4 > 20]), |
221 float(sum(integers4[integers4 > 20])) / sum(integers4)) | 273 float(sum(integers4[integers4 > 20])) / sum(integers4)) |
222 plt.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) | 274 fig.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) |
275 fig2.text(0.70, 0.05, legend5, size=10, transform=plt.gcf().transFigure) | |
223 | 276 |
224 legend6 = "{:,}".format(len(data4)) | 277 legend6 = "{:,}".format(len(data4)) |
225 plt.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) | 278 fig.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) |
279 fig2.text(0.82, 0.05, legend6, size=10, transform=plt.gcf().transFigure) | |
226 | 280 |
227 legend6b = "{:,}".format(sum(integers4)) | 281 legend6b = "{:,}".format(sum(integers4)) |
228 plt.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) | 282 fig.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) |
283 fig2.text(0.89, 0.05, legend6b, size=10, transform=plt.gcf().transFigure) | |
229 | 284 |
230 maximumX = numpy.amax(numpy.concatenate(list_to_plot)) | 285 maximumX = numpy.amax(numpy.concatenate(list_to_plot)) |
231 minimumX = numpy.amin(numpy.concatenate(list_to_plot)) | 286 minimumX = numpy.amin(numpy.concatenate(list_to_plot)) |
232 | 287 bins = numpy.arange(minimumX, maximumX + 1) |
233 counts = plt.hist(list_to_plot, bins=range(minimumX, maximumX + 1), stacked=False, edgecolor="black", | 288 list_to_plot2 = list_to_plot |
234 linewidth=1, label=label, align="left", rwidth=0.8, alpha=0.7) | 289 to_plot = ["Absolute frequencies", "Relative frequencies"] |
235 | 290 plt.xticks([], []) |
236 ticks = numpy.arange(minimumX - 1, maximumX, 1) | 291 plt.yticks([], []) |
237 ticks1 = map(str, ticks) | 292 fig.suptitle('Family Size Distribution (tags)', fontsize=14) |
238 ticks1[len(ticks1) - 1] = ">20" | 293 |
239 plt.xticks(numpy.array(ticks), ticks1) | 294 for l in range(len(to_plot)): |
240 | 295 ax = fig.add_subplot(2, 1, l+1) |
241 plt.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) | 296 ticks = numpy.arange(1, 22, 1) |
242 # plt.title("Family Size Distribution", fontsize=14) | 297 ticks1 = map(str, ticks) |
243 plt.xlabel("Family size", fontsize=14) | 298 if maximumX > 20: |
244 plt.ylabel("Absolute Frequency", fontsize=14) | 299 ticks1[len(ticks1) - 1] = ">20" |
245 plt.margins(0.01, None) | 300 |
246 plt.grid(b=True, which="major", color="#424242", linestyle=":") | 301 if to_plot[l] == "Relative frequencies": |
302 counts_rel = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8, normed=True) | |
303 else: | |
304 counts = ax.hist(list_to_plot2, bins=numpy.arange(minimumX, maximumX + 2), stacked=False, edgecolor="black", linewidth=1, label=label, align="left", alpha=1, rwidth=0.8) | |
305 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) | |
306 | |
307 ax.set_xticks(numpy.array(ticks)) | |
308 ax.set_xticklabels(ticks1) | |
309 | |
310 ax.set_ylabel(to_plot[l], fontsize=14) | |
311 ax.set_xlabel("Family size", fontsize=14) | |
312 if log_axis: | |
313 ax.set_yscale('log') | |
314 ax.grid(b=True, which="major", color="#424242", linestyle=":") | |
315 ax.margins(0.01, None) | |
247 pdf.savefig(fig) | 316 pdf.savefig(fig) |
248 plt.close() | 317 plt.close() |
249 | 318 |
250 # write data to CSV file | 319 fig2.suptitle('Family Size Distribution (PE reads)', fontsize=14) |
251 output_file.write("Values from family size distribution with all datasets\n") | 320 for l in range(len(to_plot)): |
321 ax = fig2.add_subplot(2, 1, l + 1) | |
322 ticks = numpy.arange(minimumX, maximumX + 1) | |
323 ticks1 = map(str, ticks) | |
324 if maximumX > 20: | |
325 ticks1[len(ticks1) - 1] = ">20" | |
326 reads = [] | |
327 reads_rel = [] | |
328 | |
329 barWidth = 0 - (len(list_to_plot)+1)/2 * 1./(len(list_to_plot) + 1) | |
330 | |
331 for i in range(len(list_to_plot2)): | |
332 unique, c = numpy.unique(list_to_plot2[i], return_counts=True) | |
333 new_c = [] | |
334 new_unique = [] | |
335 | |
336 for t in ticks: | |
337 if t not in unique: | |
338 new_c.append(0) # add zero count of not occuring | |
339 new_unique.append(t) | |
340 else: | |
341 c_idx = numpy.where(t == unique)[0] | |
342 new_c.append(c[c_idx]) | |
343 new_unique.append(unique[c_idx]) | |
344 y = numpy.array(new_unique) * numpy.array(new_c) | |
345 if len([list_to_plot_original > 20]) > 0: | |
346 y[len(y) - 1] = sum(list_to_plot_original[i][list_to_plot_original[i] > 20]) | |
347 reads.append(y) | |
348 reads_rel.append(list(numpy.float_(y)) / sum(y)) | |
349 | |
350 x = list(numpy.arange(numpy.amin(unique), numpy.amax(unique) + 1).astype(float)) | |
351 x = [xi + barWidth for xi in x] | |
352 | |
353 if to_plot[l] == "Relative frequencies": | |
354 counts2_rel = ax.bar(x, list(numpy.float_(y)) / sum(y), align="edge", width=1./(len(list_to_plot) + 1), | |
355 edgecolor="black", label=label[i], alpha=1, linewidth=1, color=colors[i]) | |
356 else: | |
357 counts2 = ax.bar(x, y, align="edge", width=1./len(list_to_plot), edgecolor="black", label=label[i], | |
358 alpha=1, linewidth=1, color=colors[i]) | |
359 if i == len(list_to_plot2): | |
360 barWidth += 1. / (len(list_to_plot) + 1) + 1. / (len(list_to_plot) + 1) | |
361 else: | |
362 barWidth += 1. / (len(list_to_plot) + 1) | |
363 | |
364 if to_plot[l] == "Absolute frequencies": | |
365 ax.legend(loc='upper right', fontsize=14, frameon=True, bbox_to_anchor=(0.9, 1)) | |
366 else: | |
367 ax.set_xlabel("Family size", fontsize=14) | |
368 | |
369 ax.set_xticks(numpy.array(ticks)) | |
370 ax.set_xticklabels(ticks1) | |
371 ax.set_ylabel(to_plot[l], fontsize=14) | |
372 if log_axis: | |
373 ax.set_yscale('log') | |
374 ax.grid(b=True, which="major", color="#424242", linestyle=":") | |
375 ax.margins(0.01, None) | |
376 | |
377 pdf.savefig(fig2) | |
378 plt.close() | |
379 | |
380 # write data to CSV file tags | |
381 output_file.write("Values from family size distribution with all datasets (tags)\n") | |
252 output_file.write("\nFamily size") | 382 output_file.write("\nFamily size") |
253 for i in label: | 383 for i in label: |
254 output_file.write("{}{}".format(sep, i)) | 384 output_file.write("{}{}".format(sep, i)) |
255 # output_file.write("{}sum".format(sep)) | 385 # output_file.write("{}sum".format(sep)) |
256 output_file.write("\n") | 386 output_file.write("\n") |
273 output_file.write("{}{}".format(int(sum(counts[0])), sep)) | 403 output_file.write("{}{}".format(int(sum(counts[0])), sep)) |
274 else: | 404 else: |
275 for i in counts[0]: | 405 for i in counts[0]: |
276 output_file.write("{}{}".format(int(sum(i)), sep)) | 406 output_file.write("{}{}".format(int(sum(i)), sep)) |
277 | 407 |
408 # write data to CSV file PE reads | |
409 output_file.write("\n\nValues from family size distribution with all datasets (PE reads)\n") | |
410 output_file.write("\nFamily size") | |
411 for i in label: | |
412 output_file.write("{}{}".format(sep, i)) | |
413 # output_file.write("{}sum".format(sep)) | |
414 output_file.write("\n") | |
415 j = 0 | |
416 for fs in bins: | |
417 if fs == 21: | |
418 fs = ">20" | |
419 else: | |
420 fs = "={}".format(fs) | |
421 output_file.write("FS{}{}".format(fs, sep)) | |
422 if len(label) == 1: | |
423 output_file.write("{}{}".format(int(reads[0][j]), sep)) | |
424 else: | |
425 for n in range(len(label)): | |
426 output_file.write("{}{}".format(int(reads[n][j]), sep)) | |
427 output_file.write("\n") | |
428 j += 1 | |
429 output_file.write("sum{}".format(sep)) | |
430 if len(label) == 1: | |
431 output_file.write("{}{}".format(int(sum(reads)), sep)) | |
432 else: | |
433 for i in reads: | |
434 output_file.write("{}{}".format(int(sum(i)), sep)) | |
435 output_file.write("\n") | |
436 | |
278 # Family size distribution after DCS and SSCS | 437 # Family size distribution after DCS and SSCS |
279 for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label): | 438 for dataset, data_o, name_file in zip(list_to_plot, data_array_list, label): |
280 maximumX = numpy.amax(dataset) | 439 maximumX = numpy.amax(dataset) |
281 minimumX = numpy.amin(dataset) | 440 minimumX = numpy.amin(dataset) |
282 | 441 |
296 duplTags_o = duplTags_double_o[0::2] # ab of DCS | 455 duplTags_o = duplTags_double_o[0::2] # ab of DCS |
297 | 456 |
298 duplTagsBA = duplTags_double[1::2] # ba of DCS | 457 duplTagsBA = duplTags_double[1::2] # ba of DCS |
299 duplTagsBA_o = duplTags_double_o[1::2] # ba of DCS | 458 duplTagsBA_o = duplTags_double_o[1::2] # ba of DCS |
300 | 459 |
460 # duplTags_double_tag = tags[numpy.in1d(seq, d)] | |
461 # duplTags_double_seq = seq[numpy.in1d(seq, d)] | |
462 | |
301 # get family sizes for SSCS with no partner | 463 # get family sizes for SSCS with no partner |
302 ab = numpy.where(tags == "ab")[0] | 464 ab = numpy.where(tags == "ab")[0] |
303 abSeq = seq[ab] | 465 abSeq = seq[ab] |
304 ab_o = data_o[ab] | 466 ab_o = data_o[ab] |
305 ab = data[ab] | 467 ab = data[ab] |
320 # information for family size >= 3 | 482 # information for family size >= 3 |
321 dataAB_FS3 = dataAB[dataAB >= 3] | 483 dataAB_FS3 = dataAB[dataAB >= 3] |
322 dataAB_FS3_o = dataAB_o[dataAB_o >= 3] | 484 dataAB_FS3_o = dataAB_o[dataAB_o >= 3] |
323 dataBA_FS3 = dataBA[dataBA >= 3] | 485 dataBA_FS3 = dataBA[dataBA >= 3] |
324 dataBA_FS3_o = dataBA_o[dataBA_o >= 3] | 486 dataBA_FS3_o = dataBA_o[dataBA_o >= 3] |
325 ab_FS3 = ab[ab >= 3] | 487 # ab_FS3 = ab[ab >= 3] |
326 ba_FS3 = ba[ba >= 3] | 488 # ba_FS3 = ba[ba >= 3] |
327 ab_FS3_o = ab_o[ab_o >= 3] | 489 # ab_FS3_o = ab_o[ab_o >= 3] |
328 ba_FS3_o = ba_o[ba_o >= 3] | 490 # ba_FS3_o = ba_o[ba_o >= 3] |
329 | 491 |
330 duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 | 492 duplTags_FS3 = duplTags[(duplTags >= 3) & (duplTagsBA >= 3)] # ab+ba with FS>=3 |
331 duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)] # ba+ab with FS>=3 | 493 duplTags_FS3_BA = duplTagsBA[(duplTags >= 3) & (duplTagsBA >= 3)] # ba+ab with FS>=3 |
332 duplTags_double_FS3 = len(duplTags_FS3) + len(duplTags_FS3_BA) # both ab and ba strands with FS>=3 | 494 duplTags_double_FS3 = len(duplTags_FS3) + len(duplTags_FS3_BA) # both ab and ba strands with FS>=3 |
333 | 495 |
335 duplTags_FS3_o = duplTags_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ab+ba with FS>=3 | 497 duplTags_FS3_o = duplTags_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ab+ba with FS>=3 |
336 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3 | 498 duplTags_FS3_BA_o = duplTagsBA_o[(duplTags_o >= 3) & (duplTagsBA_o >= 3)] # ba+ab with FS>=3 |
337 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 | 499 duplTags_double_FS3_o = sum(duplTags_FS3_o) + sum(duplTags_FS3_BA_o) # both ab and ba strands with FS>=3 |
338 | 500 |
339 fig = plt.figure() | 501 fig = plt.figure() |
340 plt.subplots_adjust(bottom=0.3) | 502 plt.subplots_adjust(left=0.12, right=0.97, bottom=0.3, top=0.94, hspace=0) |
341 counts = plt.hist(list1, bins=range(minimumX, maximumX + 1), stacked=True, label=["duplex", "ab", "ba"], | 503 counts = plt.hist(list1, bins=numpy.arange(minimumX, maximumX + 2), stacked=True, label=["duplex", "ab", "ba"], |
342 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], | 504 edgecolor="black", linewidth=1, align="left", color=["#FF0000", "#5FB404", "#FFBF00"], |
343 rwidth=0.8) | 505 rwidth=0.8) |
344 # tick labels of x axis | 506 # tick labels of x axis |
345 ticks = numpy.arange(minimumX - 1, maximumX, 1) | 507 ticks = numpy.arange(1, 22, 1) |
346 ticks1 = map(str, ticks) | 508 ticks1 = map(str, ticks) |
347 ticks1[len(ticks1) - 1] = ">20" | 509 if maximumX > 20: |
510 ticks1[len(ticks1) - 1] = ">20" | |
348 plt.xticks(numpy.array(ticks), ticks1) | 511 plt.xticks(numpy.array(ticks), ticks1) |
349 singl = counts[0][2][0] # singletons | 512 singl = counts[0][2][0] # singletons |
350 last = counts[0][2][len(counts[0][0]) - 1] # large families | 513 last = counts[0][2][len(counts[0][0]) - 1] # large families |
351 | 514 if log_axis: |
515 plt.yscale('log') | |
352 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) | 516 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) |
353 plt.title(name_file, fontsize=14) | 517 plt.title(name_file, fontsize=14) |
354 plt.xlabel("Family size", fontsize=14) | 518 plt.xlabel("Family size", fontsize=14) |
355 plt.ylabel("Absolute Frequency", fontsize=14) | 519 plt.ylabel("Absolute Frequency", fontsize=14) |
356 plt.margins(0.01, None) | 520 plt.margins(0.01, None) |
409 output_file.write("\nDataset:{}{}\n".format(sep, name_file)) | 573 output_file.write("\nDataset:{}{}\n".format(sep, name_file)) |
410 output_file.write("max. family size:{}{}\n".format(sep, max(integers))) | 574 output_file.write("max. family size:{}{}\n".format(sep, max(integers))) |
411 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) | 575 output_file.write("absolute frequency:{}{}\n".format(sep, count[len(count) - 1])) |
412 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) | 576 output_file.write("relative frequency:{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count))) |
413 | 577 |
414 output_file.write("{}singletons:{}{}{}family size > 20:\n".format(sep, sep, sep, sep)) | 578 output_file.write("{}singletons:{}{}{}family size > 20:{}{}{}{}length of dataset:\n".format(sep, sep, sep, sep, sep, sep, sep, sep)) |
415 output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) | 579 output_file.write("{}nr. of tags{}rel. freq of tags{}rel.freq of PE reads{}nr. of tags{}rel. freq of tags{}nr. of PE reads{}rel. freq of PE reads{}total nr. of tags{}total nr. of PE reads\n".format(sep, sep, sep, sep, sep, sep, sep, sep, sep)) |
416 output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( | 580 output_file.write("{}{}{}{}{:.3f}{}{:.3f}{}{}{}{:.3f}{}{}{}{:.3f}{}{}{}{}\n\n".format( |
417 name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep, | 581 name_file, sep, singl.astype(int), sep, singl / len(data), sep, float(singl)/sum(data_o), sep, |
418 last.astype(int), sep, last / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), sep, sum(data_o))) | 582 last.astype(int), sep, last / len(data), sep, sum(data_o[data_o > 20]), sep, float(sum(data_o[data_o > 20])) / sum(data_o), sep, len(data), sep, sum(data_o))) |
419 | 583 |