Mercurial > repos > mheinzl > fsd_regions
comparison fsd_regions.py @ 5:52454637bc45 draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 8833d1a8a49d7b6d4a9c849b0335d3260564b351-dirty
author | mheinzl |
---|---|
date | Wed, 17 Oct 2018 05:23:33 -0400 |
parents | b202c97deabe |
children | 26014c24323a |
comparison
equal
deleted
inserted
replaced
4:b202c97deabe | 5:52454637bc45 |
---|---|
11 # a tabular file with the data of the plot. | 11 # a tabular file with the data of the plot. |
12 | 12 |
13 # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf | 13 # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf |
14 | 14 |
15 import argparse | 15 import argparse |
16 import re | |
16 import sys | 17 import sys |
18 from collections import OrderedDict | |
17 | 19 |
18 import matplotlib.pyplot as plt | 20 import matplotlib.pyplot as plt |
19 import numpy | 21 import numpy |
20 from matplotlib.backends.backend_pdf import PdfPages | 22 from matplotlib.backends.backend_pdf import PdfPages |
21 | 23 |
53 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: | 55 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: |
54 data_array = readFileReferenceFree(firstFile, "\t") | 56 data_array = readFileReferenceFree(firstFile, "\t") |
55 | 57 |
56 mut_array = readFileReferenceFree(refGenome, " ") | 58 mut_array = readFileReferenceFree(refGenome, " ") |
57 length_regions = len(mut_array) | 59 length_regions = len(mut_array) |
60 group = numpy.array(mut_array[:, 0]) | |
61 seq_mut = numpy.array(mut_array[:, 1]) | |
62 alt_group = numpy.array(mut_array[:, 2]) | |
58 | 63 |
59 seq = numpy.array(data_array[:, 1]) | 64 seq = numpy.array(data_array[:, 1]) |
60 tags = numpy.array(data_array[:, 2]) | 65 tags = numpy.array(data_array[:, 2]) |
61 quant = numpy.array(data_array[:, 0]).astype(int) | 66 quant = numpy.array(data_array[:, 0]).astype(int) |
62 group = numpy.array(mut_array[:, 0]) | |
63 | 67 |
64 all_ab = seq[numpy.where(tags == "ab")[0]] | 68 all_ab = seq[numpy.where(tags == "ab")[0]] |
65 all_ba = seq[numpy.where(tags == "ba")[0]] | 69 all_ba = seq[numpy.where(tags == "ba")[0]] |
66 quant_ab = quant[numpy.where(tags == "ab")[0]] | 70 quant_ab = quant[numpy.where(tags == "ab")[0]] |
67 quant_ba = quant[numpy.where(tags == "ba")[0]] | 71 quant_ba = quant[numpy.where(tags == "ba")[0]] |
68 | 72 |
69 seqDic_ab = dict(zip(all_ab, quant_ab)) | 73 seqDic_ab = dict(zip(all_ab, quant_ab)) |
70 seqDic_ba = dict(zip(all_ba, quant_ba)) | 74 seqDic_ba = dict(zip(all_ba, quant_ba)) |
71 | 75 |
72 seq_mut = numpy.array(mut_array[:, 1]) | 76 if re.search('^(\d)+_(\d)+', str(mut_array[0,0])) is None: |
77 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True) | |
78 group = mut_array[seqMut_index,0] | |
79 alt_group = mut_array[seqMut_index,2] | |
80 mut_array = mut_array[seqMut_index,:] | |
73 | 81 |
74 groupUnique, group_index = numpy.unique(group, return_index=True) | 82 groupUnique, group_index = numpy.unique(group, return_index=True) |
75 groupUnique = groupUnique[numpy.argsort(group_index)] | 83 groupUnique = groupUnique[numpy.argsort(group_index)] |
76 | |
77 lst_ab = [] | 84 lst_ab = [] |
85 lst_ba = [] | |
78 for i in seq_mut: | 86 for i in seq_mut: |
79 lst_ab.append(seqDic_ab.get(i)) | 87 lst_ab.append(seqDic_ab.get(i)) |
80 | |
81 lst_ba = [] | |
82 for i in seq_mut: | |
83 lst_ba.append(seqDic_ba.get(i)) | 88 lst_ba.append(seqDic_ba.get(i)) |
84 | 89 |
85 quant_ab = numpy.array(lst_ab) | 90 quant_ab = numpy.array(lst_ab) |
86 quant_ba = numpy.array(lst_ba) | 91 quant_ba = numpy.array(lst_ba) |
87 | 92 |
88 quantAfterRegion = [] | 93 quantAfterRegion = OrderedDict() |
94 for key in groupUnique: | |
95 quantAfterRegion[key] = [] | |
96 | |
89 for i in groupUnique: | 97 for i in groupUnique: |
90 dataAB = quant_ab[numpy.where(group == i)[0]] | 98 index_of_current_region = numpy.where(group == i)[0] |
91 dataBA = quant_ba[numpy.where(group == i)[0]] | 99 quant_ba_i = quant_ba[index_of_current_region] |
100 alt_group_i = alt_group[index_of_current_region] | |
101 index_alternative_refs = numpy.where(alt_group_i != "=")[0] | |
102 | |
103 dataAB = quant_ab[index_of_current_region] | |
92 bigFamilies = numpy.where(dataAB > 20)[0] | 104 bigFamilies = numpy.where(dataAB > 20)[0] |
93 dataAB[bigFamilies] = 22 | 105 dataAB[bigFamilies] = 22 |
94 bigFamilies = numpy.where(dataBA > 20)[0] | 106 for el in dataAB: |
95 dataBA[bigFamilies] = 22 | 107 quantAfterRegion[i].append(el) |
96 | 108 |
97 quantAll = numpy.concatenate((dataAB, dataBA)) | 109 if len(index_alternative_refs) == 0: |
98 quantAfterRegion.append(quantAll) | 110 dataBA = quant_ba_i |
99 | 111 bigFamilies = numpy.where(dataBA > 20)[0] |
112 dataBA[bigFamilies] = 22 | |
113 for el2 in dataBA: | |
114 quantAfterRegion[i].append(el2) | |
115 else: # get tags where 2nd mate is aligned to a different ref genome | |
116 unique_alt = numpy.unique(alt_group_i[index_alternative_refs]) | |
117 for alt in unique_alt: | |
118 ind_alt_tags = numpy.where(alt_group_i == alt)[0] | |
119 dataBA = quant_ba_i[ind_alt_tags] | |
120 | |
121 bigFamilies = numpy.where(dataBA > 20)[0] | |
122 if len(bigFamilies) != 0: | |
123 if len(bigFamilies) == 1 and type(dataBA) != list: | |
124 dataBA = 22 | |
125 quantAfterRegion[alt].append(dataBA) | |
126 else: | |
127 dataBA[bigFamilies] = 22 | |
128 for el3 in dataBA: | |
129 quantAfterRegion[alt].append(el3) | |
130 | |
131 index_inverse = [x for x in range(0, len(index_of_current_region)) if x not in index_alternative_refs] | |
132 data_BA_other = quant_ba_i[index_inverse] | |
133 bigFamilies_other = numpy.where(data_BA_other > 20)[0] | |
134 | |
135 if len(bigFamilies_other) != 0: | |
136 if len(bigFamilies_other) == 1 and type(data_BA_other) != list: | |
137 data_BA_other = 22 | |
138 quantAfterRegion[i].append(data_BA_other) | |
139 else: | |
140 data_BA_other[bigFamilies_other] = 22 | |
141 for el3 in data_BA_other: | |
142 quantAfterRegion[i].append(el3) | |
143 | |
144 quantAfterRegion = quantAfterRegion.values() | |
100 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) | 145 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) |
101 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) | 146 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) |
102 | 147 |
103 # PLOT | 148 # PLOT |
104 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | 149 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format |
136 | 181 |
137 legend = "BA\n{}\n{}\n{:.5f}" \ | 182 legend = "BA\n{}\n{}\n{:.5f}" \ |
138 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) | 183 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) |
139 plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure) | 184 plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure) |
140 | 185 |
141 legend1 = "total nr. of tags=" | 186 plt.text(0.55, 0.22, "total nr. of tags=", size=11, transform=plt.gcf().transFigure) |
142 legend2 = "total numbers * \n{:,}".format(length_regions) | 187 plt.text(0.7, 0.22, "{:,}".format(length_regions), size=11, transform=plt.gcf().transFigure) |
143 plt.text(0.6, 0.2, legend1, size=11, transform=plt.gcf().transFigure) | 188 |
144 plt.text(0.75, 0.2, legend2, size=11, transform=plt.gcf().transFigure) | 189 legend4 = '* The total numbers indicate the count of the ab and ba tags per region.\nAn equal sign ("=") is used in the column ba tags, if the counts and the region are identical to the ab tags.' |
145 legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n" | |
146 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) | 190 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) |
147 | 191 |
148 space = numpy.arange(0, len(groupUnique), 0.02) | 192 space = numpy.arange(0, len(groupUnique), 0.02) |
193 plt.text(0.7, 0.18, "total number of *\nab", size=11, transform=plt.gcf().transFigure) | |
194 plt.text(0.78, 0.18, "ba tags", size=11, transform=plt.gcf().transFigure) | |
195 lengths_array_ab = [] | |
196 lengths_array_ba = [] | |
197 | |
198 index_array = 0 | |
149 for i, s, count in zip(groupUnique, space, quantAfterRegion): | 199 for i, s, count in zip(groupUnique, space, quantAfterRegion): |
150 plt.text(0.6, 0.05 + s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) | 200 index_of_current_region = numpy.where(group == i)[0] |
151 plt.text(0.75, 0.05 + s, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure) | 201 |
202 plt.text(0.55, 0.14 - s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) | |
203 if re.search('^(\d)+_(\d)+', str(mut_array[0, 0])) is None: | |
204 nr_tags_ab = len(numpy.unique(mut_array[index_of_current_region, 1])) | |
205 else: | |
206 nr_tags_ab = len(mut_array[index_of_current_region, 1]) | |
207 | |
208 plt.text(0.7, 0.14 - s, "{:,}\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure) | |
209 | |
210 alt_group_i = alt_group[index_of_current_region] | |
211 alternative = numpy.where(alt_group_i != "=")[0] | |
212 unique_alt = numpy.unique(alt_group_i[alternative]) | |
213 lengths_of_alt_aligned_tags = [] | |
214 if len(alternative) != 0: | |
215 for alt in unique_alt: | |
216 ind_alt_tags = numpy.where(alt_group_i == alt)[0] | |
217 name = "{:,} to {}".format(len(ind_alt_tags), alt) | |
218 lengths_of_alt_aligned_tags.append(name) | |
219 ind_alt_tags_inverse = numpy.where(alt_group_i != alt)[0] | |
220 name_inverse = "{:,} to {}".format(len(ind_alt_tags_inverse), i) | |
221 lengths_of_alt_aligned_tags.append(name_inverse) | |
222 plt.text(0.78, 0.14 - s, "{}\n".format(", ".join(lengths_of_alt_aligned_tags)), size=10, transform=plt.gcf().transFigure) | |
223 lengths_array_ab.append(nr_tags_ab) | |
224 lengths_array_ba.append(",".join(lengths_of_alt_aligned_tags)) | |
225 else: | |
226 plt.text(0.78, 0.14 - s, "=\n", size=11,transform=plt.gcf().transFigure) | |
227 lengths_array_ab.append(nr_tags_ab) | |
228 lengths_array_ba.append(nr_tags_ab) | |
229 index_array += 1 | |
152 | 230 |
153 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) | 231 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) |
154 # plt.title(name1, fontsize=14) | |
155 plt.xlabel("Family size", fontsize=14) | 232 plt.xlabel("Family size", fontsize=14) |
156 plt.ylabel("Absolute Frequency", fontsize=14) | 233 plt.ylabel("Absolute Frequency", fontsize=14) |
157 plt.grid(b=True, which="major", color="#424242", linestyle=":") | 234 plt.grid(b=True, which="major", color="#424242", linestyle=":") |
158 plt.margins(0.01, None) | 235 plt.margins(0.01, None) |
159 | 236 |
160 # plt.savefig("{}_regions.pdf".format(title_file), bbox_inch="tight") | |
161 pdf.savefig(fig, bbox_inch="tight") | 237 pdf.savefig(fig, bbox_inch="tight") |
162 plt.close() | 238 plt.close() |
163 | 239 |
164 output_file.write("Dataset:{}{}\n".format(sep, name1)) | 240 output_file.write("Dataset:{}{}\n".format(sep, name1)) |
165 output_file.write("{}AB{}BA\n".format(sep, sep)) | 241 output_file.write("{}AB{}BA\n".format(sep, sep)) |
166 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba)))) | 242 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba)))) |
167 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) | 243 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) |
168 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) | 244 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) |
169 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) | 245 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) |
246 output_file.write("total nr. of tags{}{}\n".format(sep, length_regions)) | |
247 | |
170 output_file.write("\n\nValues from family size distribution\n") | 248 output_file.write("\n\nValues from family size distribution\n") |
171 output_file.write("{}".format(sep)) | 249 output_file.write("{}".format(sep)) |
172 for i in groupUnique: | 250 for i in groupUnique: |
173 output_file.write("{}{}".format(i, sep)) | 251 output_file.write("{}{}".format(i, sep)) |
174 output_file.write("\n") | 252 output_file.write("\n") |
177 if fs == 21: | 255 if fs == 21: |
178 fs = ">20" | 256 fs = ">20" |
179 else: | 257 else: |
180 fs = "={}".format(fs) | 258 fs = "={}".format(fs) |
181 output_file.write("FS{}{}".format(fs, sep)) | 259 output_file.write("FS{}{}".format(fs, sep)) |
182 for n in range(len(groupUnique)): | 260 |
183 output_file.write("{}{}".format(int(counts[0][n][j]), sep)) | 261 if len(groupUnique) == 1: |
262 output_file.write("{}{}".format(int(counts[0][j]), sep)) | |
263 else: | |
264 for n in range(len(groupUnique)): | |
265 output_file.write("{}{}".format(int(counts[0][n][j]), sep)) | |
266 | |
184 output_file.write("\n") | 267 output_file.write("\n") |
185 j += 1 | 268 j += 1 |
186 output_file.write("sum{}".format(sep)) | 269 output_file.write("sum{}".format(sep)) |
187 for i in counts[0]: | 270 if len(groupUnique) == 1: |
188 output_file.write("{}{}".format(int(sum(i)), sep)) | 271 output_file.write("{}{}".format(int(sum(counts[0])), sep)) |
272 else: | |
273 for i in counts[0]: | |
274 output_file.write("{}{}".format(int(sum(i)), sep)) | |
189 output_file.write("\n") | 275 output_file.write("\n") |
190 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n") | 276 output_file.write("\n\nRegion{}total nr. of ab{}ba tags\n".format(sep, sep)) |
191 output_file.write("Region{}total nr. of tags per region\n".format(sep)) | 277 |
192 for i, count in zip(groupUnique, quantAfterRegion): | 278 for ab, ba, i in zip(lengths_array_ab, lengths_array_ba, groupUnique): |
193 output_file.write("{}{}{}\n".format(i, sep, len(count) / 2)) | 279 output_file.write("{}{}{}{}{}\n".format(i, sep, ab, sep, ba)) |
194 output_file.write("sum of tags{}{}\n".format(sep, length_regions)) | |
195 | 280 |
196 print("Files successfully created!") | 281 print("Files successfully created!") |
197 # print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd())) | |
198 | 282 |
199 | 283 |
200 if __name__ == '__main__': | 284 if __name__ == '__main__': |
201 sys.exit(compare_read_families_refGenome(sys.argv)) | 285 sys.exit(compare_read_families_refGenome(sys.argv)) |