Mercurial > repos > mheinzl > fsd_regions
comparison fsd_regions.py @ 14:6879295d3f11 draft default tip
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit b8a2f7b7615b2bcd3b602027af31f4e677da94f6-dirty
author | mheinzl |
---|---|
date | Tue, 08 Jan 2019 09:50:01 -0500 |
parents | 63432e6f6a61 |
children |
comparison
equal
deleted
inserted
replaced
13:63432e6f6a61 | 14:6879295d3f11 |
---|---|
131 seqDic_ba = dict(zip(all_ba, quant_ba)) | 131 seqDic_ba = dict(zip(all_ba, quant_ba)) |
132 | 132 |
133 lst_ab = [] | 133 lst_ab = [] |
134 lst_ba = [] | 134 lst_ba = [] |
135 quantAfterRegion = [] | 135 quantAfterRegion = [] |
136 length_regions = 0 | |
137 for i in group: | 136 for i in group: |
138 lst_ab_r = [] | 137 lst_ab_r = [] |
139 lst_ba_r = [] | 138 lst_ba_r = [] |
140 seq_mut = qname_dict[i] | 139 seq_mut = qname_dict[i] |
141 if rangesFile == str(None): | 140 if rangesFile == str(None): |
142 seq_mut, seqMut_index = np.unique(np.array(seq_mut), return_index=True) | 141 seq_mut, seqMut_index = np.unique(np.array(seq_mut), return_index=True) |
143 length_regions = length_regions + len(seq_mut) * 2 | |
144 for r in seq_mut: | 142 for r in seq_mut: |
145 count_ab = seqDic_ab.get(r) | 143 if re.search('\.', r): # BAM file with SSCS tags |
146 count_ba = seqDic_ba.get(r) | 144 splitted_tag = re.split('\.', r)[0] |
147 lst_ab_r.append(count_ab) | 145 direction = re.split('\.', r)[1] |
148 lst_ab.append(count_ab) | 146 |
149 lst_ba_r.append(count_ba) | 147 if direction == "ab": |
150 lst_ba.append(count_ba) | 148 count_ab = seqDic_ab.get(splitted_tag) |
149 lst_ab_r.append(count_ab) | |
150 lst_ab.append(count_ab) | |
151 | |
152 else: | |
153 count_ba = seqDic_ba.get(splitted_tag) | |
154 lst_ba_r.append(count_ba) | |
155 lst_ba.append(count_ba) | |
156 else: # BAM file with DCS tags | |
157 count_ab = seqDic_ab.get(r) | |
158 lst_ab_r.append(count_ab) | |
159 lst_ab.append(count_ab) | |
160 count_ba = seqDic_ba.get(r) | |
161 lst_ba_r.append(count_ba) | |
162 lst_ba.append(count_ba) | |
151 | 163 |
152 dataAB = np.array(lst_ab_r) | 164 dataAB = np.array(lst_ab_r) |
153 dataBA = np.array(lst_ba_r) | 165 dataBA = np.array(lst_ba_r) |
154 bigFamilies = np.where(dataAB > 20)[0] | 166 bigFamilies = np.where(dataAB > 20)[0] |
155 dataAB[bigFamilies] = 22 | 167 dataAB[bigFamilies] = 22 |
159 quantAll = np.concatenate((dataAB, dataBA)) | 171 quantAll = np.concatenate((dataAB, dataBA)) |
160 quantAfterRegion.append(quantAll) | 172 quantAfterRegion.append(quantAll) |
161 | 173 |
162 quant_ab = np.array(lst_ab) | 174 quant_ab = np.array(lst_ab) |
163 quant_ba = np.array(lst_ba) | 175 quant_ba = np.array(lst_ba) |
176 length_regions = len(np.concatenate(quantAfterRegion)) | |
164 | 177 |
165 maximumX = np.amax(np.concatenate(quantAfterRegion)) | 178 maximumX = np.amax(np.concatenate(quantAfterRegion)) |
166 minimumX = np.amin(np.concatenate(quantAfterRegion)) | 179 minimumX = np.amin(np.concatenate(quantAfterRegion)) |
167 | 180 |
168 # PLOT | 181 # PLOT |
187 ticks1 = map(str, ticks) | 200 ticks1 = map(str, ticks) |
188 ticks1[len(ticks1) - 1] = ">20" | 201 ticks1[len(ticks1) - 1] = ">20" |
189 plt.xticks(np.array(ticks), ticks1) | 202 plt.xticks(np.array(ticks), ticks1) |
190 count = np.bincount(map(int, quant_ab)) # original counts | 203 count = np.bincount(map(int, quant_ab)) # original counts |
191 | 204 |
192 legend = "max. family size:\nabsolute frequency:\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)" | 205 legend = "max. family size:\nabsolute frequency:" \ |
206 "\nrelative frequency:\n\ntotal nr. of reads:\n(before SSCS building)" | |
193 plt.text(0.15, 0.085, legend, size=11, transform=plt.gcf().transFigure) | 207 plt.text(0.15, 0.085, legend, size=11, transform=plt.gcf().transFigure) |
194 | 208 |
195 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}".format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), sum(np.array(data_array[:, 0]).astype(int))) | 209 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}".format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), sum(np.array(data_array[:, 0]).astype(int))) |
196 plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure) | 210 plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure) |
197 | 211 |
200 legend = "BA\n{}\n{}\n{:.5f}" \ | 214 legend = "BA\n{}\n{}\n{:.5f}" \ |
201 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) | 215 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) |
202 plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure) | 216 plt.text(0.45, 0.1475, legend, size=11, transform=plt.gcf().transFigure) |
203 | 217 |
204 plt.text(0.53, 0.2125, "total nr. of tags:", size=11, transform=plt.gcf().transFigure) | 218 plt.text(0.53, 0.2125, "total nr. of tags:", size=11, transform=plt.gcf().transFigure) |
205 plt.text(0.85, 0.2125, "{:,} ({:,})".format(length_regions, length_regions / 2), size=11, | 219 if re.search('\.', r): # BAM file with SSCS tags |
220 plt.text(0.85, 0.2125, "{:,}".format(length_regions), size=11, | |
221 transform=plt.gcf().transFigure) | |
222 else: | |
223 plt.text(0.85, 0.2125, "{:,} ({:,})".format(length_regions, length_regions / 2), size=11, | |
206 transform=plt.gcf().transFigure) | 224 transform=plt.gcf().transFigure) |
207 | 225 legend4 = "* In the plot, both family sizes of the ab and ba strands were used." \ |
208 legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n" | 226 "\nWhereas the total numbers indicate only the single count of the tags per region.\n" |
209 plt.text(0.1, 0.01, legend4, size=11, transform=plt.gcf().transFigure) | 227 plt.text(0.1, 0.01, legend4, size=11, transform=plt.gcf().transFigure) |
210 | 228 |
211 space = 0 | 229 space = 0 |
212 for i, count in zip(group, quantAfterRegion): | 230 for i, count in zip(group, quantAfterRegion): |
213 plt.text(0.53, 0.15 - space, "{}:\n".format(i), size=11, transform=plt.gcf().transFigure) | 231 plt.text(0.53, 0.15 - space, "{}:\n".format(i), size=11, transform=plt.gcf().transFigure) |
214 plt.text(0.85, 0.15 - space, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure) | 232 if re.search('\.', r): # BAM file with SSCS tags |
233 plt.text(0.85, 0.15 - space, "{:,}\n".format(len(count)), size=11, transform=plt.gcf().transFigure) | |
234 else: | |
235 plt.text(0.85, 0.15 - space, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure) | |
215 space = space + 0.02 | 236 space = space + 0.02 |
216 | 237 |
217 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) | 238 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) |
218 plt.xlabel("Family size", fontsize=14) | 239 plt.xlabel("Family size", fontsize=14) |
219 plt.ylabel("Absolute Frequency", fontsize=14) | 240 plt.ylabel("Absolute Frequency", fontsize=14) |
227 output_file.write("{}AB{}BA\n".format(sep, sep)) | 248 output_file.write("{}AB{}BA\n".format(sep, sep)) |
228 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba)))) | 249 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba)))) |
229 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) | 250 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) |
230 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) | 251 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) |
231 output_file.write("total nr. of reads{}{}\n".format(sep, sum(np.array(data_array[:, 0]).astype(int)))) | 252 output_file.write("total nr. of reads{}{}\n".format(sep, sum(np.array(data_array[:, 0]).astype(int)))) |
232 output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions / 2)) | 253 if re.search('\.', r): # BAM file with SSCS tags |
254 output_file.write("total nr. of tags{}{}\n".format(sep, length_regions)) | |
255 else: | |
256 output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions / 2)) | |
233 output_file.write("\n\nValues from family size distribution\n") | 257 output_file.write("\n\nValues from family size distribution\n") |
234 output_file.write("{}".format(sep)) | 258 output_file.write("{}".format(sep)) |
235 for i in group: | 259 for i in group: |
236 output_file.write("{}{}".format(i, sep)) | 260 output_file.write("{}{}".format(i, sep)) |
237 output_file.write("\n") | 261 output_file.write("\n") |
256 output_file.write("{}{}".format(int(sum(counts[0])), sep)) | 280 output_file.write("{}{}".format(int(sum(counts[0])), sep)) |
257 else: | 281 else: |
258 for i in counts[0]: | 282 for i in counts[0]: |
259 output_file.write("{}{}".format(int(sum(i)), sep)) | 283 output_file.write("{}{}".format(int(sum(i)), sep)) |
260 output_file.write("\n") | 284 output_file.write("\n") |
261 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n") | 285 if re.search('\.', r): # BAM file with SSCS tags |
286 output_file.write("\n") | |
287 else: | |
288 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used." | |
289 "\nWhereas the total numbers indicate only the single count of the tags per region.\n") | |
262 output_file.write("Region{}total nr. of tags per region\n".format(sep)) | 290 output_file.write("Region{}total nr. of tags per region\n".format(sep)) |
263 for i, count in zip(group, quantAfterRegion): | 291 for i, count in zip(group, quantAfterRegion): |
264 output_file.write("{}{}{}\n".format(i, sep, len(count) / 2)) | 292 if re.search('\.', r): # BAM file with SSCS tags |
293 output_file.write("{}{}{}\n".format(i, sep, len(count))) | |
294 else: | |
295 output_file.write("{}{}{}\n".format(i, sep, len(count) / 2)) | |
265 | 296 |
266 print("Files successfully created!") | 297 print("Files successfully created!") |
267 | 298 |
268 | 299 |
269 if __name__ == '__main__': | 300 if __name__ == '__main__': |