Mercurial > repos > mheinzl > fsd_regions
diff fsd_regions.py @ 5:52454637bc45 draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 8833d1a8a49d7b6d4a9c849b0335d3260564b351-dirty
author | mheinzl |
---|---|
date | Wed, 17 Oct 2018 05:23:33 -0400 |
parents | b202c97deabe |
children | 26014c24323a |
line wrap: on
line diff
--- a/fsd_regions.py Mon Oct 08 05:53:50 2018 -0400 +++ b/fsd_regions.py Wed Oct 17 05:23:33 2018 -0400 @@ -13,7 +13,9 @@ # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf import argparse +import re import sys +from collections import OrderedDict import matplotlib.pyplot as plt import numpy @@ -55,11 +57,13 @@ mut_array = readFileReferenceFree(refGenome, " ") length_regions = len(mut_array) + group = numpy.array(mut_array[:, 0]) + seq_mut = numpy.array(mut_array[:, 1]) + alt_group = numpy.array(mut_array[:, 2]) seq = numpy.array(data_array[:, 1]) tags = numpy.array(data_array[:, 2]) quant = numpy.array(data_array[:, 0]).astype(int) - group = numpy.array(mut_array[:, 0]) all_ab = seq[numpy.where(tags == "ab")[0]] all_ba = seq[numpy.where(tags == "ba")[0]] @@ -69,34 +73,75 @@ seqDic_ab = dict(zip(all_ab, quant_ab)) seqDic_ba = dict(zip(all_ba, quant_ba)) - seq_mut = numpy.array(mut_array[:, 1]) + if re.search('^(\d)+_(\d)+', str(mut_array[0,0])) is None: + seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True) + group = mut_array[seqMut_index,0] + alt_group = mut_array[seqMut_index,2] + mut_array = mut_array[seqMut_index,:] groupUnique, group_index = numpy.unique(group, return_index=True) groupUnique = groupUnique[numpy.argsort(group_index)] - lst_ab = [] + lst_ba = [] for i in seq_mut: lst_ab.append(seqDic_ab.get(i)) - - lst_ba = [] - for i in seq_mut: lst_ba.append(seqDic_ba.get(i)) quant_ab = numpy.array(lst_ab) quant_ba = numpy.array(lst_ba) - quantAfterRegion = [] + quantAfterRegion = OrderedDict() + for key in groupUnique: + quantAfterRegion[key] = [] + for i in groupUnique: - dataAB = quant_ab[numpy.where(group == i)[0]] - dataBA = quant_ba[numpy.where(group == i)[0]] + index_of_current_region = numpy.where(group == i)[0] + quant_ba_i = quant_ba[index_of_current_region] + alt_group_i = alt_group[index_of_current_region] + index_alternative_refs = numpy.where(alt_group_i != "=")[0] + + dataAB = quant_ab[index_of_current_region] bigFamilies = numpy.where(dataAB > 20)[0] dataAB[bigFamilies] = 22 - bigFamilies = numpy.where(dataBA > 20)[0] - dataBA[bigFamilies] = 22 + for el in dataAB: + quantAfterRegion[i].append(el) + + if len(index_alternative_refs) == 0: + dataBA = quant_ba_i + bigFamilies = numpy.where(dataBA > 20)[0] + dataBA[bigFamilies] = 22 + for el2 in dataBA: + quantAfterRegion[i].append(el2) + else: # get tags where 2nd mate is aligned to a different ref genome + unique_alt = numpy.unique(alt_group_i[index_alternative_refs]) + for alt in unique_alt: + ind_alt_tags = numpy.where(alt_group_i == alt)[0] + dataBA = quant_ba_i[ind_alt_tags] - quantAll = numpy.concatenate((dataAB, dataBA)) - quantAfterRegion.append(quantAll) + bigFamilies = numpy.where(dataBA > 20)[0] + if len(bigFamilies) != 0: + if len(bigFamilies) == 1 and type(dataBA) != list: + dataBA = 22 + quantAfterRegion[alt].append(dataBA) + else: + dataBA[bigFamilies] = 22 + for el3 in dataBA: + quantAfterRegion[alt].append(el3) + index_inverse = [x for x in range(0, len(index_of_current_region)) if x not in index_alternative_refs] + data_BA_other = quant_ba_i[index_inverse] + bigFamilies_other = numpy.where(data_BA_other > 20)[0] + + if len(bigFamilies_other) != 0: + if len(bigFamilies_other) == 1 and type(data_BA_other) != list: + data_BA_other = 22 + quantAfterRegion[i].append(data_BA_other) + else: + data_BA_other[bigFamilies_other] = 22 + for el3 in data_BA_other: + quantAfterRegion[i].append(el3) + + quantAfterRegion = quantAfterRegion.values() maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) @@ -138,26 +183,57 @@ .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure) - legend1 = "total nr. of tags=" - legend2 = "total numbers * \n{:,}".format(length_regions) - plt.text(0.6, 0.2, legend1, size=11, transform=plt.gcf().transFigure) - plt.text(0.75, 0.2, legend2, size=11, transform=plt.gcf().transFigure) - legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n" + plt.text(0.55, 0.22, "total nr. of tags=", size=11, transform=plt.gcf().transFigure) + plt.text(0.7, 0.22, "{:,}".format(length_regions), size=11, transform=plt.gcf().transFigure) + + legend4 = '* The total numbers indicate the count of the ab and ba tags per region.\nAn equal sign ("=") is used in the column ba tags, if the counts and the region are identical to the ab tags.' plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) space = numpy.arange(0, len(groupUnique), 0.02) + plt.text(0.7, 0.18, "total number of *\nab", size=11, transform=plt.gcf().transFigure) + plt.text(0.78, 0.18, "ba tags", size=11, transform=plt.gcf().transFigure) + lengths_array_ab = [] + lengths_array_ba = [] + + index_array = 0 for i, s, count in zip(groupUnique, space, quantAfterRegion): - plt.text(0.6, 0.05 + s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) - plt.text(0.75, 0.05 + s, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure) + index_of_current_region = numpy.where(group == i)[0] + + plt.text(0.55, 0.14 - s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) + if re.search('^(\d)+_(\d)+', str(mut_array[0, 0])) is None: + nr_tags_ab = len(numpy.unique(mut_array[index_of_current_region, 1])) + else: + nr_tags_ab = len(mut_array[index_of_current_region, 1]) + + plt.text(0.7, 0.14 - s, "{:,}\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure) + + alt_group_i = alt_group[index_of_current_region] + alternative = numpy.where(alt_group_i != "=")[0] + unique_alt = numpy.unique(alt_group_i[alternative]) + lengths_of_alt_aligned_tags = [] + if len(alternative) != 0: + for alt in unique_alt: + ind_alt_tags = numpy.where(alt_group_i == alt)[0] + name = "{:,} to {}".format(len(ind_alt_tags), alt) + lengths_of_alt_aligned_tags.append(name) + ind_alt_tags_inverse = numpy.where(alt_group_i != alt)[0] + name_inverse = "{:,} to {}".format(len(ind_alt_tags_inverse), i) + lengths_of_alt_aligned_tags.append(name_inverse) + plt.text(0.78, 0.14 - s, "{}\n".format(", ".join(lengths_of_alt_aligned_tags)), size=10, transform=plt.gcf().transFigure) + lengths_array_ab.append(nr_tags_ab) + lengths_array_ba.append(",".join(lengths_of_alt_aligned_tags)) + else: + plt.text(0.78, 0.14 - s, "=\n", size=11,transform=plt.gcf().transFigure) + lengths_array_ab.append(nr_tags_ab) + lengths_array_ba.append(nr_tags_ab) + index_array += 1 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) - # plt.title(name1, fontsize=14) plt.xlabel("Family size", fontsize=14) plt.ylabel("Absolute Frequency", fontsize=14) plt.grid(b=True, which="major", color="#424242", linestyle=":") plt.margins(0.01, None) - # plt.savefig("{}_regions.pdf".format(title_file), bbox_inch="tight") pdf.savefig(fig, bbox_inch="tight") plt.close() @@ -167,6 +243,8 @@ output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) + output_file.write("total nr. of tags{}{}\n".format(sep, length_regions)) + output_file.write("\n\nValues from family size distribution\n") output_file.write("{}".format(sep)) for i in groupUnique: @@ -179,23 +257,29 @@ else: fs = "={}".format(fs) output_file.write("FS{}{}".format(fs, sep)) - for n in range(len(groupUnique)): - output_file.write("{}{}".format(int(counts[0][n][j]), sep)) + + if len(groupUnique) == 1: + output_file.write("{}{}".format(int(counts[0][j]), sep)) + else: + for n in range(len(groupUnique)): + output_file.write("{}{}".format(int(counts[0][n][j]), sep)) + output_file.write("\n") j += 1 output_file.write("sum{}".format(sep)) - for i in counts[0]: - output_file.write("{}{}".format(int(sum(i)), sep)) + if len(groupUnique) == 1: + output_file.write("{}{}".format(int(sum(counts[0])), sep)) + else: + for i in counts[0]: + output_file.write("{}{}".format(int(sum(i)), sep)) output_file.write("\n") - output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n") - output_file.write("Region{}total nr. of tags per region\n".format(sep)) - for i, count in zip(groupUnique, quantAfterRegion): - output_file.write("{}{}{}\n".format(i, sep, len(count) / 2)) - output_file.write("sum of tags{}{}\n".format(sep, length_regions)) + output_file.write("\n\nRegion{}total nr. of ab{}ba tags\n".format(sep, sep)) + + for ab, ba, i in zip(lengths_array_ab, lengths_array_ba, groupUnique): + output_file.write("{}{}{}{}{}\n".format(i, sep, ab, sep, ba)) print("Files successfully created!") - # print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd())) if __name__ == '__main__': - sys.exit(compare_read_families_refGenome(sys.argv)) + sys.exit(compare_read_families_refGenome(sys.argv))