Mercurial > repos > mheinzl > fsd_regions
changeset 6:26014c24323a draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 8833d1a8a49d7b6d4a9c849b0335d3260564b351-dirty
author | mheinzl |
---|---|
date | Fri, 26 Oct 2018 07:54:03 -0400 |
parents | 52454637bc45 |
children | 3b8a0e462021 |
files | fsd_regions.py test-data/Test_data_regions.txt test-data/output_file.pdf test-data/output_file.tabular |
diffstat | 4 files changed, 50 insertions(+), 114 deletions(-) [+] |
line wrap: on
line diff
--- a/fsd_regions.py Wed Oct 17 05:23:33 2018 -0400 +++ b/fsd_regions.py Fri Oct 26 07:54:03 2018 -0400 @@ -56,10 +56,8 @@ data_array = readFileReferenceFree(firstFile, "\t") mut_array = readFileReferenceFree(refGenome, " ") - length_regions = len(mut_array) group = numpy.array(mut_array[:, 0]) seq_mut = numpy.array(mut_array[:, 1]) - alt_group = numpy.array(mut_array[:, 2]) seq = numpy.array(data_array[:, 1]) tags = numpy.array(data_array[:, 2]) @@ -73,14 +71,15 @@ seqDic_ab = dict(zip(all_ab, quant_ab)) seqDic_ba = dict(zip(all_ba, quant_ba)) - if re.search('^(\d)+_(\d)+', str(mut_array[0,0])) is None: + if re.search('_(\d)+_(\d)+$', str(mut_array[0,0])) is None: seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True) group = mut_array[seqMut_index,0] - alt_group = mut_array[seqMut_index,2] mut_array = mut_array[seqMut_index,:] + length_regions = len(seq_mut)*2 groupUnique, group_index = numpy.unique(group, return_index=True) groupUnique = groupUnique[numpy.argsort(group_index)] + lst_ab = [] lst_ba = [] for i in seq_mut: @@ -90,58 +89,19 @@ quant_ab = numpy.array(lst_ab) quant_ba = numpy.array(lst_ba) - quantAfterRegion = OrderedDict() - for key in groupUnique: - quantAfterRegion[key] = [] + quantAfterRegion = [] for i in groupUnique: - index_of_current_region = numpy.where(group == i)[0] - quant_ba_i = quant_ba[index_of_current_region] - alt_group_i = alt_group[index_of_current_region] - index_alternative_refs = numpy.where(alt_group_i != "=")[0] - - dataAB = quant_ab[index_of_current_region] + dataAB = quant_ab[numpy.where(group == i)[0]] + dataBA = quant_ba[numpy.where(group == i)[0]] bigFamilies = numpy.where(dataAB > 20)[0] dataAB[bigFamilies] = 22 - for el in dataAB: - quantAfterRegion[i].append(el) - - if len(index_alternative_refs) == 0: - dataBA = quant_ba_i - bigFamilies = numpy.where(dataBA > 20)[0] - dataBA[bigFamilies] = 22 - for el2 in dataBA: - quantAfterRegion[i].append(el2) - else: # get tags where 2nd mate is aligned to a different ref genome - unique_alt = numpy.unique(alt_group_i[index_alternative_refs]) - for alt in unique_alt: - ind_alt_tags = numpy.where(alt_group_i == alt)[0] - dataBA = quant_ba_i[ind_alt_tags] + bigFamilies = numpy.where(dataBA > 20)[0] + dataBA[bigFamilies] = 22 - bigFamilies = numpy.where(dataBA > 20)[0] - if len(bigFamilies) != 0: - if len(bigFamilies) == 1 and type(dataBA) != list: - dataBA = 22 - quantAfterRegion[alt].append(dataBA) - else: - dataBA[bigFamilies] = 22 - for el3 in dataBA: - quantAfterRegion[alt].append(el3) + quantAll = numpy.concatenate((dataAB, dataBA)) + quantAfterRegion.append(quantAll) - index_inverse = [x for x in range(0, len(index_of_current_region)) if x not in index_alternative_refs] - data_BA_other = quant_ba_i[index_inverse] - bigFamilies_other = numpy.where(data_BA_other > 20)[0] - - if len(bigFamilies_other) != 0: - if len(bigFamilies_other) == 1 and type(data_BA_other) != list: - data_BA_other = 22 - quantAfterRegion[i].append(data_BA_other) - else: - data_BA_other[bigFamilies_other] = 22 - for el3 in data_BA_other: - quantAfterRegion[i].append(el3) - - quantAfterRegion = quantAfterRegion.values() maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) @@ -184,49 +144,24 @@ plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure) plt.text(0.55, 0.22, "total nr. of tags=", size=11, transform=plt.gcf().transFigure) - plt.text(0.7, 0.22, "{:,}".format(length_regions), size=11, transform=plt.gcf().transFigure) + plt.text(0.75, 0.22, "{:,} ({:,})".format(length_regions, length_regions/2), size=11, transform=plt.gcf().transFigure) - legend4 = '* The total numbers indicate the count of the ab and ba tags per region.\nAn equal sign ("=") is used in the column ba tags, if the counts and the region are identical to the ab tags.' - plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) + # legend4 = '* The total numbers indicate the count of the ab and ba tags per region.\nAn equal sign ("=") is used in the column ba tags, if the counts and the region are identical to the ab tags.' + # plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) - space = numpy.arange(0, len(groupUnique), 0.02) - plt.text(0.7, 0.18, "total number of *\nab", size=11, transform=plt.gcf().transFigure) - plt.text(0.78, 0.18, "ba tags", size=11, transform=plt.gcf().transFigure) - lengths_array_ab = [] - lengths_array_ba = [] - + plt.text(0.75, 0.18, "total nr. of tags per region", size=11, transform=plt.gcf().transFigure) + #space = numpy.arange(0, len(groupUnique), 0.02) + s = 0 index_array = 0 - for i, s, count in zip(groupUnique, space, quantAfterRegion): + for i, count in zip(groupUnique, quantAfterRegion): index_of_current_region = numpy.where(group == i)[0] - plt.text(0.55, 0.14 - s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) - if re.search('^(\d)+_(\d)+', str(mut_array[0, 0])) is None: + if re.search('_(\d)+_(\d)+$', str(mut_array[0, 0])) is None: nr_tags_ab = len(numpy.unique(mut_array[index_of_current_region, 1])) else: nr_tags_ab = len(mut_array[index_of_current_region, 1]) - - plt.text(0.7, 0.14 - s, "{:,}\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure) - - alt_group_i = alt_group[index_of_current_region] - alternative = numpy.where(alt_group_i != "=")[0] - unique_alt = numpy.unique(alt_group_i[alternative]) - lengths_of_alt_aligned_tags = [] - if len(alternative) != 0: - for alt in unique_alt: - ind_alt_tags = numpy.where(alt_group_i == alt)[0] - name = "{:,} to {}".format(len(ind_alt_tags), alt) - lengths_of_alt_aligned_tags.append(name) - ind_alt_tags_inverse = numpy.where(alt_group_i != alt)[0] - name_inverse = "{:,} to {}".format(len(ind_alt_tags_inverse), i) - lengths_of_alt_aligned_tags.append(name_inverse) - plt.text(0.78, 0.14 - s, "{}\n".format(", ".join(lengths_of_alt_aligned_tags)), size=10, transform=plt.gcf().transFigure) - lengths_array_ab.append(nr_tags_ab) - lengths_array_ba.append(",".join(lengths_of_alt_aligned_tags)) - else: - plt.text(0.78, 0.14 - s, "=\n", size=11,transform=plt.gcf().transFigure) - lengths_array_ab.append(nr_tags_ab) - lengths_array_ba.append(nr_tags_ab) - index_array += 1 + plt.text(0.75, 0.14 - s, "{:,}\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure) + s = s + 0.02 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) plt.xlabel("Family size", fontsize=14) @@ -243,7 +178,7 @@ output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) - output_file.write("total nr. of tags{}{}\n".format(sep, length_regions)) + output_file.write("total nr. of tags{}{} ({})\n".format(sep, length_regions, length_regions/2)) output_file.write("\n\nValues from family size distribution\n") output_file.write("{}".format(sep)) @@ -273,11 +208,11 @@ for i in counts[0]: output_file.write("{}{}".format(int(sum(i)), sep)) output_file.write("\n") - output_file.write("\n\nRegion{}total nr. of ab{}ba tags\n".format(sep, sep)) + output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the count of the tags per region.\n") + output_file.write("\n\nRegion{}total nr. of tags per region\n".format(sep, sep)) - for ab, ba, i in zip(lengths_array_ab, lengths_array_ba, groupUnique): - output_file.write("{}{}{}{}{}\n".format(i, sep, ab, sep, ba)) - + for i, count in zip(groupUnique, quantAfterRegion): + output_file.write("{}{}{}\n".format(i,sep,len(count) / 2)) print("Files successfully created!")
--- a/test-data/Test_data_regions.txt Wed Oct 17 05:23:33 2018 -0400 +++ b/test-data/Test_data_regions.txt Fri Oct 26 07:54:03 2018 -0400 @@ -1,17 +1,16 @@ -87_636 AAAAAACATCCCAATAAGAAATCA -87_636 AAAAAAGTCCTTCGACTCAAGCGG -87_636 AAAAAATAGTTAAGCCGACACACT -87_636 AAAAAATGTGCCGAACCTTGGCGA -87_636 AAAAACAACATAGCTTGAAAATTT -656_1143 ATTCGGATAATTCGACGCAACATT -656_1143 ATTCGTCGACAATACAAAGGGGCC -656_1143 ATTGCCAGTGTGGGCTGGTTAGTA -656_1143 ATTTCGCGACCATCCGCCACTTTG -656_1143 CAAACTTTAGCACAGTGTGTGTCC -1141_1564 ATAAACGGCCTTCGACATTGTGAC -1141_1564 ATAAAGTCACCTGTGAATACGTTG -1141_1564 ATAAATCGAAACCGTGCCCAACAA -1892_2398 ATTTAGATATTTTCTTCTTTTTCT -1892_2398 ATTTAGTTATCCGTCGGCGACGAA -1892_2398 ATTTAGTTTGAATTGCCCTGCGTC - +ACH_87_636 AAAAAACATCCCAATAAGAAATCA +ACH_87_636 AAAAAAGTCCTTCGACTCAAGCGG +ACH_87_636 AAAAAATAGTTAAGCCGACACACT +ACH_87_636 AAAAAATGTGCCGAACCTTGGCGA +ACH_87_636 AAAAACAACATAGCTTGAAAATTT +ACH_656_1143 ATTCGGATAATTCGACGCAACATT +ACH_656_1143 ATTCGTCGACAATACAAAGGGGCC +ACH_656_1143 ATTGCCAGTGTGGGCTGGTTAGTA +ACH_656_1143 ATTTCGCGACCATCCGCCACTTTG +ACH_656_1143 CAAACTTTAGCACAGTGTGTGTCC +ACH_1141_1564 ATAAACGGCCTTCGACATTGTGAC +ACH_1141_1564 ATAAAGTCACCTGTGAATACGTTG +ACH_1141_1564 ATAAATCGAAACCGTGCCCAACAA +ACH_1892_2398 ATTTAGATATTTTCTTCTTTTTCT +ACH_1892_2398 ATTTAGTTATCCGTCGGCGACGAA +ACH_1892_2398 ATTTAGTTTGAATTGCCCTGCGTC
--- a/test-data/output_file.tabular Wed Oct 17 05:23:33 2018 -0400 +++ b/test-data/output_file.tabular Fri Oct 26 07:54:03 2018 -0400 @@ -5,10 +5,11 @@ relative frequency: 0.209 0.062 total nr. of reads 1312 +total nr. of tags 32 (16) Values from family size distribution - 87_636 656_1143 1141_1564 1892_2398 + ACH_87_636 ACH_656_1143 ACH_1141_1564 ACH_1892_2398 FS=3 0 0 0 1 FS=4 2 0 0 0 FS=5 2 0 0 0 @@ -32,10 +33,11 @@ In the plot, both family sizes of the ab and ba strands were used. -Whereas the total numbers indicate only the single count of the tags per region. +Whereas the total numbers indicate only the count of the tags per region. + + Region total nr. of tags per region -87_636 5 -656_1143 5 -1141_1564 3 -1892_2398 3 -sum of tags 16 +ACH_87_636 5 +ACH_656_1143 5 +ACH_1141_1564 3 +ACH_1892_2398 3