comparison fsd_regions.py @ 5:52454637bc45 draft

planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit 8833d1a8a49d7b6d4a9c849b0335d3260564b351-dirty
author mheinzl
date Wed, 17 Oct 2018 05:23:33 -0400
parents b202c97deabe
children 26014c24323a
comparison
equal deleted inserted replaced
4:b202c97deabe 5:52454637bc45
11 # a tabular file with the data of the plot. 11 # a tabular file with the data of the plot.
12 12
13 # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf 13 # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf
14 14
15 import argparse 15 import argparse
16 import re
16 import sys 17 import sys
18 from collections import OrderedDict
17 19
18 import matplotlib.pyplot as plt 20 import matplotlib.pyplot as plt
19 import numpy 21 import numpy
20 from matplotlib.backends.backend_pdf import PdfPages 22 from matplotlib.backends.backend_pdf import PdfPages
21 23
53 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: 55 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf:
54 data_array = readFileReferenceFree(firstFile, "\t") 56 data_array = readFileReferenceFree(firstFile, "\t")
55 57
56 mut_array = readFileReferenceFree(refGenome, " ") 58 mut_array = readFileReferenceFree(refGenome, " ")
57 length_regions = len(mut_array) 59 length_regions = len(mut_array)
60 group = numpy.array(mut_array[:, 0])
61 seq_mut = numpy.array(mut_array[:, 1])
62 alt_group = numpy.array(mut_array[:, 2])
58 63
59 seq = numpy.array(data_array[:, 1]) 64 seq = numpy.array(data_array[:, 1])
60 tags = numpy.array(data_array[:, 2]) 65 tags = numpy.array(data_array[:, 2])
61 quant = numpy.array(data_array[:, 0]).astype(int) 66 quant = numpy.array(data_array[:, 0]).astype(int)
62 group = numpy.array(mut_array[:, 0])
63 67
64 all_ab = seq[numpy.where(tags == "ab")[0]] 68 all_ab = seq[numpy.where(tags == "ab")[0]]
65 all_ba = seq[numpy.where(tags == "ba")[0]] 69 all_ba = seq[numpy.where(tags == "ba")[0]]
66 quant_ab = quant[numpy.where(tags == "ab")[0]] 70 quant_ab = quant[numpy.where(tags == "ab")[0]]
67 quant_ba = quant[numpy.where(tags == "ba")[0]] 71 quant_ba = quant[numpy.where(tags == "ba")[0]]
68 72
69 seqDic_ab = dict(zip(all_ab, quant_ab)) 73 seqDic_ab = dict(zip(all_ab, quant_ab))
70 seqDic_ba = dict(zip(all_ba, quant_ba)) 74 seqDic_ba = dict(zip(all_ba, quant_ba))
71 75
72 seq_mut = numpy.array(mut_array[:, 1]) 76 if re.search('^(\d)+_(\d)+', str(mut_array[0,0])) is None:
77 seq_mut, seqMut_index = numpy.unique(numpy.array(mut_array[:, 1]), return_index=True)
78 group = mut_array[seqMut_index,0]
79 alt_group = mut_array[seqMut_index,2]
80 mut_array = mut_array[seqMut_index,:]
73 81
74 groupUnique, group_index = numpy.unique(group, return_index=True) 82 groupUnique, group_index = numpy.unique(group, return_index=True)
75 groupUnique = groupUnique[numpy.argsort(group_index)] 83 groupUnique = groupUnique[numpy.argsort(group_index)]
76
77 lst_ab = [] 84 lst_ab = []
85 lst_ba = []
78 for i in seq_mut: 86 for i in seq_mut:
79 lst_ab.append(seqDic_ab.get(i)) 87 lst_ab.append(seqDic_ab.get(i))
80
81 lst_ba = []
82 for i in seq_mut:
83 lst_ba.append(seqDic_ba.get(i)) 88 lst_ba.append(seqDic_ba.get(i))
84 89
85 quant_ab = numpy.array(lst_ab) 90 quant_ab = numpy.array(lst_ab)
86 quant_ba = numpy.array(lst_ba) 91 quant_ba = numpy.array(lst_ba)
87 92
88 quantAfterRegion = [] 93 quantAfterRegion = OrderedDict()
94 for key in groupUnique:
95 quantAfterRegion[key] = []
96
89 for i in groupUnique: 97 for i in groupUnique:
90 dataAB = quant_ab[numpy.where(group == i)[0]] 98 index_of_current_region = numpy.where(group == i)[0]
91 dataBA = quant_ba[numpy.where(group == i)[0]] 99 quant_ba_i = quant_ba[index_of_current_region]
100 alt_group_i = alt_group[index_of_current_region]
101 index_alternative_refs = numpy.where(alt_group_i != "=")[0]
102
103 dataAB = quant_ab[index_of_current_region]
92 bigFamilies = numpy.where(dataAB > 20)[0] 104 bigFamilies = numpy.where(dataAB > 20)[0]
93 dataAB[bigFamilies] = 22 105 dataAB[bigFamilies] = 22
94 bigFamilies = numpy.where(dataBA > 20)[0] 106 for el in dataAB:
95 dataBA[bigFamilies] = 22 107 quantAfterRegion[i].append(el)
96 108
97 quantAll = numpy.concatenate((dataAB, dataBA)) 109 if len(index_alternative_refs) == 0:
98 quantAfterRegion.append(quantAll) 110 dataBA = quant_ba_i
99 111 bigFamilies = numpy.where(dataBA > 20)[0]
112 dataBA[bigFamilies] = 22
113 for el2 in dataBA:
114 quantAfterRegion[i].append(el2)
115 else: # get tags where 2nd mate is aligned to a different ref genome
116 unique_alt = numpy.unique(alt_group_i[index_alternative_refs])
117 for alt in unique_alt:
118 ind_alt_tags = numpy.where(alt_group_i == alt)[0]
119 dataBA = quant_ba_i[ind_alt_tags]
120
121 bigFamilies = numpy.where(dataBA > 20)[0]
122 if len(bigFamilies) != 0:
123 if len(bigFamilies) == 1 and type(dataBA) != list:
124 dataBA = 22
125 quantAfterRegion[alt].append(dataBA)
126 else:
127 dataBA[bigFamilies] = 22
128 for el3 in dataBA:
129 quantAfterRegion[alt].append(el3)
130
131 index_inverse = [x for x in range(0, len(index_of_current_region)) if x not in index_alternative_refs]
132 data_BA_other = quant_ba_i[index_inverse]
133 bigFamilies_other = numpy.where(data_BA_other > 20)[0]
134
135 if len(bigFamilies_other) != 0:
136 if len(bigFamilies_other) == 1 and type(data_BA_other) != list:
137 data_BA_other = 22
138 quantAfterRegion[i].append(data_BA_other)
139 else:
140 data_BA_other[bigFamilies_other] = 22
141 for el3 in data_BA_other:
142 quantAfterRegion[i].append(el3)
143
144 quantAfterRegion = quantAfterRegion.values()
100 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) 145 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion))
101 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) 146 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion))
102 147
103 # PLOT 148 # PLOT
104 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format 149 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format
136 181
137 legend = "BA\n{}\n{}\n{:.5f}" \ 182 legend = "BA\n{}\n{}\n{:.5f}" \
138 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) 183 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2))
139 plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure) 184 plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure)
140 185
141 legend1 = "total nr. of tags=" 186 plt.text(0.55, 0.22, "total nr. of tags=", size=11, transform=plt.gcf().transFigure)
142 legend2 = "total numbers * \n{:,}".format(length_regions) 187 plt.text(0.7, 0.22, "{:,}".format(length_regions), size=11, transform=plt.gcf().transFigure)
143 plt.text(0.6, 0.2, legend1, size=11, transform=plt.gcf().transFigure) 188
144 plt.text(0.75, 0.2, legend2, size=11, transform=plt.gcf().transFigure) 189 legend4 = '* The total numbers indicate the count of the ab and ba tags per region.\nAn equal sign ("=") is used in the column ba tags, if the counts and the region are identical to the ab tags.'
145 legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n"
146 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) 190 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure)
147 191
148 space = numpy.arange(0, len(groupUnique), 0.02) 192 space = numpy.arange(0, len(groupUnique), 0.02)
193 plt.text(0.7, 0.18, "total number of *\nab", size=11, transform=plt.gcf().transFigure)
194 plt.text(0.78, 0.18, "ba tags", size=11, transform=plt.gcf().transFigure)
195 lengths_array_ab = []
196 lengths_array_ba = []
197
198 index_array = 0
149 for i, s, count in zip(groupUnique, space, quantAfterRegion): 199 for i, s, count in zip(groupUnique, space, quantAfterRegion):
150 plt.text(0.6, 0.05 + s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) 200 index_of_current_region = numpy.where(group == i)[0]
151 plt.text(0.75, 0.05 + s, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure) 201
202 plt.text(0.55, 0.14 - s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure)
203 if re.search('^(\d)+_(\d)+', str(mut_array[0, 0])) is None:
204 nr_tags_ab = len(numpy.unique(mut_array[index_of_current_region, 1]))
205 else:
206 nr_tags_ab = len(mut_array[index_of_current_region, 1])
207
208 plt.text(0.7, 0.14 - s, "{:,}\n".format(nr_tags_ab), size=11, transform=plt.gcf().transFigure)
209
210 alt_group_i = alt_group[index_of_current_region]
211 alternative = numpy.where(alt_group_i != "=")[0]
212 unique_alt = numpy.unique(alt_group_i[alternative])
213 lengths_of_alt_aligned_tags = []
214 if len(alternative) != 0:
215 for alt in unique_alt:
216 ind_alt_tags = numpy.where(alt_group_i == alt)[0]
217 name = "{:,} to {}".format(len(ind_alt_tags), alt)
218 lengths_of_alt_aligned_tags.append(name)
219 ind_alt_tags_inverse = numpy.where(alt_group_i != alt)[0]
220 name_inverse = "{:,} to {}".format(len(ind_alt_tags_inverse), i)
221 lengths_of_alt_aligned_tags.append(name_inverse)
222 plt.text(0.78, 0.14 - s, "{}\n".format(", ".join(lengths_of_alt_aligned_tags)), size=10, transform=plt.gcf().transFigure)
223 lengths_array_ab.append(nr_tags_ab)
224 lengths_array_ba.append(",".join(lengths_of_alt_aligned_tags))
225 else:
226 plt.text(0.78, 0.14 - s, "=\n", size=11,transform=plt.gcf().transFigure)
227 lengths_array_ab.append(nr_tags_ab)
228 lengths_array_ba.append(nr_tags_ab)
229 index_array += 1
152 230
153 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) 231 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True)
154 # plt.title(name1, fontsize=14)
155 plt.xlabel("Family size", fontsize=14) 232 plt.xlabel("Family size", fontsize=14)
156 plt.ylabel("Absolute Frequency", fontsize=14) 233 plt.ylabel("Absolute Frequency", fontsize=14)
157 plt.grid(b=True, which="major", color="#424242", linestyle=":") 234 plt.grid(b=True, which="major", color="#424242", linestyle=":")
158 plt.margins(0.01, None) 235 plt.margins(0.01, None)
159 236
160 # plt.savefig("{}_regions.pdf".format(title_file), bbox_inch="tight")
161 pdf.savefig(fig, bbox_inch="tight") 237 pdf.savefig(fig, bbox_inch="tight")
162 plt.close() 238 plt.close()
163 239
164 output_file.write("Dataset:{}{}\n".format(sep, name1)) 240 output_file.write("Dataset:{}{}\n".format(sep, name1))
165 output_file.write("{}AB{}BA\n".format(sep, sep)) 241 output_file.write("{}AB{}BA\n".format(sep, sep))
166 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba)))) 242 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba))))
167 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) 243 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1]))
168 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) 244 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2)))
169 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) 245 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int))))
246 output_file.write("total nr. of tags{}{}\n".format(sep, length_regions))
247
170 output_file.write("\n\nValues from family size distribution\n") 248 output_file.write("\n\nValues from family size distribution\n")
171 output_file.write("{}".format(sep)) 249 output_file.write("{}".format(sep))
172 for i in groupUnique: 250 for i in groupUnique:
173 output_file.write("{}{}".format(i, sep)) 251 output_file.write("{}{}".format(i, sep))
174 output_file.write("\n") 252 output_file.write("\n")
177 if fs == 21: 255 if fs == 21:
178 fs = ">20" 256 fs = ">20"
179 else: 257 else:
180 fs = "={}".format(fs) 258 fs = "={}".format(fs)
181 output_file.write("FS{}{}".format(fs, sep)) 259 output_file.write("FS{}{}".format(fs, sep))
182 for n in range(len(groupUnique)): 260
183 output_file.write("{}{}".format(int(counts[0][n][j]), sep)) 261 if len(groupUnique) == 1:
262 output_file.write("{}{}".format(int(counts[0][j]), sep))
263 else:
264 for n in range(len(groupUnique)):
265 output_file.write("{}{}".format(int(counts[0][n][j]), sep))
266
184 output_file.write("\n") 267 output_file.write("\n")
185 j += 1 268 j += 1
186 output_file.write("sum{}".format(sep)) 269 output_file.write("sum{}".format(sep))
187 for i in counts[0]: 270 if len(groupUnique) == 1:
188 output_file.write("{}{}".format(int(sum(i)), sep)) 271 output_file.write("{}{}".format(int(sum(counts[0])), sep))
272 else:
273 for i in counts[0]:
274 output_file.write("{}{}".format(int(sum(i)), sep))
189 output_file.write("\n") 275 output_file.write("\n")
190 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n") 276 output_file.write("\n\nRegion{}total nr. of ab{}ba tags\n".format(sep, sep))
191 output_file.write("Region{}total nr. of tags per region\n".format(sep)) 277
192 for i, count in zip(groupUnique, quantAfterRegion): 278 for ab, ba, i in zip(lengths_array_ab, lengths_array_ba, groupUnique):
193 output_file.write("{}{}{}\n".format(i, sep, len(count) / 2)) 279 output_file.write("{}{}{}{}{}\n".format(i, sep, ab, sep, ba))
194 output_file.write("sum of tags{}{}\n".format(sep, length_regions))
195 280
196 print("Files successfully created!") 281 print("Files successfully created!")
197 # print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd()))
198 282
199 283
200 if __name__ == '__main__': 284 if __name__ == '__main__':
201 sys.exit(compare_read_families_refGenome(sys.argv)) 285 sys.exit(compare_read_families_refGenome(sys.argv))