Mercurial > repos > mheinzl > fsd_regions
comparison fsd_regions.py @ 0:b82fdb006304 draft
planemo upload for repository https://github.com/monikaheinzl/galaxyProject/tree/master/tools/fsd_regions commit 6055f8c5c052f528ff85fb5e0d43b4500830637a
author | mheinzl |
---|---|
date | Thu, 10 May 2018 07:28:39 -0400 |
parents | |
children | 9ce2b4089c1b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b82fdb006304 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Family size distribution of tags which were aligned to the reference genome | |
4 # | |
5 # Author: Monika Heinzl, Johannes-Kepler University Linz (Austria) | |
6 # Contact: monika.heinzl@edumail.at | |
7 # | |
8 # Takes at least one TABULAR file with tags before the alignment to the SSCS | |
9 # and a TXT with tags of reads that overlap the regions of the reference genome as input. | |
10 # The program produces a plot which shows the distribution of family sizes of the tags from the input files and | |
11 # a CSV file with the data of the plot. | |
12 | |
13 # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --sep "characterWhichSeparatesCSVFile" --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf | |
14 | |
15 import numpy | |
16 import matplotlib.pyplot as plt | |
17 import argparse | |
18 import sys | |
19 import os | |
20 from matplotlib.backends.backend_pdf import PdfPages | |
21 | |
22 def readFileReferenceFree(file, delim): | |
23 with open(file, 'r') as dest_f: | |
24 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string') | |
25 return(data_array) | |
26 | |
27 def make_argparser(): | |
28 parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome') | |
29 parser.add_argument('--inputFile', | |
30 help='Tabular File with three columns: ab or ba, tag and family size.') | |
31 parser.add_argument('--inputName1') | |
32 parser.add_argument('--ref_genome', | |
33 help='TXT File with tags of reads that overlap the region.') | |
34 parser.add_argument('--output_csv', default="data.csv", type=str, | |
35 help='Name of the pdf and csv file.') | |
36 parser.add_argument('--output_pdf', default="data.pdf", type=str, | |
37 help='Name of the pdf and csv file.') | |
38 parser.add_argument('--sep', default=",", | |
39 help='Separator in the csv file.') | |
40 return parser | |
41 | |
42 def compare_read_families_refGenome(argv): | |
43 parser = make_argparser() | |
44 args = parser.parse_args(argv[1:]) | |
45 | |
46 firstFile = args.inputFile | |
47 name1 = args.inputName1 | |
48 name1 = name1.split(".tabular")[0] | |
49 refGenome = args.ref_genome | |
50 title_file = args.output_pdf | |
51 title_file2 = args.output_csv | |
52 sep = args.sep | |
53 | |
54 if type(sep) is not str or len(sep) > 1: | |
55 print("Error: --sep must be a single character.") | |
56 exit(3) | |
57 | |
58 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: | |
59 data_array = readFileReferenceFree(firstFile, "\t") | |
60 | |
61 mut_array = readFileReferenceFree(refGenome, " ") | |
62 length_regions = len(mut_array) | |
63 | |
64 seq = numpy.array(data_array[:, 1]) | |
65 tags = numpy.array(data_array[:, 2]) | |
66 quant = numpy.array(data_array[:, 0]).astype(int) | |
67 group = numpy.array(mut_array[:, 0]) | |
68 | |
69 all_ab = seq[numpy.where(tags == "ab")[0]] | |
70 all_ba = seq[numpy.where(tags == "ba")[0]] | |
71 quant_ab = quant[numpy.where(tags == "ab")[0]] | |
72 quant_ba = quant[numpy.where(tags == "ba")[0]] | |
73 | |
74 seqDic_ab = dict(zip(all_ab, quant_ab)) | |
75 seqDic_ba = dict(zip(all_ba, quant_ba)) | |
76 | |
77 seq_mut = numpy.array(mut_array[:, 1]) | |
78 | |
79 groupUnique, group_index = numpy.unique(group, return_index=True) | |
80 groupUnique = groupUnique[numpy.argsort(group_index)] | |
81 | |
82 lst_ab = [] | |
83 for i in seq_mut: | |
84 lst_ab.append(seqDic_ab.get(i)) | |
85 | |
86 lst_ba = [] | |
87 for i in seq_mut: | |
88 lst_ba.append(seqDic_ba.get(i)) | |
89 | |
90 quant_ab = numpy.array(lst_ab) | |
91 quant_ba = numpy.array(lst_ba) | |
92 | |
93 quantAfterRegion = [] | |
94 for i in groupUnique: | |
95 dataAB = quant_ab[numpy.where(group == i)[0]] | |
96 dataBA = quant_ba[numpy.where(group == i)[0]] | |
97 bigFamilies = numpy.where(dataAB > 20)[0] | |
98 dataAB[bigFamilies] = 22 | |
99 bigFamilies = numpy.where(dataBA > 20)[0] | |
100 dataBA[bigFamilies] = 22 | |
101 | |
102 quantAll = numpy.concatenate((dataAB, dataBA)) | |
103 quantAfterRegion.append(quantAll) | |
104 | |
105 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) | |
106 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) | |
107 | |
108 ### PLOT ### | |
109 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | |
110 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color | |
111 plt.rcParams['xtick.labelsize'] = 12 | |
112 plt.rcParams['ytick.labelsize'] = 12 | |
113 plt.rcParams['patch.edgecolor'] = "black" | |
114 fig = plt.figure() | |
115 plt.subplots_adjust(bottom=0.3) | |
116 | |
117 colors = ["#6E6E6E", "#0431B4", "#5FB404", "#B40431", "#F4FA58", "#DF7401", "#81DAF5"] | |
118 | |
119 col = [] | |
120 for i in range(0, len(groupUnique)): | |
121 col.append(colors[i]) | |
122 | |
123 counts = plt.hist(quantAfterRegion, bins=range(minimumX, maximumX + 1), stacked=False, label=groupUnique, | |
124 align="left", alpha=1, color=col, edgecolor="black", linewidth=1) | |
125 ticks = numpy.arange(minimumX - 1, maximumX, 1) | |
126 | |
127 ticks1 = map(str, ticks) | |
128 ticks1[len(ticks1) - 1] = ">20" | |
129 plt.xticks(numpy.array(ticks), ticks1) | |
130 count = numpy.bincount(map(int, quant_ab)) # original counts | |
131 | |
132 legend = "max. family size =\nabsolute frequency=\nrelative frequency=\n\ntotal nr. of reads=" | |
133 plt.text(0.15, 0.105, legend, size=11, transform=plt.gcf().transFigure) | |
134 | |
135 legend = "AB\n{}\n{}\n{:.5f}\n\n{:,}" \ | |
136 .format(max(map(int, quant_ab)), count[len(count) - 1], float(count[len(count) - 1]) / sum(count), | |
137 sum(numpy.array(data_array[:, 0]).astype(int))) | |
138 plt.text(0.35, 0.105, legend, size=11, transform=plt.gcf().transFigure) | |
139 | |
140 count2 = numpy.bincount(map(int, quant_ba)) # original counts | |
141 | |
142 legend = "BA\n{}\n{}\n{:.5f}" \ | |
143 .format(max(map(int, quant_ba)), count2[len(count2) - 1], float(count2[len(count2) - 1]) / sum(count2)) | |
144 plt.text(0.45, 0.15, legend, size=11, transform=plt.gcf().transFigure) | |
145 | |
146 legend1 = "total nr. of tags=" | |
147 legend2 = "total numbers * \n{:,}".format(length_regions) | |
148 plt.text(0.6, 0.2, legend1, size=11, transform=plt.gcf().transFigure) | |
149 plt.text(0.75, 0.2, legend2, size=11, transform=plt.gcf().transFigure) | |
150 legend4 = "* In the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n" | |
151 plt.text(0.1, 0.02, legend4, size=11, transform=plt.gcf().transFigure) | |
152 | |
153 space = numpy.arange(0, len(groupUnique), 0.02) | |
154 for i, s, count in zip(groupUnique, space, quantAfterRegion): | |
155 plt.text(0.6, 0.05 + s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) | |
156 plt.text(0.75, 0.05 + s, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure) | |
157 | |
158 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) | |
159 plt.title(name1, fontsize=14) | |
160 plt.xlabel("No. of Family Members", fontsize=12) | |
161 plt.ylabel("Absolute Frequency", fontsize=12) | |
162 plt.grid(b=True, which="major", color="#424242", linestyle=":") | |
163 plt.margins(0.01, None) | |
164 | |
165 # plt.savefig("{}_regions.pdf".format(title_file), bbox_inch="tight") | |
166 pdf.savefig(fig, bbox_inch="tight") | |
167 plt.close() | |
168 | |
169 output_file.write("Dataset:{}{}\n".format(sep, firstFile)) | |
170 output_file.write("{}AB{}BA\n".format(sep, sep)) | |
171 output_file.write("max. family size:{}{}{}{}\n".format(sep, max(map(int, quant_ab)), sep, max(map(int, quant_ba)))) | |
172 output_file.write("absolute frequency:{}{}{}{}\n".format(sep, count[len(count) - 1], sep, count2[len(count2) - 1])) | |
173 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) | |
174 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) | |
175 output_file.write("\n\nValues from family size distribution\n") | |
176 output_file.write("{}".format(sep)) | |
177 for i in groupUnique: | |
178 output_file.write("{}{}".format(i,sep)) | |
179 output_file.write("\n") | |
180 j=0 | |
181 for fs in counts[1][0:len(counts[1])-1]: | |
182 if fs == 21: | |
183 fs = ">20" | |
184 else: | |
185 fs = "={}".format(fs) | |
186 output_file.write("FS{}{}".format(fs,sep)) | |
187 for n in range(len(groupUnique)): | |
188 output_file.write("{}{}".format(int(counts[0][n][j]), sep)) | |
189 output_file.write("\n") | |
190 j+=1 | |
191 output_file.write("sum{}".format(sep)) | |
192 for i in counts[0]: | |
193 output_file.write("{}{}".format(int(sum(i)), sep)) | |
194 output_file.write("\n") | |
195 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n") | |
196 output_file.write("Region{}total nr. of tags per region\n".format(sep)) | |
197 for i, count in zip(groupUnique, quantAfterRegion): | |
198 output_file.write("{}{}{}\n".format(i,sep,len(count) / 2)) | |
199 output_file.write("sum of tags{}{}\n".format(sep,length_regions)) | |
200 | |
201 print("Files successfully created!") | |
202 #print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd())) | |
203 | |
204 if __name__ == '__main__': | |
205 sys.exit(compare_read_families_refGenome(sys.argv)) |