Mercurial > repos > mheinzl > fsd_regions
comparison fsd_regions.py @ 4:b202c97deabe draft
planemo upload for repository https://github.com/monikaheinzl/duplexanalysis_galaxy/tree/master/tools/fsd_regions commit dfaab79252a858e8df16bbea3607ebf1b6962e5a
author | mheinzl |
---|---|
date | Mon, 08 Oct 2018 05:53:50 -0400 |
parents | 2631864873d7 |
children | 52454637bc45 |
comparison
equal
deleted
inserted
replaced
3:85d870b8ae92 | 4:b202c97deabe |
---|---|
6 # Contact: monika.heinzl@edumail.at | 6 # Contact: monika.heinzl@edumail.at |
7 # | 7 # |
8 # Takes at least one TABULAR file with tags before the alignment to the SSCS | 8 # Takes at least one TABULAR file with tags before the alignment to the SSCS |
9 # and a TXT with tags of reads that overlap the regions of the reference genome as input. | 9 # and a TXT with tags of reads that overlap the regions of the reference genome as input. |
10 # The program produces a plot which shows the distribution of family sizes of the tags from the input files and | 10 # The program produces a plot which shows the distribution of family sizes of the tags from the input files and |
11 # a CSV file with the data of the plot. | 11 # a tabular file with the data of the plot. |
12 | 12 |
13 # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --sep "characterWhichSeparatesCSVFile" --output_csv outptufile_name_csv --output_pdf outptufile_name_pdf | 13 # USAGE: python FSD_regions_1.6_FINAL.py --inputFile filenameSSCS --inputName1 filenameSSCS --ref_genome filenameRefGenome --output_tabular outptufile_name_tabular --output_pdf outptufile_name_pdf |
14 | 14 |
15 import numpy | |
16 import matplotlib.pyplot as plt | |
17 import argparse | 15 import argparse |
18 import sys | 16 import sys |
19 import os | 17 |
18 import matplotlib.pyplot as plt | |
19 import numpy | |
20 from matplotlib.backends.backend_pdf import PdfPages | 20 from matplotlib.backends.backend_pdf import PdfPages |
21 | |
22 plt.switch_backend('agg') | |
23 | |
21 | 24 |
22 def readFileReferenceFree(file, delim): | 25 def readFileReferenceFree(file, delim): |
23 with open(file, 'r') as dest_f: | 26 with open(file, 'r') as dest_f: |
24 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string') | 27 data_array = numpy.genfromtxt(dest_f, skip_header=0, delimiter=delim, comments='#', dtype='string') |
25 return(data_array) | 28 return(data_array) |
26 | 29 |
30 | |
27 def make_argparser(): | 31 def make_argparser(): |
28 parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome') | 32 parser = argparse.ArgumentParser(description='Family Size Distribution of tags which were aligned to regions of the reference genome') |
29 parser.add_argument('--inputFile', | 33 parser.add_argument('--inputFile', help='Tabular File with three columns: ab or ba, tag and family size.') |
30 help='Tabular File with three columns: ab or ba, tag and family size.') | |
31 parser.add_argument('--inputName1') | 34 parser.add_argument('--inputName1') |
32 parser.add_argument('--ref_genome', | 35 parser.add_argument('--ref_genome', help='TXT File with tags of reads that overlap the region.') |
33 help='TXT File with tags of reads that overlap the region.') | 36 parser.add_argument('--output_pdf', default="data.pdf", type=str, help='Name of the pdf and tabular file.') |
34 parser.add_argument('--output_pdf', default="data.pdf", type=str, | 37 parser.add_argument('--output_tabular', default="data.tabular", type=str, help='Name of the pdf and tabular file.') |
35 help='Name of the pdf and csv file.') | |
36 parser.add_argument('--output_csv', default="data.csv", type=str, | |
37 help='Name of the pdf and csv file.') | |
38 parser.add_argument('--sep', default=",", | |
39 help='Separator in the csv file.') | |
40 return parser | 38 return parser |
39 | |
41 | 40 |
42 def compare_read_families_refGenome(argv): | 41 def compare_read_families_refGenome(argv): |
43 parser = make_argparser() | 42 parser = make_argparser() |
44 args = parser.parse_args(argv[1:]) | 43 args = parser.parse_args(argv[1:]) |
45 | 44 |
46 firstFile = args.inputFile | 45 firstFile = args.inputFile |
47 name1 = args.inputName1 | 46 name1 = args.inputName1 |
48 name1 = name1.split(".tabular")[0] | 47 name1 = name1.split(".tabular")[0] |
49 refGenome = args.ref_genome | 48 refGenome = args.ref_genome |
50 title_file = args.output_pdf | 49 title_file = args.output_pdf |
51 title_file2 = args.output_csv | 50 title_file2 = args.output_tabular |
52 sep = args.sep | 51 sep = "\t" |
53 | |
54 if type(sep) is not str or len(sep) > 1: | |
55 print("Error: --sep must be a single character.") | |
56 exit(3) | |
57 | 52 |
58 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: | 53 with open(title_file2, "w") as output_file, PdfPages(title_file) as pdf: |
59 data_array = readFileReferenceFree(firstFile, "\t") | 54 data_array = readFileReferenceFree(firstFile, "\t") |
60 | 55 |
61 mut_array = readFileReferenceFree(refGenome, " ") | 56 mut_array = readFileReferenceFree(refGenome, " ") |
103 quantAfterRegion.append(quantAll) | 98 quantAfterRegion.append(quantAll) |
104 | 99 |
105 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) | 100 maximumX = numpy.amax(numpy.concatenate(quantAfterRegion)) |
106 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) | 101 minimumX = numpy.amin(numpy.concatenate(quantAfterRegion)) |
107 | 102 |
108 ### PLOT ### | 103 # PLOT |
109 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format | 104 plt.rc('figure', figsize=(11.69, 8.27)) # A4 format |
110 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color | 105 plt.rcParams['axes.facecolor'] = "E0E0E0" # grey background color |
111 plt.rcParams['xtick.labelsize'] = 14 | 106 plt.rcParams['xtick.labelsize'] = 14 |
112 plt.rcParams['ytick.labelsize'] = 14 | 107 plt.rcParams['ytick.labelsize'] = 14 |
113 plt.rcParams['patch.edgecolor'] = "black" | 108 plt.rcParams['patch.edgecolor'] = "black" |
154 for i, s, count in zip(groupUnique, space, quantAfterRegion): | 149 for i, s, count in zip(groupUnique, space, quantAfterRegion): |
155 plt.text(0.6, 0.05 + s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) | 150 plt.text(0.6, 0.05 + s, "{}=\n".format(i), size=11, transform=plt.gcf().transFigure) |
156 plt.text(0.75, 0.05 + s, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure) | 151 plt.text(0.75, 0.05 + s, "{:,}\n".format(len(count) / 2), size=11, transform=plt.gcf().transFigure) |
157 | 152 |
158 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) | 153 plt.legend(loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1), frameon=True) |
159 #plt.title(name1, fontsize=14) | 154 # plt.title(name1, fontsize=14) |
160 plt.xlabel("Family size", fontsize=14) | 155 plt.xlabel("Family size", fontsize=14) |
161 plt.ylabel("Absolute Frequency", fontsize=14) | 156 plt.ylabel("Absolute Frequency", fontsize=14) |
162 plt.grid(b=True, which="major", color="#424242", linestyle=":") | 157 plt.grid(b=True, which="major", color="#424242", linestyle=":") |
163 plt.margins(0.01, None) | 158 plt.margins(0.01, None) |
164 | 159 |
173 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) | 168 output_file.write("relative frequency:{}{:.3f}{}{:.3f}\n\n".format(sep, float(count[len(count) - 1]) / sum(count), sep, float(count2[len(count2) - 1]) / sum(count2))) |
174 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) | 169 output_file.write("total nr. of reads{}{}\n".format(sep, sum(numpy.array(data_array[:, 0]).astype(int)))) |
175 output_file.write("\n\nValues from family size distribution\n") | 170 output_file.write("\n\nValues from family size distribution\n") |
176 output_file.write("{}".format(sep)) | 171 output_file.write("{}".format(sep)) |
177 for i in groupUnique: | 172 for i in groupUnique: |
178 output_file.write("{}{}".format(i,sep)) | 173 output_file.write("{}{}".format(i, sep)) |
179 output_file.write("\n") | 174 output_file.write("\n") |
180 j=0 | 175 j = 0 |
181 for fs in counts[1][0:len(counts[1])-1]: | 176 for fs in counts[1][0:len(counts[1]) - 1]: |
182 if fs == 21: | 177 if fs == 21: |
183 fs = ">20" | 178 fs = ">20" |
184 else: | 179 else: |
185 fs = "={}".format(fs) | 180 fs = "={}".format(fs) |
186 output_file.write("FS{}{}".format(fs,sep)) | 181 output_file.write("FS{}{}".format(fs, sep)) |
187 for n in range(len(groupUnique)): | 182 for n in range(len(groupUnique)): |
188 output_file.write("{}{}".format(int(counts[0][n][j]), sep)) | 183 output_file.write("{}{}".format(int(counts[0][n][j]), sep)) |
189 output_file.write("\n") | 184 output_file.write("\n") |
190 j+=1 | 185 j += 1 |
191 output_file.write("sum{}".format(sep)) | 186 output_file.write("sum{}".format(sep)) |
192 for i in counts[0]: | 187 for i in counts[0]: |
193 output_file.write("{}{}".format(int(sum(i)), sep)) | 188 output_file.write("{}{}".format(int(sum(i)), sep)) |
194 output_file.write("\n") | 189 output_file.write("\n") |
195 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n") | 190 output_file.write("\n\nIn the plot, both family sizes of the ab and ba strands were used.\nWhereas the total numbers indicate only the single count of the tags per region.\n") |
196 output_file.write("Region{}total nr. of tags per region\n".format(sep)) | 191 output_file.write("Region{}total nr. of tags per region\n".format(sep)) |
197 for i, count in zip(groupUnique, quantAfterRegion): | 192 for i, count in zip(groupUnique, quantAfterRegion): |
198 output_file.write("{}{}{}\n".format(i,sep,len(count) / 2)) | 193 output_file.write("{}{}{}\n".format(i, sep, len(count) / 2)) |
199 output_file.write("sum of tags{}{}\n".format(sep,length_regions)) | 194 output_file.write("sum of tags{}{}\n".format(sep, length_regions)) |
200 | 195 |
201 print("Files successfully created!") | 196 print("Files successfully created!") |
202 #print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd())) | 197 # print("Files saved under {}.pdf and {}.csv in {}!".format(title_file, title_file, os.getcwd())) |
198 | |
203 | 199 |
204 if __name__ == '__main__': | 200 if __name__ == '__main__': |
205 sys.exit(compare_read_families_refGenome(sys.argv)) | 201 sys.exit(compare_read_families_refGenome(sys.argv)) |