Mercurial > repos > biocomp-ibens > alfa
comparison alfa/ALFA.py @ 0:e360f840a92e draft default tip
Uploaded
author | biocomp-ibens |
---|---|
date | Wed, 16 May 2018 09:49:18 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e360f840a92e |
---|---|
1 #!/usr/bin/python | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 __author__ = "noel & bahin" | |
5 """ ALFA provides a global overview of features distribution composing NGS dataset(s). """ | |
6 | |
7 import argparse | |
8 import os | |
9 import numpy | |
10 import copy | |
11 import sys | |
12 import subprocess | |
13 import matplotlib.pyplot as plt | |
14 import matplotlib.cm as cmx | |
15 import matplotlib.colors as colors | |
16 import matplotlib.patheffects as PathEffects | |
17 import re | |
18 from matplotlib.backends.backend_pdf import PdfPages | |
19 # To correctly embed the texts when saving plots in svg format | |
20 import matplotlib | |
21 # import progressbar | |
22 import collections | |
23 import matplotlib as mpl | |
24 import numpy as np | |
25 | |
26 matplotlib.rcParams["svg.fonttype"] = "none" | |
27 | |
28 | |
29 ########################################################################## | |
30 # FUNCTIONS # | |
31 ########################################################################## | |
32 | |
33 def init_dict(d, key, init): | |
34 if key not in d: | |
35 d[key] = init | |
36 | |
37 | |
38 def tryint(s): | |
39 """ Function called by "alphanum_key" function to sort the chromosome names. """ | |
40 try: | |
41 return int(s) | |
42 except ValueError: | |
43 return s | |
44 | |
45 | |
46 def alphanum_key(s): | |
47 """ Turn a string into a list of string and number chunks. | |
48 "z23a" -> ["z", 23, "a"] | |
49 """ | |
50 return [ tryint(c) for c in re.split("([0-9]+)", s) ] | |
51 | |
52 | |
53 def required_arg(arg, aliases): | |
54 """ Function to display the help and quit if a required argument is missing. """ | |
55 if not arg: | |
56 print >> sys.stderr, "\nError: %s argument is missing.\n" % aliases | |
57 parser.print_usage() | |
58 sys.exit(1) | |
59 | |
60 | |
61 def existing_file(filename): | |
62 """ Checks if filename already exists and exit if so. """ | |
63 if os.path.isfile(filename): | |
64 sys.exit("Error: The file '" + filename + "' is about to be produced but already exists in the directory. \n### End of program") | |
65 | |
66 | |
67 def get_chromosome_lengths(): | |
68 """ | |
69 Parse the file containing the chromosome lengths. | |
70 If no length file is provided, browse the annotation file (GTF) to estimate the chromosome sizes. | |
71 """ | |
72 lengths = {} | |
73 gtf_chrom_names = set() | |
74 # If the user provided a chromosome length file | |
75 if options.chr_len: | |
76 # Getting the chromosome lengths from the chromosome lengths file | |
77 with open(options.chr_len, "r") as chr_len_fh: | |
78 for line in chr_len_fh: | |
79 try: | |
80 lengths[line.split("\t")[0]] = int(line.rstrip().split("\t")[1]) | |
81 except IndexError: | |
82 sys.exit("Error: The chromosome lengths file is not correctly formed. It is supposed to be tabulated file with two fields per line.") | |
83 # Getting the chromosome lengths from the GTF file | |
84 with open(options.annotation, "r") as gtf_fh: | |
85 for line in gtf_fh: | |
86 if not line.startswith("#"): | |
87 gtf_chrom_names.add(line.split("\t")[0]) | |
88 # Checking if the chromosomes from the chromosome lengths file are present in the GTF file | |
89 for chrom in lengths: | |
90 if chrom not in gtf_chrom_names: | |
91 print >> sys.stderr, "Warning: chromosome '" + chrom + "' of the chromosome lengths file does not match any chromosome name in the GTF file provided and was ignored." | |
92 # Checking if the chromosomes from the GTF file are present in the lengths file | |
93 for chrom in gtf_chrom_names: | |
94 if chrom not in lengths: | |
95 print >> sys.stderr, "Warning: at least one chromosome ('" + chrom + "') was found in the GTF file and does not match any chromosome provided in the lengths file." | |
96 print >> sys.stderr, "\t=> All the chromosome lengths will be approximated using annotations in the GTF file." | |
97 break | |
98 else: | |
99 return lengths | |
100 # If no chromosome lengths file was provided or if at least one chromosome was missing in the file, the end of the last annotation of the chromosome in the GTF file is considered as the chromosome length | |
101 with open(options.annotation, "r") as gtf_fh: | |
102 for line in gtf_fh: | |
103 if not line.startswith("#"): | |
104 chrom = line.split("\t")[0] | |
105 end = int(line.split("\t")[4]) | |
106 init_dict(lengths, chrom, 0) | |
107 lengths[chrom] = max(lengths[chrom], end) | |
108 print "The chromosome lengths have been approximated using the GTF file annotations (the stop position of the last annotation of each chromosome is considered as its length)." | |
109 return lengths | |
110 | |
111 | |
112 def write_feature_on_index(feat, chrom, start, stop, sign, stranded_genome_index): | |
113 """ Write one new line in the stranded index file and, if necessary, the unstranded index file. """ | |
114 grouped_by_biotype_features = [] | |
115 for biotype, categs in feat.iteritems(): | |
116 categ_list = [] | |
117 for cat in set(categs): | |
118 categ_list.append(cat) | |
119 grouped_by_biotype_features.append(":".join((str(biotype), ",".join(categ_list)))) | |
120 | |
121 #stranded_genome_index.write('\t'.join((chrom, start, stop, sign, '')) + '\t'.join(grouped_by_biotype_features) + '\n') | |
122 stranded_genome_index.write( | |
123 '\t'.join((chrom, start, stop, sign)) + '\t' + '\t'.join(grouped_by_biotype_features) + '\n') | |
124 if unstranded_genome_index: ## MB: Why? Not always unstranded and stranded?? | |
125 #unstranded_genome_index.write('\t'.join((chrom, start, stop, '.', '')) + '\t'.join(grouped_by_biotype_features) + '\n') | |
126 unstranded_genome_index.write( | |
127 '\t'.join((chrom, start, stop, '.')) + '\t' + '\t'.join(grouped_by_biotype_features) + '\n') | |
128 | |
129 | |
130 def write_index_line(feat, chrom, start, stop, sign, fh): | |
131 """ Write a new line in an index file. """ | |
132 # Formatting the features info | |
133 feat_by_biotype = [] | |
134 for biot, cat in feat.iteritems(): | |
135 #feat_by_biotype.append(":".join((str(biot), ",".join(set(cat))))) | |
136 feat_by_biotype.append(":".join((str(biot), ",".join(set(cat))))) | |
137 # Writing the features info in the index file | |
138 fh.write("\t".join((chrom, start, stop, sign)) + "\t" + "\t".join(feat_by_biotype) + "\n") | |
139 | |
140 | |
141 def write_index(feat_values, chrom, start, stop, stranded_genome_index, unstranded_genome_index): | |
142 """ Writing the features info in the proper index files. """ | |
143 # Writing info to the stranded indexes | |
144 if feat_values[0] != {}: | |
145 write_index_line(feat_values[0], chrom, start, stop, "+", stranded_genome_index) | |
146 else: | |
147 stranded_genome_index.write("\t".join((chrom, start, stop, "+", "antisense\n"))) | |
148 if feat_values[1] != {}: | |
149 write_index_line(feat_values[1], chrom, start, stop, "-", stranded_genome_index) | |
150 else: | |
151 stranded_genome_index.write("\t".join((chrom, start, stop, "-", "antisense\n"))) | |
152 # Writing info to the unstranded index | |
153 unstranded_feat = dict(feat_values[0], **feat_values[1]) | |
154 for name in set(feat_values[0]) & set(feat_values[1]): | |
155 unstranded_feat[name] += feat_values[0][name] | |
156 write_index_line(unstranded_feat, chrom, start, stop, ".", unstranded_genome_index) | |
157 | |
158 | |
159 def count_genome_features(cpt, features, start, stop, coverage=1): | |
160 """ Reads genome index and registers feature counts. """ | |
161 # If no biotype priority: category with the highest priority for each found biotype has the same weight (1/n_biotypes) | |
162 if not biotype_prios: | |
163 nb_biot = len(features) | |
164 # For each categ(s)/biotype pairs | |
165 for feat in features: | |
166 cur_prio = 0 | |
167 # Separate categorie(s) and biotype | |
168 try: | |
169 biot, cats = feat.split(":") | |
170 # Error if the feature is "antisense": update the "antisense/antisense" counts | |
171 except ValueError: | |
172 try: | |
173 cpt[("antisense", "antisense")] += (int(stop) - int(start)) * coverage / float(nb_biot) | |
174 except KeyError: | |
175 cpt[("antisense", "antisense")] = (int(stop) - int(start)) * coverage / float(nb_biot) | |
176 return None | |
177 # Browse the categories and get only the one(s) with highest priority | |
178 for cat in cats.split(","): | |
179 try: | |
180 prio = prios[cat] | |
181 except KeyError: | |
182 # TODO: Find a way to add unknown categories | |
183 if cat not in unknown_cat: | |
184 print >> sys.stderr, "Warning: Unknown categorie '%s' found and ignored.\n" % cat, | |
185 unknown_cat.add(cat) | |
186 continue | |
187 # Check if the category has a highest priority than the current one | |
188 if prio > cur_prio: | |
189 cur_prio = prio | |
190 cur_cat = {cat} | |
191 if prio == cur_prio: | |
192 cur_cat.add(cat) | |
193 # Increase each counts by the coverage divided by the number of categories and biotypes | |
194 nb_cat = len(cur_cat) | |
195 for cat in cur_cat: | |
196 try: | |
197 cpt[(cat, biot)] += (int(stop) - int(start)) * coverage / (float(nb_biot * nb_cat)) | |
198 except KeyError: | |
199 cpt[(cat, biot)] = (int(stop) - int(start)) * coverage / (float(nb_biot * nb_cat)) | |
200 else: | |
201 # TODO: Add an option to provide biotype priorities and handle it | |
202 pass | |
203 | |
204 ''' | |
205 def add_info(cpt, feat_values, start, stop, chrom=None, unstranded_genome_index=None, stranded_genome_index=None, | |
206 biotype_prios=None, coverage=1, categ_prios=None): | |
207 """ | |
208 From an annotated genomic interval (start/stop positions and associated feature: one or more category(ies)/biotype pair(s) ) | |
209 - If a file is provided: write the information at the end of the index file being generated; | |
210 - else: browse the features and update the counts of categories/biotypes found in the genome. | |
211 """ | |
212 ## Writing in the file if it was provided | |
213 if stranded_genome_index: | |
214 # If only one strand has a feature, this feature will directly be written on the unstranded index | |
215 if feat_values[0] == {}: | |
216 stranded_genome_index.write('\t'.join((chrom, start, stop, '+', 'antisense\n'))) | |
217 elif feat_values[1] == {}: | |
218 stranded_genome_index.write('\t'.join((chrom, start, stop, '-', 'antisense\n'))) | |
219 | |
220 write_feature_on_index(feat_values[0], chrom, start, stop, '+', stranded_genome_index) | |
221 write_feature_on_index(feat_values[1], chrom, start, stop, '-', stranded_genome_index) | |
222 unstranded_feat = dict(feat_values[0], **feat_values[1]) | |
223 for name in set(feat_values[0]) & set(feat_values[1]): | |
224 unstranded_feat[name] += feat_values[0][name] | |
225 write_feature_on_index(unstranded_feat, chrom, start, stop, '.', unstranded_genome_index) | |
226 | |
227 if feat_values[0] == {}: | |
228 # An interval with no feature corresponds to a region annotated only on the reverse strand: update 'antisense' counts | |
229 stranded_genome_index.write('\t'.join((chrom, start, stop, '+', 'antisense\n'))) | |
230 #write_feature_on_index(feat_values[1], chrom, start, stop, '-', stranded_genome_index, unstranded_genome_index) | |
231 write_feature_on_index(feat_values[1], chrom, start, stop, '-', stranded_genome_index) | |
232 write_feature_on_index(feat_values[1], chrom, start, stop, '.', unstranded_genome_index) | |
233 elif feat_values[1] == {}: | |
234 #write_feature_on_index(feat_values[0], chrom, start, stop, '+', stranded_genome_index, unstranded_genome_index) | |
235 write_feature_on_index(feat_values[0], chrom, start, stop, '+', stranded_genome_index) | |
236 write_feature_on_index(feat_values[0], chrom, start, stop, '.', unstranded_genome_index) | |
237 stranded_genome_index.write('\t'.join((chrom, start, stop, '-', 'antisense\n'))) | |
238 # Else, the unstranded index should contain the union of plus and minus features | |
239 else: | |
240 write_feature_on_index(feat_values[0], chrom, start, stop, '+', stranded_genome_index) | |
241 write_feature_on_index(feat_values[1], chrom, start, stop, '-', stranded_genome_index) | |
242 unstranded_feat = dict(feat_values[0], **feat_values[1]) | |
243 for name in set(feat_values[0]) & set(feat_values[1]): | |
244 unstranded_feat[name] += feat_values[0][name] | |
245 write_feature_on_index(unstranded_feat, chrom, start, stop, '.', unstranded_genome_index) | |
246 | |
247 ## Increasing category counter(s) | |
248 else: ##MB Why increase if file not provided?? | |
249 # Default behavior if no biotype priorities : category with the highest priority for each found biotype has the same weight (1/n_biotypes) | |
250 if not biotype_prios: | |
251 nb_feat = len(feat_values) ## MB: nb biotypes?? | |
252 # For every categ(s)/biotype pairs | |
253 for feat in feat_values: ## MB: for each biotype | |
254 cur_prio = 0 | |
255 selected_categ = [] | |
256 # Separate categorie(s) and biotype | |
257 try: | |
258 biot, cats = feat.split(":") | |
259 # Error if feature equal "antisense" : update the 'antisense/antisense' counts | |
260 except ValueError: | |
261 try: | |
262 cpt[(feat, feat)] += (int(stop) - int(start)) * coverage / float(nb_feat) ## MB: clearly write 'antisense'?? | |
263 except: ## MB: add a KeyError exception?? | |
264 cpt[(feat, feat)] = (int(stop) - int(start)) * coverage / float(nb_feat) | |
265 return | |
266 # Browse the categories and get only the one(s) with highest priority | |
267 for cat in cats.split(','): | |
268 try: | |
269 prio = prios[cat] | |
270 except: ## MB: KeyError?? | |
271 # TODO Find a way to add unknown categories | |
272 if cat not in unknown_feature: | |
273 print >> sys.stderr, "Warning: Unknown categorie %s found and ignored.\n" % cat, | |
274 unknown_feature.append(cat) | |
275 continue | |
276 if prio > cur_prio: | |
277 cur_prio = prio | |
278 selected_categ = [cat] | |
279 if prio == cur_prio: | |
280 if cat != selected_categ: ## MB: work with set?? | |
281 try: | |
282 if cat not in selected_categ: | |
283 selected_categ.append(cat) | |
284 except TypeError: ## What TypeError?? | |
285 selected_categ = [selected_categ, cat] | |
286 # Increase each counts by the coverage divided by the number of categories and biotypes | |
287 nb_cats = len(selected_categ) | |
288 for cat in selected_categ: | |
289 try: | |
290 cpt[(cat, biot)] += (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats)) | |
291 except KeyError: | |
292 cpt[(cat, biot)] = (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats)) | |
293 # else : | |
294 # cpt[(cats,biot)] = (int(stop) - int(start)) / float(nb_feat) * coverage | |
295 # Else, apply biotype selection according to the priority set | |
296 else: | |
297 # TODO Add an option to pass biotype priorities and handle it | |
298 pass | |
299 ''' | |
300 | |
301 def register_interval(features_dict, chrom, stranded_index_fh, unstranded_index_fh): | |
302 """ Write the interval features info into the genome index files. """ | |
303 # Adding the chromosome to the list if not present | |
304 if chrom not in index_chrom_list: | |
305 index_chrom_list.append(chrom) | |
306 # Writing the chromosome in the index file | |
307 with open(unstranded_index_fh, "a") as unstranded_index_fh, open(stranded_index_fh, "a") as stranded_index_fh: | |
308 # Initializing the first interval start and features | |
309 sorted_pos = sorted(features_dict["+"].keys()) | |
310 interval_start = sorted_pos[0] | |
311 features_plus = features_dict["+"][interval_start] | |
312 features_minus = features_dict["-"][interval_start] | |
313 # Browsing the interval boundaries | |
314 for interval_stop in sorted_pos[1:]: | |
315 # Writing the current interval features to the indexes | |
316 write_index([features_plus, features_minus], chrom, str(interval_start), str(interval_stop), stranded_index_fh, unstranded_index_fh) | |
317 # Initializing the new interval start and features | |
318 interval_start = interval_stop | |
319 # Store current features | |
320 prev_features_plus = features_plus | |
321 prev_features_minus = features_minus | |
322 # Update features | |
323 features_plus = features_dict["+"][interval_start] | |
324 features_minus = features_dict["-"][interval_start] | |
325 # If feature == transcript and prev interval's feature is exon => add intron feature | |
326 for biotype, categ in features_plus.iteritems(): | |
327 if set(categ) == {"gene", "transcript"}: | |
328 if "exon" in prev_features_plus[biotype]: | |
329 categ.append("intron") | |
330 else: | |
331 continue | |
332 for biotype, categ in features_minus.iteritems(): | |
333 if set(categ) == {"gene", "transcript"}: | |
334 if "exon" in prev_features_minus[biotype]: | |
335 categ.append("intron") | |
336 else: | |
337 continue | |
338 | |
339 | |
340 | |
341 def generate_genome_index(annotation, unstranded_genome_index, stranded_genome_index, chrom_sizes): | |
342 """ Create an index of the genome annotations and save it in a file. """ | |
343 # Initializations | |
344 intervals_dict = {} | |
345 max_value = -1 | |
346 prev_chrom = "" | |
347 i = 0 # Line counter | |
348 # Write the chromosome lengths as comment lines before the genome index | |
349 with open(unstranded_genome_index, "w") as unstranded_index_fh, open(stranded_genome_index, "w") as stranded_index_fh: | |
350 for key, value in chrom_sizes.items(): | |
351 unstranded_index_fh.write("#%s\t%s\n" % (key, value)) | |
352 stranded_index_fh.write("#%s\t%s\n" % (key, value)) | |
353 # Progress bar to track the genome indexes creation | |
354 nb_lines = sum(1 for _ in open(annotation)) | |
355 # pbar = progressbar.ProgressBar(widgets=["Indexing the genome ", progressbar.Percentage(), " ", progressbar.Bar(), progressbar.Timer()], maxval=nb_lines).start() | |
356 # Browsing the GTF file and writing into genome index files | |
357 with open(annotation, "r") as gtf_fh: | |
358 for line in gtf_fh: | |
359 i += 1 | |
360 # Update the progressbar every 1k lines | |
361 # if i % 1000 == 1: | |
362 # pbar.update(i) | |
363 # Processing lines except comment ones | |
364 if not line.startswith("#"): | |
365 # Getting the line info | |
366 line_split = line.rstrip().split("\t") | |
367 chrom = line_split[0] | |
368 cat = line_split[2] | |
369 start = int(line_split[3]) - 1 | |
370 stop = int(line_split[4]) | |
371 strand = line_split[6] | |
372 antisense = reverse_strand[strand] | |
373 biotype = line_split[8].split("biotype")[1].split(";")[0].strip('" ') | |
374 # Registering stored features info in the genome index file(s) if the new line concerns a new chromosome or the new line concerns an annotation not overlapping previously recorded ones | |
375 if start > max_value or chrom != prev_chrom: | |
376 # Write the previous features | |
377 if intervals_dict: | |
378 register_interval(intervals_dict, prev_chrom, stranded_genome_index, unstranded_genome_index) | |
379 prev_chrom = chrom | |
380 # (Re)Initializing the intervals info dict | |
381 intervals_dict = {strand: {start: {biotype: [cat]}, stop: {}}, antisense: {start: {}, stop: {}}} | |
382 max_value = stop | |
383 | |
384 # Update the dictionary which represents intervals for every distinct annotation | |
385 else: | |
386 # Storing the intervals on the strand of the current feature | |
387 stranded_intervals = intervals_dict[strand] | |
388 added_info = False # Variable to know if the features info were already added | |
389 # Browsing the existing boundaries | |
390 for boundary in sorted(stranded_intervals): | |
391 # While the GTF line start is after the browsed boundary: keep the browsed boundary features info in case the GTF line start is before the next boundary | |
392 if boundary < start: | |
393 stored_feat_strand, stored_feat_antisense = [dict(stranded_intervals[boundary]), dict(intervals_dict[antisense][boundary])] | |
394 | |
395 # The GTF line start is already an existing boundary: store the existing features info (to manage after the GTF line stop) and update it with the GTF line features info | |
396 elif boundary == start: | |
397 stored_feat_strand, stored_feat_antisense = [dict(stranded_intervals[boundary]), dict(intervals_dict[antisense][boundary])] | |
398 # Adding the GTF line features info to the interval | |
399 try: | |
400 stranded_intervals[boundary][biotype] = stranded_intervals[boundary][biotype] + [cat] | |
401 except KeyError: # If the GTF line features info regard an unregistered biotype | |
402 stranded_intervals[boundary][biotype] = [cat] | |
403 added_info = True # The features info were added | |
404 | |
405 # The browsed boundary is after the GTF line start: add the GTF line features info | |
406 elif boundary > start: | |
407 # Create a new boundary for the GTF line start if necessary (if it is between 2 existing boundaries, it was not created before) | |
408 if not added_info: | |
409 stranded_intervals[start] = copy.deepcopy(stored_feat_strand) | |
410 #stranded_intervals[start][biotype] = [cat] | |
411 try: | |
412 stranded_intervals[start][biotype].append(cat) | |
413 except KeyError: | |
414 stranded_intervals[start][biotype] = [cat] | |
415 intervals_dict[antisense][start] = copy.deepcopy(stored_feat_antisense) | |
416 added_info = True # The features info were added | |
417 # While the browsed boundary is before the GTF line stop: store the existing features info (to manage after the GTF line stop) and update it with the GTF line features info | |
418 if boundary < stop: | |
419 stored_feat_strand, stored_feat_antisense = [dict(stranded_intervals[boundary]), dict(intervals_dict[antisense][boundary])] | |
420 try: | |
421 stranded_intervals[boundary][biotype] = stranded_intervals[boundary][biotype] + [cat] | |
422 except KeyError: | |
423 stranded_intervals[boundary][biotype] = [cat] | |
424 # The GTF line stop is already exists, nothing more to do, the GTF line features info are integrated | |
425 elif boundary == stop: | |
426 break | |
427 # The browsed boundary is after the GTF line stop: create a new boundary for the GTF line stop (with the stored features info) | |
428 else: # boundary > stop | |
429 stranded_intervals[stop] = copy.deepcopy(stored_feat_strand) | |
430 intervals_dict[antisense][stop] = copy.deepcopy(stored_feat_antisense) | |
431 break # The GTF line features info are integrated | |
432 # If the GTF line stop is after the last boundary, extend the dictionary | |
433 if stop > max_value: | |
434 max_value = stop | |
435 stranded_intervals[stop] = {} | |
436 intervals_dict[antisense][stop] = {} | |
437 | |
438 # Store the categories of the last chromosome | |
439 register_interval(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index) | |
440 # pbar.finish() | |
441 | |
442 | |
443 def generate_bedgraph_files(sample_labels, bam_files): | |
444 """ Creates BedGraph files from BAM ones and return filenames and labels. """ | |
445 #sample_files = [] | |
446 #sample_labels = [] | |
447 # Progress bar to track the BedGraph file creation | |
448 #pbar = progressbar.ProgressBar(widgets=["Generating the BedGraph files ", progressbar.Percentage(), progressbar.Bar(), progressbar.Timer()], max_value=len(bam_files)+1).start() | |
449 # pbar = progressbar.ProgressBar(widgets=["Generating the BedGraph files ", progressbar.Percentage(), progressbar.Bar(), progressbar.Timer()], maxval=len(sample_labels)+1).start() | |
450 n = 1 | |
451 # pbar.update(n) | |
452 #for n in range(0, len(bam_files), 2): | |
453 for sample_label, bam_file in zip(sample_labels, bam_files): | |
454 # Get the label for this sample | |
455 #sample_labels.append(bam_files[n + 1]) | |
456 # Modify it to contain only alphanumeric characters (avoid files generation with dangerous names) | |
457 #modified_label = "_".join(re.findall(r"[\w']+", bam_files[n + 1])) | |
458 if options.strandness in ["forward", "fr-firststrand"]: | |
459 #subprocess.call("bedtools genomecov -bg -split -strand + -ibam " + bam_files[n] + " > " + modified_label + ".plus.bedgraph", shell=True) | |
460 subprocess.call("bedtools genomecov -bg -split -strand + -ibam " + bam_file + " > " + sample_label + ".plus.bedgraph", shell=True) | |
461 # pbar.update(n + 0.5) | |
462 #subprocess.call("bedtools genomecov -bg -split -strand - -ibam " + bam_files[n] + " > " + modified_label + ".minus.bedgraph", shell=True) | |
463 subprocess.call("bedtools genomecov -bg -split -strand - -ibam " + bam_file + " > " + sample_label + ".minus.bedgraph", shell=True) | |
464 # pbar.update(n + 0.5) | |
465 elif options.strandness in ["reverse", "fr-secondstrand"]: | |
466 #subprocess.call("bedtools genomecov -bg -split -strand - -ibam " + bam_files[n] + " > " + modified_label + ".plus.bedgraph", shell=True) | |
467 subprocess.call("bedtools genomecov -bg -split -strand - -ibam " + bam_file + " > " + sample_label + ".plus.bedgraph", shell=True) | |
468 # pbar.update(n + 0.5) | |
469 #subprocess.call("bedtools genomecov -bg -split -strand + -ibam " + bam_files[n] + " > " + modified_label + ".minus.bedgraph", shell=True) | |
470 subprocess.call("bedtools genomecov -bg -split -strand + -ibam " + bam_file + " > " + sample_label + ".minus.bedgraph", shell=True) | |
471 # pbar.update(n + 0.5) | |
472 else: | |
473 #subprocess.call("bedtools genomecov -bg -split -ibam " + bam_files[n] + " > " + modified_label + ".bedgraph", shell=True) | |
474 subprocess.call("bedtools genomecov -bg -split -ibam " + bam_file + " > " + sample_label + ".bedgraph", shell=True) | |
475 # pbar.update(n + 1) | |
476 #sample_files.append(modified_label) | |
477 # pbar.finish() | |
478 #return sample_files, sample_labels | |
479 return None | |
480 | |
481 | |
482 def read_gtf(gtf_index_file, sign): | |
483 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_features, endGTF | |
484 strand = "" | |
485 while strand != sign: | |
486 gtf_line = gtf_index_file.readline() | |
487 # If the GTF file is finished | |
488 if not gtf_line: | |
489 endGTF = True | |
490 return endGTF | |
491 splitline = gtf_line.rstrip().split("\t") | |
492 try: | |
493 strand = splitline[3] | |
494 # strand information can not be found in the file file header | |
495 except IndexError: | |
496 pass | |
497 gtf_chrom = splitline[0] | |
498 gtf_features = splitline[4:] | |
499 gtf_start, gtf_stop = map(int, splitline[1:3]) | |
500 return endGTF | |
501 | |
502 | |
503 #def read_counts(counts_files): | |
504 def read_counts(sample_labels, counts_files): | |
505 """ Reads the counts from an input file. """ | |
506 cpt = {} | |
507 cpt_genome = {} | |
508 #for fcounts in counts_files: | |
509 for sample_label, filename in zip(sample_labels, counts_files): | |
510 #label = os.path.splitext(os.path.basename(fcounts))[0] | |
511 #labels.append(label) | |
512 #cpt[label] = {} | |
513 cpt[sample_label] = {} | |
514 #with open(fcounts, "r") as counts_fh: | |
515 with open(filename, "r") as counts_fh: | |
516 for line in counts_fh: | |
517 if not line.startswith("#"): | |
518 feature = tuple(line.split("\t")[0].split(",")) | |
519 #cpt[label][feature] = float(line.split("\t")[1]) | |
520 cpt[sample_label][feature] = float(line.split("\t")[1]) | |
521 cpt_genome[feature] = float(line.rstrip().split("\t")[2]) | |
522 #return cpt, cpt_genome, labels | |
523 return cpt, cpt_genome | |
524 | |
525 | |
526 def get_chromosome_names_in_index(genome_index): | |
527 """ Returns the chromosome names in a genome index file. """ | |
528 with open(genome_index, "r") as index_fh: | |
529 for line in index_fh: | |
530 if not line.startswith("#") and (line.split("\t")[0] not in index_chrom_list): | |
531 index_chrom_list.append(line.split("\t")[0]) | |
532 return index_chrom_list | |
533 | |
534 | |
535 def read_index(): | |
536 """ Parse index files info (chromosomes list, lengths and genome features). """ | |
537 with open(genome_index, "r") as index_fh: | |
538 for line in index_fh: | |
539 if line.startswith("#"): | |
540 lengths[line.split("\t")[0][1:]] = int(line.split("\t")[1]) | |
541 else: | |
542 chrom = line.split("\t")[0] | |
543 if chrom not in index_chrom_list: | |
544 index_chrom_list.append(chrom) | |
545 count_genome_features(cpt_genome, line.rstrip().split("\t")[4:], line.split("\t")[1], line.split("\t")[2]) | |
546 | |
547 | |
548 #def intersect_bedgraphs_and_index_to_counts_categories(sample_files, sample_labels, prios, genome_index, biotype_prios=None): ## MB: To review | |
549 def intersect_bedgraphs_and_index_to_counts_categories(sample_labels, bedgraph_files, biotype_prios=None): ## MB: To review | |
550 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_cat, endGTF | |
551 unknown_chrom = [] | |
552 cpt = {} # Counter for the nucleotides in the BAM input file(s) | |
553 #for n in range(len(sample_files)): | |
554 if bedgraph_files == []: | |
555 bedgraph_files = sample_labels | |
556 for sample_label, bedgraph_basename in zip(sample_labels, bedgraph_files): | |
557 #sample_file = sample_files[n] | |
558 #sample_name = sample_labels[n] | |
559 # Initializing the category counter dict for this sample and the number of lines to process for the progress bar | |
560 #init_dict(cpt, sample_name, {}) | |
561 init_dict(cpt, sample_label, {}) | |
562 bg_extension = ".bedgraph" | |
563 if options.strandness == "unstranded": | |
564 strands = [("", ".")] | |
565 #nb_lines = sum(1 for _ in open(sample_file + ".bedgraph")) | |
566 try : | |
567 nb_lines = sum(1 for _ in open(bedgraph_basename + bg_extension)) | |
568 except IOError: | |
569 bg_extension = "bg" | |
570 nb_lines = sum(1 for _ in open(bedgraph_basename + bg_extension)) | |
571 else: | |
572 strands = [(".plus", "+"), (".minus", "-")] | |
573 #nb_lines = sum(1 for _ in open(sample_file + ".plus.bedgraph")) + sum(1 for _ in open(sample_file + ".minus.bedgraph")) | |
574 try: | |
575 nb_lines = sum(1 for _ in open(bedgraph_basename + ".plus" + bg_extension)) + sum(1 for _ in open(bedgraph_basename + ".minus" + bg_extension)) | |
576 except IOError: | |
577 nb_lines = sum(1 for _ in open(bedgraph_basename + ".plus" + bg_extension)) + sum( | |
578 1 for _ in open(bedgraph_basename + ".minus" + bg_extension)) | |
579 | |
580 # Progress bar to track the BedGraph and index intersection | |
581 #pbar = progressbar.ProgressBar(widgets=["Processing " + sample_file + " ", progressbar.Percentage(), progressbar.Bar(), progressbar.Timer()], max_value=nb_lines).start() | |
582 # pbar = progressbar.ProgressBar(widgets=["Processing " + sample_label + " ", progressbar.Percentage(), progressbar.Bar(), progressbar.Timer()], maxval=nb_lines).start() | |
583 i = 0 | |
584 | |
585 # Intersecting the BedGraph and index files | |
586 for strand, sign in strands: | |
587 prev_chrom = "" | |
588 endGTF = False # Reaching the next chr or the end of the GTF index | |
589 intergenic_adds = 0.0 | |
590 #with open(sample_file + strand + ".bedgraph", "r") as bedgraph_fh: | |
591 with open(bedgraph_basename + strand + bg_extension, "r") as bedgraph_fh: | |
592 # Running through the BedGraph file | |
593 for bam_line in bedgraph_fh: | |
594 i += 1 | |
595 # if i % 10000 == 0: | |
596 # pbar.update(i) | |
597 # Getting the BAM line info | |
598 bam_chrom = bam_line.split("\t")[0] | |
599 bam_start, bam_stop, bam_cpt = map(float, bam_line.split("\t")[1:4]) | |
600 # Skip the line if the chromosome is not in the index | |
601 if bam_chrom not in index_chrom_list: | |
602 if bam_chrom not in unknown_chrom: | |
603 unknown_chrom.append(bam_chrom) | |
604 print "\r \r Chromosome '" + bam_chrom + "' not found in index." # MB: to adapt with the progress bar | |
605 continue | |
606 # If this is a new chromosome (or the first one) | |
607 if bam_chrom != prev_chrom: | |
608 intergenic_adds = 0.0 | |
609 # (Re)opening the GTF index and looking for the first line of the matching chr | |
610 try: | |
611 gtf_index_file.close() | |
612 except UnboundLocalError: | |
613 pass | |
614 gtf_index_file = open(genome_index, "r") | |
615 endGTF = False | |
616 read_gtf(gtf_index_file, sign) | |
617 while bam_chrom != gtf_chrom: | |
618 read_gtf(gtf_index_file, sign) | |
619 if endGTF: | |
620 break | |
621 prev_chrom = bam_chrom | |
622 | |
623 # Looking for the first matching annotation in the GTF index | |
624 while (not endGTF) and (gtf_chrom == bam_chrom) and (bam_start >= gtf_stop): | |
625 read_gtf(gtf_index_file, sign) | |
626 if gtf_chrom != bam_chrom: | |
627 endGTF = True | |
628 # Processing BAM lines before the first GTF annotation if there are | |
629 if bam_start < gtf_start: | |
630 # Increase the "intergenic" category counter with all or part of the BAM interval | |
631 try: | |
632 intergenic_adds += min(bam_stop, gtf_start) - bam_start | |
633 #cpt[sample_name][("intergenic", "intergenic")] += (min(bam_stop, | |
634 cpt[sample_label][("intergenic", "intergenic")] += (min(bam_stop, | |
635 gtf_start) - bam_start) * bam_cpt | |
636 except KeyError: | |
637 #cpt[sample_name][("intergenic", "intergenic")] = (min(bam_stop, | |
638 cpt[sample_label][("intergenic", "intergenic")] = (min(bam_stop, | |
639 gtf_start) - bam_start) * bam_cpt | |
640 # Go to next line if the BAM interval was totally upstream the first GTF annotation, carry on with the remaining part otherwise | |
641 if endGTF or (bam_stop <= gtf_start): | |
642 continue | |
643 else: | |
644 bam_start = gtf_start | |
645 | |
646 # We can start the crossover | |
647 while not endGTF: | |
648 # Update category counter | |
649 #add_info(cpt[sample_name], gtf_features, bam_start, min(bam_stop, gtf_stop), coverage=bam_cpt) | |
650 #count_genome_features(cpt[sample_name], gtf_features, bam_start, min(bam_stop, gtf_stop), coverage=bam_cpt) | |
651 count_genome_features(cpt[sample_label], gtf_features, bam_start, min(bam_stop, gtf_stop), coverage=bam_cpt) | |
652 # Read the next GTF file line if the BAM line is not entirely covered | |
653 if bam_stop > gtf_stop: | |
654 # Update the BAM start pointer | |
655 bam_start = gtf_stop | |
656 endGTF = read_gtf(gtf_index_file, sign) | |
657 # If we read a new chromosome in the GTF file then it is considered finished | |
658 if bam_chrom != gtf_chrom: | |
659 endGTF = True | |
660 if endGTF: | |
661 break | |
662 else: | |
663 # Next if the BAM line is entirely covered | |
664 bam_start = bam_stop | |
665 break | |
666 | |
667 # Processing the end of the BAM line if necessary | |
668 if endGTF and (bam_stop > bam_start): | |
669 try: | |
670 #cpt[sample_name][("intergenic", "intergenic")] += (bam_stop - bam_start) * bam_cpt | |
671 cpt[sample_label][("intergenic", "intergenic")] += (bam_stop - bam_start) * bam_cpt | |
672 except KeyError: | |
673 #cpt[sample_name][("intergenic", "intergenic")] = (bam_stop - bam_start) * bam_cpt | |
674 cpt[sample_label][("intergenic", "intergenic")] = (bam_stop - bam_start) * bam_cpt | |
675 gtf_index_file.close() | |
676 # pbar.finish() | |
677 return cpt | |
678 | |
679 | |
680 def write_counts_in_files(cpt, genome_counts): | |
681 """ Writes the biotype/category counts in an output file. """ | |
682 for sample_label, counters in cpt.items(): | |
683 sample_label = "_".join(re.findall(r"[\w\-']+", sample_label)) | |
684 with open(sample_label + ".feature_counts.tsv", "w") as output_fh: | |
685 output_fh.write("#Category,biotype\tCounts_in_bam\tSize_in_genome\n") | |
686 for features_pair, counts in counters.items(): | |
687 output_fh.write("%s\t%s\t%s\n" % (",".join(features_pair), counts, genome_counts[features_pair])) | |
688 | |
689 | |
690 def recategorize_the_counts(cpt, cpt_genome, final): | |
691 final_cat_cpt = {} | |
692 final_genome_cpt = {} | |
693 for f in cpt: | |
694 # print "\nFinal categories for",f,"sample" | |
695 final_cat_cpt[f] = {} | |
696 for final_cat in final: | |
697 tot = 0 | |
698 tot_genome = 0 | |
699 for cat in final[final_cat]: | |
700 tot += cpt[f][cat] | |
701 tot_genome += cpt_genome[cat] | |
702 # output_file.write('\t'.join((final_cat, str(tot))) + '\n') | |
703 # print '\t'.join((final_cat, str(tot))) | |
704 final_cat_cpt[f][final_cat] = tot | |
705 final_genome_cpt[final_cat] = tot_genome | |
706 return final_cat_cpt, final_genome_cpt | |
707 | |
708 | |
709 def group_counts_by_categ(cpt, cpt_genome, final, selected_biotype): | |
710 final_cat_cpt = {} | |
711 final_genome_cpt = {} | |
712 filtered_cat_cpt = {} | |
713 for f in cpt: | |
714 final_cat_cpt[f] = {} | |
715 filtered_cat_cpt[f] = {} | |
716 for final_cat in final: | |
717 tot = 0 | |
718 tot_filter = 0 | |
719 tot_genome = 0 | |
720 for cat in final[final_cat]: | |
721 for key, value in cpt[f].items(): | |
722 if key[0] == cat: | |
723 tot += value | |
724 tot_genome += cpt_genome[key] | |
725 if key[1] == selected_biotype: | |
726 tot_filter += value | |
727 # output_file.write('\t'.join((final_cat, str(tot))) + '\n') | |
728 # print '\t'.join((final_cat, str(tot))) | |
729 final_cat_cpt[f][final_cat] = tot | |
730 if tot_genome == 0: | |
731 final_genome_cpt[final_cat] = 1e-100 | |
732 else: | |
733 final_genome_cpt[final_cat] = tot_genome | |
734 filtered_cat_cpt[f][final_cat] = tot_filter | |
735 # if "antisense" in final_genome_cpt: final_genome_cpt["antisense"] = 0 | |
736 return final_cat_cpt, final_genome_cpt, filtered_cat_cpt | |
737 | |
738 | |
739 def group_counts_by_biotype(cpt, cpt_genome, biotypes): | |
740 final_cpt = {} | |
741 final_genome_cpt = {} | |
742 for f in cpt: | |
743 final_cpt[f] = {} | |
744 for biot in biotypes: | |
745 tot = 0 | |
746 tot_genome = 0 | |
747 try: | |
748 for final_biot in biotypes[biot]: | |
749 for key, value in cpt[f].items(): | |
750 if key[1] == final_biot: | |
751 tot += value | |
752 # if key[1] != 'antisense': | |
753 tot_genome += cpt_genome[key] | |
754 except: | |
755 for key, value in cpt[f].items(): | |
756 if key[1] == biot: | |
757 tot += value | |
758 tot_genome += cpt_genome[key] | |
759 if tot != 0: | |
760 final_cpt[f][biot] = tot | |
761 final_genome_cpt[biot] = tot_genome | |
762 return final_cpt, final_genome_cpt | |
763 | |
764 | |
765 # def get_cmap(N): | |
766 # '''Returns a function that maps each index in 0, 1, ... N-1 to a distinct | |
767 # RGB color.''' | |
768 # color_norm = colors.Normalize(vmin=0, vmax=N-1) | |
769 # scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv') | |
770 # def map_index_to_rgb_color(index): | |
771 # return scalar_map.to_rgba(index) | |
772 # return map_index_to_rgb_color | |
773 | |
774 def one_sample_plot(ordered_categs, percentages, enrichment, n_cat, index, index_enrichment, bar_width, counts_type, | |
775 title, sample_labels): | |
776 ### Initialization | |
777 fig = plt.figure(figsize=(13, 9)) | |
778 ax1 = plt.subplot2grid((2, 4), (0, 0), colspan=2) | |
779 ax2 = plt.subplot2grid((2, 4), (1, 0), colspan=2) | |
780 cmap = plt.get_cmap("Spectral") | |
781 cols = [cmap(x) for x in xrange(0, 256, 256 / n_cat)] | |
782 if title: | |
783 ax1.set_title(title + "in: %s" % sample_labels[0]) | |
784 else: | |
785 ax1.set_title(counts_type + " distribution in mapped reads in: %s" % sample_labels[0]) | |
786 ax2.set_title("Normalized counts of " + counts_type) | |
787 | |
788 ### Barplots | |
789 # First barplot: percentage of reads in each categorie | |
790 ax1.bar(index, percentages, bar_width, | |
791 color=cols) | |
792 # Second barplot: enrichment relative to the genome for each categ | |
793 # (the reads count in a categ is divided by the categ size in the genome) | |
794 ax2.bar(index_enrichment, enrichment, bar_width, | |
795 color=cols, ) | |
796 ### Piecharts | |
797 pielabels = [ordered_categs[i] if percentages[i] > 0.025 else "" for i in xrange(n_cat)] | |
798 sum_enrichment = numpy.sum(enrichment) | |
799 pielabels_enrichment = [ordered_categs[i] if enrichment[i] / sum_enrichment > 0.025 else "" for i in xrange(n_cat)] | |
800 # Categories piechart | |
801 ax3 = plt.subplot2grid((2, 4), (0, 2)) | |
802 pie_wedge_collection, texts = ax3.pie(percentages, labels=pielabels, shadow=True, colors=cols) | |
803 # Enrichment piechart | |
804 ax4 = plt.subplot2grid((2, 4), (1, 2)) | |
805 pie_wedge_collection, texts = ax4.pie(enrichment, labels=pielabels_enrichment, shadow=True, colors=cols) | |
806 # Add legends (append percentages to labels) | |
807 labels = [" ".join((ordered_categs[i], "({:.1%})".format(percentages[i]))) for i in range(len(ordered_categs))] | |
808 ax3.legend(pie_wedge_collection, labels, loc="center", fancybox=True, shadow=True, prop={"size": "medium"}, | |
809 bbox_to_anchor=(1.7, 0.5)) | |
810 labels = [" ".join((ordered_categs[i], "({:.1%})".format(enrichment[i] / sum_enrichment))) for i in | |
811 range(len(ordered_categs))] # if ordered_categs[i] != "antisense"] | |
812 ax4.legend(pie_wedge_collection, labels, loc="center", fancybox=True, shadow=True, prop={"size": "medium"}, | |
813 bbox_to_anchor=(1.7, 0.5)) | |
814 # Set aspect ratio to be equal so that pie is drawn as a circle | |
815 ax3.set_aspect("equal") | |
816 ax4.set_aspect("equal") | |
817 return fig, ax1, ax2 | |
818 | |
819 | |
820 #def make_plot(ordered_categs, sample_names, categ_counts, genome_counts, pdf, counts_type, threshold, title=None, | |
821 def make_plot(sample_labels, ordered_categs, categ_counts, genome_counts, pdf, counts_type, threshold, title=None, svg=None, png=None, | |
822 categ_groups=[]): # MB: to review | |
823 # From ordered_categs, keep only the features (categs or biotypes) that we can find in at least one sample. | |
824 existing_categs = set() | |
825 for sample in categ_counts.values(): | |
826 existing_categs |= set(sample.keys()) | |
827 ordered_categs = filter(existing_categs.__contains__, ordered_categs) | |
828 xlabels = [cat if len(cat.split("_")) == 1 else "\n".join(cat.split("_")) if cat.split("_")[0] != 'undescribed' else "\n".join(["und.",cat.split("_")[1]]) for cat in ordered_categs] | |
829 n_cat = len(ordered_categs) | |
830 #n_exp = len(sample_names) | |
831 nb_samples = len(categ_counts) | |
832 ##Initialization of the matrix of counts (nrow=nb_experiements, ncol=nb_categorie) | |
833 #counts = numpy.matrix(numpy.zeros(shape=(n_exp, n_cat))) | |
834 counts = numpy.matrix(numpy.zeros(shape=(nb_samples, n_cat))) | |
835 ''' | |
836 for exp in xrange(len(sample_names)): | |
837 for cat in xrange(len(ordered_categs)): | |
838 try: | |
839 counts[exp, cat] = categ_counts[sample_names[exp]][ordered_categs[cat]] | |
840 except: | |
841 pass | |
842 ''' | |
843 for sample_label in sample_labels: | |
844 for cat in xrange(len(ordered_categs)): | |
845 try: | |
846 counts[sample_labels.index(sample_label), cat] = categ_counts[sample_label][ordered_categs[cat]] | |
847 except: | |
848 pass | |
849 | |
850 ##Normalize the categorie sizes by the total size to get percentages | |
851 sizes = [] | |
852 sizes_sum = 0 | |
853 for cat in ordered_categs: | |
854 sizes.append(genome_counts[cat]) | |
855 sizes_sum += genome_counts[cat] | |
856 if "antisense" in ordered_categs: | |
857 antisense_pos = ordered_categs.index("antisense") | |
858 sizes[antisense_pos] = 1e-100 | |
859 for cpt in xrange(len(sizes)): | |
860 sizes[cpt] /= float(sizes_sum) | |
861 | |
862 ## Create array which contains the percentage of reads in each categ for every sample | |
863 percentages = numpy.array(counts / numpy.sum(counts, axis=1)) | |
864 ## Create the enrichment array (counts divided by the categorie sizes in the genome) | |
865 enrichment = numpy.array(percentages / sizes) | |
866 if "antisense_pos" in locals(): | |
867 ''' | |
868 for i in xrange(len(sample_names)): | |
869 enrichment[i][antisense_pos] = 0 | |
870 ''' | |
871 for n in xrange(nb_samples): | |
872 enrichment[n][antisense_pos] = 0 | |
873 | |
874 # enrichment=numpy.log(numpy.array(percentages/sizes)) | |
875 #for exp in xrange(n_exp): | |
876 for n in xrange(nb_samples): | |
877 for i in xrange(n_cat): | |
878 val = enrichment[n][i] | |
879 if val > 1: | |
880 enrichment[n][i] = val - 1 | |
881 elif val == 1 or val == 0: | |
882 enrichment[n][i] = 0 | |
883 else: | |
884 enrichment[n][i] = -1 / val + 1 | |
885 | |
886 #### Finally, produce the plot | |
887 | |
888 ##Get the colors from the colormap | |
889 ncolor = 16 | |
890 cmap = ["#e47878", "#68b4e5", "#a3ea9b", "#ea9cf3", "#e5c957", "#a3ecd1", "#e97ca0", "#66d985", "#8e7ae5", | |
891 "#b3e04b", "#b884e4", "#e4e758", "#738ee3", "#e76688", "#70dddd", "#e49261"] | |
892 ''' | |
893 if n_exp > ncolor: | |
894 cmap = plt.get_cmap("Set3", n_exp) | |
895 cmap = [cmap(i) for i in xrange(n_exp)] | |
896 ''' | |
897 if nb_samples > ncolor: | |
898 cmap = plt.get_cmap("Set3", nb_samples) | |
899 cmap = [cmap(i) for i in xrange(nb_samples)] | |
900 | |
901 ## Parameters for the plot | |
902 opacity = 1 | |
903 # Create a vector which contains the position of each bar | |
904 index = numpy.arange(n_cat) | |
905 # Size of the bars (depends on the categs number) | |
906 #bar_width = 0.9 / n_exp | |
907 bar_width = 0.9 / nb_samples | |
908 | |
909 ##Initialise the subplot | |
910 # if there is only one sample, also plot piecharts | |
911 # if n_exp == 1 and counts_type.lower() == 'categories': | |
912 # fig, ax1, ax2 = one_sample_plot(ordered_categs, percentages[0], enrichment[0], n_cat, index, bar_width, counts_type, title) | |
913 ## If more than one sample | |
914 # else: | |
915 if counts_type.lower() != "categories": | |
916 #fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * n_exp) / 3, 10)) | |
917 fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * nb_samples) / 3, 10)) | |
918 else: | |
919 #fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * n_exp) / 3, 10)) | |
920 fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * nb_samples) / 3, 10)) | |
921 # Store the bars objects for enrichment plot | |
922 rects = [] | |
923 # For each sample/experiment | |
924 #for i in range(n_exp): | |
925 for sample_label in sample_labels: | |
926 # First barplot: percentage of reads in each categorie | |
927 n = sample_labels.index(sample_label) | |
928 #ax1.bar(index + i * bar_width, percentages[i], bar_width, | |
929 ax1.bar(index + n * bar_width, percentages[n], bar_width, | |
930 alpha=opacity, | |
931 #color=cmap[i], | |
932 color=cmap[n], | |
933 #label=sample_names[i], edgecolor="#FFFFFF", lw=0) | |
934 label=sample_label, edgecolor="#FFFFFF", lw=0) | |
935 # Second barplot: enrichment relative to the genome for each categ | |
936 # (the reads count in a categ is divided by the categ size in the genome) | |
937 rects.append(ax2.bar(index + n * bar_width, enrichment[n], bar_width, | |
938 alpha=opacity, | |
939 #color=cmap[i], | |
940 color=cmap[n], | |
941 #label=sample_names[i], edgecolor=cmap[i], lw=0)) | |
942 label=sample_label, edgecolor=cmap[n], lw=0)) | |
943 | |
944 ## Graphical options for the plot | |
945 # Adding of the legend | |
946 #if n_exp < 10: | |
947 if nb_samples < 10: | |
948 ax1.legend(loc="best", frameon=False) | |
949 legend_ncol = 1 | |
950 #elif n_exp < 19: | |
951 elif nb_samples < 19: | |
952 legend_ncol = 2 | |
953 else: | |
954 legend_ncol = 3 | |
955 ax1.legend(loc="best", frameon=False, ncol=legend_ncol) | |
956 ax2.legend(loc="best", frameon=False, ncol=legend_ncol) | |
957 # ax2.legend(loc='upper center',bbox_to_anchor=(0.5,-0.1), fancybox=True, shadow=True) | |
958 # Main titles | |
959 if title: | |
960 ax1.set_title(title) | |
961 else: | |
962 ax1.set_title(counts_type + " counts") | |
963 ax2.set_title(counts_type + " normalized counts") | |
964 | |
965 # Adding enrichment baseline | |
966 # ax2.axhline(y=0,color='black',linestyle='dashed',linewidth='1.5') | |
967 # Axes limits | |
968 ax1.set_xlim(-0.1, len(ordered_categs) + 0.1) | |
969 if len(sizes) == 1: ax1.set_xlim(-2, 3) | |
970 ax2.set_xlim(ax1.get_xlim()) | |
971 # Set axis limits (max/min values + 5% margin) | |
972 ax2_ymin = [] | |
973 ax2_ymax = [] | |
974 for sample_values in enrichment: | |
975 ax2_ymin.append(min(sample_values)) | |
976 ax2_ymax.append(max(sample_values)) | |
977 ax2_ymax = max(ax2_ymax) | |
978 ax2_ymin = min(ax2_ymin) | |
979 margin_top, margin_bottom = (abs(0.05 * ax2_ymax), abs(0.05 * ax2_ymin)) | |
980 ax1.set_ylim(0, ax1.get_ylim()[1] * 1.05) | |
981 if threshold: | |
982 threshold_bottom = -abs(float(threshold[0])) + 1 | |
983 threshold_top = float(threshold[1]) - 1 | |
984 | |
985 #for i in xrange(n_exp): | |
986 for n in xrange(nb_samples): | |
987 for y in xrange(n_cat): | |
988 #val = enrichment[i][y] | |
989 val = enrichment[n][y] | |
990 if not numpy.isnan(val) and not (threshold_bottom < val < threshold_top): | |
991 #rect = rects[i][y] | |
992 rect = rects[n][y] | |
993 rect_height = rect.get_height() | |
994 if rect.get_y() < 0: | |
995 diff = rect_height + threshold_bottom | |
996 rect.set_y(threshold_bottom) | |
997 ax2_ymin = threshold_bottom | |
998 margin_bottom = 0 | |
999 else: | |
1000 diff = rect_height - threshold_top | |
1001 ax2_ymax = threshold_top | |
1002 margin_top = 0 | |
1003 rect.set_height(rect.get_height() - diff) | |
1004 if margin_top != 0 and margin_bottom != 0: | |
1005 margin_top, margin_bottom = [max(margin_top, margin_bottom) for i in xrange(2)] | |
1006 ax2.set_ylim(ax2_ymin - margin_bottom, ax2_ymax + margin_top) | |
1007 # Y axis title | |
1008 ax1.set_ylabel("Proportion of reads (%)") | |
1009 ax2.set_ylabel("Enrichment relative to genome") | |
1010 | |
1011 | |
1012 # Add the categories on the x-axis | |
1013 #ax1.set_xticks(index + bar_width * n_exp / 2) | |
1014 ax1.set_xticks(index + bar_width * nb_samples / 2) | |
1015 #ax2.set_xticks(index + bar_width * n_exp / 2) | |
1016 ax2.set_xticks(index + bar_width * nb_samples / 2) | |
1017 if counts_type.lower() != "categories": | |
1018 ax1.set_xticklabels(ordered_categs, rotation="30", ha="right") | |
1019 ax2.set_xticklabels(ordered_categs, rotation="30", ha="right") | |
1020 else: | |
1021 ax1.set_xticklabels(xlabels) | |
1022 ax2.set_xticklabels(xlabels) | |
1023 | |
1024 # Display fractions values in percentages | |
1025 ax1.set_yticklabels([str(int(i * 100)) for i in ax1.get_yticks()]) | |
1026 # Correct y-axis ticks labels for enrichment subplot | |
1027 # ax2.set_yticklabels([str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()]) | |
1028 yticks = list(ax2.get_yticks()) | |
1029 yticks = [yticks[i] - 1 if yticks[i] > 9 else yticks[i] + 1 if yticks[i] < -9 else yticks[i] for i in | |
1030 xrange(len(yticks))] | |
1031 ax2.set_yticks(yticks) | |
1032 ax2.set_yticklabels([str(int(i + 1)) + "$^{+1}$" if i > 0 and i % 1 == 0 else str( | |
1033 i + 1) + "$^{+1}$" if i > 0 else 1 if i == 0 else str( | |
1034 int(-(i - 1))) + "$^{-1}$" if i < 0 and i % 1 == 0 else str(-(i - 1)) + "$^{-1}$" for i in ax2.get_yticks()]) | |
1035 # ax2.set_yticklabels([i+1 if i>0 else 1 if i==0 else "$\\frac{1}{%s}$" %-(i-1) for i in ax2.get_yticks()]) | |
1036 # Change appearance of 'antisense' bars on enrichment plot since we cannot calculate an enrichment for this artificial category | |
1037 if "antisense_pos" in locals(): # ax2.text(antisense_pos+bar_width/2,ax2.get_ylim()[1]/10,'NA') | |
1038 #for i in xrange(n_exp): | |
1039 for n in xrange(nb_samples): | |
1040 #rect = rects[i][antisense_pos] | |
1041 rect = rects[n][antisense_pos] | |
1042 rect.set_y(ax2.get_ylim()[0]) | |
1043 rect.set_height(ax2.get_ylim()[1] - ax2.get_ylim()[0]) | |
1044 rect.set_hatch("/") | |
1045 rect.set_fill(False) | |
1046 rect.set_linewidth(0) | |
1047 # rect.set_color('lightgrey') | |
1048 # rect.set_edgecolor('#EDEDED') | |
1049 rect.set_color("#EDEDED") | |
1050 #ax2.text(index[antisense_pos] + bar_width * n_exp / 2 - 0.1, (ax2_ymax + ax2_ymin) / 2, "NA") | |
1051 ax2.text(index[antisense_pos] + bar_width * nb_samples / 2 - 0.1, (ax2_ymax + ax2_ymin) / 2, "NA") | |
1052 # Add text for features absent in sample | |
1053 #for i in xrange(n_exp): | |
1054 for n in xrange(nb_samples): | |
1055 for y in xrange(n_cat): | |
1056 #if percentages[i][y] == 0: | |
1057 if percentages[n][y] == 0: | |
1058 txt = ax1.text(y + bar_width * (n + 0.5), 0.02, "Abs.", rotation="vertical", color=cmap[n], | |
1059 horizontalalignment="center", verticalalignment="bottom") | |
1060 txt.set_path_effects([PathEffects.Stroke(linewidth=0.5), PathEffects.Normal()]) | |
1061 #elif enrichment[i][y] == 0: | |
1062 elif enrichment[n][y] == 0: | |
1063 #rects[i][y].set_linewidth(1) | |
1064 rects[n][y].set_linewidth(1) | |
1065 | |
1066 # Remove top/right/bottom axes | |
1067 for ax in [ax1, ax2]: | |
1068 ax.spines["top"].set_visible(False) | |
1069 ax.spines["right"].set_visible(False) | |
1070 ax.spines["bottom"].set_visible(False) | |
1071 ax.tick_params(axis="x", which="both", bottom="on", top="off", labelbottom="on") | |
1072 ax.tick_params(axis="y", which="both", left="on", right="off", labelleft="on") | |
1073 | |
1074 | |
1075 ### Add second axis with categ groups | |
1076 annotate_group(categ_groups, label=None, ax=ax1) | |
1077 annotate_group(categ_groups, label=None, ax=ax2) | |
1078 | |
1079 ### Adjust figure margins to | |
1080 if counts_type.lower() == "categories": | |
1081 plt.tight_layout(h_pad=5.0) | |
1082 fig.subplots_adjust(bottom=0.1) | |
1083 else: | |
1084 plt.tight_layout() | |
1085 | |
1086 ## Showing the plot | |
1087 if pdf: ## TODO: allow several output formats | |
1088 pdf.savefig() | |
1089 plt.close() | |
1090 elif svg: | |
1091 if svg == True: | |
1092 plt.savefig(counts_type + ".svg") | |
1093 else: | |
1094 if os.path.splitext(svg)[1] == ".svg": | |
1095 plt.savefig(".".join((os.path.splitext(svg)[0], counts_type, "svg"))) | |
1096 else: | |
1097 plt.savefig(".".join((svg, counts_type, "svg"))) | |
1098 elif png: | |
1099 if png == True: | |
1100 plt.savefig(counts_type + ".png") | |
1101 else: | |
1102 if os.path.splitext(png)[1] == ".png": | |
1103 plt.savefig(".".join((os.path.splitext(png)[0], counts_type, "png"))) | |
1104 else: | |
1105 plt.savefig(".".join((png, counts_type, "png"))) | |
1106 else: | |
1107 plt.show() | |
1108 ## Save on disk it as a png image | |
1109 # fig.savefig('image_output.png', dpi=300, format='png', bbox_extra_artists=(lgd,), bbox_inches='tight') | |
1110 | |
1111 def annotate_group(groups, ax=None, label=None, labeloffset=30): | |
1112 """Annotates the categories with their parent group and add x-axis label""" | |
1113 | |
1114 def annotate(ax, name, left, right, y, pad): | |
1115 """Draw the group annotation""" | |
1116 arrow = ax.annotate(name, xy=(left, y), xycoords="data", | |
1117 xytext=(right, y - pad), textcoords="data", | |
1118 annotation_clip=False, verticalalignment="top", | |
1119 horizontalalignment="center", linespacing=2.0, | |
1120 arrowprops={'arrowstyle': "-", 'shrinkA': 0, 'shrinkB': 0, | |
1121 'connectionstyle': "angle,angleB=90,angleA=0,rad=5"} | |
1122 ) | |
1123 return arrow | |
1124 | |
1125 if ax is None: | |
1126 ax = plt.gca() | |
1127 level = 0 | |
1128 for level in range(len(groups)): | |
1129 grp = groups[level] | |
1130 for name, coord in grp.items(): | |
1131 ymin = ax.get_ylim()[0] - np.ptp(ax.get_ylim()) * 0.12 - np.ptp(ax.get_ylim()) * 0.05 * (level) | |
1132 ypad = 0.01 * np.ptp(ax.get_ylim()) | |
1133 xcenter = np.mean(coord) | |
1134 annotate(ax, name, coord[0], xcenter, ymin, ypad) | |
1135 annotate(ax, name, coord[1], xcenter, ymin, ypad) | |
1136 | |
1137 if label is not None: | |
1138 # Define xlabel and position it according to the number of group levels | |
1139 ax.annotate(label, | |
1140 xy=(0.5, 0), xycoords="axes fraction", | |
1141 xytext=(0, -labeloffset - (level + 1) * 15), textcoords="offset points", | |
1142 verticalalignment="top", horizontalalignment="center") | |
1143 | |
1144 return | |
1145 | |
1146 def filter_categs_on_biotype(selected_biotype, cpt): | |
1147 filtered_cpt = {} | |
1148 for sample in cpt: | |
1149 filtered_cpt[sample] = {} | |
1150 for feature, count in cpt[sample].items(): | |
1151 if feature[1] == selected_biotype: | |
1152 filtered_cpt[sample][feature[0]] = count | |
1153 return filtered_cpt | |
1154 | |
1155 | |
1156 def unnecessary_param(param, message): | |
1157 """ Function to display a warning on the standard error. """ | |
1158 if param: | |
1159 print >> sys.stderr, message | |
1160 | |
1161 | |
1162 def usage_message(): | |
1163 return """ | |
1164 Generate genome indexes: | |
1165 python ALFA.py -a GTF_FILE [-g GENOME_INDEX] | |
1166 [--chr_len CHR_LENGTHS_FILE] | |
1167 Process BAM file(s): | |
1168 python ALFA.py -g GENOME_INDEX -i BAM1 LABEL1 [BAM2 LABEL2 ...] | |
1169 [--bedgraph] [-s STRAND] | |
1170 [-n] [--pdf output.pdf] | |
1171 [-d {1,2,3,4}] [-t YMIN YMAX] | |
1172 Index genome + process BAM: | |
1173 python ALFA.py -a GTF_FILE [-g GENOME_INDEX] | |
1174 -i BAM1 LABEL1 [BAM2 LABEL2 ...] | |
1175 [--chr_len CHR_LENGTHS_FILE] | |
1176 [--bedgraph][-s STRAND] | |
1177 [-n] [--pdf output.pdf] | |
1178 [-d {1,2,3,4}] [-t YMIN YMAX] | |
1179 | |
1180 Process previously created ALFA counts file(s): | |
1181 python ALFA.py -c COUNTS1 [COUNTS2 ...] | |
1182 [-s STRAND] | |
1183 [-n] [--pdf output.pdf] | |
1184 [-d {1,2,3,4}] [-t YMIN YMAX] | |
1185 | |
1186 """ | |
1187 | |
1188 ########################################################################## | |
1189 # MAIN # | |
1190 ########################################################################## | |
1191 | |
1192 | |
1193 if __name__ == "__main__": | |
1194 | |
1195 #### Parse command line arguments and store them in the variable options | |
1196 parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, usage=usage_message()) | |
1197 parser.add_argument("--version", action="version", version="version 1.0", | |
1198 help="Show ALFA version number and exit\n\n-----------\n\n") | |
1199 # Options regarding the index | |
1200 parser.add_argument("-g", "--genome_index", | |
1201 help="Genome index files path and basename for existing index, or path and basename for new index creation\n\n") | |
1202 parser.add_argument("-a", "--annotation", metavar="GTF_FILE", help="Genomic annotations file (GTF format)\n\n") | |
1203 parser.add_argument("--chr_len", help="Tabulated file containing chromosome names and lengths\n\n-----------\n\n") | |
1204 | |
1205 # Options regarding the intersection step | |
1206 #parser.add_argument('-i', '--input', '--bam', dest='input', metavar=('BAM_FILE1 LABEL1', ""), nargs='+', | |
1207 #help='Input BAM file(s) and label(s). The BAM files must be sorted by position.\n\n') | |
1208 parser.add_argument("--bam", metavar=("BAM_FILE1 LABEL1", ""), nargs="+", | |
1209 help="Input BAM file(s) and label(s). The BAM files must be sorted by position.\n\n") ## MB: position AND chr?? | |
1210 # parser.add_argument('--bedgraph', action='store_const',default = False, const = True, help="Use this options if your input file(s) is(are) already in bedgraph format\n\n") | |
1211 #parser.add_argument('--bedgraph', dest='bedgraph', action='store_const', default=False, const=True, | |
1212 #help="Use this options if your input file(s) is(are) already in bedgraph format\n\n") | |
1213 parser.add_argument("--bedgraph", metavar=("BEDGRAPH_FILE1 LABEL1", ""), nargs="+", | |
1214 help="Use this options if your input is/are BedGraph file(s).\n\n") | |
1215 parser.add_argument("-c", "--counts", metavar=("COUNTS_FILE", ""), nargs="+", | |
1216 help="Use this options instead of '-i/--input' to provide ALFA counts files as input \ninstead of bam/bedgraph files.\n\n") | |
1217 #parser.add_argument('-s', '--strandness', dest="strandness", nargs=1, action='store', default=['unstranded'], choices=['unstranded', 'forward', 'reverse', 'fr-firststrand', 'fr-secondstrand'], metavar="", help="Library orientation. Choose within: 'unstranded', 'forward'/'fr-firststrand' \nor 'reverse'/'fr-secondstrand'. (Default: 'unstranded')\n\n-----------\n\n") | |
1218 parser.add_argument("-s", "--strandness", default="unstranded", | |
1219 choices=["unstranded", "forward", "reverse", "fr-firststrand", "fr-secondstrand"], metavar="", | |
1220 help="Library orientation. Choose within: 'unstranded', " | |
1221 "'forward'/'fr-firststrand' \nor 'reverse'/'fr-secondstrand'. " | |
1222 "(Default: 'unstranded')\n\n-----------\n\n") | |
1223 | |
1224 # Options regarding the plot | |
1225 parser.add_argument("--biotype_filter", help=argparse.SUPPRESS) # "Make an extra plot of categories distribution using only counts of the specified biotype." ## MB: TO DISPLAY (no suppress) | |
1226 parser.add_argument("-d", "--categories_depth", type=int, default=3, choices=range(1, 5), | |
1227 help="Use this option to set the hierarchical level that will be considered in the GTF file (default=3): \n(1) gene,intergenic; \n(2) intron,exon,intergenic; \n(3) 5'UTR,CDS,3'UTR,intron,intergenic; \n(4) start_codon,5'UTR,CDS,3'UTR,stop_codon,intron,intergenic. \n\n") | |
1228 parser.add_argument("--pdf", nargs="?", const="ALFA_plots.pdf", | |
1229 help="Save produced plots in PDF format at specified path ('categories_plots.pdf' if no argument provided).\n\n") | |
1230 parser.add_argument("--png", nargs="?", const="ALFA_plots.png", | |
1231 help="Save produced plots in PNG format with provided argument as basename \nor 'categories.png' and 'biotypes.png' if no argument provided.\n\n") | |
1232 parser.add_argument("--svg", nargs="?", const="ALFA_plots.svg", | |
1233 help="Save produced plots in SVG format with provided argument as basename \nor 'categories.svg' and 'biotypes.svg' if no argument provided.\n\n") | |
1234 parser.add_argument("-n", "--no_display", action="store_const", const=True, default=False, help="Do not display plots.\n\n") # We have to add "const=None" to avoid a bug in argparse | |
1235 parser.add_argument("-t", "--threshold", dest="threshold", nargs=2, metavar=("ymin", "ymax"), type=float, | |
1236 help="Set axis limits for enrichment plots.\n\n") | |
1237 | |
1238 if len(sys.argv) == 1: | |
1239 parser.print_usage() | |
1240 sys.exit(1) | |
1241 | |
1242 options = parser.parse_args() | |
1243 | |
1244 ''' | |
1245 # Booleans for steps to be executed | |
1246 make_index = False | |
1247 intersect_reads = False | |
1248 process_counts = False | |
1249 | |
1250 #### Check arguments conformity and define which steps have to be performed | |
1251 print "### Checking parameters" | |
1252 if options.counts: | |
1253 # Aucun autre argument requis, precise that the other won't be used (if this is true!!) | |
1254 # Vérifier extension input | |
1255 | |
1256 # Action: Only do the plot | |
1257 process_counts = True | |
1258 else: | |
1259 if options.annotation: | |
1260 # If '-gi' parameter is present | |
1261 if options.genome_index: | |
1262 genome_index_basename = options.genome_index | |
1263 else: | |
1264 # Otherwise the GTF filename without extension will be the basename | |
1265 genome_index_basename = options.annotation.split("/")[-1].split(".gtf")[0] | |
1266 # Check if the indexes were already created and warn the user | |
1267 if os.path.isfile(genome_index_basename + ".stranded.index"): | |
1268 if options.input: | |
1269 print >> sys.stderr, "Warning: an index file named '%s' already exists and will be used. If you want to create a new index, please delete this file or specify an other path." % ( | |
1270 genome_index_basename + ".stranded.index") | |
1271 else: | |
1272 sys.exit( | |
1273 "Error: an index file named %s already exists. If you want to create a new index, please delete this file or specify an other path.\n" % ( | |
1274 genome_index_basename + ".stranded.index")) | |
1275 # Create them otherwise | |
1276 else: | |
1277 make_index = True | |
1278 # If the index is already done | |
1279 if options.input: | |
1280 # Required arguments are the input and the genome_index | |
1281 if 'genome_index_basename' not in locals(): | |
1282 required_arg(options.genome_index, "-g/--genome_index") | |
1283 genome_index_basename = options.genome_index | |
1284 required_arg(options.input, "-i/--input/--bam") | |
1285 for i in xrange(0, len(options.input), 2): | |
1286 # Check whether the input file exists | |
1287 try: | |
1288 open(options.input[i]) | |
1289 except IOError: | |
1290 sys.exit("Error: the input file " + options.input[i] + " was not found. Aborting.") | |
1291 # Check whether the input file extensions are 'bam', 'bedgraph' or 'bg' and the label extension are no | |
1292 try: | |
1293 extension = os.path.splitext(options.input[i + 1])[1] | |
1294 if extension in ('.bam', '.bedgraph', '.bg'): | |
1295 sys.exit("Error: it seems input files and associated labels are not correctly provided.\n\ | |
1296 Make sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].") | |
1297 except: | |
1298 sys.exit( | |
1299 "Error: it seems input files and associated labels are not correctly provided.\nMake sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].") | |
1300 | |
1301 intersect_reads = True | |
1302 # Vérifier input's extension | |
1303 # TODO | |
1304 if not (options.counts or options.input or options.annotation): | |
1305 sys.exit( | |
1306 "\nError : some arguments are missing At least '-a', '-c' or '-i' is required. Please refer to help (-h/--help) and usage cases for more details.\n") | |
1307 if not options.counts: | |
1308 # Declare genome_index variables | |
1309 stranded_genome_index = genome_index_basename + ".stranded.index" | |
1310 unstranded_genome_index = genome_index_basename + ".unstranded.index" | |
1311 if options.strandness[0] == "unstranded": | |
1312 genome_index = unstranded_genome_index | |
1313 else: | |
1314 genome_index = stranded_genome_index | |
1315 ''' | |
1316 #### Initialization of some variables | |
1317 | |
1318 # Miscellaneous variables | |
1319 reverse_strand = {"+": "-", "-": "+"} | |
1320 samples = collections.OrderedDict() # Structure: {<label>: [<filename1>(, <filename2>)]} # Example: {'Toy': ['toy.bam']} | |
1321 | |
1322 #Sample labels and file paths | |
1323 labels = [] | |
1324 bams = [] | |
1325 bedgraphs = [] | |
1326 count_files = [] | |
1327 | |
1328 | |
1329 # Initializing the category priority order, coding biotypes and the final list | |
1330 # prios = {"start_codon": 7, "stop_codon": 7, "five_prime_utr": 6, "three_prime_utr": 6, "UTR": 6, "CDS": 5, "exon": 4, | |
1331 # "transcript": 3, "gene": 2, "antisense": 1, "intergenic": 0} | |
1332 prios = {"start_codon": 4, "stop_codon": 4, "five_prime_utr": 3, "three_prime_utr": 3, "UTR": 3, "CDS": 3, | |
1333 "exon": 2, "intron": 2, "transcript": 1, "gene": 1, "antisense": 0, "intergenic": -1} | |
1334 | |
1335 biotype_prios = None | |
1336 # biotype_prios = {"protein_coding":1, "miRNA":2} | |
1337 | |
1338 # Possibles groups of categories to plot | |
1339 # categs_group1 = {"start": ["start_codon"], "5UTR": ["five_prime_utr", "UTR"], "CDS": ["CDS", "exon"], | |
1340 # "3UTR": ["three_prime_utr"], "stop": ["stop_codon"], "introns": ["transcript", "gene"], | |
1341 # "intergenic": ["intergenic"], "antisense": ["antisense"]} | |
1342 # categs_group2 = {"5UTR": ["five_prime_utr", "UTR"], "CDS": ["CDS", "exon", "start_codon", "stop_codon"], | |
1343 # "3UTR": ["three_prime_utr"], "introns": ["transcript", "gene"], "intergenic": ["intergenic"], | |
1344 # "antisense": ["antisense"]} | |
1345 # categs_group3 = {"exons": ["five_prime_utr", "three_prime_utr", "UTR", "CDS", "exon", "start_codon", "stop_codon"], | |
1346 # "introns": ["transcript", "gene"], "intergenic": ["intergenic"], "antisense": ["antisense"]} | |
1347 # categs_group4 = { | |
1348 # "gene": ["five_prime_utr", "three_prime_utr", "UTR", "CDS", "exon", "start_codon", "stop_codon", "transcript", | |
1349 # "gene"], "intergenic": ["intergenic"], "antisense": ["antisense"]} | |
1350 | |
1351 | |
1352 categs_level1 = {"gene": ["five_prime_utr", "three_prime_utr", "UTR", "CDS", "exon", "intron", "start_codon", | |
1353 "stop_codon", "transcript", "gene"], | |
1354 "intergenic": ["intergenic"], | |
1355 "antisense": ["antisense"]} | |
1356 | |
1357 categs_level2 = {"exons": ["five_prime_utr", "three_prime_utr", "UTR", "CDS", "exon", "start_codon", "stop_codon"], | |
1358 "introns": ["intron"], | |
1359 "undescribed_genes": ["transcript", "gene"], | |
1360 "intergenic": ["intergenic"], | |
1361 "antisense": ["antisense"]} | |
1362 | |
1363 categs_level3 = {"5UTR": ["five_prime_utr", "UTR"], | |
1364 "CDS": ["CDS", "start_codon", "stop_codon"], | |
1365 "3UTR": ["three_prime_utr"], | |
1366 "undescribed_exons": ["exon"], | |
1367 "introns": ["intron"], | |
1368 "undescribed_genes": ["transcript", "gene"], | |
1369 "intergenic": ["intergenic"], | |
1370 "antisense": ["antisense"]} | |
1371 | |
1372 categs_level4 = {"5UTR": ["five_prime_utr", "UTR"], | |
1373 "start": ["start_codon"], | |
1374 "stop": ["stop_codon"], | |
1375 "CDS_body": ["CDS"], | |
1376 "undescribed_CDS": [], #TODO: implement CDS/undescribed_CDS distinction | |
1377 "3UTR": ["three_prime_utr"], | |
1378 "undescribed_exons": ["exon"], | |
1379 "introns": ["intron"], | |
1380 "undescribed_genes": ["transcript", "gene"], | |
1381 "intergenic": ["intergenic"], | |
1382 "antisense": ["antisense"]} | |
1383 | |
1384 # categs_groups = [categs_group4, categs_group3, categs_group2, categs_group1] # Order and merging for the final plot | |
1385 categs_levels = [categs_level1, categs_level2, categs_level3, categs_level4] | |
1386 | |
1387 parent_categ_level1 = [] | |
1388 parent_categ_level2 = [{"gene":[0.5,2.5]}] | |
1389 parent_categ_level3 = [{"exon":[0.5,3.5]}, {"gene":[0.5,5.5]}] | |
1390 parent_categ_level4 = [{"CDS":[1.5,4.5]}, {"exon":[0.5,6.5]},{"gene":[0.5,8.5]}] | |
1391 parent_categ_groups = [parent_categ_level1, parent_categ_level2, parent_categ_level3, parent_categ_level4] | |
1392 | |
1393 cat_list = ["5UTR", "start", "CDS", "CDS_body", "stop", "undescribed_CDS", "3UTR", "exons", "undescribed_exons", "introns", "gene", "undescribed_genes", "intergenic", "antisense"] | |
1394 | |
1395 # biotypes list | |
1396 biotypes = {"protein_coding", "polymorphic_pseudogene", "TR_C_gene", "TR_D_gene", "TR_J_gene", "TR_V_gene", "IG_C_gene", | |
1397 "IG_D_gene", "IG_J_gene", "IG_V_gene", "3prime_overlapping_ncrna", "lincRNA", "macro_lncRNA", "miRNA", | |
1398 "misc_RNA", "Mt_rRNA", "Mt_tRNA", "processed_transcript", "ribozyme", "rRNA", "scaRNA", "sense_intronic", | |
1399 "sense_overlapping", "snoRNA", "snRNA", "sRNA", "TEC", "vaultRNA", "antisense", | |
1400 "transcribed_processed_pseudogene", "transcribed_unitary_pseudogene", "transcribed_unprocessed_pseudogene", | |
1401 "translated_unprocessed_pseudogene", "TR_J_pseudogene", "TR_V_pseudogene", "unitary_pseudogene", | |
1402 "unprocessed_pseudogene", "processed_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", "IG_V_pseudogene", | |
1403 "pseudogene", "ncRNA", "tRNA"} # Type: set (to access quickly) | |
1404 | |
1405 # Grouping of biotypes: | |
1406 biotypes_group1 = {"protein_coding": ["protein_coding"], | |
1407 "pseudogenes": ["polymorphic_pseudogene", "transcribed_processed_pseudogene", | |
1408 "transcribed_unitary_pseudogene", "transcribed_unprocessed_pseudogene", | |
1409 "translated_unprocessed_pseudogene", "TR_J_pseudogene", "TR_V_pseudogene", | |
1410 "unitary_pseudogene", "unprocessed_pseudogene", "processed_pseudogene", | |
1411 "IG_C_pseudogene", "IG_J_pseudogene", "IG_V_pseudogene", "pseudogene"], | |
1412 "TR": ["TR_C_gene", "TR_D_gene", "TR_J_gene", "TR_V_gene"], | |
1413 "IG": ["IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_V_gene"], \ | |
1414 "MT_RNA": ["Mt_rRNA", "Mt_tRNA"], \ | |
1415 "ncRNA": ["lincRNA", "macro_lncRNA", "3prime_overlapping_ncrna", "ncRNA"], \ | |
1416 "others": ["misc_RNA", "processed_transcript", "ribozyme", "scaRNA", "sense_intronic", | |
1417 "sense_overlapping", "TEC", "vaultRNA"], | |
1418 "antisense": ["antisense"]} | |
1419 for biot in ["miRNA", "snoRNA", "snRNA", "rRNA", "sRNA", "tRNA"]: | |
1420 biotypes_group1[biot] = [biot] | |
1421 | |
1422 # Initializing the unknown features list | |
1423 unknown_cat = set() | |
1424 unknown_biot= set() | |
1425 | |
1426 # Initializing the genome category counter dict | |
1427 cpt_genome = {} | |
1428 ''' | |
1429 if process_counts: | |
1430 #### If input files are the categories counts, just load them and continue to recategorization step | |
1431 cpt, cpt_genome, samples_names = read_counts_files(options.counts) | |
1432 else: | |
1433 #### Create genome index if needed and get the sizes of categories | |
1434 index_chrom_list = [] | |
1435 if make_index: | |
1436 #### Get the chromosome lengths | |
1437 lengths = get_chromosome_lengths(options) | |
1438 # Generating the genome index files if the user didn't provide them | |
1439 create_genome_index(options.annotation, unstranded_genome_index, stranded_genome_index, cpt_genome, prios, | |
1440 biotypes, lengths) | |
1441 else: | |
1442 # Retrieving chromosome names saved in index | |
1443 index_chrom_list = get_chromosome_names_in_index(genome_index) | |
1444 | |
1445 | |
1446 # print '\nChr lengths:', lengths | |
1447 | |
1448 if intersect_reads: | |
1449 # If the indexes already exist, read them to compute the sizes of the categories in the genome and retrieve the chromosome lengths | |
1450 if not make_index: | |
1451 print "\n### Reading genome indexes\n", | |
1452 sys.stdout.flush() | |
1453 lengths = {} | |
1454 with open(genome_index, 'r') as genome_index_file: | |
1455 for line in genome_index_file: | |
1456 if line[0] == "#": | |
1457 lengths[line.split('\t')[0][1:]] = int(line.split('\t')[1]) | |
1458 else: | |
1459 add_info(cpt_genome, line.rstrip().split('\t')[4:], line.split('\t')[1], line.split('\t')[2], | |
1460 biotype_prios=None, categ_prios=prios) | |
1461 | |
1462 #print 'Indexed chromosomes: ' + ', '.join((sorted(index_chrom_list))) | |
1463 index_chrom_list.sort(key=alphanum_key) | |
1464 print 'Indexed chromosomes: ' + ', '.join((index_chrom_list)) | |
1465 | |
1466 #### Computing the genome intergenic count: sum of the chr lengths minus sum of the genome annotated intervals | |
1467 cpt_genome[('intergenic', 'intergenic')] = sum(lengths.itervalues()) - sum( | |
1468 [v for x, v in cpt_genome.iteritems() if x != ('antisense', 'antisense')]) | |
1469 if not make_index: | |
1470 print "Done!" | |
1471 # print '\nGenome category counts:' | |
1472 # for key,val in cpt_genome.iteritems(): | |
1473 # print key,"\t",val | |
1474 | |
1475 | |
1476 #### Create the Bedgraph files if needed and get the files list | |
1477 | |
1478 if not options.bedgraph: | |
1479 # Generating the BEDGRAPH files is the user provided BAM file(s) and get the samples labels (this names will be used in the plot legend) | |
1480 samples_files, samples_names = create_bedgraph_files(options.input, options.strandness[0]) | |
1481 else: | |
1482 # Just initialize the files list with the bedgraph paths | |
1483 # samples_files = [options.input[i] for i in range(0,len(options.input),2)] | |
1484 samples_files = [re.sub('.bedgraph$', '', options.input[i]) for i in range(0, len(options.input), 2)] | |
1485 # and get the labels | |
1486 samples_names = [options.input[i] for i in range(1, len(options.input), 2)] | |
1487 #### Retrieving chromosome names saved in index | |
1488 #chrom_list = get_chromosome_names_in_index(genome_index) | |
1489 #### Processing the BEDGRAPH files: intersecting the bedgraph with the genome index and count the number of aligned positions in each category | |
1490 cpt = intersect_bedgraphs_and_index_to_counts_categories(samples_files, samples_names, prios, genome_index, | |
1491 options.strandness[0], biotype_prios=None) | |
1492 | |
1493 #### Write the counts on disk | |
1494 write_counts_in_files(cpt, cpt_genome) | |
1495 | |
1496 if not (intersect_reads or process_counts) or (options.quiet and options.pdf == False): | |
1497 quit("\n### End of program") | |
1498 print "\n### Generating plots" | |
1499 # Updating the biotypes lists (biotypes and 'biotype_group1'): adding the 'unknow biotypes' found in gtf/index | |
1500 if unknown_feature == []: # 'unknown_feature' is define only during the index generation | |
1501 # Browse the feature to determine whether some biotypes are 'unknown' | |
1502 for sample, counts in cpt.items(): | |
1503 for (cat, biot) in counts: | |
1504 if biot not in biotypes and cat not in unknown_feature: | |
1505 unknown_feature.append(biot) | |
1506 for new_biot in unknown_feature: | |
1507 biotypes.add(new_biot) | |
1508 biotypes_group1["others"].append(new_biot) | |
1509 biotypes = sorted(biotypes) | |
1510 # move antisense categ to the end of the list | |
1511 biotypes.remove('antisense') | |
1512 biotypes.append('antisense') | |
1513 biotypes_group1 = sorted(biotypes_group1) | |
1514 | |
1515 # print '\nCounts for every category/biotype pair: ',cpt | |
1516 | |
1517 # Generating plots | |
1518 if options.pdf != False: | |
1519 if options.pdf == None: | |
1520 options.pdf = "categories_plots.pdf" | |
1521 pdf = PdfPages(options.pdf) | |
1522 else: | |
1523 pdf = False | |
1524 | |
1525 selected_biotype = None | |
1526 if options.biotype_filter: | |
1527 options.biotype_filter = options.biotype_filter[0] | |
1528 for sample in cpt: | |
1529 for feature in cpt[sample]: | |
1530 biotype = feature[1] | |
1531 if options.biotype_filter.lower() == biotype.lower(): | |
1532 selected_biotype = biotype | |
1533 break | |
1534 if selected_biotype == None: | |
1535 print "\nError: biotype '" + options.biotype_filter + "' not found. Please check the biotype name and that this biotype exists in your sample(s)." | |
1536 sys.exit() | |
1537 | |
1538 # Print a warning message if the UTRs are not specified as 5' or 3' (they will be ploted as 5'UTR) | |
1539 if 'UTR' in [categ[0] for counts in cpt.values() for categ in counts.keys()]: | |
1540 print "\nWARNING: (some) 5'UTR/3'UTR are not precisely defined. Consequently, positions annotated as "UTR" will be counted as "5'UTR"\n" | |
1541 ##MB | |
1542 #### Make the plot by categories | |
1543 #### Recategorizing with the final categories | |
1544 final_cats = categs_groups[options.categories_depth - 1] | |
1545 final_cat_cpt, final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt, cpt_genome, final_cats, selected_biotype) | |
1546 #### Display the distribution of specified categories (or biotypes) in samples on a barplot | |
1547 # Remove the 'antisense' category if the library type is 'unstranded' | |
1548 for dic in cpt.values(): | |
1549 if ('antisense', 'antisense') in dic.keys(): break | |
1550 else: | |
1551 cat_list.remove('antisense') | |
1552 make_plot(cat_list, samples_names, final_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold, | |
1553 svg=options.svg, png=options.png) | |
1554 if selected_biotype: | |
1555 make_plot(cat_list, samples_names, filtered_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold, | |
1556 title="Categories distribution for '" + selected_biotype + "' biotype", svg=options.svg, png=options.png) | |
1557 | |
1558 #### Make the plot by biotypes | |
1559 #### Recategorizing with the final categories | |
1560 final_cat_cpt, final_genome_cpt = group_counts_by_biotype(cpt, cpt_genome, biotypes) | |
1561 #### Display the distribution of specified categories (or biotypes) in samples on a barplot | |
1562 make_plot(biotypes, samples_names, final_cat_cpt, final_genome_cpt, pdf, "biotypes", options.threshold, svg=options.svg, | |
1563 png=options.png) | |
1564 | |
1565 ##### Recategorizing with the final categories | |
1566 # final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes_group1) | |
1567 ##### Display the distribution of specified categories (or biotypes) in samples on a barplot | |
1568 # make_plot(biotypes_group1,samples_names,final_cat_cpt,final_genome_cpt,pdf,"Biotype groups", options.threshold, title="Biotypes distribution in mapped reads \n(biotypes are grouped by 'family')", svg = options.svg, png = options.png) | |
1569 | |
1570 | |
1571 if options.pdf: | |
1572 pdf.close() | |
1573 print "\n### Plots saved in pdf file: %s" % options.pdf | |
1574 | |
1575 print "\n### End of program" | |
1576 ''' | |
1577 print "### ALFA ###" | |
1578 if not (options.annotation or options.bam or options.bedgraph or options.counts): | |
1579 print >> sys.stderr, "\nError: At least one argument among '-a/--annotation', '--bam', '--bedgraph' or " \ | |
1580 "'-c/--counts' is required. Please refer to help (-h/--help) and usage cases for more " \ | |
1581 "details.\n" | |
1582 parser.print_usage() | |
1583 sys.exit() | |
1584 | |
1585 ## Getting the steps to execute and checking parameters | |
1586 | |
1587 # Getting the steps to execute | |
1588 generate_indexes = False | |
1589 generate_BedGraph = False | |
1590 intersect_indexes_BedGraph = False | |
1591 generate_plot = False | |
1592 | |
1593 # Checking parameters for the step: indexes generation | |
1594 if options.annotation: | |
1595 generate_indexes = True | |
1596 elif options.chr_len: | |
1597 unnecessary_param(options.chr_len, "Warning: the parameter '--chr_len' will not be used because the indexes generation step will not be performed.") | |
1598 | |
1599 # Checking parameters for the step: BedGraph files generation | |
1600 if options.bam: | |
1601 if options.bedgraph: | |
1602 sys.exit("\nError: parameters '--bam' and '--bedgraph' provided, only one should be.\n### End of program") | |
1603 else: | |
1604 if not generate_indexes and not options.genome_index: | |
1605 sys.exit("\nError: parameter '-g/--genome_index' should be provided.\n### End of program") | |
1606 # Checking the input BAM files | |
1607 for i in xrange(0, len(options.bam), 2): | |
1608 # Check whether the BAM file exists | |
1609 try: | |
1610 open(options.bam[i]) | |
1611 except IOError: | |
1612 sys.exit("\nError: the BAM file " + options.bam[i] + " was not found.\n### End of program") | |
1613 except IndexError: | |
1614 sys.exit("\nError: BAM files and associated labels are not correctly provided.\n" | |
1615 "Make sure to follow the expected format: --bam BAM_file1 Label1 [BAM_file2 Label2 ...]." | |
1616 "\n### End of program ###") | |
1617 | |
1618 # Check whether the BAM file extension in 'bam' | |
1619 if not options.bam[i].endswith(".bam"): | |
1620 sys.exit("\nError: at least one BAM file hasn't a '.bam' extension.\n### End of program ###") | |
1621 # Check whether the labels hasn't a "bam" extension | |
1622 try: | |
1623 if options.bam[i + 1].endswith(".bam"): | |
1624 sys.exit("\nError: at least one label for a BAM file has a '.bam' extension.\n" | |
1625 "Make sure to follow the expected format: --bam BAM_file1 Label1 [BAM_file2 Label2 ...]." | |
1626 "\n### End of program ###") | |
1627 except IndexError: | |
1628 sys.exit("\nError: BAM files and associated labels are not correctly provided.\n" | |
1629 "Make sure to follow the expected format: --bam BAM_file1 Label1 [BAM_file2 Label2 ...]." | |
1630 "\n### End of program ###") | |
1631 # Get label and replace invalid characters by "_" | |
1632 label = "_".join(re.findall(r"[\w\-']+", options.bam[i + 1])) | |
1633 # Check whether the BedGraph file(s) and counts file that will be generated already exists | |
1634 if options.strandness == "unstranded": | |
1635 existing_file(label + ".bedgraph") | |
1636 existing_file(label + ".unstranded.feature_counts.tsv") | |
1637 else: | |
1638 existing_file(label + ".plus.bedgraph") | |
1639 existing_file(label + ".minus.bedgraph") | |
1640 existing_file(label + ".stranded.feature_counts.tsv") | |
1641 # Register the sample label and filename | |
1642 labels.append(label) | |
1643 bams.append(options.bam[i]) | |
1644 # Set this step + the intersection one as tasks to process | |
1645 generate_BedGraph = True | |
1646 intersect_indexes_BedGraph = True | |
1647 | |
1648 # Checking parameters for the step: indexes and BedGraph files intersection | |
1649 if options.bedgraph: | |
1650 if not generate_indexes and not options.genome_index: | |
1651 sys.exit("\nError: parameter '-g/--genome_index' should be provided.\n### End of program") | |
1652 if options.strandness == "unstranded": | |
1653 sample_file_nb = 2 | |
1654 else: | |
1655 sample_file_nb = 3 | |
1656 # Checking the input BedGraph files | |
1657 for i in xrange(0, len(options.bedgraph), sample_file_nb): | |
1658 # Check whether the BedGraph file(s) exists | |
1659 for j in xrange(0,sample_file_nb - 1): | |
1660 try: | |
1661 open(options.bedgraph[i + j]) | |
1662 except IOError: | |
1663 if not (options.bedgraph[i + j].endswith(".bedgraph") or options.bedgraph[i + j].endswith(".bg")): | |
1664 sys.exit("\nError: it looks like BedGraph files and associated labels are not correctly provided.\n" | |
1665 "Make sure to follow the expected format: --bedgraph BedGraph_file1 Label1 [BedGraph_file2 Label2 ...]." | |
1666 "\n### End of program ###") | |
1667 else: | |
1668 sys.exit("\nError: the BedGraph file " + options.bedgraph[i + j] + " was not found.\n### End of program") | |
1669 except IndexError: | |
1670 sys.exit("\nError: it looks like BedGraph files and associated labels are not correctly provided.\n" | |
1671 "Make sure to follow the expected format: --bedgraph BedGraph_file1 Label1 [BedGraph_file2 Label2 ...]." | |
1672 "\n### End of program ###") | |
1673 # Check whether the BedGraph file extension is "bedgraph"/"bg" | |
1674 if not (options.bedgraph[i + j].endswith(".bedgraph") or options.bedgraph[i + j].endswith(".bg")): | |
1675 sys.exit("\nError: the BedGrapg file" + options.bedgraph[i + j] + " hasn't a '.bedgraph'/'.bg' extension." | |
1676 "\n### End of program ###") | |
1677 # Check whether the labels hasn't a "bedgraph"/"bg" extension | |
1678 try: | |
1679 if options.bedgraph[i + sample_file_nb - 1].endswith('.bedgraph') or options.bedgraph[i + sample_file_nb - 1].endswith('.bg'): | |
1680 sys.exit("\nError: the label " + options.bedgraph[i + sample_file_nb - 1] + " has a '.bedgraph'/'.bg' extension.\n" | |
1681 "Make sure to follow the expected format: " | |
1682 "--bedgraph BedGraph_file1 Label1 [BedGraph_file2 Label2 ...]." | |
1683 "\n### End of program ###") | |
1684 except IndexError: | |
1685 sys.exit("\nError: it looks like BedGraph files and associated labels are not correctly provided.\n" | |
1686 "Make sure to follow the expected format: --bedgraph BedGraph_file1 Label1 [BedGraph_file2 Label2 ...]." | |
1687 "\n### End of program ###") | |
1688 # Register the sample label and filename(s) | |
1689 bedgraphs.append(re.sub("(.(plus|minus))?.((bg)|(bedgraph))", "", options.bedgraph[i])) | |
1690 label = "_".join(re.findall(r"[\w\-']+", options.bedgraph[i + sample_file_nb - 1])) | |
1691 labels.append(label) | |
1692 # Check whether the count file(s) that will be created already exists | |
1693 if options.strandness == "unstranded": | |
1694 existing_file(label + ".unstranded.feature_counts.tsv") | |
1695 else: | |
1696 existing_file(label + ".stranded.feature_counts.tsv") | |
1697 # Set this step as a task to process | |
1698 intersect_indexes_BedGraph = True | |
1699 | |
1700 # Checking parameters for the step: plot generation | |
1701 if options.counts: | |
1702 # Checking unnecessary parameters | |
1703 unnecessary_param(options.annotation, "Warning: the parameter '-a/--annotation' will not be used because the counts are already provided.") | |
1704 generate_indexes = False | |
1705 unnecessary_param(options.genome_index, "Warning: the parameter '-g/--genome_index' will not be used because the counts are already provided.") | |
1706 unnecessary_param(options.bam, "Warning: the parameter '--bam' will not be used because the counts are already provided.") | |
1707 generate_BedGraph = False | |
1708 unnecessary_param(options.bedgraph, "Warning: the parameter '--bedgraph' will not be used because the counts are already provided.") | |
1709 intersect_indexes_BedGraph = False | |
1710 # Registering the sample labels and filenames | |
1711 for sample in options.counts: | |
1712 label = os.path.basename(sample) | |
1713 label = re.sub('(.(un)?stranded)?.feature_counts.tsv', '', label) | |
1714 label = "_".join(re.findall(r"[\w\-']+", label)) | |
1715 count_files.append(sample) | |
1716 labels.append(label) | |
1717 else: | |
1718 # Generating the genome index filenames | |
1719 index_chrom_list = [] # Not a set because we need to sort it according to the chromosome names later | |
1720 lengths = {} | |
1721 if options.genome_index: | |
1722 genome_index_basename = options.genome_index | |
1723 elif options.annotation: | |
1724 # Otherwise the GTF filename without extension will be the basename | |
1725 genome_index_basename = options.annotation.split("/")[-1].split(".gtf")[0] | |
1726 stranded_genome_index = genome_index_basename + ".stranded.index" | |
1727 unstranded_genome_index = genome_index_basename + ".unstranded.index" | |
1728 if options.strandness == "unstranded": | |
1729 genome_index = unstranded_genome_index | |
1730 else: | |
1731 genome_index = stranded_genome_index | |
1732 # If no plot is displayed or saved, plot parameters are useless | |
1733 if (options.no_display and not (options.pdf or options.png or options.svg)) or not(intersect_indexes_BedGraph or options.counts): | |
1734 # unnecessary_param(options.categories_depth, "Warning: the parameter '-d/--categories_depth' will not be used because no plots will be displayed or saved.") | |
1735 # # Cannot be tested because options.categories_depth has always a value (default value if option not specified by user) | |
1736 unnecessary_param(options.threshold, "Warning: the parameter '-t/--threshold' will not be used because no plots will be displayed or saved.") | |
1737 unnecessary_param(options.biotype_filter, "Warning: the parameter '--biotype_filter' will not be used because no plots will be displayed or saved.") | |
1738 if options.counts: | |
1739 sys.exit("Error: there is nothing to do (counts are provided and no display or plot saving is required") | |
1740 else: | |
1741 ## [BN] X server check | |
1742 # if not X server: | |
1743 #options.no_display = True | |
1744 #Message | |
1745 try: | |
1746 x_server = os.environ['DISPLAY'] | |
1747 generate_plot = True | |
1748 except: | |
1749 x_server = False | |
1750 if options.counts: | |
1751 sys.exit("Error: your current configuration does not allow graphical interface ('DISPLAY' variable is not set on your system).\nExiting") | |
1752 else: | |
1753 print >> sys.stderr, "WARNING: your current configuration does not allow graphical interface ('DISPLAY' variable is not set on your system).\nPlotting step will not be performed." | |
1754 | |
1755 ## Executing the step(s) | |
1756 | |
1757 # Indexes generation | |
1758 if generate_indexes: | |
1759 # Checking if the index files already exist | |
1760 if os.path.isfile(stranded_genome_index): | |
1761 sys.exit("\nError: index files named '" + genome_index_basename + | |
1762 ".stranded.index' already exists but you provided a GTF file. If you want to generate a new index, " | |
1763 "please delete these files or specify an other path.\n### End of program") | |
1764 # Running the index generation commands | |
1765 print "# Generating the genome index files" | |
1766 # Getting chromosomes lengths | |
1767 lengths = get_chromosome_lengths() | |
1768 # Generating the index files | |
1769 generate_genome_index(options.annotation, unstranded_genome_index, stranded_genome_index, lengths) | |
1770 # Displaying the list of indexed chromosomes | |
1771 index_chrom_list.sort(key=alphanum_key) | |
1772 print "Indexed chromosomes: " + ", ".join(index_chrom_list) | |
1773 if generate_indexes or not options.counts: | |
1774 # Getting index info | |
1775 read_index() | |
1776 # Computing the genome intergenic count: sum of the chr lengths minus sum of the genome annotated intervals | |
1777 cpt_genome[("intergenic", "intergenic")] = sum(lengths.values()) - sum([v for x, v in cpt_genome.iteritems() if x != ("antisense", "antisense")]) | |
1778 | |
1779 # BedGraph files generation | |
1780 if generate_BedGraph: | |
1781 print "# Generating the BedGraph files" | |
1782 #sample_files, sample_labels = generate_bedgraph_files() | |
1783 generate_bedgraph_files(labels, bams) | |
1784 | |
1785 # Indexes and BedGraph files intersection | |
1786 if intersect_indexes_BedGraph: | |
1787 print "# Intersecting index and BedGraph files" | |
1788 cpt = intersect_bedgraphs_and_index_to_counts_categories(labels, bedgraphs) | |
1789 # Write the counts to an output file | |
1790 write_counts_in_files(cpt, cpt_genome) | |
1791 | |
1792 ## Plot generation ## MB: all the section still to review | |
1793 if generate_plot: | |
1794 print "# Generating plots" | |
1795 # If input files are the categories counts, the first step is to load them | |
1796 if options.counts: | |
1797 #cpt, cpt_genome, sample_names = read_counts(options.counts) | |
1798 cpt, cpt_genome = read_counts(labels, count_files) | |
1799 # Managing the unknown biotypes | |
1800 for sample_label, counters in cpt.items(): | |
1801 for (cat, biot) in counters: | |
1802 if biot not in biotypes: | |
1803 unknown_biot.add(biot) | |
1804 for biot in unknown_biot: | |
1805 biotypes.add(biot) | |
1806 biotypes_group1["others"].append(biot) | |
1807 biotypes = sorted(biotypes) | |
1808 # Moving antisense cat to the end of the list | |
1809 biotypes.remove("antisense") | |
1810 biotypes.append("antisense") | |
1811 biotypes_group1 = sorted(biotypes_group1) | |
1812 # Filtering biotypes if necessary | |
1813 filtered_biotype = None | |
1814 if options.biotype_filter: | |
1815 for sample_label in cpt: | |
1816 for feature in cpt[sample_label]: | |
1817 biotype = feature[1] | |
1818 if options.biotype_filter.lower() == biotype.lower(): | |
1819 selected_biotype = biotype | |
1820 break | |
1821 if filtered_biotype: | |
1822 print "\nWarning: biotype '" + options.biotype_filter + "' not found. Please check the biotype name and that this biotype exists in your sample(s)." | |
1823 # Setting the plots filenames | |
1824 if options.pdf: ## MB: Do the same for svg and png?? | |
1825 pdf = PdfPages(options.pdf) | |
1826 else: | |
1827 pdf = False | |
1828 ## Generate the categories plot | |
1829 # Recategorizing within the final categories and plot generation | |
1830 final_cats = categs_levels[options.categories_depth - 1] | |
1831 parent_categs = parent_categ_groups[options.categories_depth - 1] | |
1832 final_cat_cpt, final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt, cpt_genome, final_cats, filtered_biotype) | |
1833 # Remove the "antisense" category if the library type is "unstranded" ## MB: if options.strandness == "unstranded": cat_list.remove("antisense")?? | |
1834 for dic in cpt.values(): | |
1835 if ("antisense", "antisense") in dic.keys(): break | |
1836 else: | |
1837 cat_list.remove("antisense") | |
1838 #make_plot(labels, cat_list, sample_labels, final_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold, svg=options.svg, png=options.png) | |
1839 make_plot(labels, cat_list, final_cat_cpt, final_genome_cpt, pdf, "Categories", options.threshold, svg=options.svg, png=options.png, categ_groups= parent_categs) | |
1840 if filtered_biotype: | |
1841 #make_plot(labels, cat_list, sample_labels, filtered_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold, title="Categories distribution for '" + filtered_biotype + "' biotype", svg=options.svg, png=options.png) | |
1842 make_plot(labels, cat_list, filtered_cat_cpt, final_genome_cpt, pdf, "Categories", options.threshold, title="Categories distribution for '" + filtered_biotype + "' biotype", svg=options.svg, png=options.png, categ_groups= parent_categs) | |
1843 ## Generate the biotypes plot | |
1844 # Recategorization within the final biotypes and plot generation | |
1845 final_cat_cpt, final_genome_cpt = group_counts_by_biotype(cpt, cpt_genome, biotypes) | |
1846 #make_plot(biotypes, sample_labels, final_cat_cpt, final_genome_cpt, pdf, "biotypes", options.threshold, svg=options.svg, png=options.png) | |
1847 make_plot(labels, biotypes, final_cat_cpt, final_genome_cpt, pdf, "Biotypes", options.threshold, svg=options.svg, png=options.png) | |
1848 | |
1849 | |
1850 print "### End of program ###" |