comparison alfa/ALFA.py @ 0:e360f840a92e draft default tip

Uploaded
author biocomp-ibens
date Wed, 16 May 2018 09:49:18 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e360f840a92e
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3
4 __author__ = "noel & bahin"
5 """ ALFA provides a global overview of features distribution composing NGS dataset(s). """
6
7 import argparse
8 import os
9 import numpy
10 import copy
11 import sys
12 import subprocess
13 import matplotlib.pyplot as plt
14 import matplotlib.cm as cmx
15 import matplotlib.colors as colors
16 import matplotlib.patheffects as PathEffects
17 import re
18 from matplotlib.backends.backend_pdf import PdfPages
19 # To correctly embed the texts when saving plots in svg format
20 import matplotlib
21 # import progressbar
22 import collections
23 import matplotlib as mpl
24 import numpy as np
25
26 matplotlib.rcParams["svg.fonttype"] = "none"
27
28
29 ##########################################################################
30 # FUNCTIONS #
31 ##########################################################################
32
33 def init_dict(d, key, init):
34 if key not in d:
35 d[key] = init
36
37
38 def tryint(s):
39 """ Function called by "alphanum_key" function to sort the chromosome names. """
40 try:
41 return int(s)
42 except ValueError:
43 return s
44
45
46 def alphanum_key(s):
47 """ Turn a string into a list of string and number chunks.
48 "z23a" -> ["z", 23, "a"]
49 """
50 return [ tryint(c) for c in re.split("([0-9]+)", s) ]
51
52
53 def required_arg(arg, aliases):
54 """ Function to display the help and quit if a required argument is missing. """
55 if not arg:
56 print >> sys.stderr, "\nError: %s argument is missing.\n" % aliases
57 parser.print_usage()
58 sys.exit(1)
59
60
61 def existing_file(filename):
62 """ Checks if filename already exists and exit if so. """
63 if os.path.isfile(filename):
64 sys.exit("Error: The file '" + filename + "' is about to be produced but already exists in the directory. \n### End of program")
65
66
67 def get_chromosome_lengths():
68 """
69 Parse the file containing the chromosome lengths.
70 If no length file is provided, browse the annotation file (GTF) to estimate the chromosome sizes.
71 """
72 lengths = {}
73 gtf_chrom_names = set()
74 # If the user provided a chromosome length file
75 if options.chr_len:
76 # Getting the chromosome lengths from the chromosome lengths file
77 with open(options.chr_len, "r") as chr_len_fh:
78 for line in chr_len_fh:
79 try:
80 lengths[line.split("\t")[0]] = int(line.rstrip().split("\t")[1])
81 except IndexError:
82 sys.exit("Error: The chromosome lengths file is not correctly formed. It is supposed to be tabulated file with two fields per line.")
83 # Getting the chromosome lengths from the GTF file
84 with open(options.annotation, "r") as gtf_fh:
85 for line in gtf_fh:
86 if not line.startswith("#"):
87 gtf_chrom_names.add(line.split("\t")[0])
88 # Checking if the chromosomes from the chromosome lengths file are present in the GTF file
89 for chrom in lengths:
90 if chrom not in gtf_chrom_names:
91 print >> sys.stderr, "Warning: chromosome '" + chrom + "' of the chromosome lengths file does not match any chromosome name in the GTF file provided and was ignored."
92 # Checking if the chromosomes from the GTF file are present in the lengths file
93 for chrom in gtf_chrom_names:
94 if chrom not in lengths:
95 print >> sys.stderr, "Warning: at least one chromosome ('" + chrom + "') was found in the GTF file and does not match any chromosome provided in the lengths file."
96 print >> sys.stderr, "\t=> All the chromosome lengths will be approximated using annotations in the GTF file."
97 break
98 else:
99 return lengths
100 # If no chromosome lengths file was provided or if at least one chromosome was missing in the file, the end of the last annotation of the chromosome in the GTF file is considered as the chromosome length
101 with open(options.annotation, "r") as gtf_fh:
102 for line in gtf_fh:
103 if not line.startswith("#"):
104 chrom = line.split("\t")[0]
105 end = int(line.split("\t")[4])
106 init_dict(lengths, chrom, 0)
107 lengths[chrom] = max(lengths[chrom], end)
108 print "The chromosome lengths have been approximated using the GTF file annotations (the stop position of the last annotation of each chromosome is considered as its length)."
109 return lengths
110
111
112 def write_feature_on_index(feat, chrom, start, stop, sign, stranded_genome_index):
113 """ Write one new line in the stranded index file and, if necessary, the unstranded index file. """
114 grouped_by_biotype_features = []
115 for biotype, categs in feat.iteritems():
116 categ_list = []
117 for cat in set(categs):
118 categ_list.append(cat)
119 grouped_by_biotype_features.append(":".join((str(biotype), ",".join(categ_list))))
120
121 #stranded_genome_index.write('\t'.join((chrom, start, stop, sign, '')) + '\t'.join(grouped_by_biotype_features) + '\n')
122 stranded_genome_index.write(
123 '\t'.join((chrom, start, stop, sign)) + '\t' + '\t'.join(grouped_by_biotype_features) + '\n')
124 if unstranded_genome_index: ## MB: Why? Not always unstranded and stranded??
125 #unstranded_genome_index.write('\t'.join((chrom, start, stop, '.', '')) + '\t'.join(grouped_by_biotype_features) + '\n')
126 unstranded_genome_index.write(
127 '\t'.join((chrom, start, stop, '.')) + '\t' + '\t'.join(grouped_by_biotype_features) + '\n')
128
129
130 def write_index_line(feat, chrom, start, stop, sign, fh):
131 """ Write a new line in an index file. """
132 # Formatting the features info
133 feat_by_biotype = []
134 for biot, cat in feat.iteritems():
135 #feat_by_biotype.append(":".join((str(biot), ",".join(set(cat)))))
136 feat_by_biotype.append(":".join((str(biot), ",".join(set(cat)))))
137 # Writing the features info in the index file
138 fh.write("\t".join((chrom, start, stop, sign)) + "\t" + "\t".join(feat_by_biotype) + "\n")
139
140
141 def write_index(feat_values, chrom, start, stop, stranded_genome_index, unstranded_genome_index):
142 """ Writing the features info in the proper index files. """
143 # Writing info to the stranded indexes
144 if feat_values[0] != {}:
145 write_index_line(feat_values[0], chrom, start, stop, "+", stranded_genome_index)
146 else:
147 stranded_genome_index.write("\t".join((chrom, start, stop, "+", "antisense\n")))
148 if feat_values[1] != {}:
149 write_index_line(feat_values[1], chrom, start, stop, "-", stranded_genome_index)
150 else:
151 stranded_genome_index.write("\t".join((chrom, start, stop, "-", "antisense\n")))
152 # Writing info to the unstranded index
153 unstranded_feat = dict(feat_values[0], **feat_values[1])
154 for name in set(feat_values[0]) & set(feat_values[1]):
155 unstranded_feat[name] += feat_values[0][name]
156 write_index_line(unstranded_feat, chrom, start, stop, ".", unstranded_genome_index)
157
158
159 def count_genome_features(cpt, features, start, stop, coverage=1):
160 """ Reads genome index and registers feature counts. """
161 # If no biotype priority: category with the highest priority for each found biotype has the same weight (1/n_biotypes)
162 if not biotype_prios:
163 nb_biot = len(features)
164 # For each categ(s)/biotype pairs
165 for feat in features:
166 cur_prio = 0
167 # Separate categorie(s) and biotype
168 try:
169 biot, cats = feat.split(":")
170 # Error if the feature is "antisense": update the "antisense/antisense" counts
171 except ValueError:
172 try:
173 cpt[("antisense", "antisense")] += (int(stop) - int(start)) * coverage / float(nb_biot)
174 except KeyError:
175 cpt[("antisense", "antisense")] = (int(stop) - int(start)) * coverage / float(nb_biot)
176 return None
177 # Browse the categories and get only the one(s) with highest priority
178 for cat in cats.split(","):
179 try:
180 prio = prios[cat]
181 except KeyError:
182 # TODO: Find a way to add unknown categories
183 if cat not in unknown_cat:
184 print >> sys.stderr, "Warning: Unknown categorie '%s' found and ignored.\n" % cat,
185 unknown_cat.add(cat)
186 continue
187 # Check if the category has a highest priority than the current one
188 if prio > cur_prio:
189 cur_prio = prio
190 cur_cat = {cat}
191 if prio == cur_prio:
192 cur_cat.add(cat)
193 # Increase each counts by the coverage divided by the number of categories and biotypes
194 nb_cat = len(cur_cat)
195 for cat in cur_cat:
196 try:
197 cpt[(cat, biot)] += (int(stop) - int(start)) * coverage / (float(nb_biot * nb_cat))
198 except KeyError:
199 cpt[(cat, biot)] = (int(stop) - int(start)) * coverage / (float(nb_biot * nb_cat))
200 else:
201 # TODO: Add an option to provide biotype priorities and handle it
202 pass
203
204 '''
205 def add_info(cpt, feat_values, start, stop, chrom=None, unstranded_genome_index=None, stranded_genome_index=None,
206 biotype_prios=None, coverage=1, categ_prios=None):
207 """
208 From an annotated genomic interval (start/stop positions and associated feature: one or more category(ies)/biotype pair(s) )
209 - If a file is provided: write the information at the end of the index file being generated;
210 - else: browse the features and update the counts of categories/biotypes found in the genome.
211 """
212 ## Writing in the file if it was provided
213 if stranded_genome_index:
214 # If only one strand has a feature, this feature will directly be written on the unstranded index
215 if feat_values[0] == {}:
216 stranded_genome_index.write('\t'.join((chrom, start, stop, '+', 'antisense\n')))
217 elif feat_values[1] == {}:
218 stranded_genome_index.write('\t'.join((chrom, start, stop, '-', 'antisense\n')))
219
220 write_feature_on_index(feat_values[0], chrom, start, stop, '+', stranded_genome_index)
221 write_feature_on_index(feat_values[1], chrom, start, stop, '-', stranded_genome_index)
222 unstranded_feat = dict(feat_values[0], **feat_values[1])
223 for name in set(feat_values[0]) & set(feat_values[1]):
224 unstranded_feat[name] += feat_values[0][name]
225 write_feature_on_index(unstranded_feat, chrom, start, stop, '.', unstranded_genome_index)
226
227 if feat_values[0] == {}:
228 # An interval with no feature corresponds to a region annotated only on the reverse strand: update 'antisense' counts
229 stranded_genome_index.write('\t'.join((chrom, start, stop, '+', 'antisense\n')))
230 #write_feature_on_index(feat_values[1], chrom, start, stop, '-', stranded_genome_index, unstranded_genome_index)
231 write_feature_on_index(feat_values[1], chrom, start, stop, '-', stranded_genome_index)
232 write_feature_on_index(feat_values[1], chrom, start, stop, '.', unstranded_genome_index)
233 elif feat_values[1] == {}:
234 #write_feature_on_index(feat_values[0], chrom, start, stop, '+', stranded_genome_index, unstranded_genome_index)
235 write_feature_on_index(feat_values[0], chrom, start, stop, '+', stranded_genome_index)
236 write_feature_on_index(feat_values[0], chrom, start, stop, '.', unstranded_genome_index)
237 stranded_genome_index.write('\t'.join((chrom, start, stop, '-', 'antisense\n')))
238 # Else, the unstranded index should contain the union of plus and minus features
239 else:
240 write_feature_on_index(feat_values[0], chrom, start, stop, '+', stranded_genome_index)
241 write_feature_on_index(feat_values[1], chrom, start, stop, '-', stranded_genome_index)
242 unstranded_feat = dict(feat_values[0], **feat_values[1])
243 for name in set(feat_values[0]) & set(feat_values[1]):
244 unstranded_feat[name] += feat_values[0][name]
245 write_feature_on_index(unstranded_feat, chrom, start, stop, '.', unstranded_genome_index)
246
247 ## Increasing category counter(s)
248 else: ##MB Why increase if file not provided??
249 # Default behavior if no biotype priorities : category with the highest priority for each found biotype has the same weight (1/n_biotypes)
250 if not biotype_prios:
251 nb_feat = len(feat_values) ## MB: nb biotypes??
252 # For every categ(s)/biotype pairs
253 for feat in feat_values: ## MB: for each biotype
254 cur_prio = 0
255 selected_categ = []
256 # Separate categorie(s) and biotype
257 try:
258 biot, cats = feat.split(":")
259 # Error if feature equal "antisense" : update the 'antisense/antisense' counts
260 except ValueError:
261 try:
262 cpt[(feat, feat)] += (int(stop) - int(start)) * coverage / float(nb_feat) ## MB: clearly write 'antisense'??
263 except: ## MB: add a KeyError exception??
264 cpt[(feat, feat)] = (int(stop) - int(start)) * coverage / float(nb_feat)
265 return
266 # Browse the categories and get only the one(s) with highest priority
267 for cat in cats.split(','):
268 try:
269 prio = prios[cat]
270 except: ## MB: KeyError??
271 # TODO Find a way to add unknown categories
272 if cat not in unknown_feature:
273 print >> sys.stderr, "Warning: Unknown categorie %s found and ignored.\n" % cat,
274 unknown_feature.append(cat)
275 continue
276 if prio > cur_prio:
277 cur_prio = prio
278 selected_categ = [cat]
279 if prio == cur_prio:
280 if cat != selected_categ: ## MB: work with set??
281 try:
282 if cat not in selected_categ:
283 selected_categ.append(cat)
284 except TypeError: ## What TypeError??
285 selected_categ = [selected_categ, cat]
286 # Increase each counts by the coverage divided by the number of categories and biotypes
287 nb_cats = len(selected_categ)
288 for cat in selected_categ:
289 try:
290 cpt[(cat, biot)] += (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats))
291 except KeyError:
292 cpt[(cat, biot)] = (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats))
293 # else :
294 # cpt[(cats,biot)] = (int(stop) - int(start)) / float(nb_feat) * coverage
295 # Else, apply biotype selection according to the priority set
296 else:
297 # TODO Add an option to pass biotype priorities and handle it
298 pass
299 '''
300
301 def register_interval(features_dict, chrom, stranded_index_fh, unstranded_index_fh):
302 """ Write the interval features info into the genome index files. """
303 # Adding the chromosome to the list if not present
304 if chrom not in index_chrom_list:
305 index_chrom_list.append(chrom)
306 # Writing the chromosome in the index file
307 with open(unstranded_index_fh, "a") as unstranded_index_fh, open(stranded_index_fh, "a") as stranded_index_fh:
308 # Initializing the first interval start and features
309 sorted_pos = sorted(features_dict["+"].keys())
310 interval_start = sorted_pos[0]
311 features_plus = features_dict["+"][interval_start]
312 features_minus = features_dict["-"][interval_start]
313 # Browsing the interval boundaries
314 for interval_stop in sorted_pos[1:]:
315 # Writing the current interval features to the indexes
316 write_index([features_plus, features_minus], chrom, str(interval_start), str(interval_stop), stranded_index_fh, unstranded_index_fh)
317 # Initializing the new interval start and features
318 interval_start = interval_stop
319 # Store current features
320 prev_features_plus = features_plus
321 prev_features_minus = features_minus
322 # Update features
323 features_plus = features_dict["+"][interval_start]
324 features_minus = features_dict["-"][interval_start]
325 # If feature == transcript and prev interval's feature is exon => add intron feature
326 for biotype, categ in features_plus.iteritems():
327 if set(categ) == {"gene", "transcript"}:
328 if "exon" in prev_features_plus[biotype]:
329 categ.append("intron")
330 else:
331 continue
332 for biotype, categ in features_minus.iteritems():
333 if set(categ) == {"gene", "transcript"}:
334 if "exon" in prev_features_minus[biotype]:
335 categ.append("intron")
336 else:
337 continue
338
339
340
341 def generate_genome_index(annotation, unstranded_genome_index, stranded_genome_index, chrom_sizes):
342 """ Create an index of the genome annotations and save it in a file. """
343 # Initializations
344 intervals_dict = {}
345 max_value = -1
346 prev_chrom = ""
347 i = 0 # Line counter
348 # Write the chromosome lengths as comment lines before the genome index
349 with open(unstranded_genome_index, "w") as unstranded_index_fh, open(stranded_genome_index, "w") as stranded_index_fh:
350 for key, value in chrom_sizes.items():
351 unstranded_index_fh.write("#%s\t%s\n" % (key, value))
352 stranded_index_fh.write("#%s\t%s\n" % (key, value))
353 # Progress bar to track the genome indexes creation
354 nb_lines = sum(1 for _ in open(annotation))
355 # pbar = progressbar.ProgressBar(widgets=["Indexing the genome ", progressbar.Percentage(), " ", progressbar.Bar(), progressbar.Timer()], maxval=nb_lines).start()
356 # Browsing the GTF file and writing into genome index files
357 with open(annotation, "r") as gtf_fh:
358 for line in gtf_fh:
359 i += 1
360 # Update the progressbar every 1k lines
361 # if i % 1000 == 1:
362 # pbar.update(i)
363 # Processing lines except comment ones
364 if not line.startswith("#"):
365 # Getting the line info
366 line_split = line.rstrip().split("\t")
367 chrom = line_split[0]
368 cat = line_split[2]
369 start = int(line_split[3]) - 1
370 stop = int(line_split[4])
371 strand = line_split[6]
372 antisense = reverse_strand[strand]
373 biotype = line_split[8].split("biotype")[1].split(";")[0].strip('" ')
374 # Registering stored features info in the genome index file(s) if the new line concerns a new chromosome or the new line concerns an annotation not overlapping previously recorded ones
375 if start > max_value or chrom != prev_chrom:
376 # Write the previous features
377 if intervals_dict:
378 register_interval(intervals_dict, prev_chrom, stranded_genome_index, unstranded_genome_index)
379 prev_chrom = chrom
380 # (Re)Initializing the intervals info dict
381 intervals_dict = {strand: {start: {biotype: [cat]}, stop: {}}, antisense: {start: {}, stop: {}}}
382 max_value = stop
383
384 # Update the dictionary which represents intervals for every distinct annotation
385 else:
386 # Storing the intervals on the strand of the current feature
387 stranded_intervals = intervals_dict[strand]
388 added_info = False # Variable to know if the features info were already added
389 # Browsing the existing boundaries
390 for boundary in sorted(stranded_intervals):
391 # While the GTF line start is after the browsed boundary: keep the browsed boundary features info in case the GTF line start is before the next boundary
392 if boundary < start:
393 stored_feat_strand, stored_feat_antisense = [dict(stranded_intervals[boundary]), dict(intervals_dict[antisense][boundary])]
394
395 # The GTF line start is already an existing boundary: store the existing features info (to manage after the GTF line stop) and update it with the GTF line features info
396 elif boundary == start:
397 stored_feat_strand, stored_feat_antisense = [dict(stranded_intervals[boundary]), dict(intervals_dict[antisense][boundary])]
398 # Adding the GTF line features info to the interval
399 try:
400 stranded_intervals[boundary][biotype] = stranded_intervals[boundary][biotype] + [cat]
401 except KeyError: # If the GTF line features info regard an unregistered biotype
402 stranded_intervals[boundary][biotype] = [cat]
403 added_info = True # The features info were added
404
405 # The browsed boundary is after the GTF line start: add the GTF line features info
406 elif boundary > start:
407 # Create a new boundary for the GTF line start if necessary (if it is between 2 existing boundaries, it was not created before)
408 if not added_info:
409 stranded_intervals[start] = copy.deepcopy(stored_feat_strand)
410 #stranded_intervals[start][biotype] = [cat]
411 try:
412 stranded_intervals[start][biotype].append(cat)
413 except KeyError:
414 stranded_intervals[start][biotype] = [cat]
415 intervals_dict[antisense][start] = copy.deepcopy(stored_feat_antisense)
416 added_info = True # The features info were added
417 # While the browsed boundary is before the GTF line stop: store the existing features info (to manage after the GTF line stop) and update it with the GTF line features info
418 if boundary < stop:
419 stored_feat_strand, stored_feat_antisense = [dict(stranded_intervals[boundary]), dict(intervals_dict[antisense][boundary])]
420 try:
421 stranded_intervals[boundary][biotype] = stranded_intervals[boundary][biotype] + [cat]
422 except KeyError:
423 stranded_intervals[boundary][biotype] = [cat]
424 # The GTF line stop is already exists, nothing more to do, the GTF line features info are integrated
425 elif boundary == stop:
426 break
427 # The browsed boundary is after the GTF line stop: create a new boundary for the GTF line stop (with the stored features info)
428 else: # boundary > stop
429 stranded_intervals[stop] = copy.deepcopy(stored_feat_strand)
430 intervals_dict[antisense][stop] = copy.deepcopy(stored_feat_antisense)
431 break # The GTF line features info are integrated
432 # If the GTF line stop is after the last boundary, extend the dictionary
433 if stop > max_value:
434 max_value = stop
435 stranded_intervals[stop] = {}
436 intervals_dict[antisense][stop] = {}
437
438 # Store the categories of the last chromosome
439 register_interval(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index)
440 # pbar.finish()
441
442
443 def generate_bedgraph_files(sample_labels, bam_files):
444 """ Creates BedGraph files from BAM ones and return filenames and labels. """
445 #sample_files = []
446 #sample_labels = []
447 # Progress bar to track the BedGraph file creation
448 #pbar = progressbar.ProgressBar(widgets=["Generating the BedGraph files ", progressbar.Percentage(), progressbar.Bar(), progressbar.Timer()], max_value=len(bam_files)+1).start()
449 # pbar = progressbar.ProgressBar(widgets=["Generating the BedGraph files ", progressbar.Percentage(), progressbar.Bar(), progressbar.Timer()], maxval=len(sample_labels)+1).start()
450 n = 1
451 # pbar.update(n)
452 #for n in range(0, len(bam_files), 2):
453 for sample_label, bam_file in zip(sample_labels, bam_files):
454 # Get the label for this sample
455 #sample_labels.append(bam_files[n + 1])
456 # Modify it to contain only alphanumeric characters (avoid files generation with dangerous names)
457 #modified_label = "_".join(re.findall(r"[\w']+", bam_files[n + 1]))
458 if options.strandness in ["forward", "fr-firststrand"]:
459 #subprocess.call("bedtools genomecov -bg -split -strand + -ibam " + bam_files[n] + " > " + modified_label + ".plus.bedgraph", shell=True)
460 subprocess.call("bedtools genomecov -bg -split -strand + -ibam " + bam_file + " > " + sample_label + ".plus.bedgraph", shell=True)
461 # pbar.update(n + 0.5)
462 #subprocess.call("bedtools genomecov -bg -split -strand - -ibam " + bam_files[n] + " > " + modified_label + ".minus.bedgraph", shell=True)
463 subprocess.call("bedtools genomecov -bg -split -strand - -ibam " + bam_file + " > " + sample_label + ".minus.bedgraph", shell=True)
464 # pbar.update(n + 0.5)
465 elif options.strandness in ["reverse", "fr-secondstrand"]:
466 #subprocess.call("bedtools genomecov -bg -split -strand - -ibam " + bam_files[n] + " > " + modified_label + ".plus.bedgraph", shell=True)
467 subprocess.call("bedtools genomecov -bg -split -strand - -ibam " + bam_file + " > " + sample_label + ".plus.bedgraph", shell=True)
468 # pbar.update(n + 0.5)
469 #subprocess.call("bedtools genomecov -bg -split -strand + -ibam " + bam_files[n] + " > " + modified_label + ".minus.bedgraph", shell=True)
470 subprocess.call("bedtools genomecov -bg -split -strand + -ibam " + bam_file + " > " + sample_label + ".minus.bedgraph", shell=True)
471 # pbar.update(n + 0.5)
472 else:
473 #subprocess.call("bedtools genomecov -bg -split -ibam " + bam_files[n] + " > " + modified_label + ".bedgraph", shell=True)
474 subprocess.call("bedtools genomecov -bg -split -ibam " + bam_file + " > " + sample_label + ".bedgraph", shell=True)
475 # pbar.update(n + 1)
476 #sample_files.append(modified_label)
477 # pbar.finish()
478 #return sample_files, sample_labels
479 return None
480
481
482 def read_gtf(gtf_index_file, sign):
483 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_features, endGTF
484 strand = ""
485 while strand != sign:
486 gtf_line = gtf_index_file.readline()
487 # If the GTF file is finished
488 if not gtf_line:
489 endGTF = True
490 return endGTF
491 splitline = gtf_line.rstrip().split("\t")
492 try:
493 strand = splitline[3]
494 # strand information can not be found in the file file header
495 except IndexError:
496 pass
497 gtf_chrom = splitline[0]
498 gtf_features = splitline[4:]
499 gtf_start, gtf_stop = map(int, splitline[1:3])
500 return endGTF
501
502
503 #def read_counts(counts_files):
504 def read_counts(sample_labels, counts_files):
505 """ Reads the counts from an input file. """
506 cpt = {}
507 cpt_genome = {}
508 #for fcounts in counts_files:
509 for sample_label, filename in zip(sample_labels, counts_files):
510 #label = os.path.splitext(os.path.basename(fcounts))[0]
511 #labels.append(label)
512 #cpt[label] = {}
513 cpt[sample_label] = {}
514 #with open(fcounts, "r") as counts_fh:
515 with open(filename, "r") as counts_fh:
516 for line in counts_fh:
517 if not line.startswith("#"):
518 feature = tuple(line.split("\t")[0].split(","))
519 #cpt[label][feature] = float(line.split("\t")[1])
520 cpt[sample_label][feature] = float(line.split("\t")[1])
521 cpt_genome[feature] = float(line.rstrip().split("\t")[2])
522 #return cpt, cpt_genome, labels
523 return cpt, cpt_genome
524
525
526 def get_chromosome_names_in_index(genome_index):
527 """ Returns the chromosome names in a genome index file. """
528 with open(genome_index, "r") as index_fh:
529 for line in index_fh:
530 if not line.startswith("#") and (line.split("\t")[0] not in index_chrom_list):
531 index_chrom_list.append(line.split("\t")[0])
532 return index_chrom_list
533
534
535 def read_index():
536 """ Parse index files info (chromosomes list, lengths and genome features). """
537 with open(genome_index, "r") as index_fh:
538 for line in index_fh:
539 if line.startswith("#"):
540 lengths[line.split("\t")[0][1:]] = int(line.split("\t")[1])
541 else:
542 chrom = line.split("\t")[0]
543 if chrom not in index_chrom_list:
544 index_chrom_list.append(chrom)
545 count_genome_features(cpt_genome, line.rstrip().split("\t")[4:], line.split("\t")[1], line.split("\t")[2])
546
547
548 #def intersect_bedgraphs_and_index_to_counts_categories(sample_files, sample_labels, prios, genome_index, biotype_prios=None): ## MB: To review
549 def intersect_bedgraphs_and_index_to_counts_categories(sample_labels, bedgraph_files, biotype_prios=None): ## MB: To review
550 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_cat, endGTF
551 unknown_chrom = []
552 cpt = {} # Counter for the nucleotides in the BAM input file(s)
553 #for n in range(len(sample_files)):
554 if bedgraph_files == []:
555 bedgraph_files = sample_labels
556 for sample_label, bedgraph_basename in zip(sample_labels, bedgraph_files):
557 #sample_file = sample_files[n]
558 #sample_name = sample_labels[n]
559 # Initializing the category counter dict for this sample and the number of lines to process for the progress bar
560 #init_dict(cpt, sample_name, {})
561 init_dict(cpt, sample_label, {})
562 bg_extension = ".bedgraph"
563 if options.strandness == "unstranded":
564 strands = [("", ".")]
565 #nb_lines = sum(1 for _ in open(sample_file + ".bedgraph"))
566 try :
567 nb_lines = sum(1 for _ in open(bedgraph_basename + bg_extension))
568 except IOError:
569 bg_extension = "bg"
570 nb_lines = sum(1 for _ in open(bedgraph_basename + bg_extension))
571 else:
572 strands = [(".plus", "+"), (".minus", "-")]
573 #nb_lines = sum(1 for _ in open(sample_file + ".plus.bedgraph")) + sum(1 for _ in open(sample_file + ".minus.bedgraph"))
574 try:
575 nb_lines = sum(1 for _ in open(bedgraph_basename + ".plus" + bg_extension)) + sum(1 for _ in open(bedgraph_basename + ".minus" + bg_extension))
576 except IOError:
577 nb_lines = sum(1 for _ in open(bedgraph_basename + ".plus" + bg_extension)) + sum(
578 1 for _ in open(bedgraph_basename + ".minus" + bg_extension))
579
580 # Progress bar to track the BedGraph and index intersection
581 #pbar = progressbar.ProgressBar(widgets=["Processing " + sample_file + " ", progressbar.Percentage(), progressbar.Bar(), progressbar.Timer()], max_value=nb_lines).start()
582 # pbar = progressbar.ProgressBar(widgets=["Processing " + sample_label + " ", progressbar.Percentage(), progressbar.Bar(), progressbar.Timer()], maxval=nb_lines).start()
583 i = 0
584
585 # Intersecting the BedGraph and index files
586 for strand, sign in strands:
587 prev_chrom = ""
588 endGTF = False # Reaching the next chr or the end of the GTF index
589 intergenic_adds = 0.0
590 #with open(sample_file + strand + ".bedgraph", "r") as bedgraph_fh:
591 with open(bedgraph_basename + strand + bg_extension, "r") as bedgraph_fh:
592 # Running through the BedGraph file
593 for bam_line in bedgraph_fh:
594 i += 1
595 # if i % 10000 == 0:
596 # pbar.update(i)
597 # Getting the BAM line info
598 bam_chrom = bam_line.split("\t")[0]
599 bam_start, bam_stop, bam_cpt = map(float, bam_line.split("\t")[1:4])
600 # Skip the line if the chromosome is not in the index
601 if bam_chrom not in index_chrom_list:
602 if bam_chrom not in unknown_chrom:
603 unknown_chrom.append(bam_chrom)
604 print "\r \r Chromosome '" + bam_chrom + "' not found in index." # MB: to adapt with the progress bar
605 continue
606 # If this is a new chromosome (or the first one)
607 if bam_chrom != prev_chrom:
608 intergenic_adds = 0.0
609 # (Re)opening the GTF index and looking for the first line of the matching chr
610 try:
611 gtf_index_file.close()
612 except UnboundLocalError:
613 pass
614 gtf_index_file = open(genome_index, "r")
615 endGTF = False
616 read_gtf(gtf_index_file, sign)
617 while bam_chrom != gtf_chrom:
618 read_gtf(gtf_index_file, sign)
619 if endGTF:
620 break
621 prev_chrom = bam_chrom
622
623 # Looking for the first matching annotation in the GTF index
624 while (not endGTF) and (gtf_chrom == bam_chrom) and (bam_start >= gtf_stop):
625 read_gtf(gtf_index_file, sign)
626 if gtf_chrom != bam_chrom:
627 endGTF = True
628 # Processing BAM lines before the first GTF annotation if there are
629 if bam_start < gtf_start:
630 # Increase the "intergenic" category counter with all or part of the BAM interval
631 try:
632 intergenic_adds += min(bam_stop, gtf_start) - bam_start
633 #cpt[sample_name][("intergenic", "intergenic")] += (min(bam_stop,
634 cpt[sample_label][("intergenic", "intergenic")] += (min(bam_stop,
635 gtf_start) - bam_start) * bam_cpt
636 except KeyError:
637 #cpt[sample_name][("intergenic", "intergenic")] = (min(bam_stop,
638 cpt[sample_label][("intergenic", "intergenic")] = (min(bam_stop,
639 gtf_start) - bam_start) * bam_cpt
640 # Go to next line if the BAM interval was totally upstream the first GTF annotation, carry on with the remaining part otherwise
641 if endGTF or (bam_stop <= gtf_start):
642 continue
643 else:
644 bam_start = gtf_start
645
646 # We can start the crossover
647 while not endGTF:
648 # Update category counter
649 #add_info(cpt[sample_name], gtf_features, bam_start, min(bam_stop, gtf_stop), coverage=bam_cpt)
650 #count_genome_features(cpt[sample_name], gtf_features, bam_start, min(bam_stop, gtf_stop), coverage=bam_cpt)
651 count_genome_features(cpt[sample_label], gtf_features, bam_start, min(bam_stop, gtf_stop), coverage=bam_cpt)
652 # Read the next GTF file line if the BAM line is not entirely covered
653 if bam_stop > gtf_stop:
654 # Update the BAM start pointer
655 bam_start = gtf_stop
656 endGTF = read_gtf(gtf_index_file, sign)
657 # If we read a new chromosome in the GTF file then it is considered finished
658 if bam_chrom != gtf_chrom:
659 endGTF = True
660 if endGTF:
661 break
662 else:
663 # Next if the BAM line is entirely covered
664 bam_start = bam_stop
665 break
666
667 # Processing the end of the BAM line if necessary
668 if endGTF and (bam_stop > bam_start):
669 try:
670 #cpt[sample_name][("intergenic", "intergenic")] += (bam_stop - bam_start) * bam_cpt
671 cpt[sample_label][("intergenic", "intergenic")] += (bam_stop - bam_start) * bam_cpt
672 except KeyError:
673 #cpt[sample_name][("intergenic", "intergenic")] = (bam_stop - bam_start) * bam_cpt
674 cpt[sample_label][("intergenic", "intergenic")] = (bam_stop - bam_start) * bam_cpt
675 gtf_index_file.close()
676 # pbar.finish()
677 return cpt
678
679
680 def write_counts_in_files(cpt, genome_counts):
681 """ Writes the biotype/category counts in an output file. """
682 for sample_label, counters in cpt.items():
683 sample_label = "_".join(re.findall(r"[\w\-']+", sample_label))
684 with open(sample_label + ".feature_counts.tsv", "w") as output_fh:
685 output_fh.write("#Category,biotype\tCounts_in_bam\tSize_in_genome\n")
686 for features_pair, counts in counters.items():
687 output_fh.write("%s\t%s\t%s\n" % (",".join(features_pair), counts, genome_counts[features_pair]))
688
689
690 def recategorize_the_counts(cpt, cpt_genome, final):
691 final_cat_cpt = {}
692 final_genome_cpt = {}
693 for f in cpt:
694 # print "\nFinal categories for",f,"sample"
695 final_cat_cpt[f] = {}
696 for final_cat in final:
697 tot = 0
698 tot_genome = 0
699 for cat in final[final_cat]:
700 tot += cpt[f][cat]
701 tot_genome += cpt_genome[cat]
702 # output_file.write('\t'.join((final_cat, str(tot))) + '\n')
703 # print '\t'.join((final_cat, str(tot)))
704 final_cat_cpt[f][final_cat] = tot
705 final_genome_cpt[final_cat] = tot_genome
706 return final_cat_cpt, final_genome_cpt
707
708
709 def group_counts_by_categ(cpt, cpt_genome, final, selected_biotype):
710 final_cat_cpt = {}
711 final_genome_cpt = {}
712 filtered_cat_cpt = {}
713 for f in cpt:
714 final_cat_cpt[f] = {}
715 filtered_cat_cpt[f] = {}
716 for final_cat in final:
717 tot = 0
718 tot_filter = 0
719 tot_genome = 0
720 for cat in final[final_cat]:
721 for key, value in cpt[f].items():
722 if key[0] == cat:
723 tot += value
724 tot_genome += cpt_genome[key]
725 if key[1] == selected_biotype:
726 tot_filter += value
727 # output_file.write('\t'.join((final_cat, str(tot))) + '\n')
728 # print '\t'.join((final_cat, str(tot)))
729 final_cat_cpt[f][final_cat] = tot
730 if tot_genome == 0:
731 final_genome_cpt[final_cat] = 1e-100
732 else:
733 final_genome_cpt[final_cat] = tot_genome
734 filtered_cat_cpt[f][final_cat] = tot_filter
735 # if "antisense" in final_genome_cpt: final_genome_cpt["antisense"] = 0
736 return final_cat_cpt, final_genome_cpt, filtered_cat_cpt
737
738
739 def group_counts_by_biotype(cpt, cpt_genome, biotypes):
740 final_cpt = {}
741 final_genome_cpt = {}
742 for f in cpt:
743 final_cpt[f] = {}
744 for biot in biotypes:
745 tot = 0
746 tot_genome = 0
747 try:
748 for final_biot in biotypes[biot]:
749 for key, value in cpt[f].items():
750 if key[1] == final_biot:
751 tot += value
752 # if key[1] != 'antisense':
753 tot_genome += cpt_genome[key]
754 except:
755 for key, value in cpt[f].items():
756 if key[1] == biot:
757 tot += value
758 tot_genome += cpt_genome[key]
759 if tot != 0:
760 final_cpt[f][biot] = tot
761 final_genome_cpt[biot] = tot_genome
762 return final_cpt, final_genome_cpt
763
764
765 # def get_cmap(N):
766 # '''Returns a function that maps each index in 0, 1, ... N-1 to a distinct
767 # RGB color.'''
768 # color_norm = colors.Normalize(vmin=0, vmax=N-1)
769 # scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv')
770 # def map_index_to_rgb_color(index):
771 # return scalar_map.to_rgba(index)
772 # return map_index_to_rgb_color
773
774 def one_sample_plot(ordered_categs, percentages, enrichment, n_cat, index, index_enrichment, bar_width, counts_type,
775 title, sample_labels):
776 ### Initialization
777 fig = plt.figure(figsize=(13, 9))
778 ax1 = plt.subplot2grid((2, 4), (0, 0), colspan=2)
779 ax2 = plt.subplot2grid((2, 4), (1, 0), colspan=2)
780 cmap = plt.get_cmap("Spectral")
781 cols = [cmap(x) for x in xrange(0, 256, 256 / n_cat)]
782 if title:
783 ax1.set_title(title + "in: %s" % sample_labels[0])
784 else:
785 ax1.set_title(counts_type + " distribution in mapped reads in: %s" % sample_labels[0])
786 ax2.set_title("Normalized counts of " + counts_type)
787
788 ### Barplots
789 # First barplot: percentage of reads in each categorie
790 ax1.bar(index, percentages, bar_width,
791 color=cols)
792 # Second barplot: enrichment relative to the genome for each categ
793 # (the reads count in a categ is divided by the categ size in the genome)
794 ax2.bar(index_enrichment, enrichment, bar_width,
795 color=cols, )
796 ### Piecharts
797 pielabels = [ordered_categs[i] if percentages[i] > 0.025 else "" for i in xrange(n_cat)]
798 sum_enrichment = numpy.sum(enrichment)
799 pielabels_enrichment = [ordered_categs[i] if enrichment[i] / sum_enrichment > 0.025 else "" for i in xrange(n_cat)]
800 # Categories piechart
801 ax3 = plt.subplot2grid((2, 4), (0, 2))
802 pie_wedge_collection, texts = ax3.pie(percentages, labels=pielabels, shadow=True, colors=cols)
803 # Enrichment piechart
804 ax4 = plt.subplot2grid((2, 4), (1, 2))
805 pie_wedge_collection, texts = ax4.pie(enrichment, labels=pielabels_enrichment, shadow=True, colors=cols)
806 # Add legends (append percentages to labels)
807 labels = [" ".join((ordered_categs[i], "({:.1%})".format(percentages[i]))) for i in range(len(ordered_categs))]
808 ax3.legend(pie_wedge_collection, labels, loc="center", fancybox=True, shadow=True, prop={"size": "medium"},
809 bbox_to_anchor=(1.7, 0.5))
810 labels = [" ".join((ordered_categs[i], "({:.1%})".format(enrichment[i] / sum_enrichment))) for i in
811 range(len(ordered_categs))] # if ordered_categs[i] != "antisense"]
812 ax4.legend(pie_wedge_collection, labels, loc="center", fancybox=True, shadow=True, prop={"size": "medium"},
813 bbox_to_anchor=(1.7, 0.5))
814 # Set aspect ratio to be equal so that pie is drawn as a circle
815 ax3.set_aspect("equal")
816 ax4.set_aspect("equal")
817 return fig, ax1, ax2
818
819
820 #def make_plot(ordered_categs, sample_names, categ_counts, genome_counts, pdf, counts_type, threshold, title=None,
821 def make_plot(sample_labels, ordered_categs, categ_counts, genome_counts, pdf, counts_type, threshold, title=None, svg=None, png=None,
822 categ_groups=[]): # MB: to review
823 # From ordered_categs, keep only the features (categs or biotypes) that we can find in at least one sample.
824 existing_categs = set()
825 for sample in categ_counts.values():
826 existing_categs |= set(sample.keys())
827 ordered_categs = filter(existing_categs.__contains__, ordered_categs)
828 xlabels = [cat if len(cat.split("_")) == 1 else "\n".join(cat.split("_")) if cat.split("_")[0] != 'undescribed' else "\n".join(["und.",cat.split("_")[1]]) for cat in ordered_categs]
829 n_cat = len(ordered_categs)
830 #n_exp = len(sample_names)
831 nb_samples = len(categ_counts)
832 ##Initialization of the matrix of counts (nrow=nb_experiements, ncol=nb_categorie)
833 #counts = numpy.matrix(numpy.zeros(shape=(n_exp, n_cat)))
834 counts = numpy.matrix(numpy.zeros(shape=(nb_samples, n_cat)))
835 '''
836 for exp in xrange(len(sample_names)):
837 for cat in xrange(len(ordered_categs)):
838 try:
839 counts[exp, cat] = categ_counts[sample_names[exp]][ordered_categs[cat]]
840 except:
841 pass
842 '''
843 for sample_label in sample_labels:
844 for cat in xrange(len(ordered_categs)):
845 try:
846 counts[sample_labels.index(sample_label), cat] = categ_counts[sample_label][ordered_categs[cat]]
847 except:
848 pass
849
850 ##Normalize the categorie sizes by the total size to get percentages
851 sizes = []
852 sizes_sum = 0
853 for cat in ordered_categs:
854 sizes.append(genome_counts[cat])
855 sizes_sum += genome_counts[cat]
856 if "antisense" in ordered_categs:
857 antisense_pos = ordered_categs.index("antisense")
858 sizes[antisense_pos] = 1e-100
859 for cpt in xrange(len(sizes)):
860 sizes[cpt] /= float(sizes_sum)
861
862 ## Create array which contains the percentage of reads in each categ for every sample
863 percentages = numpy.array(counts / numpy.sum(counts, axis=1))
864 ## Create the enrichment array (counts divided by the categorie sizes in the genome)
865 enrichment = numpy.array(percentages / sizes)
866 if "antisense_pos" in locals():
867 '''
868 for i in xrange(len(sample_names)):
869 enrichment[i][antisense_pos] = 0
870 '''
871 for n in xrange(nb_samples):
872 enrichment[n][antisense_pos] = 0
873
874 # enrichment=numpy.log(numpy.array(percentages/sizes))
875 #for exp in xrange(n_exp):
876 for n in xrange(nb_samples):
877 for i in xrange(n_cat):
878 val = enrichment[n][i]
879 if val > 1:
880 enrichment[n][i] = val - 1
881 elif val == 1 or val == 0:
882 enrichment[n][i] = 0
883 else:
884 enrichment[n][i] = -1 / val + 1
885
886 #### Finally, produce the plot
887
888 ##Get the colors from the colormap
889 ncolor = 16
890 cmap = ["#e47878", "#68b4e5", "#a3ea9b", "#ea9cf3", "#e5c957", "#a3ecd1", "#e97ca0", "#66d985", "#8e7ae5",
891 "#b3e04b", "#b884e4", "#e4e758", "#738ee3", "#e76688", "#70dddd", "#e49261"]
892 '''
893 if n_exp > ncolor:
894 cmap = plt.get_cmap("Set3", n_exp)
895 cmap = [cmap(i) for i in xrange(n_exp)]
896 '''
897 if nb_samples > ncolor:
898 cmap = plt.get_cmap("Set3", nb_samples)
899 cmap = [cmap(i) for i in xrange(nb_samples)]
900
901 ## Parameters for the plot
902 opacity = 1
903 # Create a vector which contains the position of each bar
904 index = numpy.arange(n_cat)
905 # Size of the bars (depends on the categs number)
906 #bar_width = 0.9 / n_exp
907 bar_width = 0.9 / nb_samples
908
909 ##Initialise the subplot
910 # if there is only one sample, also plot piecharts
911 # if n_exp == 1 and counts_type.lower() == 'categories':
912 # fig, ax1, ax2 = one_sample_plot(ordered_categs, percentages[0], enrichment[0], n_cat, index, bar_width, counts_type, title)
913 ## If more than one sample
914 # else:
915 if counts_type.lower() != "categories":
916 #fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * n_exp) / 3, 10))
917 fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * nb_samples) / 3, 10))
918 else:
919 #fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * n_exp) / 3, 10))
920 fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * nb_samples) / 3, 10))
921 # Store the bars objects for enrichment plot
922 rects = []
923 # For each sample/experiment
924 #for i in range(n_exp):
925 for sample_label in sample_labels:
926 # First barplot: percentage of reads in each categorie
927 n = sample_labels.index(sample_label)
928 #ax1.bar(index + i * bar_width, percentages[i], bar_width,
929 ax1.bar(index + n * bar_width, percentages[n], bar_width,
930 alpha=opacity,
931 #color=cmap[i],
932 color=cmap[n],
933 #label=sample_names[i], edgecolor="#FFFFFF", lw=0)
934 label=sample_label, edgecolor="#FFFFFF", lw=0)
935 # Second barplot: enrichment relative to the genome for each categ
936 # (the reads count in a categ is divided by the categ size in the genome)
937 rects.append(ax2.bar(index + n * bar_width, enrichment[n], bar_width,
938 alpha=opacity,
939 #color=cmap[i],
940 color=cmap[n],
941 #label=sample_names[i], edgecolor=cmap[i], lw=0))
942 label=sample_label, edgecolor=cmap[n], lw=0))
943
944 ## Graphical options for the plot
945 # Adding of the legend
946 #if n_exp < 10:
947 if nb_samples < 10:
948 ax1.legend(loc="best", frameon=False)
949 legend_ncol = 1
950 #elif n_exp < 19:
951 elif nb_samples < 19:
952 legend_ncol = 2
953 else:
954 legend_ncol = 3
955 ax1.legend(loc="best", frameon=False, ncol=legend_ncol)
956 ax2.legend(loc="best", frameon=False, ncol=legend_ncol)
957 # ax2.legend(loc='upper center',bbox_to_anchor=(0.5,-0.1), fancybox=True, shadow=True)
958 # Main titles
959 if title:
960 ax1.set_title(title)
961 else:
962 ax1.set_title(counts_type + " counts")
963 ax2.set_title(counts_type + " normalized counts")
964
965 # Adding enrichment baseline
966 # ax2.axhline(y=0,color='black',linestyle='dashed',linewidth='1.5')
967 # Axes limits
968 ax1.set_xlim(-0.1, len(ordered_categs) + 0.1)
969 if len(sizes) == 1: ax1.set_xlim(-2, 3)
970 ax2.set_xlim(ax1.get_xlim())
971 # Set axis limits (max/min values + 5% margin)
972 ax2_ymin = []
973 ax2_ymax = []
974 for sample_values in enrichment:
975 ax2_ymin.append(min(sample_values))
976 ax2_ymax.append(max(sample_values))
977 ax2_ymax = max(ax2_ymax)
978 ax2_ymin = min(ax2_ymin)
979 margin_top, margin_bottom = (abs(0.05 * ax2_ymax), abs(0.05 * ax2_ymin))
980 ax1.set_ylim(0, ax1.get_ylim()[1] * 1.05)
981 if threshold:
982 threshold_bottom = -abs(float(threshold[0])) + 1
983 threshold_top = float(threshold[1]) - 1
984
985 #for i in xrange(n_exp):
986 for n in xrange(nb_samples):
987 for y in xrange(n_cat):
988 #val = enrichment[i][y]
989 val = enrichment[n][y]
990 if not numpy.isnan(val) and not (threshold_bottom < val < threshold_top):
991 #rect = rects[i][y]
992 rect = rects[n][y]
993 rect_height = rect.get_height()
994 if rect.get_y() < 0:
995 diff = rect_height + threshold_bottom
996 rect.set_y(threshold_bottom)
997 ax2_ymin = threshold_bottom
998 margin_bottom = 0
999 else:
1000 diff = rect_height - threshold_top
1001 ax2_ymax = threshold_top
1002 margin_top = 0
1003 rect.set_height(rect.get_height() - diff)
1004 if margin_top != 0 and margin_bottom != 0:
1005 margin_top, margin_bottom = [max(margin_top, margin_bottom) for i in xrange(2)]
1006 ax2.set_ylim(ax2_ymin - margin_bottom, ax2_ymax + margin_top)
1007 # Y axis title
1008 ax1.set_ylabel("Proportion of reads (%)")
1009 ax2.set_ylabel("Enrichment relative to genome")
1010
1011
1012 # Add the categories on the x-axis
1013 #ax1.set_xticks(index + bar_width * n_exp / 2)
1014 ax1.set_xticks(index + bar_width * nb_samples / 2)
1015 #ax2.set_xticks(index + bar_width * n_exp / 2)
1016 ax2.set_xticks(index + bar_width * nb_samples / 2)
1017 if counts_type.lower() != "categories":
1018 ax1.set_xticklabels(ordered_categs, rotation="30", ha="right")
1019 ax2.set_xticklabels(ordered_categs, rotation="30", ha="right")
1020 else:
1021 ax1.set_xticklabels(xlabels)
1022 ax2.set_xticklabels(xlabels)
1023
1024 # Display fractions values in percentages
1025 ax1.set_yticklabels([str(int(i * 100)) for i in ax1.get_yticks()])
1026 # Correct y-axis ticks labels for enrichment subplot
1027 # ax2.set_yticklabels([str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()])
1028 yticks = list(ax2.get_yticks())
1029 yticks = [yticks[i] - 1 if yticks[i] > 9 else yticks[i] + 1 if yticks[i] < -9 else yticks[i] for i in
1030 xrange(len(yticks))]
1031 ax2.set_yticks(yticks)
1032 ax2.set_yticklabels([str(int(i + 1)) + "$^{+1}$" if i > 0 and i % 1 == 0 else str(
1033 i + 1) + "$^{+1}$" if i > 0 else 1 if i == 0 else str(
1034 int(-(i - 1))) + "$^{-1}$" if i < 0 and i % 1 == 0 else str(-(i - 1)) + "$^{-1}$" for i in ax2.get_yticks()])
1035 # ax2.set_yticklabels([i+1 if i>0 else 1 if i==0 else "$\\frac{1}{%s}$" %-(i-1) for i in ax2.get_yticks()])
1036 # Change appearance of 'antisense' bars on enrichment plot since we cannot calculate an enrichment for this artificial category
1037 if "antisense_pos" in locals(): # ax2.text(antisense_pos+bar_width/2,ax2.get_ylim()[1]/10,'NA')
1038 #for i in xrange(n_exp):
1039 for n in xrange(nb_samples):
1040 #rect = rects[i][antisense_pos]
1041 rect = rects[n][antisense_pos]
1042 rect.set_y(ax2.get_ylim()[0])
1043 rect.set_height(ax2.get_ylim()[1] - ax2.get_ylim()[0])
1044 rect.set_hatch("/")
1045 rect.set_fill(False)
1046 rect.set_linewidth(0)
1047 # rect.set_color('lightgrey')
1048 # rect.set_edgecolor('#EDEDED')
1049 rect.set_color("#EDEDED")
1050 #ax2.text(index[antisense_pos] + bar_width * n_exp / 2 - 0.1, (ax2_ymax + ax2_ymin) / 2, "NA")
1051 ax2.text(index[antisense_pos] + bar_width * nb_samples / 2 - 0.1, (ax2_ymax + ax2_ymin) / 2, "NA")
1052 # Add text for features absent in sample
1053 #for i in xrange(n_exp):
1054 for n in xrange(nb_samples):
1055 for y in xrange(n_cat):
1056 #if percentages[i][y] == 0:
1057 if percentages[n][y] == 0:
1058 txt = ax1.text(y + bar_width * (n + 0.5), 0.02, "Abs.", rotation="vertical", color=cmap[n],
1059 horizontalalignment="center", verticalalignment="bottom")
1060 txt.set_path_effects([PathEffects.Stroke(linewidth=0.5), PathEffects.Normal()])
1061 #elif enrichment[i][y] == 0:
1062 elif enrichment[n][y] == 0:
1063 #rects[i][y].set_linewidth(1)
1064 rects[n][y].set_linewidth(1)
1065
1066 # Remove top/right/bottom axes
1067 for ax in [ax1, ax2]:
1068 ax.spines["top"].set_visible(False)
1069 ax.spines["right"].set_visible(False)
1070 ax.spines["bottom"].set_visible(False)
1071 ax.tick_params(axis="x", which="both", bottom="on", top="off", labelbottom="on")
1072 ax.tick_params(axis="y", which="both", left="on", right="off", labelleft="on")
1073
1074
1075 ### Add second axis with categ groups
1076 annotate_group(categ_groups, label=None, ax=ax1)
1077 annotate_group(categ_groups, label=None, ax=ax2)
1078
1079 ### Adjust figure margins to
1080 if counts_type.lower() == "categories":
1081 plt.tight_layout(h_pad=5.0)
1082 fig.subplots_adjust(bottom=0.1)
1083 else:
1084 plt.tight_layout()
1085
1086 ## Showing the plot
1087 if pdf: ## TODO: allow several output formats
1088 pdf.savefig()
1089 plt.close()
1090 elif svg:
1091 if svg == True:
1092 plt.savefig(counts_type + ".svg")
1093 else:
1094 if os.path.splitext(svg)[1] == ".svg":
1095 plt.savefig(".".join((os.path.splitext(svg)[0], counts_type, "svg")))
1096 else:
1097 plt.savefig(".".join((svg, counts_type, "svg")))
1098 elif png:
1099 if png == True:
1100 plt.savefig(counts_type + ".png")
1101 else:
1102 if os.path.splitext(png)[1] == ".png":
1103 plt.savefig(".".join((os.path.splitext(png)[0], counts_type, "png")))
1104 else:
1105 plt.savefig(".".join((png, counts_type, "png")))
1106 else:
1107 plt.show()
1108 ## Save on disk it as a png image
1109 # fig.savefig('image_output.png', dpi=300, format='png', bbox_extra_artists=(lgd,), bbox_inches='tight')
1110
1111 def annotate_group(groups, ax=None, label=None, labeloffset=30):
1112 """Annotates the categories with their parent group and add x-axis label"""
1113
1114 def annotate(ax, name, left, right, y, pad):
1115 """Draw the group annotation"""
1116 arrow = ax.annotate(name, xy=(left, y), xycoords="data",
1117 xytext=(right, y - pad), textcoords="data",
1118 annotation_clip=False, verticalalignment="top",
1119 horizontalalignment="center", linespacing=2.0,
1120 arrowprops={'arrowstyle': "-", 'shrinkA': 0, 'shrinkB': 0,
1121 'connectionstyle': "angle,angleB=90,angleA=0,rad=5"}
1122 )
1123 return arrow
1124
1125 if ax is None:
1126 ax = plt.gca()
1127 level = 0
1128 for level in range(len(groups)):
1129 grp = groups[level]
1130 for name, coord in grp.items():
1131 ymin = ax.get_ylim()[0] - np.ptp(ax.get_ylim()) * 0.12 - np.ptp(ax.get_ylim()) * 0.05 * (level)
1132 ypad = 0.01 * np.ptp(ax.get_ylim())
1133 xcenter = np.mean(coord)
1134 annotate(ax, name, coord[0], xcenter, ymin, ypad)
1135 annotate(ax, name, coord[1], xcenter, ymin, ypad)
1136
1137 if label is not None:
1138 # Define xlabel and position it according to the number of group levels
1139 ax.annotate(label,
1140 xy=(0.5, 0), xycoords="axes fraction",
1141 xytext=(0, -labeloffset - (level + 1) * 15), textcoords="offset points",
1142 verticalalignment="top", horizontalalignment="center")
1143
1144 return
1145
1146 def filter_categs_on_biotype(selected_biotype, cpt):
1147 filtered_cpt = {}
1148 for sample in cpt:
1149 filtered_cpt[sample] = {}
1150 for feature, count in cpt[sample].items():
1151 if feature[1] == selected_biotype:
1152 filtered_cpt[sample][feature[0]] = count
1153 return filtered_cpt
1154
1155
1156 def unnecessary_param(param, message):
1157 """ Function to display a warning on the standard error. """
1158 if param:
1159 print >> sys.stderr, message
1160
1161
1162 def usage_message():
1163 return """
1164 Generate genome indexes:
1165 python ALFA.py -a GTF_FILE [-g GENOME_INDEX]
1166 [--chr_len CHR_LENGTHS_FILE]
1167 Process BAM file(s):
1168 python ALFA.py -g GENOME_INDEX -i BAM1 LABEL1 [BAM2 LABEL2 ...]
1169 [--bedgraph] [-s STRAND]
1170 [-n] [--pdf output.pdf]
1171 [-d {1,2,3,4}] [-t YMIN YMAX]
1172 Index genome + process BAM:
1173 python ALFA.py -a GTF_FILE [-g GENOME_INDEX]
1174 -i BAM1 LABEL1 [BAM2 LABEL2 ...]
1175 [--chr_len CHR_LENGTHS_FILE]
1176 [--bedgraph][-s STRAND]
1177 [-n] [--pdf output.pdf]
1178 [-d {1,2,3,4}] [-t YMIN YMAX]
1179
1180 Process previously created ALFA counts file(s):
1181 python ALFA.py -c COUNTS1 [COUNTS2 ...]
1182 [-s STRAND]
1183 [-n] [--pdf output.pdf]
1184 [-d {1,2,3,4}] [-t YMIN YMAX]
1185
1186 """
1187
1188 ##########################################################################
1189 # MAIN #
1190 ##########################################################################
1191
1192
1193 if __name__ == "__main__":
1194
1195 #### Parse command line arguments and store them in the variable options
1196 parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, usage=usage_message())
1197 parser.add_argument("--version", action="version", version="version 1.0",
1198 help="Show ALFA version number and exit\n\n-----------\n\n")
1199 # Options regarding the index
1200 parser.add_argument("-g", "--genome_index",
1201 help="Genome index files path and basename for existing index, or path and basename for new index creation\n\n")
1202 parser.add_argument("-a", "--annotation", metavar="GTF_FILE", help="Genomic annotations file (GTF format)\n\n")
1203 parser.add_argument("--chr_len", help="Tabulated file containing chromosome names and lengths\n\n-----------\n\n")
1204
1205 # Options regarding the intersection step
1206 #parser.add_argument('-i', '--input', '--bam', dest='input', metavar=('BAM_FILE1 LABEL1', ""), nargs='+',
1207 #help='Input BAM file(s) and label(s). The BAM files must be sorted by position.\n\n')
1208 parser.add_argument("--bam", metavar=("BAM_FILE1 LABEL1", ""), nargs="+",
1209 help="Input BAM file(s) and label(s). The BAM files must be sorted by position.\n\n") ## MB: position AND chr??
1210 # parser.add_argument('--bedgraph', action='store_const',default = False, const = True, help="Use this options if your input file(s) is(are) already in bedgraph format\n\n")
1211 #parser.add_argument('--bedgraph', dest='bedgraph', action='store_const', default=False, const=True,
1212 #help="Use this options if your input file(s) is(are) already in bedgraph format\n\n")
1213 parser.add_argument("--bedgraph", metavar=("BEDGRAPH_FILE1 LABEL1", ""), nargs="+",
1214 help="Use this options if your input is/are BedGraph file(s).\n\n")
1215 parser.add_argument("-c", "--counts", metavar=("COUNTS_FILE", ""), nargs="+",
1216 help="Use this options instead of '-i/--input' to provide ALFA counts files as input \ninstead of bam/bedgraph files.\n\n")
1217 #parser.add_argument('-s', '--strandness', dest="strandness", nargs=1, action='store', default=['unstranded'], choices=['unstranded', 'forward', 'reverse', 'fr-firststrand', 'fr-secondstrand'], metavar="", help="Library orientation. Choose within: 'unstranded', 'forward'/'fr-firststrand' \nor 'reverse'/'fr-secondstrand'. (Default: 'unstranded')\n\n-----------\n\n")
1218 parser.add_argument("-s", "--strandness", default="unstranded",
1219 choices=["unstranded", "forward", "reverse", "fr-firststrand", "fr-secondstrand"], metavar="",
1220 help="Library orientation. Choose within: 'unstranded', "
1221 "'forward'/'fr-firststrand' \nor 'reverse'/'fr-secondstrand'. "
1222 "(Default: 'unstranded')\n\n-----------\n\n")
1223
1224 # Options regarding the plot
1225 parser.add_argument("--biotype_filter", help=argparse.SUPPRESS) # "Make an extra plot of categories distribution using only counts of the specified biotype." ## MB: TO DISPLAY (no suppress)
1226 parser.add_argument("-d", "--categories_depth", type=int, default=3, choices=range(1, 5),
1227 help="Use this option to set the hierarchical level that will be considered in the GTF file (default=3): \n(1) gene,intergenic; \n(2) intron,exon,intergenic; \n(3) 5'UTR,CDS,3'UTR,intron,intergenic; \n(4) start_codon,5'UTR,CDS,3'UTR,stop_codon,intron,intergenic. \n\n")
1228 parser.add_argument("--pdf", nargs="?", const="ALFA_plots.pdf",
1229 help="Save produced plots in PDF format at specified path ('categories_plots.pdf' if no argument provided).\n\n")
1230 parser.add_argument("--png", nargs="?", const="ALFA_plots.png",
1231 help="Save produced plots in PNG format with provided argument as basename \nor 'categories.png' and 'biotypes.png' if no argument provided.\n\n")
1232 parser.add_argument("--svg", nargs="?", const="ALFA_plots.svg",
1233 help="Save produced plots in SVG format with provided argument as basename \nor 'categories.svg' and 'biotypes.svg' if no argument provided.\n\n")
1234 parser.add_argument("-n", "--no_display", action="store_const", const=True, default=False, help="Do not display plots.\n\n") # We have to add "const=None" to avoid a bug in argparse
1235 parser.add_argument("-t", "--threshold", dest="threshold", nargs=2, metavar=("ymin", "ymax"), type=float,
1236 help="Set axis limits for enrichment plots.\n\n")
1237
1238 if len(sys.argv) == 1:
1239 parser.print_usage()
1240 sys.exit(1)
1241
1242 options = parser.parse_args()
1243
1244 '''
1245 # Booleans for steps to be executed
1246 make_index = False
1247 intersect_reads = False
1248 process_counts = False
1249
1250 #### Check arguments conformity and define which steps have to be performed
1251 print "### Checking parameters"
1252 if options.counts:
1253 # Aucun autre argument requis, precise that the other won't be used (if this is true!!)
1254 # Vérifier extension input
1255
1256 # Action: Only do the plot
1257 process_counts = True
1258 else:
1259 if options.annotation:
1260 # If '-gi' parameter is present
1261 if options.genome_index:
1262 genome_index_basename = options.genome_index
1263 else:
1264 # Otherwise the GTF filename without extension will be the basename
1265 genome_index_basename = options.annotation.split("/")[-1].split(".gtf")[0]
1266 # Check if the indexes were already created and warn the user
1267 if os.path.isfile(genome_index_basename + ".stranded.index"):
1268 if options.input:
1269 print >> sys.stderr, "Warning: an index file named '%s' already exists and will be used. If you want to create a new index, please delete this file or specify an other path." % (
1270 genome_index_basename + ".stranded.index")
1271 else:
1272 sys.exit(
1273 "Error: an index file named %s already exists. If you want to create a new index, please delete this file or specify an other path.\n" % (
1274 genome_index_basename + ".stranded.index"))
1275 # Create them otherwise
1276 else:
1277 make_index = True
1278 # If the index is already done
1279 if options.input:
1280 # Required arguments are the input and the genome_index
1281 if 'genome_index_basename' not in locals():
1282 required_arg(options.genome_index, "-g/--genome_index")
1283 genome_index_basename = options.genome_index
1284 required_arg(options.input, "-i/--input/--bam")
1285 for i in xrange(0, len(options.input), 2):
1286 # Check whether the input file exists
1287 try:
1288 open(options.input[i])
1289 except IOError:
1290 sys.exit("Error: the input file " + options.input[i] + " was not found. Aborting.")
1291 # Check whether the input file extensions are 'bam', 'bedgraph' or 'bg' and the label extension are no
1292 try:
1293 extension = os.path.splitext(options.input[i + 1])[1]
1294 if extension in ('.bam', '.bedgraph', '.bg'):
1295 sys.exit("Error: it seems input files and associated labels are not correctly provided.\n\
1296 Make sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].")
1297 except:
1298 sys.exit(
1299 "Error: it seems input files and associated labels are not correctly provided.\nMake sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].")
1300
1301 intersect_reads = True
1302 # Vérifier input's extension
1303 # TODO
1304 if not (options.counts or options.input or options.annotation):
1305 sys.exit(
1306 "\nError : some arguments are missing At least '-a', '-c' or '-i' is required. Please refer to help (-h/--help) and usage cases for more details.\n")
1307 if not options.counts:
1308 # Declare genome_index variables
1309 stranded_genome_index = genome_index_basename + ".stranded.index"
1310 unstranded_genome_index = genome_index_basename + ".unstranded.index"
1311 if options.strandness[0] == "unstranded":
1312 genome_index = unstranded_genome_index
1313 else:
1314 genome_index = stranded_genome_index
1315 '''
1316 #### Initialization of some variables
1317
1318 # Miscellaneous variables
1319 reverse_strand = {"+": "-", "-": "+"}
1320 samples = collections.OrderedDict() # Structure: {<label>: [<filename1>(, <filename2>)]} # Example: {'Toy': ['toy.bam']}
1321
1322 #Sample labels and file paths
1323 labels = []
1324 bams = []
1325 bedgraphs = []
1326 count_files = []
1327
1328
1329 # Initializing the category priority order, coding biotypes and the final list
1330 # prios = {"start_codon": 7, "stop_codon": 7, "five_prime_utr": 6, "three_prime_utr": 6, "UTR": 6, "CDS": 5, "exon": 4,
1331 # "transcript": 3, "gene": 2, "antisense": 1, "intergenic": 0}
1332 prios = {"start_codon": 4, "stop_codon": 4, "five_prime_utr": 3, "three_prime_utr": 3, "UTR": 3, "CDS": 3,
1333 "exon": 2, "intron": 2, "transcript": 1, "gene": 1, "antisense": 0, "intergenic": -1}
1334
1335 biotype_prios = None
1336 # biotype_prios = {"protein_coding":1, "miRNA":2}
1337
1338 # Possibles groups of categories to plot
1339 # categs_group1 = {"start": ["start_codon"], "5UTR": ["five_prime_utr", "UTR"], "CDS": ["CDS", "exon"],
1340 # "3UTR": ["three_prime_utr"], "stop": ["stop_codon"], "introns": ["transcript", "gene"],
1341 # "intergenic": ["intergenic"], "antisense": ["antisense"]}
1342 # categs_group2 = {"5UTR": ["five_prime_utr", "UTR"], "CDS": ["CDS", "exon", "start_codon", "stop_codon"],
1343 # "3UTR": ["three_prime_utr"], "introns": ["transcript", "gene"], "intergenic": ["intergenic"],
1344 # "antisense": ["antisense"]}
1345 # categs_group3 = {"exons": ["five_prime_utr", "three_prime_utr", "UTR", "CDS", "exon", "start_codon", "stop_codon"],
1346 # "introns": ["transcript", "gene"], "intergenic": ["intergenic"], "antisense": ["antisense"]}
1347 # categs_group4 = {
1348 # "gene": ["five_prime_utr", "three_prime_utr", "UTR", "CDS", "exon", "start_codon", "stop_codon", "transcript",
1349 # "gene"], "intergenic": ["intergenic"], "antisense": ["antisense"]}
1350
1351
1352 categs_level1 = {"gene": ["five_prime_utr", "three_prime_utr", "UTR", "CDS", "exon", "intron", "start_codon",
1353 "stop_codon", "transcript", "gene"],
1354 "intergenic": ["intergenic"],
1355 "antisense": ["antisense"]}
1356
1357 categs_level2 = {"exons": ["five_prime_utr", "three_prime_utr", "UTR", "CDS", "exon", "start_codon", "stop_codon"],
1358 "introns": ["intron"],
1359 "undescribed_genes": ["transcript", "gene"],
1360 "intergenic": ["intergenic"],
1361 "antisense": ["antisense"]}
1362
1363 categs_level3 = {"5UTR": ["five_prime_utr", "UTR"],
1364 "CDS": ["CDS", "start_codon", "stop_codon"],
1365 "3UTR": ["three_prime_utr"],
1366 "undescribed_exons": ["exon"],
1367 "introns": ["intron"],
1368 "undescribed_genes": ["transcript", "gene"],
1369 "intergenic": ["intergenic"],
1370 "antisense": ["antisense"]}
1371
1372 categs_level4 = {"5UTR": ["five_prime_utr", "UTR"],
1373 "start": ["start_codon"],
1374 "stop": ["stop_codon"],
1375 "CDS_body": ["CDS"],
1376 "undescribed_CDS": [], #TODO: implement CDS/undescribed_CDS distinction
1377 "3UTR": ["three_prime_utr"],
1378 "undescribed_exons": ["exon"],
1379 "introns": ["intron"],
1380 "undescribed_genes": ["transcript", "gene"],
1381 "intergenic": ["intergenic"],
1382 "antisense": ["antisense"]}
1383
1384 # categs_groups = [categs_group4, categs_group3, categs_group2, categs_group1] # Order and merging for the final plot
1385 categs_levels = [categs_level1, categs_level2, categs_level3, categs_level4]
1386
1387 parent_categ_level1 = []
1388 parent_categ_level2 = [{"gene":[0.5,2.5]}]
1389 parent_categ_level3 = [{"exon":[0.5,3.5]}, {"gene":[0.5,5.5]}]
1390 parent_categ_level4 = [{"CDS":[1.5,4.5]}, {"exon":[0.5,6.5]},{"gene":[0.5,8.5]}]
1391 parent_categ_groups = [parent_categ_level1, parent_categ_level2, parent_categ_level3, parent_categ_level4]
1392
1393 cat_list = ["5UTR", "start", "CDS", "CDS_body", "stop", "undescribed_CDS", "3UTR", "exons", "undescribed_exons", "introns", "gene", "undescribed_genes", "intergenic", "antisense"]
1394
1395 # biotypes list
1396 biotypes = {"protein_coding", "polymorphic_pseudogene", "TR_C_gene", "TR_D_gene", "TR_J_gene", "TR_V_gene", "IG_C_gene",
1397 "IG_D_gene", "IG_J_gene", "IG_V_gene", "3prime_overlapping_ncrna", "lincRNA", "macro_lncRNA", "miRNA",
1398 "misc_RNA", "Mt_rRNA", "Mt_tRNA", "processed_transcript", "ribozyme", "rRNA", "scaRNA", "sense_intronic",
1399 "sense_overlapping", "snoRNA", "snRNA", "sRNA", "TEC", "vaultRNA", "antisense",
1400 "transcribed_processed_pseudogene", "transcribed_unitary_pseudogene", "transcribed_unprocessed_pseudogene",
1401 "translated_unprocessed_pseudogene", "TR_J_pseudogene", "TR_V_pseudogene", "unitary_pseudogene",
1402 "unprocessed_pseudogene", "processed_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", "IG_V_pseudogene",
1403 "pseudogene", "ncRNA", "tRNA"} # Type: set (to access quickly)
1404
1405 # Grouping of biotypes:
1406 biotypes_group1 = {"protein_coding": ["protein_coding"],
1407 "pseudogenes": ["polymorphic_pseudogene", "transcribed_processed_pseudogene",
1408 "transcribed_unitary_pseudogene", "transcribed_unprocessed_pseudogene",
1409 "translated_unprocessed_pseudogene", "TR_J_pseudogene", "TR_V_pseudogene",
1410 "unitary_pseudogene", "unprocessed_pseudogene", "processed_pseudogene",
1411 "IG_C_pseudogene", "IG_J_pseudogene", "IG_V_pseudogene", "pseudogene"],
1412 "TR": ["TR_C_gene", "TR_D_gene", "TR_J_gene", "TR_V_gene"],
1413 "IG": ["IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_V_gene"], \
1414 "MT_RNA": ["Mt_rRNA", "Mt_tRNA"], \
1415 "ncRNA": ["lincRNA", "macro_lncRNA", "3prime_overlapping_ncrna", "ncRNA"], \
1416 "others": ["misc_RNA", "processed_transcript", "ribozyme", "scaRNA", "sense_intronic",
1417 "sense_overlapping", "TEC", "vaultRNA"],
1418 "antisense": ["antisense"]}
1419 for biot in ["miRNA", "snoRNA", "snRNA", "rRNA", "sRNA", "tRNA"]:
1420 biotypes_group1[biot] = [biot]
1421
1422 # Initializing the unknown features list
1423 unknown_cat = set()
1424 unknown_biot= set()
1425
1426 # Initializing the genome category counter dict
1427 cpt_genome = {}
1428 '''
1429 if process_counts:
1430 #### If input files are the categories counts, just load them and continue to recategorization step
1431 cpt, cpt_genome, samples_names = read_counts_files(options.counts)
1432 else:
1433 #### Create genome index if needed and get the sizes of categories
1434 index_chrom_list = []
1435 if make_index:
1436 #### Get the chromosome lengths
1437 lengths = get_chromosome_lengths(options)
1438 # Generating the genome index files if the user didn't provide them
1439 create_genome_index(options.annotation, unstranded_genome_index, stranded_genome_index, cpt_genome, prios,
1440 biotypes, lengths)
1441 else:
1442 # Retrieving chromosome names saved in index
1443 index_chrom_list = get_chromosome_names_in_index(genome_index)
1444
1445
1446 # print '\nChr lengths:', lengths
1447
1448 if intersect_reads:
1449 # If the indexes already exist, read them to compute the sizes of the categories in the genome and retrieve the chromosome lengths
1450 if not make_index:
1451 print "\n### Reading genome indexes\n",
1452 sys.stdout.flush()
1453 lengths = {}
1454 with open(genome_index, 'r') as genome_index_file:
1455 for line in genome_index_file:
1456 if line[0] == "#":
1457 lengths[line.split('\t')[0][1:]] = int(line.split('\t')[1])
1458 else:
1459 add_info(cpt_genome, line.rstrip().split('\t')[4:], line.split('\t')[1], line.split('\t')[2],
1460 biotype_prios=None, categ_prios=prios)
1461
1462 #print 'Indexed chromosomes: ' + ', '.join((sorted(index_chrom_list)))
1463 index_chrom_list.sort(key=alphanum_key)
1464 print 'Indexed chromosomes: ' + ', '.join((index_chrom_list))
1465
1466 #### Computing the genome intergenic count: sum of the chr lengths minus sum of the genome annotated intervals
1467 cpt_genome[('intergenic', 'intergenic')] = sum(lengths.itervalues()) - sum(
1468 [v for x, v in cpt_genome.iteritems() if x != ('antisense', 'antisense')])
1469 if not make_index:
1470 print "Done!"
1471 # print '\nGenome category counts:'
1472 # for key,val in cpt_genome.iteritems():
1473 # print key,"\t",val
1474
1475
1476 #### Create the Bedgraph files if needed and get the files list
1477
1478 if not options.bedgraph:
1479 # Generating the BEDGRAPH files is the user provided BAM file(s) and get the samples labels (this names will be used in the plot legend)
1480 samples_files, samples_names = create_bedgraph_files(options.input, options.strandness[0])
1481 else:
1482 # Just initialize the files list with the bedgraph paths
1483 # samples_files = [options.input[i] for i in range(0,len(options.input),2)]
1484 samples_files = [re.sub('.bedgraph$', '', options.input[i]) for i in range(0, len(options.input), 2)]
1485 # and get the labels
1486 samples_names = [options.input[i] for i in range(1, len(options.input), 2)]
1487 #### Retrieving chromosome names saved in index
1488 #chrom_list = get_chromosome_names_in_index(genome_index)
1489 #### Processing the BEDGRAPH files: intersecting the bedgraph with the genome index and count the number of aligned positions in each category
1490 cpt = intersect_bedgraphs_and_index_to_counts_categories(samples_files, samples_names, prios, genome_index,
1491 options.strandness[0], biotype_prios=None)
1492
1493 #### Write the counts on disk
1494 write_counts_in_files(cpt, cpt_genome)
1495
1496 if not (intersect_reads or process_counts) or (options.quiet and options.pdf == False):
1497 quit("\n### End of program")
1498 print "\n### Generating plots"
1499 # Updating the biotypes lists (biotypes and 'biotype_group1'): adding the 'unknow biotypes' found in gtf/index
1500 if unknown_feature == []: # 'unknown_feature' is define only during the index generation
1501 # Browse the feature to determine whether some biotypes are 'unknown'
1502 for sample, counts in cpt.items():
1503 for (cat, biot) in counts:
1504 if biot not in biotypes and cat not in unknown_feature:
1505 unknown_feature.append(biot)
1506 for new_biot in unknown_feature:
1507 biotypes.add(new_biot)
1508 biotypes_group1["others"].append(new_biot)
1509 biotypes = sorted(biotypes)
1510 # move antisense categ to the end of the list
1511 biotypes.remove('antisense')
1512 biotypes.append('antisense')
1513 biotypes_group1 = sorted(biotypes_group1)
1514
1515 # print '\nCounts for every category/biotype pair: ',cpt
1516
1517 # Generating plots
1518 if options.pdf != False:
1519 if options.pdf == None:
1520 options.pdf = "categories_plots.pdf"
1521 pdf = PdfPages(options.pdf)
1522 else:
1523 pdf = False
1524
1525 selected_biotype = None
1526 if options.biotype_filter:
1527 options.biotype_filter = options.biotype_filter[0]
1528 for sample in cpt:
1529 for feature in cpt[sample]:
1530 biotype = feature[1]
1531 if options.biotype_filter.lower() == biotype.lower():
1532 selected_biotype = biotype
1533 break
1534 if selected_biotype == None:
1535 print "\nError: biotype '" + options.biotype_filter + "' not found. Please check the biotype name and that this biotype exists in your sample(s)."
1536 sys.exit()
1537
1538 # Print a warning message if the UTRs are not specified as 5' or 3' (they will be ploted as 5'UTR)
1539 if 'UTR' in [categ[0] for counts in cpt.values() for categ in counts.keys()]:
1540 print "\nWARNING: (some) 5'UTR/3'UTR are not precisely defined. Consequently, positions annotated as "UTR" will be counted as "5'UTR"\n"
1541 ##MB
1542 #### Make the plot by categories
1543 #### Recategorizing with the final categories
1544 final_cats = categs_groups[options.categories_depth - 1]
1545 final_cat_cpt, final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt, cpt_genome, final_cats, selected_biotype)
1546 #### Display the distribution of specified categories (or biotypes) in samples on a barplot
1547 # Remove the 'antisense' category if the library type is 'unstranded'
1548 for dic in cpt.values():
1549 if ('antisense', 'antisense') in dic.keys(): break
1550 else:
1551 cat_list.remove('antisense')
1552 make_plot(cat_list, samples_names, final_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold,
1553 svg=options.svg, png=options.png)
1554 if selected_biotype:
1555 make_plot(cat_list, samples_names, filtered_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold,
1556 title="Categories distribution for '" + selected_biotype + "' biotype", svg=options.svg, png=options.png)
1557
1558 #### Make the plot by biotypes
1559 #### Recategorizing with the final categories
1560 final_cat_cpt, final_genome_cpt = group_counts_by_biotype(cpt, cpt_genome, biotypes)
1561 #### Display the distribution of specified categories (or biotypes) in samples on a barplot
1562 make_plot(biotypes, samples_names, final_cat_cpt, final_genome_cpt, pdf, "biotypes", options.threshold, svg=options.svg,
1563 png=options.png)
1564
1565 ##### Recategorizing with the final categories
1566 # final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes_group1)
1567 ##### Display the distribution of specified categories (or biotypes) in samples on a barplot
1568 # make_plot(biotypes_group1,samples_names,final_cat_cpt,final_genome_cpt,pdf,"Biotype groups", options.threshold, title="Biotypes distribution in mapped reads \n(biotypes are grouped by 'family')", svg = options.svg, png = options.png)
1569
1570
1571 if options.pdf:
1572 pdf.close()
1573 print "\n### Plots saved in pdf file: %s" % options.pdf
1574
1575 print "\n### End of program"
1576 '''
1577 print "### ALFA ###"
1578 if not (options.annotation or options.bam or options.bedgraph or options.counts):
1579 print >> sys.stderr, "\nError: At least one argument among '-a/--annotation', '--bam', '--bedgraph' or " \
1580 "'-c/--counts' is required. Please refer to help (-h/--help) and usage cases for more " \
1581 "details.\n"
1582 parser.print_usage()
1583 sys.exit()
1584
1585 ## Getting the steps to execute and checking parameters
1586
1587 # Getting the steps to execute
1588 generate_indexes = False
1589 generate_BedGraph = False
1590 intersect_indexes_BedGraph = False
1591 generate_plot = False
1592
1593 # Checking parameters for the step: indexes generation
1594 if options.annotation:
1595 generate_indexes = True
1596 elif options.chr_len:
1597 unnecessary_param(options.chr_len, "Warning: the parameter '--chr_len' will not be used because the indexes generation step will not be performed.")
1598
1599 # Checking parameters for the step: BedGraph files generation
1600 if options.bam:
1601 if options.bedgraph:
1602 sys.exit("\nError: parameters '--bam' and '--bedgraph' provided, only one should be.\n### End of program")
1603 else:
1604 if not generate_indexes and not options.genome_index:
1605 sys.exit("\nError: parameter '-g/--genome_index' should be provided.\n### End of program")
1606 # Checking the input BAM files
1607 for i in xrange(0, len(options.bam), 2):
1608 # Check whether the BAM file exists
1609 try:
1610 open(options.bam[i])
1611 except IOError:
1612 sys.exit("\nError: the BAM file " + options.bam[i] + " was not found.\n### End of program")
1613 except IndexError:
1614 sys.exit("\nError: BAM files and associated labels are not correctly provided.\n"
1615 "Make sure to follow the expected format: --bam BAM_file1 Label1 [BAM_file2 Label2 ...]."
1616 "\n### End of program ###")
1617
1618 # Check whether the BAM file extension in 'bam'
1619 if not options.bam[i].endswith(".bam"):
1620 sys.exit("\nError: at least one BAM file hasn't a '.bam' extension.\n### End of program ###")
1621 # Check whether the labels hasn't a "bam" extension
1622 try:
1623 if options.bam[i + 1].endswith(".bam"):
1624 sys.exit("\nError: at least one label for a BAM file has a '.bam' extension.\n"
1625 "Make sure to follow the expected format: --bam BAM_file1 Label1 [BAM_file2 Label2 ...]."
1626 "\n### End of program ###")
1627 except IndexError:
1628 sys.exit("\nError: BAM files and associated labels are not correctly provided.\n"
1629 "Make sure to follow the expected format: --bam BAM_file1 Label1 [BAM_file2 Label2 ...]."
1630 "\n### End of program ###")
1631 # Get label and replace invalid characters by "_"
1632 label = "_".join(re.findall(r"[\w\-']+", options.bam[i + 1]))
1633 # Check whether the BedGraph file(s) and counts file that will be generated already exists
1634 if options.strandness == "unstranded":
1635 existing_file(label + ".bedgraph")
1636 existing_file(label + ".unstranded.feature_counts.tsv")
1637 else:
1638 existing_file(label + ".plus.bedgraph")
1639 existing_file(label + ".minus.bedgraph")
1640 existing_file(label + ".stranded.feature_counts.tsv")
1641 # Register the sample label and filename
1642 labels.append(label)
1643 bams.append(options.bam[i])
1644 # Set this step + the intersection one as tasks to process
1645 generate_BedGraph = True
1646 intersect_indexes_BedGraph = True
1647
1648 # Checking parameters for the step: indexes and BedGraph files intersection
1649 if options.bedgraph:
1650 if not generate_indexes and not options.genome_index:
1651 sys.exit("\nError: parameter '-g/--genome_index' should be provided.\n### End of program")
1652 if options.strandness == "unstranded":
1653 sample_file_nb = 2
1654 else:
1655 sample_file_nb = 3
1656 # Checking the input BedGraph files
1657 for i in xrange(0, len(options.bedgraph), sample_file_nb):
1658 # Check whether the BedGraph file(s) exists
1659 for j in xrange(0,sample_file_nb - 1):
1660 try:
1661 open(options.bedgraph[i + j])
1662 except IOError:
1663 if not (options.bedgraph[i + j].endswith(".bedgraph") or options.bedgraph[i + j].endswith(".bg")):
1664 sys.exit("\nError: it looks like BedGraph files and associated labels are not correctly provided.\n"
1665 "Make sure to follow the expected format: --bedgraph BedGraph_file1 Label1 [BedGraph_file2 Label2 ...]."
1666 "\n### End of program ###")
1667 else:
1668 sys.exit("\nError: the BedGraph file " + options.bedgraph[i + j] + " was not found.\n### End of program")
1669 except IndexError:
1670 sys.exit("\nError: it looks like BedGraph files and associated labels are not correctly provided.\n"
1671 "Make sure to follow the expected format: --bedgraph BedGraph_file1 Label1 [BedGraph_file2 Label2 ...]."
1672 "\n### End of program ###")
1673 # Check whether the BedGraph file extension is "bedgraph"/"bg"
1674 if not (options.bedgraph[i + j].endswith(".bedgraph") or options.bedgraph[i + j].endswith(".bg")):
1675 sys.exit("\nError: the BedGrapg file" + options.bedgraph[i + j] + " hasn't a '.bedgraph'/'.bg' extension."
1676 "\n### End of program ###")
1677 # Check whether the labels hasn't a "bedgraph"/"bg" extension
1678 try:
1679 if options.bedgraph[i + sample_file_nb - 1].endswith('.bedgraph') or options.bedgraph[i + sample_file_nb - 1].endswith('.bg'):
1680 sys.exit("\nError: the label " + options.bedgraph[i + sample_file_nb - 1] + " has a '.bedgraph'/'.bg' extension.\n"
1681 "Make sure to follow the expected format: "
1682 "--bedgraph BedGraph_file1 Label1 [BedGraph_file2 Label2 ...]."
1683 "\n### End of program ###")
1684 except IndexError:
1685 sys.exit("\nError: it looks like BedGraph files and associated labels are not correctly provided.\n"
1686 "Make sure to follow the expected format: --bedgraph BedGraph_file1 Label1 [BedGraph_file2 Label2 ...]."
1687 "\n### End of program ###")
1688 # Register the sample label and filename(s)
1689 bedgraphs.append(re.sub("(.(plus|minus))?.((bg)|(bedgraph))", "", options.bedgraph[i]))
1690 label = "_".join(re.findall(r"[\w\-']+", options.bedgraph[i + sample_file_nb - 1]))
1691 labels.append(label)
1692 # Check whether the count file(s) that will be created already exists
1693 if options.strandness == "unstranded":
1694 existing_file(label + ".unstranded.feature_counts.tsv")
1695 else:
1696 existing_file(label + ".stranded.feature_counts.tsv")
1697 # Set this step as a task to process
1698 intersect_indexes_BedGraph = True
1699
1700 # Checking parameters for the step: plot generation
1701 if options.counts:
1702 # Checking unnecessary parameters
1703 unnecessary_param(options.annotation, "Warning: the parameter '-a/--annotation' will not be used because the counts are already provided.")
1704 generate_indexes = False
1705 unnecessary_param(options.genome_index, "Warning: the parameter '-g/--genome_index' will not be used because the counts are already provided.")
1706 unnecessary_param(options.bam, "Warning: the parameter '--bam' will not be used because the counts are already provided.")
1707 generate_BedGraph = False
1708 unnecessary_param(options.bedgraph, "Warning: the parameter '--bedgraph' will not be used because the counts are already provided.")
1709 intersect_indexes_BedGraph = False
1710 # Registering the sample labels and filenames
1711 for sample in options.counts:
1712 label = os.path.basename(sample)
1713 label = re.sub('(.(un)?stranded)?.feature_counts.tsv', '', label)
1714 label = "_".join(re.findall(r"[\w\-']+", label))
1715 count_files.append(sample)
1716 labels.append(label)
1717 else:
1718 # Generating the genome index filenames
1719 index_chrom_list = [] # Not a set because we need to sort it according to the chromosome names later
1720 lengths = {}
1721 if options.genome_index:
1722 genome_index_basename = options.genome_index
1723 elif options.annotation:
1724 # Otherwise the GTF filename without extension will be the basename
1725 genome_index_basename = options.annotation.split("/")[-1].split(".gtf")[0]
1726 stranded_genome_index = genome_index_basename + ".stranded.index"
1727 unstranded_genome_index = genome_index_basename + ".unstranded.index"
1728 if options.strandness == "unstranded":
1729 genome_index = unstranded_genome_index
1730 else:
1731 genome_index = stranded_genome_index
1732 # If no plot is displayed or saved, plot parameters are useless
1733 if (options.no_display and not (options.pdf or options.png or options.svg)) or not(intersect_indexes_BedGraph or options.counts):
1734 # unnecessary_param(options.categories_depth, "Warning: the parameter '-d/--categories_depth' will not be used because no plots will be displayed or saved.")
1735 # # Cannot be tested because options.categories_depth has always a value (default value if option not specified by user)
1736 unnecessary_param(options.threshold, "Warning: the parameter '-t/--threshold' will not be used because no plots will be displayed or saved.")
1737 unnecessary_param(options.biotype_filter, "Warning: the parameter '--biotype_filter' will not be used because no plots will be displayed or saved.")
1738 if options.counts:
1739 sys.exit("Error: there is nothing to do (counts are provided and no display or plot saving is required")
1740 else:
1741 ## [BN] X server check
1742 # if not X server:
1743 #options.no_display = True
1744 #Message
1745 try:
1746 x_server = os.environ['DISPLAY']
1747 generate_plot = True
1748 except:
1749 x_server = False
1750 if options.counts:
1751 sys.exit("Error: your current configuration does not allow graphical interface ('DISPLAY' variable is not set on your system).\nExiting")
1752 else:
1753 print >> sys.stderr, "WARNING: your current configuration does not allow graphical interface ('DISPLAY' variable is not set on your system).\nPlotting step will not be performed."
1754
1755 ## Executing the step(s)
1756
1757 # Indexes generation
1758 if generate_indexes:
1759 # Checking if the index files already exist
1760 if os.path.isfile(stranded_genome_index):
1761 sys.exit("\nError: index files named '" + genome_index_basename +
1762 ".stranded.index' already exists but you provided a GTF file. If you want to generate a new index, "
1763 "please delete these files or specify an other path.\n### End of program")
1764 # Running the index generation commands
1765 print "# Generating the genome index files"
1766 # Getting chromosomes lengths
1767 lengths = get_chromosome_lengths()
1768 # Generating the index files
1769 generate_genome_index(options.annotation, unstranded_genome_index, stranded_genome_index, lengths)
1770 # Displaying the list of indexed chromosomes
1771 index_chrom_list.sort(key=alphanum_key)
1772 print "Indexed chromosomes: " + ", ".join(index_chrom_list)
1773 if generate_indexes or not options.counts:
1774 # Getting index info
1775 read_index()
1776 # Computing the genome intergenic count: sum of the chr lengths minus sum of the genome annotated intervals
1777 cpt_genome[("intergenic", "intergenic")] = sum(lengths.values()) - sum([v for x, v in cpt_genome.iteritems() if x != ("antisense", "antisense")])
1778
1779 # BedGraph files generation
1780 if generate_BedGraph:
1781 print "# Generating the BedGraph files"
1782 #sample_files, sample_labels = generate_bedgraph_files()
1783 generate_bedgraph_files(labels, bams)
1784
1785 # Indexes and BedGraph files intersection
1786 if intersect_indexes_BedGraph:
1787 print "# Intersecting index and BedGraph files"
1788 cpt = intersect_bedgraphs_and_index_to_counts_categories(labels, bedgraphs)
1789 # Write the counts to an output file
1790 write_counts_in_files(cpt, cpt_genome)
1791
1792 ## Plot generation ## MB: all the section still to review
1793 if generate_plot:
1794 print "# Generating plots"
1795 # If input files are the categories counts, the first step is to load them
1796 if options.counts:
1797 #cpt, cpt_genome, sample_names = read_counts(options.counts)
1798 cpt, cpt_genome = read_counts(labels, count_files)
1799 # Managing the unknown biotypes
1800 for sample_label, counters in cpt.items():
1801 for (cat, biot) in counters:
1802 if biot not in biotypes:
1803 unknown_biot.add(biot)
1804 for biot in unknown_biot:
1805 biotypes.add(biot)
1806 biotypes_group1["others"].append(biot)
1807 biotypes = sorted(biotypes)
1808 # Moving antisense cat to the end of the list
1809 biotypes.remove("antisense")
1810 biotypes.append("antisense")
1811 biotypes_group1 = sorted(biotypes_group1)
1812 # Filtering biotypes if necessary
1813 filtered_biotype = None
1814 if options.biotype_filter:
1815 for sample_label in cpt:
1816 for feature in cpt[sample_label]:
1817 biotype = feature[1]
1818 if options.biotype_filter.lower() == biotype.lower():
1819 selected_biotype = biotype
1820 break
1821 if filtered_biotype:
1822 print "\nWarning: biotype '" + options.biotype_filter + "' not found. Please check the biotype name and that this biotype exists in your sample(s)."
1823 # Setting the plots filenames
1824 if options.pdf: ## MB: Do the same for svg and png??
1825 pdf = PdfPages(options.pdf)
1826 else:
1827 pdf = False
1828 ## Generate the categories plot
1829 # Recategorizing within the final categories and plot generation
1830 final_cats = categs_levels[options.categories_depth - 1]
1831 parent_categs = parent_categ_groups[options.categories_depth - 1]
1832 final_cat_cpt, final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt, cpt_genome, final_cats, filtered_biotype)
1833 # Remove the "antisense" category if the library type is "unstranded" ## MB: if options.strandness == "unstranded": cat_list.remove("antisense")??
1834 for dic in cpt.values():
1835 if ("antisense", "antisense") in dic.keys(): break
1836 else:
1837 cat_list.remove("antisense")
1838 #make_plot(labels, cat_list, sample_labels, final_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold, svg=options.svg, png=options.png)
1839 make_plot(labels, cat_list, final_cat_cpt, final_genome_cpt, pdf, "Categories", options.threshold, svg=options.svg, png=options.png, categ_groups= parent_categs)
1840 if filtered_biotype:
1841 #make_plot(labels, cat_list, sample_labels, filtered_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold, title="Categories distribution for '" + filtered_biotype + "' biotype", svg=options.svg, png=options.png)
1842 make_plot(labels, cat_list, filtered_cat_cpt, final_genome_cpt, pdf, "Categories", options.threshold, title="Categories distribution for '" + filtered_biotype + "' biotype", svg=options.svg, png=options.png, categ_groups= parent_categs)
1843 ## Generate the biotypes plot
1844 # Recategorization within the final biotypes and plot generation
1845 final_cat_cpt, final_genome_cpt = group_counts_by_biotype(cpt, cpt_genome, biotypes)
1846 #make_plot(biotypes, sample_labels, final_cat_cpt, final_genome_cpt, pdf, "biotypes", options.threshold, svg=options.svg, png=options.png)
1847 make_plot(labels, biotypes, final_cat_cpt, final_genome_cpt, pdf, "Biotypes", options.threshold, svg=options.svg, png=options.png)
1848
1849
1850 print "### End of program ###"