annotate alfa/ALFA.py @ 0:e360f840a92e draft default tip

Uploaded
author biocomp-ibens
date Wed, 16 May 2018 09:49:18 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1 #!/usr/bin/python
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
2 # -*- coding: utf-8 -*-
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
3
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
4 __author__ = "noel & bahin"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
5 """ ALFA provides a global overview of features distribution composing NGS dataset(s). """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
6
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
7 import argparse
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
8 import os
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
9 import numpy
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
10 import copy
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
11 import sys
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
12 import subprocess
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
13 import matplotlib.pyplot as plt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
14 import matplotlib.cm as cmx
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
15 import matplotlib.colors as colors
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
16 import matplotlib.patheffects as PathEffects
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
17 import re
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
18 from matplotlib.backends.backend_pdf import PdfPages
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
19 # To correctly embed the texts when saving plots in svg format
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
20 import matplotlib
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
21 # import progressbar
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
22 import collections
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
23 import matplotlib as mpl
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
24 import numpy as np
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
25
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
26 matplotlib.rcParams["svg.fonttype"] = "none"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
27
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
28
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
29 ##########################################################################
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
30 # FUNCTIONS #
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
31 ##########################################################################
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
32
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
33 def init_dict(d, key, init):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
34 if key not in d:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
35 d[key] = init
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
36
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
37
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
38 def tryint(s):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
39 """ Function called by "alphanum_key" function to sort the chromosome names. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
40 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
41 return int(s)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
42 except ValueError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
43 return s
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
44
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
45
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
46 def alphanum_key(s):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
47 """ Turn a string into a list of string and number chunks.
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
48 "z23a" -> ["z", 23, "a"]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
49 """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
50 return [ tryint(c) for c in re.split("([0-9]+)", s) ]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
51
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
52
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
53 def required_arg(arg, aliases):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
54 """ Function to display the help and quit if a required argument is missing. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
55 if not arg:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
56 print >> sys.stderr, "\nError: %s argument is missing.\n" % aliases
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
57 parser.print_usage()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
58 sys.exit(1)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
59
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
60
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
61 def existing_file(filename):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
62 """ Checks if filename already exists and exit if so. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
63 if os.path.isfile(filename):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
64 sys.exit("Error: The file '" + filename + "' is about to be produced but already exists in the directory. \n### End of program")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
65
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
66
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
67 def get_chromosome_lengths():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
68 """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
69 Parse the file containing the chromosome lengths.
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
70 If no length file is provided, browse the annotation file (GTF) to estimate the chromosome sizes.
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
71 """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
72 lengths = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
73 gtf_chrom_names = set()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
74 # If the user provided a chromosome length file
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
75 if options.chr_len:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
76 # Getting the chromosome lengths from the chromosome lengths file
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
77 with open(options.chr_len, "r") as chr_len_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
78 for line in chr_len_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
79 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
80 lengths[line.split("\t")[0]] = int(line.rstrip().split("\t")[1])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
81 except IndexError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
82 sys.exit("Error: The chromosome lengths file is not correctly formed. It is supposed to be tabulated file with two fields per line.")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
83 # Getting the chromosome lengths from the GTF file
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
84 with open(options.annotation, "r") as gtf_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
85 for line in gtf_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
86 if not line.startswith("#"):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
87 gtf_chrom_names.add(line.split("\t")[0])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
88 # Checking if the chromosomes from the chromosome lengths file are present in the GTF file
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
89 for chrom in lengths:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
90 if chrom not in gtf_chrom_names:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
91 print >> sys.stderr, "Warning: chromosome '" + chrom + "' of the chromosome lengths file does not match any chromosome name in the GTF file provided and was ignored."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
92 # Checking if the chromosomes from the GTF file are present in the lengths file
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
93 for chrom in gtf_chrom_names:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
94 if chrom not in lengths:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
95 print >> sys.stderr, "Warning: at least one chromosome ('" + chrom + "') was found in the GTF file and does not match any chromosome provided in the lengths file."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
96 print >> sys.stderr, "\t=> All the chromosome lengths will be approximated using annotations in the GTF file."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
97 break
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
98 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
99 return lengths
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
100 # If no chromosome lengths file was provided or if at least one chromosome was missing in the file, the end of the last annotation of the chromosome in the GTF file is considered as the chromosome length
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
101 with open(options.annotation, "r") as gtf_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
102 for line in gtf_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
103 if not line.startswith("#"):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
104 chrom = line.split("\t")[0]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
105 end = int(line.split("\t")[4])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
106 init_dict(lengths, chrom, 0)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
107 lengths[chrom] = max(lengths[chrom], end)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
108 print "The chromosome lengths have been approximated using the GTF file annotations (the stop position of the last annotation of each chromosome is considered as its length)."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
109 return lengths
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
110
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
111
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
112 def write_feature_on_index(feat, chrom, start, stop, sign, stranded_genome_index):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
113 """ Write one new line in the stranded index file and, if necessary, the unstranded index file. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
114 grouped_by_biotype_features = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
115 for biotype, categs in feat.iteritems():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
116 categ_list = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
117 for cat in set(categs):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
118 categ_list.append(cat)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
119 grouped_by_biotype_features.append(":".join((str(biotype), ",".join(categ_list))))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
120
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
121 #stranded_genome_index.write('\t'.join((chrom, start, stop, sign, '')) + '\t'.join(grouped_by_biotype_features) + '\n')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
122 stranded_genome_index.write(
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
123 '\t'.join((chrom, start, stop, sign)) + '\t' + '\t'.join(grouped_by_biotype_features) + '\n')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
124 if unstranded_genome_index: ## MB: Why? Not always unstranded and stranded??
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
125 #unstranded_genome_index.write('\t'.join((chrom, start, stop, '.', '')) + '\t'.join(grouped_by_biotype_features) + '\n')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
126 unstranded_genome_index.write(
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
127 '\t'.join((chrom, start, stop, '.')) + '\t' + '\t'.join(grouped_by_biotype_features) + '\n')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
128
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
129
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
130 def write_index_line(feat, chrom, start, stop, sign, fh):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
131 """ Write a new line in an index file. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
132 # Formatting the features info
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
133 feat_by_biotype = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
134 for biot, cat in feat.iteritems():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
135 #feat_by_biotype.append(":".join((str(biot), ",".join(set(cat)))))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
136 feat_by_biotype.append(":".join((str(biot), ",".join(set(cat)))))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
137 # Writing the features info in the index file
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
138 fh.write("\t".join((chrom, start, stop, sign)) + "\t" + "\t".join(feat_by_biotype) + "\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
139
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
140
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
141 def write_index(feat_values, chrom, start, stop, stranded_genome_index, unstranded_genome_index):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
142 """ Writing the features info in the proper index files. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
143 # Writing info to the stranded indexes
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
144 if feat_values[0] != {}:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
145 write_index_line(feat_values[0], chrom, start, stop, "+", stranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
146 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
147 stranded_genome_index.write("\t".join((chrom, start, stop, "+", "antisense\n")))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
148 if feat_values[1] != {}:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
149 write_index_line(feat_values[1], chrom, start, stop, "-", stranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
150 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
151 stranded_genome_index.write("\t".join((chrom, start, stop, "-", "antisense\n")))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
152 # Writing info to the unstranded index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
153 unstranded_feat = dict(feat_values[0], **feat_values[1])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
154 for name in set(feat_values[0]) & set(feat_values[1]):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
155 unstranded_feat[name] += feat_values[0][name]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
156 write_index_line(unstranded_feat, chrom, start, stop, ".", unstranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
157
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
158
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
159 def count_genome_features(cpt, features, start, stop, coverage=1):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
160 """ Reads genome index and registers feature counts. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
161 # If no biotype priority: category with the highest priority for each found biotype has the same weight (1/n_biotypes)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
162 if not biotype_prios:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
163 nb_biot = len(features)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
164 # For each categ(s)/biotype pairs
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
165 for feat in features:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
166 cur_prio = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
167 # Separate categorie(s) and biotype
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
168 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
169 biot, cats = feat.split(":")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
170 # Error if the feature is "antisense": update the "antisense/antisense" counts
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
171 except ValueError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
172 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
173 cpt[("antisense", "antisense")] += (int(stop) - int(start)) * coverage / float(nb_biot)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
174 except KeyError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
175 cpt[("antisense", "antisense")] = (int(stop) - int(start)) * coverage / float(nb_biot)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
176 return None
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
177 # Browse the categories and get only the one(s) with highest priority
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
178 for cat in cats.split(","):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
179 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
180 prio = prios[cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
181 except KeyError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
182 # TODO: Find a way to add unknown categories
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
183 if cat not in unknown_cat:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
184 print >> sys.stderr, "Warning: Unknown categorie '%s' found and ignored.\n" % cat,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
185 unknown_cat.add(cat)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
186 continue
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
187 # Check if the category has a highest priority than the current one
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
188 if prio > cur_prio:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
189 cur_prio = prio
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
190 cur_cat = {cat}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
191 if prio == cur_prio:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
192 cur_cat.add(cat)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
193 # Increase each counts by the coverage divided by the number of categories and biotypes
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
194 nb_cat = len(cur_cat)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
195 for cat in cur_cat:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
196 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
197 cpt[(cat, biot)] += (int(stop) - int(start)) * coverage / (float(nb_biot * nb_cat))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
198 except KeyError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
199 cpt[(cat, biot)] = (int(stop) - int(start)) * coverage / (float(nb_biot * nb_cat))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
200 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
201 # TODO: Add an option to provide biotype priorities and handle it
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
202 pass
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
203
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
204 '''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
205 def add_info(cpt, feat_values, start, stop, chrom=None, unstranded_genome_index=None, stranded_genome_index=None,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
206 biotype_prios=None, coverage=1, categ_prios=None):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
207 """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
208 From an annotated genomic interval (start/stop positions and associated feature: one or more category(ies)/biotype pair(s) )
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
209 - If a file is provided: write the information at the end of the index file being generated;
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
210 - else: browse the features and update the counts of categories/biotypes found in the genome.
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
211 """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
212 ## Writing in the file if it was provided
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
213 if stranded_genome_index:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
214 # If only one strand has a feature, this feature will directly be written on the unstranded index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
215 if feat_values[0] == {}:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
216 stranded_genome_index.write('\t'.join((chrom, start, stop, '+', 'antisense\n')))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
217 elif feat_values[1] == {}:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
218 stranded_genome_index.write('\t'.join((chrom, start, stop, '-', 'antisense\n')))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
219
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
220 write_feature_on_index(feat_values[0], chrom, start, stop, '+', stranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
221 write_feature_on_index(feat_values[1], chrom, start, stop, '-', stranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
222 unstranded_feat = dict(feat_values[0], **feat_values[1])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
223 for name in set(feat_values[0]) & set(feat_values[1]):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
224 unstranded_feat[name] += feat_values[0][name]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
225 write_feature_on_index(unstranded_feat, chrom, start, stop, '.', unstranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
226
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
227 if feat_values[0] == {}:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
228 # An interval with no feature corresponds to a region annotated only on the reverse strand: update 'antisense' counts
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
229 stranded_genome_index.write('\t'.join((chrom, start, stop, '+', 'antisense\n')))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
230 #write_feature_on_index(feat_values[1], chrom, start, stop, '-', stranded_genome_index, unstranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
231 write_feature_on_index(feat_values[1], chrom, start, stop, '-', stranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
232 write_feature_on_index(feat_values[1], chrom, start, stop, '.', unstranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
233 elif feat_values[1] == {}:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
234 #write_feature_on_index(feat_values[0], chrom, start, stop, '+', stranded_genome_index, unstranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
235 write_feature_on_index(feat_values[0], chrom, start, stop, '+', stranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
236 write_feature_on_index(feat_values[0], chrom, start, stop, '.', unstranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
237 stranded_genome_index.write('\t'.join((chrom, start, stop, '-', 'antisense\n')))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
238 # Else, the unstranded index should contain the union of plus and minus features
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
239 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
240 write_feature_on_index(feat_values[0], chrom, start, stop, '+', stranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
241 write_feature_on_index(feat_values[1], chrom, start, stop, '-', stranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
242 unstranded_feat = dict(feat_values[0], **feat_values[1])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
243 for name in set(feat_values[0]) & set(feat_values[1]):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
244 unstranded_feat[name] += feat_values[0][name]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
245 write_feature_on_index(unstranded_feat, chrom, start, stop, '.', unstranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
246
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
247 ## Increasing category counter(s)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
248 else: ##MB Why increase if file not provided??
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
249 # Default behavior if no biotype priorities : category with the highest priority for each found biotype has the same weight (1/n_biotypes)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
250 if not biotype_prios:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
251 nb_feat = len(feat_values) ## MB: nb biotypes??
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
252 # For every categ(s)/biotype pairs
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
253 for feat in feat_values: ## MB: for each biotype
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
254 cur_prio = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
255 selected_categ = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
256 # Separate categorie(s) and biotype
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
257 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
258 biot, cats = feat.split(":")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
259 # Error if feature equal "antisense" : update the 'antisense/antisense' counts
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
260 except ValueError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
261 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
262 cpt[(feat, feat)] += (int(stop) - int(start)) * coverage / float(nb_feat) ## MB: clearly write 'antisense'??
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
263 except: ## MB: add a KeyError exception??
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
264 cpt[(feat, feat)] = (int(stop) - int(start)) * coverage / float(nb_feat)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
265 return
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
266 # Browse the categories and get only the one(s) with highest priority
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
267 for cat in cats.split(','):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
268 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
269 prio = prios[cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
270 except: ## MB: KeyError??
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
271 # TODO Find a way to add unknown categories
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
272 if cat not in unknown_feature:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
273 print >> sys.stderr, "Warning: Unknown categorie %s found and ignored.\n" % cat,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
274 unknown_feature.append(cat)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
275 continue
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
276 if prio > cur_prio:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
277 cur_prio = prio
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
278 selected_categ = [cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
279 if prio == cur_prio:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
280 if cat != selected_categ: ## MB: work with set??
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
281 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
282 if cat not in selected_categ:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
283 selected_categ.append(cat)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
284 except TypeError: ## What TypeError??
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
285 selected_categ = [selected_categ, cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
286 # Increase each counts by the coverage divided by the number of categories and biotypes
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
287 nb_cats = len(selected_categ)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
288 for cat in selected_categ:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
289 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
290 cpt[(cat, biot)] += (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
291 except KeyError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
292 cpt[(cat, biot)] = (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
293 # else :
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
294 # cpt[(cats,biot)] = (int(stop) - int(start)) / float(nb_feat) * coverage
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
295 # Else, apply biotype selection according to the priority set
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
296 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
297 # TODO Add an option to pass biotype priorities and handle it
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
298 pass
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
299 '''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
300
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
301 def register_interval(features_dict, chrom, stranded_index_fh, unstranded_index_fh):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
302 """ Write the interval features info into the genome index files. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
303 # Adding the chromosome to the list if not present
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
304 if chrom not in index_chrom_list:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
305 index_chrom_list.append(chrom)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
306 # Writing the chromosome in the index file
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
307 with open(unstranded_index_fh, "a") as unstranded_index_fh, open(stranded_index_fh, "a") as stranded_index_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
308 # Initializing the first interval start and features
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
309 sorted_pos = sorted(features_dict["+"].keys())
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
310 interval_start = sorted_pos[0]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
311 features_plus = features_dict["+"][interval_start]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
312 features_minus = features_dict["-"][interval_start]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
313 # Browsing the interval boundaries
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
314 for interval_stop in sorted_pos[1:]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
315 # Writing the current interval features to the indexes
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
316 write_index([features_plus, features_minus], chrom, str(interval_start), str(interval_stop), stranded_index_fh, unstranded_index_fh)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
317 # Initializing the new interval start and features
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
318 interval_start = interval_stop
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
319 # Store current features
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
320 prev_features_plus = features_plus
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
321 prev_features_minus = features_minus
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
322 # Update features
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
323 features_plus = features_dict["+"][interval_start]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
324 features_minus = features_dict["-"][interval_start]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
325 # If feature == transcript and prev interval's feature is exon => add intron feature
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
326 for biotype, categ in features_plus.iteritems():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
327 if set(categ) == {"gene", "transcript"}:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
328 if "exon" in prev_features_plus[biotype]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
329 categ.append("intron")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
330 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
331 continue
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
332 for biotype, categ in features_minus.iteritems():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
333 if set(categ) == {"gene", "transcript"}:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
334 if "exon" in prev_features_minus[biotype]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
335 categ.append("intron")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
336 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
337 continue
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
338
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
339
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
340
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
341 def generate_genome_index(annotation, unstranded_genome_index, stranded_genome_index, chrom_sizes):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
342 """ Create an index of the genome annotations and save it in a file. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
343 # Initializations
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
344 intervals_dict = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
345 max_value = -1
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
346 prev_chrom = ""
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
347 i = 0 # Line counter
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
348 # Write the chromosome lengths as comment lines before the genome index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
349 with open(unstranded_genome_index, "w") as unstranded_index_fh, open(stranded_genome_index, "w") as stranded_index_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
350 for key, value in chrom_sizes.items():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
351 unstranded_index_fh.write("#%s\t%s\n" % (key, value))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
352 stranded_index_fh.write("#%s\t%s\n" % (key, value))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
353 # Progress bar to track the genome indexes creation
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
354 nb_lines = sum(1 for _ in open(annotation))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
355 # pbar = progressbar.ProgressBar(widgets=["Indexing the genome ", progressbar.Percentage(), " ", progressbar.Bar(), progressbar.Timer()], maxval=nb_lines).start()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
356 # Browsing the GTF file and writing into genome index files
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
357 with open(annotation, "r") as gtf_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
358 for line in gtf_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
359 i += 1
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
360 # Update the progressbar every 1k lines
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
361 # if i % 1000 == 1:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
362 # pbar.update(i)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
363 # Processing lines except comment ones
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
364 if not line.startswith("#"):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
365 # Getting the line info
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
366 line_split = line.rstrip().split("\t")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
367 chrom = line_split[0]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
368 cat = line_split[2]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
369 start = int(line_split[3]) - 1
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
370 stop = int(line_split[4])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
371 strand = line_split[6]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
372 antisense = reverse_strand[strand]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
373 biotype = line_split[8].split("biotype")[1].split(";")[0].strip('" ')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
374 # Registering stored features info in the genome index file(s) if the new line concerns a new chromosome or the new line concerns an annotation not overlapping previously recorded ones
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
375 if start > max_value or chrom != prev_chrom:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
376 # Write the previous features
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
377 if intervals_dict:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
378 register_interval(intervals_dict, prev_chrom, stranded_genome_index, unstranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
379 prev_chrom = chrom
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
380 # (Re)Initializing the intervals info dict
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
381 intervals_dict = {strand: {start: {biotype: [cat]}, stop: {}}, antisense: {start: {}, stop: {}}}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
382 max_value = stop
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
383
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
384 # Update the dictionary which represents intervals for every distinct annotation
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
385 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
386 # Storing the intervals on the strand of the current feature
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
387 stranded_intervals = intervals_dict[strand]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
388 added_info = False # Variable to know if the features info were already added
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
389 # Browsing the existing boundaries
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
390 for boundary in sorted(stranded_intervals):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
391 # While the GTF line start is after the browsed boundary: keep the browsed boundary features info in case the GTF line start is before the next boundary
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
392 if boundary < start:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
393 stored_feat_strand, stored_feat_antisense = [dict(stranded_intervals[boundary]), dict(intervals_dict[antisense][boundary])]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
394
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
395 # The GTF line start is already an existing boundary: store the existing features info (to manage after the GTF line stop) and update it with the GTF line features info
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
396 elif boundary == start:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
397 stored_feat_strand, stored_feat_antisense = [dict(stranded_intervals[boundary]), dict(intervals_dict[antisense][boundary])]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
398 # Adding the GTF line features info to the interval
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
399 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
400 stranded_intervals[boundary][biotype] = stranded_intervals[boundary][biotype] + [cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
401 except KeyError: # If the GTF line features info regard an unregistered biotype
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
402 stranded_intervals[boundary][biotype] = [cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
403 added_info = True # The features info were added
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
404
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
405 # The browsed boundary is after the GTF line start: add the GTF line features info
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
406 elif boundary > start:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
407 # Create a new boundary for the GTF line start if necessary (if it is between 2 existing boundaries, it was not created before)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
408 if not added_info:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
409 stranded_intervals[start] = copy.deepcopy(stored_feat_strand)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
410 #stranded_intervals[start][biotype] = [cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
411 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
412 stranded_intervals[start][biotype].append(cat)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
413 except KeyError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
414 stranded_intervals[start][biotype] = [cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
415 intervals_dict[antisense][start] = copy.deepcopy(stored_feat_antisense)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
416 added_info = True # The features info were added
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
417 # While the browsed boundary is before the GTF line stop: store the existing features info (to manage after the GTF line stop) and update it with the GTF line features info
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
418 if boundary < stop:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
419 stored_feat_strand, stored_feat_antisense = [dict(stranded_intervals[boundary]), dict(intervals_dict[antisense][boundary])]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
420 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
421 stranded_intervals[boundary][biotype] = stranded_intervals[boundary][biotype] + [cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
422 except KeyError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
423 stranded_intervals[boundary][biotype] = [cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
424 # The GTF line stop is already exists, nothing more to do, the GTF line features info are integrated
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
425 elif boundary == stop:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
426 break
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
427 # The browsed boundary is after the GTF line stop: create a new boundary for the GTF line stop (with the stored features info)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
428 else: # boundary > stop
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
429 stranded_intervals[stop] = copy.deepcopy(stored_feat_strand)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
430 intervals_dict[antisense][stop] = copy.deepcopy(stored_feat_antisense)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
431 break # The GTF line features info are integrated
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
432 # If the GTF line stop is after the last boundary, extend the dictionary
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
433 if stop > max_value:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
434 max_value = stop
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
435 stranded_intervals[stop] = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
436 intervals_dict[antisense][stop] = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
437
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
438 # Store the categories of the last chromosome
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
439 register_interval(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
440 # pbar.finish()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
441
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
442
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
443 def generate_bedgraph_files(sample_labels, bam_files):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
444 """ Creates BedGraph files from BAM ones and return filenames and labels. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
445 #sample_files = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
446 #sample_labels = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
447 # Progress bar to track the BedGraph file creation
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
448 #pbar = progressbar.ProgressBar(widgets=["Generating the BedGraph files ", progressbar.Percentage(), progressbar.Bar(), progressbar.Timer()], max_value=len(bam_files)+1).start()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
449 # pbar = progressbar.ProgressBar(widgets=["Generating the BedGraph files ", progressbar.Percentage(), progressbar.Bar(), progressbar.Timer()], maxval=len(sample_labels)+1).start()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
450 n = 1
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
451 # pbar.update(n)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
452 #for n in range(0, len(bam_files), 2):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
453 for sample_label, bam_file in zip(sample_labels, bam_files):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
454 # Get the label for this sample
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
455 #sample_labels.append(bam_files[n + 1])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
456 # Modify it to contain only alphanumeric characters (avoid files generation with dangerous names)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
457 #modified_label = "_".join(re.findall(r"[\w']+", bam_files[n + 1]))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
458 if options.strandness in ["forward", "fr-firststrand"]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
459 #subprocess.call("bedtools genomecov -bg -split -strand + -ibam " + bam_files[n] + " > " + modified_label + ".plus.bedgraph", shell=True)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
460 subprocess.call("bedtools genomecov -bg -split -strand + -ibam " + bam_file + " > " + sample_label + ".plus.bedgraph", shell=True)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
461 # pbar.update(n + 0.5)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
462 #subprocess.call("bedtools genomecov -bg -split -strand - -ibam " + bam_files[n] + " > " + modified_label + ".minus.bedgraph", shell=True)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
463 subprocess.call("bedtools genomecov -bg -split -strand - -ibam " + bam_file + " > " + sample_label + ".minus.bedgraph", shell=True)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
464 # pbar.update(n + 0.5)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
465 elif options.strandness in ["reverse", "fr-secondstrand"]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
466 #subprocess.call("bedtools genomecov -bg -split -strand - -ibam " + bam_files[n] + " > " + modified_label + ".plus.bedgraph", shell=True)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
467 subprocess.call("bedtools genomecov -bg -split -strand - -ibam " + bam_file + " > " + sample_label + ".plus.bedgraph", shell=True)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
468 # pbar.update(n + 0.5)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
469 #subprocess.call("bedtools genomecov -bg -split -strand + -ibam " + bam_files[n] + " > " + modified_label + ".minus.bedgraph", shell=True)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
470 subprocess.call("bedtools genomecov -bg -split -strand + -ibam " + bam_file + " > " + sample_label + ".minus.bedgraph", shell=True)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
471 # pbar.update(n + 0.5)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
472 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
473 #subprocess.call("bedtools genomecov -bg -split -ibam " + bam_files[n] + " > " + modified_label + ".bedgraph", shell=True)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
474 subprocess.call("bedtools genomecov -bg -split -ibam " + bam_file + " > " + sample_label + ".bedgraph", shell=True)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
475 # pbar.update(n + 1)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
476 #sample_files.append(modified_label)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
477 # pbar.finish()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
478 #return sample_files, sample_labels
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
479 return None
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
480
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
481
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
482 def read_gtf(gtf_index_file, sign):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
483 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_features, endGTF
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
484 strand = ""
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
485 while strand != sign:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
486 gtf_line = gtf_index_file.readline()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
487 # If the GTF file is finished
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
488 if not gtf_line:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
489 endGTF = True
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
490 return endGTF
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
491 splitline = gtf_line.rstrip().split("\t")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
492 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
493 strand = splitline[3]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
494 # strand information can not be found in the file file header
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
495 except IndexError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
496 pass
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
497 gtf_chrom = splitline[0]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
498 gtf_features = splitline[4:]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
499 gtf_start, gtf_stop = map(int, splitline[1:3])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
500 return endGTF
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
501
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
502
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
503 #def read_counts(counts_files):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
504 def read_counts(sample_labels, counts_files):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
505 """ Reads the counts from an input file. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
506 cpt = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
507 cpt_genome = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
508 #for fcounts in counts_files:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
509 for sample_label, filename in zip(sample_labels, counts_files):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
510 #label = os.path.splitext(os.path.basename(fcounts))[0]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
511 #labels.append(label)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
512 #cpt[label] = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
513 cpt[sample_label] = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
514 #with open(fcounts, "r") as counts_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
515 with open(filename, "r") as counts_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
516 for line in counts_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
517 if not line.startswith("#"):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
518 feature = tuple(line.split("\t")[0].split(","))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
519 #cpt[label][feature] = float(line.split("\t")[1])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
520 cpt[sample_label][feature] = float(line.split("\t")[1])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
521 cpt_genome[feature] = float(line.rstrip().split("\t")[2])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
522 #return cpt, cpt_genome, labels
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
523 return cpt, cpt_genome
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
524
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
525
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
526 def get_chromosome_names_in_index(genome_index):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
527 """ Returns the chromosome names in a genome index file. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
528 with open(genome_index, "r") as index_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
529 for line in index_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
530 if not line.startswith("#") and (line.split("\t")[0] not in index_chrom_list):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
531 index_chrom_list.append(line.split("\t")[0])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
532 return index_chrom_list
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
533
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
534
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
535 def read_index():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
536 """ Parse index files info (chromosomes list, lengths and genome features). """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
537 with open(genome_index, "r") as index_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
538 for line in index_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
539 if line.startswith("#"):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
540 lengths[line.split("\t")[0][1:]] = int(line.split("\t")[1])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
541 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
542 chrom = line.split("\t")[0]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
543 if chrom not in index_chrom_list:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
544 index_chrom_list.append(chrom)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
545 count_genome_features(cpt_genome, line.rstrip().split("\t")[4:], line.split("\t")[1], line.split("\t")[2])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
546
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
547
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
548 #def intersect_bedgraphs_and_index_to_counts_categories(sample_files, sample_labels, prios, genome_index, biotype_prios=None): ## MB: To review
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
549 def intersect_bedgraphs_and_index_to_counts_categories(sample_labels, bedgraph_files, biotype_prios=None): ## MB: To review
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
550 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_cat, endGTF
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
551 unknown_chrom = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
552 cpt = {} # Counter for the nucleotides in the BAM input file(s)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
553 #for n in range(len(sample_files)):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
554 if bedgraph_files == []:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
555 bedgraph_files = sample_labels
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
556 for sample_label, bedgraph_basename in zip(sample_labels, bedgraph_files):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
557 #sample_file = sample_files[n]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
558 #sample_name = sample_labels[n]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
559 # Initializing the category counter dict for this sample and the number of lines to process for the progress bar
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
560 #init_dict(cpt, sample_name, {})
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
561 init_dict(cpt, sample_label, {})
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
562 bg_extension = ".bedgraph"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
563 if options.strandness == "unstranded":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
564 strands = [("", ".")]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
565 #nb_lines = sum(1 for _ in open(sample_file + ".bedgraph"))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
566 try :
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
567 nb_lines = sum(1 for _ in open(bedgraph_basename + bg_extension))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
568 except IOError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
569 bg_extension = "bg"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
570 nb_lines = sum(1 for _ in open(bedgraph_basename + bg_extension))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
571 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
572 strands = [(".plus", "+"), (".minus", "-")]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
573 #nb_lines = sum(1 for _ in open(sample_file + ".plus.bedgraph")) + sum(1 for _ in open(sample_file + ".minus.bedgraph"))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
574 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
575 nb_lines = sum(1 for _ in open(bedgraph_basename + ".plus" + bg_extension)) + sum(1 for _ in open(bedgraph_basename + ".minus" + bg_extension))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
576 except IOError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
577 nb_lines = sum(1 for _ in open(bedgraph_basename + ".plus" + bg_extension)) + sum(
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
578 1 for _ in open(bedgraph_basename + ".minus" + bg_extension))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
579
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
580 # Progress bar to track the BedGraph and index intersection
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
581 #pbar = progressbar.ProgressBar(widgets=["Processing " + sample_file + " ", progressbar.Percentage(), progressbar.Bar(), progressbar.Timer()], max_value=nb_lines).start()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
582 # pbar = progressbar.ProgressBar(widgets=["Processing " + sample_label + " ", progressbar.Percentage(), progressbar.Bar(), progressbar.Timer()], maxval=nb_lines).start()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
583 i = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
584
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
585 # Intersecting the BedGraph and index files
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
586 for strand, sign in strands:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
587 prev_chrom = ""
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
588 endGTF = False # Reaching the next chr or the end of the GTF index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
589 intergenic_adds = 0.0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
590 #with open(sample_file + strand + ".bedgraph", "r") as bedgraph_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
591 with open(bedgraph_basename + strand + bg_extension, "r") as bedgraph_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
592 # Running through the BedGraph file
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
593 for bam_line in bedgraph_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
594 i += 1
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
595 # if i % 10000 == 0:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
596 # pbar.update(i)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
597 # Getting the BAM line info
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
598 bam_chrom = bam_line.split("\t")[0]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
599 bam_start, bam_stop, bam_cpt = map(float, bam_line.split("\t")[1:4])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
600 # Skip the line if the chromosome is not in the index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
601 if bam_chrom not in index_chrom_list:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
602 if bam_chrom not in unknown_chrom:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
603 unknown_chrom.append(bam_chrom)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
604 print "\r \r Chromosome '" + bam_chrom + "' not found in index." # MB: to adapt with the progress bar
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
605 continue
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
606 # If this is a new chromosome (or the first one)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
607 if bam_chrom != prev_chrom:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
608 intergenic_adds = 0.0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
609 # (Re)opening the GTF index and looking for the first line of the matching chr
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
610 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
611 gtf_index_file.close()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
612 except UnboundLocalError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
613 pass
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
614 gtf_index_file = open(genome_index, "r")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
615 endGTF = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
616 read_gtf(gtf_index_file, sign)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
617 while bam_chrom != gtf_chrom:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
618 read_gtf(gtf_index_file, sign)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
619 if endGTF:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
620 break
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
621 prev_chrom = bam_chrom
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
622
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
623 # Looking for the first matching annotation in the GTF index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
624 while (not endGTF) and (gtf_chrom == bam_chrom) and (bam_start >= gtf_stop):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
625 read_gtf(gtf_index_file, sign)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
626 if gtf_chrom != bam_chrom:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
627 endGTF = True
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
628 # Processing BAM lines before the first GTF annotation if there are
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
629 if bam_start < gtf_start:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
630 # Increase the "intergenic" category counter with all or part of the BAM interval
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
631 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
632 intergenic_adds += min(bam_stop, gtf_start) - bam_start
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
633 #cpt[sample_name][("intergenic", "intergenic")] += (min(bam_stop,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
634 cpt[sample_label][("intergenic", "intergenic")] += (min(bam_stop,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
635 gtf_start) - bam_start) * bam_cpt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
636 except KeyError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
637 #cpt[sample_name][("intergenic", "intergenic")] = (min(bam_stop,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
638 cpt[sample_label][("intergenic", "intergenic")] = (min(bam_stop,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
639 gtf_start) - bam_start) * bam_cpt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
640 # Go to next line if the BAM interval was totally upstream the first GTF annotation, carry on with the remaining part otherwise
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
641 if endGTF or (bam_stop <= gtf_start):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
642 continue
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
643 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
644 bam_start = gtf_start
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
645
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
646 # We can start the crossover
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
647 while not endGTF:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
648 # Update category counter
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
649 #add_info(cpt[sample_name], gtf_features, bam_start, min(bam_stop, gtf_stop), coverage=bam_cpt)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
650 #count_genome_features(cpt[sample_name], gtf_features, bam_start, min(bam_stop, gtf_stop), coverage=bam_cpt)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
651 count_genome_features(cpt[sample_label], gtf_features, bam_start, min(bam_stop, gtf_stop), coverage=bam_cpt)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
652 # Read the next GTF file line if the BAM line is not entirely covered
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
653 if bam_stop > gtf_stop:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
654 # Update the BAM start pointer
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
655 bam_start = gtf_stop
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
656 endGTF = read_gtf(gtf_index_file, sign)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
657 # If we read a new chromosome in the GTF file then it is considered finished
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
658 if bam_chrom != gtf_chrom:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
659 endGTF = True
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
660 if endGTF:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
661 break
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
662 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
663 # Next if the BAM line is entirely covered
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
664 bam_start = bam_stop
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
665 break
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
666
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
667 # Processing the end of the BAM line if necessary
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
668 if endGTF and (bam_stop > bam_start):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
669 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
670 #cpt[sample_name][("intergenic", "intergenic")] += (bam_stop - bam_start) * bam_cpt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
671 cpt[sample_label][("intergenic", "intergenic")] += (bam_stop - bam_start) * bam_cpt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
672 except KeyError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
673 #cpt[sample_name][("intergenic", "intergenic")] = (bam_stop - bam_start) * bam_cpt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
674 cpt[sample_label][("intergenic", "intergenic")] = (bam_stop - bam_start) * bam_cpt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
675 gtf_index_file.close()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
676 # pbar.finish()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
677 return cpt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
678
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
679
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
680 def write_counts_in_files(cpt, genome_counts):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
681 """ Writes the biotype/category counts in an output file. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
682 for sample_label, counters in cpt.items():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
683 sample_label = "_".join(re.findall(r"[\w\-']+", sample_label))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
684 with open(sample_label + ".feature_counts.tsv", "w") as output_fh:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
685 output_fh.write("#Category,biotype\tCounts_in_bam\tSize_in_genome\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
686 for features_pair, counts in counters.items():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
687 output_fh.write("%s\t%s\t%s\n" % (",".join(features_pair), counts, genome_counts[features_pair]))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
688
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
689
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
690 def recategorize_the_counts(cpt, cpt_genome, final):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
691 final_cat_cpt = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
692 final_genome_cpt = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
693 for f in cpt:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
694 # print "\nFinal categories for",f,"sample"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
695 final_cat_cpt[f] = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
696 for final_cat in final:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
697 tot = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
698 tot_genome = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
699 for cat in final[final_cat]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
700 tot += cpt[f][cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
701 tot_genome += cpt_genome[cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
702 # output_file.write('\t'.join((final_cat, str(tot))) + '\n')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
703 # print '\t'.join((final_cat, str(tot)))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
704 final_cat_cpt[f][final_cat] = tot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
705 final_genome_cpt[final_cat] = tot_genome
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
706 return final_cat_cpt, final_genome_cpt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
707
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
708
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
709 def group_counts_by_categ(cpt, cpt_genome, final, selected_biotype):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
710 final_cat_cpt = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
711 final_genome_cpt = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
712 filtered_cat_cpt = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
713 for f in cpt:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
714 final_cat_cpt[f] = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
715 filtered_cat_cpt[f] = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
716 for final_cat in final:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
717 tot = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
718 tot_filter = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
719 tot_genome = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
720 for cat in final[final_cat]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
721 for key, value in cpt[f].items():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
722 if key[0] == cat:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
723 tot += value
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
724 tot_genome += cpt_genome[key]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
725 if key[1] == selected_biotype:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
726 tot_filter += value
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
727 # output_file.write('\t'.join((final_cat, str(tot))) + '\n')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
728 # print '\t'.join((final_cat, str(tot)))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
729 final_cat_cpt[f][final_cat] = tot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
730 if tot_genome == 0:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
731 final_genome_cpt[final_cat] = 1e-100
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
732 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
733 final_genome_cpt[final_cat] = tot_genome
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
734 filtered_cat_cpt[f][final_cat] = tot_filter
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
735 # if "antisense" in final_genome_cpt: final_genome_cpt["antisense"] = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
736 return final_cat_cpt, final_genome_cpt, filtered_cat_cpt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
737
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
738
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
739 def group_counts_by_biotype(cpt, cpt_genome, biotypes):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
740 final_cpt = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
741 final_genome_cpt = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
742 for f in cpt:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
743 final_cpt[f] = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
744 for biot in biotypes:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
745 tot = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
746 tot_genome = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
747 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
748 for final_biot in biotypes[biot]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
749 for key, value in cpt[f].items():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
750 if key[1] == final_biot:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
751 tot += value
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
752 # if key[1] != 'antisense':
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
753 tot_genome += cpt_genome[key]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
754 except:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
755 for key, value in cpt[f].items():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
756 if key[1] == biot:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
757 tot += value
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
758 tot_genome += cpt_genome[key]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
759 if tot != 0:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
760 final_cpt[f][biot] = tot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
761 final_genome_cpt[biot] = tot_genome
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
762 return final_cpt, final_genome_cpt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
763
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
764
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
765 # def get_cmap(N):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
766 # '''Returns a function that maps each index in 0, 1, ... N-1 to a distinct
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
767 # RGB color.'''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
768 # color_norm = colors.Normalize(vmin=0, vmax=N-1)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
769 # scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
770 # def map_index_to_rgb_color(index):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
771 # return scalar_map.to_rgba(index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
772 # return map_index_to_rgb_color
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
773
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
774 def one_sample_plot(ordered_categs, percentages, enrichment, n_cat, index, index_enrichment, bar_width, counts_type,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
775 title, sample_labels):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
776 ### Initialization
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
777 fig = plt.figure(figsize=(13, 9))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
778 ax1 = plt.subplot2grid((2, 4), (0, 0), colspan=2)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
779 ax2 = plt.subplot2grid((2, 4), (1, 0), colspan=2)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
780 cmap = plt.get_cmap("Spectral")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
781 cols = [cmap(x) for x in xrange(0, 256, 256 / n_cat)]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
782 if title:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
783 ax1.set_title(title + "in: %s" % sample_labels[0])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
784 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
785 ax1.set_title(counts_type + " distribution in mapped reads in: %s" % sample_labels[0])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
786 ax2.set_title("Normalized counts of " + counts_type)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
787
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
788 ### Barplots
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
789 # First barplot: percentage of reads in each categorie
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
790 ax1.bar(index, percentages, bar_width,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
791 color=cols)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
792 # Second barplot: enrichment relative to the genome for each categ
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
793 # (the reads count in a categ is divided by the categ size in the genome)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
794 ax2.bar(index_enrichment, enrichment, bar_width,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
795 color=cols, )
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
796 ### Piecharts
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
797 pielabels = [ordered_categs[i] if percentages[i] > 0.025 else "" for i in xrange(n_cat)]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
798 sum_enrichment = numpy.sum(enrichment)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
799 pielabels_enrichment = [ordered_categs[i] if enrichment[i] / sum_enrichment > 0.025 else "" for i in xrange(n_cat)]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
800 # Categories piechart
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
801 ax3 = plt.subplot2grid((2, 4), (0, 2))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
802 pie_wedge_collection, texts = ax3.pie(percentages, labels=pielabels, shadow=True, colors=cols)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
803 # Enrichment piechart
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
804 ax4 = plt.subplot2grid((2, 4), (1, 2))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
805 pie_wedge_collection, texts = ax4.pie(enrichment, labels=pielabels_enrichment, shadow=True, colors=cols)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
806 # Add legends (append percentages to labels)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
807 labels = [" ".join((ordered_categs[i], "({:.1%})".format(percentages[i]))) for i in range(len(ordered_categs))]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
808 ax3.legend(pie_wedge_collection, labels, loc="center", fancybox=True, shadow=True, prop={"size": "medium"},
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
809 bbox_to_anchor=(1.7, 0.5))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
810 labels = [" ".join((ordered_categs[i], "({:.1%})".format(enrichment[i] / sum_enrichment))) for i in
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
811 range(len(ordered_categs))] # if ordered_categs[i] != "antisense"]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
812 ax4.legend(pie_wedge_collection, labels, loc="center", fancybox=True, shadow=True, prop={"size": "medium"},
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
813 bbox_to_anchor=(1.7, 0.5))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
814 # Set aspect ratio to be equal so that pie is drawn as a circle
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
815 ax3.set_aspect("equal")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
816 ax4.set_aspect("equal")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
817 return fig, ax1, ax2
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
818
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
819
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
820 #def make_plot(ordered_categs, sample_names, categ_counts, genome_counts, pdf, counts_type, threshold, title=None,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
821 def make_plot(sample_labels, ordered_categs, categ_counts, genome_counts, pdf, counts_type, threshold, title=None, svg=None, png=None,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
822 categ_groups=[]): # MB: to review
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
823 # From ordered_categs, keep only the features (categs or biotypes) that we can find in at least one sample.
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
824 existing_categs = set()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
825 for sample in categ_counts.values():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
826 existing_categs |= set(sample.keys())
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
827 ordered_categs = filter(existing_categs.__contains__, ordered_categs)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
828 xlabels = [cat if len(cat.split("_")) == 1 else "\n".join(cat.split("_")) if cat.split("_")[0] != 'undescribed' else "\n".join(["und.",cat.split("_")[1]]) for cat in ordered_categs]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
829 n_cat = len(ordered_categs)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
830 #n_exp = len(sample_names)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
831 nb_samples = len(categ_counts)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
832 ##Initialization of the matrix of counts (nrow=nb_experiements, ncol=nb_categorie)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
833 #counts = numpy.matrix(numpy.zeros(shape=(n_exp, n_cat)))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
834 counts = numpy.matrix(numpy.zeros(shape=(nb_samples, n_cat)))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
835 '''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
836 for exp in xrange(len(sample_names)):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
837 for cat in xrange(len(ordered_categs)):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
838 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
839 counts[exp, cat] = categ_counts[sample_names[exp]][ordered_categs[cat]]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
840 except:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
841 pass
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
842 '''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
843 for sample_label in sample_labels:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
844 for cat in xrange(len(ordered_categs)):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
845 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
846 counts[sample_labels.index(sample_label), cat] = categ_counts[sample_label][ordered_categs[cat]]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
847 except:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
848 pass
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
849
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
850 ##Normalize the categorie sizes by the total size to get percentages
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
851 sizes = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
852 sizes_sum = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
853 for cat in ordered_categs:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
854 sizes.append(genome_counts[cat])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
855 sizes_sum += genome_counts[cat]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
856 if "antisense" in ordered_categs:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
857 antisense_pos = ordered_categs.index("antisense")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
858 sizes[antisense_pos] = 1e-100
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
859 for cpt in xrange(len(sizes)):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
860 sizes[cpt] /= float(sizes_sum)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
861
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
862 ## Create array which contains the percentage of reads in each categ for every sample
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
863 percentages = numpy.array(counts / numpy.sum(counts, axis=1))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
864 ## Create the enrichment array (counts divided by the categorie sizes in the genome)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
865 enrichment = numpy.array(percentages / sizes)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
866 if "antisense_pos" in locals():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
867 '''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
868 for i in xrange(len(sample_names)):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
869 enrichment[i][antisense_pos] = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
870 '''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
871 for n in xrange(nb_samples):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
872 enrichment[n][antisense_pos] = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
873
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
874 # enrichment=numpy.log(numpy.array(percentages/sizes))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
875 #for exp in xrange(n_exp):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
876 for n in xrange(nb_samples):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
877 for i in xrange(n_cat):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
878 val = enrichment[n][i]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
879 if val > 1:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
880 enrichment[n][i] = val - 1
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
881 elif val == 1 or val == 0:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
882 enrichment[n][i] = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
883 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
884 enrichment[n][i] = -1 / val + 1
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
885
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
886 #### Finally, produce the plot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
887
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
888 ##Get the colors from the colormap
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
889 ncolor = 16
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
890 cmap = ["#e47878", "#68b4e5", "#a3ea9b", "#ea9cf3", "#e5c957", "#a3ecd1", "#e97ca0", "#66d985", "#8e7ae5",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
891 "#b3e04b", "#b884e4", "#e4e758", "#738ee3", "#e76688", "#70dddd", "#e49261"]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
892 '''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
893 if n_exp > ncolor:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
894 cmap = plt.get_cmap("Set3", n_exp)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
895 cmap = [cmap(i) for i in xrange(n_exp)]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
896 '''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
897 if nb_samples > ncolor:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
898 cmap = plt.get_cmap("Set3", nb_samples)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
899 cmap = [cmap(i) for i in xrange(nb_samples)]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
900
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
901 ## Parameters for the plot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
902 opacity = 1
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
903 # Create a vector which contains the position of each bar
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
904 index = numpy.arange(n_cat)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
905 # Size of the bars (depends on the categs number)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
906 #bar_width = 0.9 / n_exp
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
907 bar_width = 0.9 / nb_samples
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
908
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
909 ##Initialise the subplot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
910 # if there is only one sample, also plot piecharts
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
911 # if n_exp == 1 and counts_type.lower() == 'categories':
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
912 # fig, ax1, ax2 = one_sample_plot(ordered_categs, percentages[0], enrichment[0], n_cat, index, bar_width, counts_type, title)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
913 ## If more than one sample
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
914 # else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
915 if counts_type.lower() != "categories":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
916 #fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * n_exp) / 3, 10))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
917 fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * nb_samples) / 3, 10))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
918 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
919 #fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * n_exp) / 3, 10))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
920 fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * nb_samples) / 3, 10))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
921 # Store the bars objects for enrichment plot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
922 rects = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
923 # For each sample/experiment
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
924 #for i in range(n_exp):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
925 for sample_label in sample_labels:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
926 # First barplot: percentage of reads in each categorie
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
927 n = sample_labels.index(sample_label)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
928 #ax1.bar(index + i * bar_width, percentages[i], bar_width,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
929 ax1.bar(index + n * bar_width, percentages[n], bar_width,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
930 alpha=opacity,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
931 #color=cmap[i],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
932 color=cmap[n],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
933 #label=sample_names[i], edgecolor="#FFFFFF", lw=0)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
934 label=sample_label, edgecolor="#FFFFFF", lw=0)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
935 # Second barplot: enrichment relative to the genome for each categ
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
936 # (the reads count in a categ is divided by the categ size in the genome)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
937 rects.append(ax2.bar(index + n * bar_width, enrichment[n], bar_width,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
938 alpha=opacity,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
939 #color=cmap[i],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
940 color=cmap[n],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
941 #label=sample_names[i], edgecolor=cmap[i], lw=0))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
942 label=sample_label, edgecolor=cmap[n], lw=0))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
943
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
944 ## Graphical options for the plot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
945 # Adding of the legend
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
946 #if n_exp < 10:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
947 if nb_samples < 10:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
948 ax1.legend(loc="best", frameon=False)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
949 legend_ncol = 1
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
950 #elif n_exp < 19:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
951 elif nb_samples < 19:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
952 legend_ncol = 2
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
953 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
954 legend_ncol = 3
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
955 ax1.legend(loc="best", frameon=False, ncol=legend_ncol)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
956 ax2.legend(loc="best", frameon=False, ncol=legend_ncol)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
957 # ax2.legend(loc='upper center',bbox_to_anchor=(0.5,-0.1), fancybox=True, shadow=True)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
958 # Main titles
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
959 if title:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
960 ax1.set_title(title)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
961 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
962 ax1.set_title(counts_type + " counts")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
963 ax2.set_title(counts_type + " normalized counts")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
964
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
965 # Adding enrichment baseline
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
966 # ax2.axhline(y=0,color='black',linestyle='dashed',linewidth='1.5')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
967 # Axes limits
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
968 ax1.set_xlim(-0.1, len(ordered_categs) + 0.1)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
969 if len(sizes) == 1: ax1.set_xlim(-2, 3)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
970 ax2.set_xlim(ax1.get_xlim())
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
971 # Set axis limits (max/min values + 5% margin)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
972 ax2_ymin = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
973 ax2_ymax = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
974 for sample_values in enrichment:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
975 ax2_ymin.append(min(sample_values))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
976 ax2_ymax.append(max(sample_values))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
977 ax2_ymax = max(ax2_ymax)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
978 ax2_ymin = min(ax2_ymin)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
979 margin_top, margin_bottom = (abs(0.05 * ax2_ymax), abs(0.05 * ax2_ymin))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
980 ax1.set_ylim(0, ax1.get_ylim()[1] * 1.05)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
981 if threshold:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
982 threshold_bottom = -abs(float(threshold[0])) + 1
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
983 threshold_top = float(threshold[1]) - 1
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
984
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
985 #for i in xrange(n_exp):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
986 for n in xrange(nb_samples):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
987 for y in xrange(n_cat):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
988 #val = enrichment[i][y]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
989 val = enrichment[n][y]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
990 if not numpy.isnan(val) and not (threshold_bottom < val < threshold_top):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
991 #rect = rects[i][y]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
992 rect = rects[n][y]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
993 rect_height = rect.get_height()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
994 if rect.get_y() < 0:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
995 diff = rect_height + threshold_bottom
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
996 rect.set_y(threshold_bottom)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
997 ax2_ymin = threshold_bottom
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
998 margin_bottom = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
999 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1000 diff = rect_height - threshold_top
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1001 ax2_ymax = threshold_top
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1002 margin_top = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1003 rect.set_height(rect.get_height() - diff)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1004 if margin_top != 0 and margin_bottom != 0:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1005 margin_top, margin_bottom = [max(margin_top, margin_bottom) for i in xrange(2)]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1006 ax2.set_ylim(ax2_ymin - margin_bottom, ax2_ymax + margin_top)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1007 # Y axis title
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1008 ax1.set_ylabel("Proportion of reads (%)")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1009 ax2.set_ylabel("Enrichment relative to genome")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1010
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1011
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1012 # Add the categories on the x-axis
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1013 #ax1.set_xticks(index + bar_width * n_exp / 2)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1014 ax1.set_xticks(index + bar_width * nb_samples / 2)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1015 #ax2.set_xticks(index + bar_width * n_exp / 2)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1016 ax2.set_xticks(index + bar_width * nb_samples / 2)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1017 if counts_type.lower() != "categories":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1018 ax1.set_xticklabels(ordered_categs, rotation="30", ha="right")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1019 ax2.set_xticklabels(ordered_categs, rotation="30", ha="right")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1020 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1021 ax1.set_xticklabels(xlabels)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1022 ax2.set_xticklabels(xlabels)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1023
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1024 # Display fractions values in percentages
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1025 ax1.set_yticklabels([str(int(i * 100)) for i in ax1.get_yticks()])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1026 # Correct y-axis ticks labels for enrichment subplot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1027 # ax2.set_yticklabels([str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1028 yticks = list(ax2.get_yticks())
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1029 yticks = [yticks[i] - 1 if yticks[i] > 9 else yticks[i] + 1 if yticks[i] < -9 else yticks[i] for i in
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1030 xrange(len(yticks))]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1031 ax2.set_yticks(yticks)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1032 ax2.set_yticklabels([str(int(i + 1)) + "$^{+1}$" if i > 0 and i % 1 == 0 else str(
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1033 i + 1) + "$^{+1}$" if i > 0 else 1 if i == 0 else str(
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1034 int(-(i - 1))) + "$^{-1}$" if i < 0 and i % 1 == 0 else str(-(i - 1)) + "$^{-1}$" for i in ax2.get_yticks()])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1035 # ax2.set_yticklabels([i+1 if i>0 else 1 if i==0 else "$\\frac{1}{%s}$" %-(i-1) for i in ax2.get_yticks()])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1036 # Change appearance of 'antisense' bars on enrichment plot since we cannot calculate an enrichment for this artificial category
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1037 if "antisense_pos" in locals(): # ax2.text(antisense_pos+bar_width/2,ax2.get_ylim()[1]/10,'NA')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1038 #for i in xrange(n_exp):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1039 for n in xrange(nb_samples):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1040 #rect = rects[i][antisense_pos]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1041 rect = rects[n][antisense_pos]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1042 rect.set_y(ax2.get_ylim()[0])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1043 rect.set_height(ax2.get_ylim()[1] - ax2.get_ylim()[0])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1044 rect.set_hatch("/")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1045 rect.set_fill(False)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1046 rect.set_linewidth(0)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1047 # rect.set_color('lightgrey')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1048 # rect.set_edgecolor('#EDEDED')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1049 rect.set_color("#EDEDED")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1050 #ax2.text(index[antisense_pos] + bar_width * n_exp / 2 - 0.1, (ax2_ymax + ax2_ymin) / 2, "NA")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1051 ax2.text(index[antisense_pos] + bar_width * nb_samples / 2 - 0.1, (ax2_ymax + ax2_ymin) / 2, "NA")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1052 # Add text for features absent in sample
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1053 #for i in xrange(n_exp):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1054 for n in xrange(nb_samples):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1055 for y in xrange(n_cat):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1056 #if percentages[i][y] == 0:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1057 if percentages[n][y] == 0:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1058 txt = ax1.text(y + bar_width * (n + 0.5), 0.02, "Abs.", rotation="vertical", color=cmap[n],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1059 horizontalalignment="center", verticalalignment="bottom")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1060 txt.set_path_effects([PathEffects.Stroke(linewidth=0.5), PathEffects.Normal()])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1061 #elif enrichment[i][y] == 0:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1062 elif enrichment[n][y] == 0:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1063 #rects[i][y].set_linewidth(1)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1064 rects[n][y].set_linewidth(1)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1065
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1066 # Remove top/right/bottom axes
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1067 for ax in [ax1, ax2]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1068 ax.spines["top"].set_visible(False)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1069 ax.spines["right"].set_visible(False)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1070 ax.spines["bottom"].set_visible(False)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1071 ax.tick_params(axis="x", which="both", bottom="on", top="off", labelbottom="on")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1072 ax.tick_params(axis="y", which="both", left="on", right="off", labelleft="on")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1073
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1074
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1075 ### Add second axis with categ groups
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1076 annotate_group(categ_groups, label=None, ax=ax1)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1077 annotate_group(categ_groups, label=None, ax=ax2)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1078
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1079 ### Adjust figure margins to
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1080 if counts_type.lower() == "categories":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1081 plt.tight_layout(h_pad=5.0)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1082 fig.subplots_adjust(bottom=0.1)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1083 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1084 plt.tight_layout()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1085
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1086 ## Showing the plot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1087 if pdf: ## TODO: allow several output formats
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1088 pdf.savefig()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1089 plt.close()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1090 elif svg:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1091 if svg == True:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1092 plt.savefig(counts_type + ".svg")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1093 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1094 if os.path.splitext(svg)[1] == ".svg":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1095 plt.savefig(".".join((os.path.splitext(svg)[0], counts_type, "svg")))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1096 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1097 plt.savefig(".".join((svg, counts_type, "svg")))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1098 elif png:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1099 if png == True:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1100 plt.savefig(counts_type + ".png")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1101 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1102 if os.path.splitext(png)[1] == ".png":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1103 plt.savefig(".".join((os.path.splitext(png)[0], counts_type, "png")))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1104 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1105 plt.savefig(".".join((png, counts_type, "png")))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1106 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1107 plt.show()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1108 ## Save on disk it as a png image
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1109 # fig.savefig('image_output.png', dpi=300, format='png', bbox_extra_artists=(lgd,), bbox_inches='tight')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1110
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1111 def annotate_group(groups, ax=None, label=None, labeloffset=30):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1112 """Annotates the categories with their parent group and add x-axis label"""
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1113
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1114 def annotate(ax, name, left, right, y, pad):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1115 """Draw the group annotation"""
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1116 arrow = ax.annotate(name, xy=(left, y), xycoords="data",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1117 xytext=(right, y - pad), textcoords="data",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1118 annotation_clip=False, verticalalignment="top",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1119 horizontalalignment="center", linespacing=2.0,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1120 arrowprops={'arrowstyle': "-", 'shrinkA': 0, 'shrinkB': 0,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1121 'connectionstyle': "angle,angleB=90,angleA=0,rad=5"}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1122 )
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1123 return arrow
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1124
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1125 if ax is None:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1126 ax = plt.gca()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1127 level = 0
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1128 for level in range(len(groups)):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1129 grp = groups[level]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1130 for name, coord in grp.items():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1131 ymin = ax.get_ylim()[0] - np.ptp(ax.get_ylim()) * 0.12 - np.ptp(ax.get_ylim()) * 0.05 * (level)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1132 ypad = 0.01 * np.ptp(ax.get_ylim())
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1133 xcenter = np.mean(coord)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1134 annotate(ax, name, coord[0], xcenter, ymin, ypad)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1135 annotate(ax, name, coord[1], xcenter, ymin, ypad)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1136
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1137 if label is not None:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1138 # Define xlabel and position it according to the number of group levels
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1139 ax.annotate(label,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1140 xy=(0.5, 0), xycoords="axes fraction",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1141 xytext=(0, -labeloffset - (level + 1) * 15), textcoords="offset points",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1142 verticalalignment="top", horizontalalignment="center")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1143
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1144 return
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1145
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1146 def filter_categs_on_biotype(selected_biotype, cpt):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1147 filtered_cpt = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1148 for sample in cpt:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1149 filtered_cpt[sample] = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1150 for feature, count in cpt[sample].items():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1151 if feature[1] == selected_biotype:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1152 filtered_cpt[sample][feature[0]] = count
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1153 return filtered_cpt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1154
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1155
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1156 def unnecessary_param(param, message):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1157 """ Function to display a warning on the standard error. """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1158 if param:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1159 print >> sys.stderr, message
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1160
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1161
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1162 def usage_message():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1163 return """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1164 Generate genome indexes:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1165 python ALFA.py -a GTF_FILE [-g GENOME_INDEX]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1166 [--chr_len CHR_LENGTHS_FILE]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1167 Process BAM file(s):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1168 python ALFA.py -g GENOME_INDEX -i BAM1 LABEL1 [BAM2 LABEL2 ...]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1169 [--bedgraph] [-s STRAND]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1170 [-n] [--pdf output.pdf]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1171 [-d {1,2,3,4}] [-t YMIN YMAX]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1172 Index genome + process BAM:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1173 python ALFA.py -a GTF_FILE [-g GENOME_INDEX]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1174 -i BAM1 LABEL1 [BAM2 LABEL2 ...]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1175 [--chr_len CHR_LENGTHS_FILE]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1176 [--bedgraph][-s STRAND]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1177 [-n] [--pdf output.pdf]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1178 [-d {1,2,3,4}] [-t YMIN YMAX]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1179
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1180 Process previously created ALFA counts file(s):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1181 python ALFA.py -c COUNTS1 [COUNTS2 ...]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1182 [-s STRAND]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1183 [-n] [--pdf output.pdf]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1184 [-d {1,2,3,4}] [-t YMIN YMAX]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1185
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1186 """
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1187
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1188 ##########################################################################
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1189 # MAIN #
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1190 ##########################################################################
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1191
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1192
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1193 if __name__ == "__main__":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1194
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1195 #### Parse command line arguments and store them in the variable options
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1196 parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, usage=usage_message())
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1197 parser.add_argument("--version", action="version", version="version 1.0",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1198 help="Show ALFA version number and exit\n\n-----------\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1199 # Options regarding the index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1200 parser.add_argument("-g", "--genome_index",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1201 help="Genome index files path and basename for existing index, or path and basename for new index creation\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1202 parser.add_argument("-a", "--annotation", metavar="GTF_FILE", help="Genomic annotations file (GTF format)\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1203 parser.add_argument("--chr_len", help="Tabulated file containing chromosome names and lengths\n\n-----------\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1204
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1205 # Options regarding the intersection step
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1206 #parser.add_argument('-i', '--input', '--bam', dest='input', metavar=('BAM_FILE1 LABEL1', ""), nargs='+',
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1207 #help='Input BAM file(s) and label(s). The BAM files must be sorted by position.\n\n')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1208 parser.add_argument("--bam", metavar=("BAM_FILE1 LABEL1", ""), nargs="+",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1209 help="Input BAM file(s) and label(s). The BAM files must be sorted by position.\n\n") ## MB: position AND chr??
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1210 # parser.add_argument('--bedgraph', action='store_const',default = False, const = True, help="Use this options if your input file(s) is(are) already in bedgraph format\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1211 #parser.add_argument('--bedgraph', dest='bedgraph', action='store_const', default=False, const=True,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1212 #help="Use this options if your input file(s) is(are) already in bedgraph format\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1213 parser.add_argument("--bedgraph", metavar=("BEDGRAPH_FILE1 LABEL1", ""), nargs="+",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1214 help="Use this options if your input is/are BedGraph file(s).\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1215 parser.add_argument("-c", "--counts", metavar=("COUNTS_FILE", ""), nargs="+",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1216 help="Use this options instead of '-i/--input' to provide ALFA counts files as input \ninstead of bam/bedgraph files.\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1217 #parser.add_argument('-s', '--strandness', dest="strandness", nargs=1, action='store', default=['unstranded'], choices=['unstranded', 'forward', 'reverse', 'fr-firststrand', 'fr-secondstrand'], metavar="", help="Library orientation. Choose within: 'unstranded', 'forward'/'fr-firststrand' \nor 'reverse'/'fr-secondstrand'. (Default: 'unstranded')\n\n-----------\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1218 parser.add_argument("-s", "--strandness", default="unstranded",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1219 choices=["unstranded", "forward", "reverse", "fr-firststrand", "fr-secondstrand"], metavar="",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1220 help="Library orientation. Choose within: 'unstranded', "
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1221 "'forward'/'fr-firststrand' \nor 'reverse'/'fr-secondstrand'. "
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1222 "(Default: 'unstranded')\n\n-----------\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1223
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1224 # Options regarding the plot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1225 parser.add_argument("--biotype_filter", help=argparse.SUPPRESS) # "Make an extra plot of categories distribution using only counts of the specified biotype." ## MB: TO DISPLAY (no suppress)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1226 parser.add_argument("-d", "--categories_depth", type=int, default=3, choices=range(1, 5),
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1227 help="Use this option to set the hierarchical level that will be considered in the GTF file (default=3): \n(1) gene,intergenic; \n(2) intron,exon,intergenic; \n(3) 5'UTR,CDS,3'UTR,intron,intergenic; \n(4) start_codon,5'UTR,CDS,3'UTR,stop_codon,intron,intergenic. \n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1228 parser.add_argument("--pdf", nargs="?", const="ALFA_plots.pdf",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1229 help="Save produced plots in PDF format at specified path ('categories_plots.pdf' if no argument provided).\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1230 parser.add_argument("--png", nargs="?", const="ALFA_plots.png",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1231 help="Save produced plots in PNG format with provided argument as basename \nor 'categories.png' and 'biotypes.png' if no argument provided.\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1232 parser.add_argument("--svg", nargs="?", const="ALFA_plots.svg",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1233 help="Save produced plots in SVG format with provided argument as basename \nor 'categories.svg' and 'biotypes.svg' if no argument provided.\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1234 parser.add_argument("-n", "--no_display", action="store_const", const=True, default=False, help="Do not display plots.\n\n") # We have to add "const=None" to avoid a bug in argparse
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1235 parser.add_argument("-t", "--threshold", dest="threshold", nargs=2, metavar=("ymin", "ymax"), type=float,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1236 help="Set axis limits for enrichment plots.\n\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1237
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1238 if len(sys.argv) == 1:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1239 parser.print_usage()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1240 sys.exit(1)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1241
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1242 options = parser.parse_args()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1243
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1244 '''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1245 # Booleans for steps to be executed
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1246 make_index = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1247 intersect_reads = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1248 process_counts = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1249
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1250 #### Check arguments conformity and define which steps have to be performed
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1251 print "### Checking parameters"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1252 if options.counts:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1253 # Aucun autre argument requis, precise that the other won't be used (if this is true!!)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1254 # Vérifier extension input
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1255
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1256 # Action: Only do the plot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1257 process_counts = True
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1258 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1259 if options.annotation:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1260 # If '-gi' parameter is present
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1261 if options.genome_index:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1262 genome_index_basename = options.genome_index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1263 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1264 # Otherwise the GTF filename without extension will be the basename
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1265 genome_index_basename = options.annotation.split("/")[-1].split(".gtf")[0]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1266 # Check if the indexes were already created and warn the user
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1267 if os.path.isfile(genome_index_basename + ".stranded.index"):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1268 if options.input:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1269 print >> sys.stderr, "Warning: an index file named '%s' already exists and will be used. If you want to create a new index, please delete this file or specify an other path." % (
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1270 genome_index_basename + ".stranded.index")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1271 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1272 sys.exit(
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1273 "Error: an index file named %s already exists. If you want to create a new index, please delete this file or specify an other path.\n" % (
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1274 genome_index_basename + ".stranded.index"))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1275 # Create them otherwise
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1276 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1277 make_index = True
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1278 # If the index is already done
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1279 if options.input:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1280 # Required arguments are the input and the genome_index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1281 if 'genome_index_basename' not in locals():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1282 required_arg(options.genome_index, "-g/--genome_index")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1283 genome_index_basename = options.genome_index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1284 required_arg(options.input, "-i/--input/--bam")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1285 for i in xrange(0, len(options.input), 2):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1286 # Check whether the input file exists
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1287 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1288 open(options.input[i])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1289 except IOError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1290 sys.exit("Error: the input file " + options.input[i] + " was not found. Aborting.")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1291 # Check whether the input file extensions are 'bam', 'bedgraph' or 'bg' and the label extension are no
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1292 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1293 extension = os.path.splitext(options.input[i + 1])[1]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1294 if extension in ('.bam', '.bedgraph', '.bg'):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1295 sys.exit("Error: it seems input files and associated labels are not correctly provided.\n\
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1296 Make sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1297 except:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1298 sys.exit(
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1299 "Error: it seems input files and associated labels are not correctly provided.\nMake sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1300
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1301 intersect_reads = True
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1302 # Vérifier input's extension
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1303 # TODO
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1304 if not (options.counts or options.input or options.annotation):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1305 sys.exit(
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1306 "\nError : some arguments are missing At least '-a', '-c' or '-i' is required. Please refer to help (-h/--help) and usage cases for more details.\n")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1307 if not options.counts:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1308 # Declare genome_index variables
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1309 stranded_genome_index = genome_index_basename + ".stranded.index"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1310 unstranded_genome_index = genome_index_basename + ".unstranded.index"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1311 if options.strandness[0] == "unstranded":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1312 genome_index = unstranded_genome_index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1313 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1314 genome_index = stranded_genome_index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1315 '''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1316 #### Initialization of some variables
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1317
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1318 # Miscellaneous variables
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1319 reverse_strand = {"+": "-", "-": "+"}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1320 samples = collections.OrderedDict() # Structure: {<label>: [<filename1>(, <filename2>)]} # Example: {'Toy': ['toy.bam']}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1321
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1322 #Sample labels and file paths
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1323 labels = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1324 bams = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1325 bedgraphs = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1326 count_files = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1327
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1328
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1329 # Initializing the category priority order, coding biotypes and the final list
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1330 # prios = {"start_codon": 7, "stop_codon": 7, "five_prime_utr": 6, "three_prime_utr": 6, "UTR": 6, "CDS": 5, "exon": 4,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1331 # "transcript": 3, "gene": 2, "antisense": 1, "intergenic": 0}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1332 prios = {"start_codon": 4, "stop_codon": 4, "five_prime_utr": 3, "three_prime_utr": 3, "UTR": 3, "CDS": 3,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1333 "exon": 2, "intron": 2, "transcript": 1, "gene": 1, "antisense": 0, "intergenic": -1}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1334
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1335 biotype_prios = None
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1336 # biotype_prios = {"protein_coding":1, "miRNA":2}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1337
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1338 # Possibles groups of categories to plot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1339 # categs_group1 = {"start": ["start_codon"], "5UTR": ["five_prime_utr", "UTR"], "CDS": ["CDS", "exon"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1340 # "3UTR": ["three_prime_utr"], "stop": ["stop_codon"], "introns": ["transcript", "gene"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1341 # "intergenic": ["intergenic"], "antisense": ["antisense"]}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1342 # categs_group2 = {"5UTR": ["five_prime_utr", "UTR"], "CDS": ["CDS", "exon", "start_codon", "stop_codon"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1343 # "3UTR": ["three_prime_utr"], "introns": ["transcript", "gene"], "intergenic": ["intergenic"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1344 # "antisense": ["antisense"]}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1345 # categs_group3 = {"exons": ["five_prime_utr", "three_prime_utr", "UTR", "CDS", "exon", "start_codon", "stop_codon"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1346 # "introns": ["transcript", "gene"], "intergenic": ["intergenic"], "antisense": ["antisense"]}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1347 # categs_group4 = {
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1348 # "gene": ["five_prime_utr", "three_prime_utr", "UTR", "CDS", "exon", "start_codon", "stop_codon", "transcript",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1349 # "gene"], "intergenic": ["intergenic"], "antisense": ["antisense"]}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1350
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1351
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1352 categs_level1 = {"gene": ["five_prime_utr", "three_prime_utr", "UTR", "CDS", "exon", "intron", "start_codon",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1353 "stop_codon", "transcript", "gene"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1354 "intergenic": ["intergenic"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1355 "antisense": ["antisense"]}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1356
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1357 categs_level2 = {"exons": ["five_prime_utr", "three_prime_utr", "UTR", "CDS", "exon", "start_codon", "stop_codon"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1358 "introns": ["intron"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1359 "undescribed_genes": ["transcript", "gene"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1360 "intergenic": ["intergenic"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1361 "antisense": ["antisense"]}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1362
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1363 categs_level3 = {"5UTR": ["five_prime_utr", "UTR"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1364 "CDS": ["CDS", "start_codon", "stop_codon"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1365 "3UTR": ["three_prime_utr"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1366 "undescribed_exons": ["exon"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1367 "introns": ["intron"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1368 "undescribed_genes": ["transcript", "gene"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1369 "intergenic": ["intergenic"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1370 "antisense": ["antisense"]}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1371
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1372 categs_level4 = {"5UTR": ["five_prime_utr", "UTR"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1373 "start": ["start_codon"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1374 "stop": ["stop_codon"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1375 "CDS_body": ["CDS"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1376 "undescribed_CDS": [], #TODO: implement CDS/undescribed_CDS distinction
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1377 "3UTR": ["three_prime_utr"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1378 "undescribed_exons": ["exon"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1379 "introns": ["intron"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1380 "undescribed_genes": ["transcript", "gene"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1381 "intergenic": ["intergenic"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1382 "antisense": ["antisense"]}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1383
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1384 # categs_groups = [categs_group4, categs_group3, categs_group2, categs_group1] # Order and merging for the final plot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1385 categs_levels = [categs_level1, categs_level2, categs_level3, categs_level4]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1386
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1387 parent_categ_level1 = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1388 parent_categ_level2 = [{"gene":[0.5,2.5]}]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1389 parent_categ_level3 = [{"exon":[0.5,3.5]}, {"gene":[0.5,5.5]}]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1390 parent_categ_level4 = [{"CDS":[1.5,4.5]}, {"exon":[0.5,6.5]},{"gene":[0.5,8.5]}]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1391 parent_categ_groups = [parent_categ_level1, parent_categ_level2, parent_categ_level3, parent_categ_level4]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1392
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1393 cat_list = ["5UTR", "start", "CDS", "CDS_body", "stop", "undescribed_CDS", "3UTR", "exons", "undescribed_exons", "introns", "gene", "undescribed_genes", "intergenic", "antisense"]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1394
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1395 # biotypes list
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1396 biotypes = {"protein_coding", "polymorphic_pseudogene", "TR_C_gene", "TR_D_gene", "TR_J_gene", "TR_V_gene", "IG_C_gene",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1397 "IG_D_gene", "IG_J_gene", "IG_V_gene", "3prime_overlapping_ncrna", "lincRNA", "macro_lncRNA", "miRNA",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1398 "misc_RNA", "Mt_rRNA", "Mt_tRNA", "processed_transcript", "ribozyme", "rRNA", "scaRNA", "sense_intronic",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1399 "sense_overlapping", "snoRNA", "snRNA", "sRNA", "TEC", "vaultRNA", "antisense",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1400 "transcribed_processed_pseudogene", "transcribed_unitary_pseudogene", "transcribed_unprocessed_pseudogene",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1401 "translated_unprocessed_pseudogene", "TR_J_pseudogene", "TR_V_pseudogene", "unitary_pseudogene",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1402 "unprocessed_pseudogene", "processed_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", "IG_V_pseudogene",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1403 "pseudogene", "ncRNA", "tRNA"} # Type: set (to access quickly)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1404
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1405 # Grouping of biotypes:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1406 biotypes_group1 = {"protein_coding": ["protein_coding"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1407 "pseudogenes": ["polymorphic_pseudogene", "transcribed_processed_pseudogene",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1408 "transcribed_unitary_pseudogene", "transcribed_unprocessed_pseudogene",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1409 "translated_unprocessed_pseudogene", "TR_J_pseudogene", "TR_V_pseudogene",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1410 "unitary_pseudogene", "unprocessed_pseudogene", "processed_pseudogene",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1411 "IG_C_pseudogene", "IG_J_pseudogene", "IG_V_pseudogene", "pseudogene"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1412 "TR": ["TR_C_gene", "TR_D_gene", "TR_J_gene", "TR_V_gene"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1413 "IG": ["IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_V_gene"], \
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1414 "MT_RNA": ["Mt_rRNA", "Mt_tRNA"], \
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1415 "ncRNA": ["lincRNA", "macro_lncRNA", "3prime_overlapping_ncrna", "ncRNA"], \
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1416 "others": ["misc_RNA", "processed_transcript", "ribozyme", "scaRNA", "sense_intronic",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1417 "sense_overlapping", "TEC", "vaultRNA"],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1418 "antisense": ["antisense"]}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1419 for biot in ["miRNA", "snoRNA", "snRNA", "rRNA", "sRNA", "tRNA"]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1420 biotypes_group1[biot] = [biot]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1421
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1422 # Initializing the unknown features list
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1423 unknown_cat = set()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1424 unknown_biot= set()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1425
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1426 # Initializing the genome category counter dict
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1427 cpt_genome = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1428 '''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1429 if process_counts:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1430 #### If input files are the categories counts, just load them and continue to recategorization step
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1431 cpt, cpt_genome, samples_names = read_counts_files(options.counts)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1432 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1433 #### Create genome index if needed and get the sizes of categories
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1434 index_chrom_list = []
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1435 if make_index:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1436 #### Get the chromosome lengths
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1437 lengths = get_chromosome_lengths(options)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1438 # Generating the genome index files if the user didn't provide them
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1439 create_genome_index(options.annotation, unstranded_genome_index, stranded_genome_index, cpt_genome, prios,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1440 biotypes, lengths)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1441 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1442 # Retrieving chromosome names saved in index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1443 index_chrom_list = get_chromosome_names_in_index(genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1444
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1445
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1446 # print '\nChr lengths:', lengths
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1447
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1448 if intersect_reads:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1449 # If the indexes already exist, read them to compute the sizes of the categories in the genome and retrieve the chromosome lengths
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1450 if not make_index:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1451 print "\n### Reading genome indexes\n",
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1452 sys.stdout.flush()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1453 lengths = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1454 with open(genome_index, 'r') as genome_index_file:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1455 for line in genome_index_file:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1456 if line[0] == "#":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1457 lengths[line.split('\t')[0][1:]] = int(line.split('\t')[1])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1458 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1459 add_info(cpt_genome, line.rstrip().split('\t')[4:], line.split('\t')[1], line.split('\t')[2],
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1460 biotype_prios=None, categ_prios=prios)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1461
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1462 #print 'Indexed chromosomes: ' + ', '.join((sorted(index_chrom_list)))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1463 index_chrom_list.sort(key=alphanum_key)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1464 print 'Indexed chromosomes: ' + ', '.join((index_chrom_list))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1465
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1466 #### Computing the genome intergenic count: sum of the chr lengths minus sum of the genome annotated intervals
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1467 cpt_genome[('intergenic', 'intergenic')] = sum(lengths.itervalues()) - sum(
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1468 [v for x, v in cpt_genome.iteritems() if x != ('antisense', 'antisense')])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1469 if not make_index:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1470 print "Done!"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1471 # print '\nGenome category counts:'
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1472 # for key,val in cpt_genome.iteritems():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1473 # print key,"\t",val
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1474
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1475
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1476 #### Create the Bedgraph files if needed and get the files list
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1477
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1478 if not options.bedgraph:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1479 # Generating the BEDGRAPH files is the user provided BAM file(s) and get the samples labels (this names will be used in the plot legend)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1480 samples_files, samples_names = create_bedgraph_files(options.input, options.strandness[0])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1481 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1482 # Just initialize the files list with the bedgraph paths
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1483 # samples_files = [options.input[i] for i in range(0,len(options.input),2)]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1484 samples_files = [re.sub('.bedgraph$', '', options.input[i]) for i in range(0, len(options.input), 2)]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1485 # and get the labels
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1486 samples_names = [options.input[i] for i in range(1, len(options.input), 2)]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1487 #### Retrieving chromosome names saved in index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1488 #chrom_list = get_chromosome_names_in_index(genome_index)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1489 #### Processing the BEDGRAPH files: intersecting the bedgraph with the genome index and count the number of aligned positions in each category
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1490 cpt = intersect_bedgraphs_and_index_to_counts_categories(samples_files, samples_names, prios, genome_index,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1491 options.strandness[0], biotype_prios=None)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1492
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1493 #### Write the counts on disk
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1494 write_counts_in_files(cpt, cpt_genome)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1495
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1496 if not (intersect_reads or process_counts) or (options.quiet and options.pdf == False):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1497 quit("\n### End of program")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1498 print "\n### Generating plots"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1499 # Updating the biotypes lists (biotypes and 'biotype_group1'): adding the 'unknow biotypes' found in gtf/index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1500 if unknown_feature == []: # 'unknown_feature' is define only during the index generation
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1501 # Browse the feature to determine whether some biotypes are 'unknown'
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1502 for sample, counts in cpt.items():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1503 for (cat, biot) in counts:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1504 if biot not in biotypes and cat not in unknown_feature:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1505 unknown_feature.append(biot)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1506 for new_biot in unknown_feature:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1507 biotypes.add(new_biot)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1508 biotypes_group1["others"].append(new_biot)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1509 biotypes = sorted(biotypes)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1510 # move antisense categ to the end of the list
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1511 biotypes.remove('antisense')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1512 biotypes.append('antisense')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1513 biotypes_group1 = sorted(biotypes_group1)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1514
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1515 # print '\nCounts for every category/biotype pair: ',cpt
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1516
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1517 # Generating plots
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1518 if options.pdf != False:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1519 if options.pdf == None:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1520 options.pdf = "categories_plots.pdf"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1521 pdf = PdfPages(options.pdf)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1522 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1523 pdf = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1524
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1525 selected_biotype = None
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1526 if options.biotype_filter:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1527 options.biotype_filter = options.biotype_filter[0]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1528 for sample in cpt:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1529 for feature in cpt[sample]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1530 biotype = feature[1]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1531 if options.biotype_filter.lower() == biotype.lower():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1532 selected_biotype = biotype
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1533 break
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1534 if selected_biotype == None:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1535 print "\nError: biotype '" + options.biotype_filter + "' not found. Please check the biotype name and that this biotype exists in your sample(s)."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1536 sys.exit()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1537
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1538 # Print a warning message if the UTRs are not specified as 5' or 3' (they will be ploted as 5'UTR)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1539 if 'UTR' in [categ[0] for counts in cpt.values() for categ in counts.keys()]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1540 print "\nWARNING: (some) 5'UTR/3'UTR are not precisely defined. Consequently, positions annotated as "UTR" will be counted as "5'UTR"\n"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1541 ##MB
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1542 #### Make the plot by categories
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1543 #### Recategorizing with the final categories
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1544 final_cats = categs_groups[options.categories_depth - 1]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1545 final_cat_cpt, final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt, cpt_genome, final_cats, selected_biotype)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1546 #### Display the distribution of specified categories (or biotypes) in samples on a barplot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1547 # Remove the 'antisense' category if the library type is 'unstranded'
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1548 for dic in cpt.values():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1549 if ('antisense', 'antisense') in dic.keys(): break
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1550 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1551 cat_list.remove('antisense')
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1552 make_plot(cat_list, samples_names, final_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1553 svg=options.svg, png=options.png)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1554 if selected_biotype:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1555 make_plot(cat_list, samples_names, filtered_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1556 title="Categories distribution for '" + selected_biotype + "' biotype", svg=options.svg, png=options.png)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1557
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1558 #### Make the plot by biotypes
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1559 #### Recategorizing with the final categories
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1560 final_cat_cpt, final_genome_cpt = group_counts_by_biotype(cpt, cpt_genome, biotypes)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1561 #### Display the distribution of specified categories (or biotypes) in samples on a barplot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1562 make_plot(biotypes, samples_names, final_cat_cpt, final_genome_cpt, pdf, "biotypes", options.threshold, svg=options.svg,
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1563 png=options.png)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1564
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1565 ##### Recategorizing with the final categories
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1566 # final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes_group1)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1567 ##### Display the distribution of specified categories (or biotypes) in samples on a barplot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1568 # make_plot(biotypes_group1,samples_names,final_cat_cpt,final_genome_cpt,pdf,"Biotype groups", options.threshold, title="Biotypes distribution in mapped reads \n(biotypes are grouped by 'family')", svg = options.svg, png = options.png)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1569
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1570
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1571 if options.pdf:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1572 pdf.close()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1573 print "\n### Plots saved in pdf file: %s" % options.pdf
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1574
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1575 print "\n### End of program"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1576 '''
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1577 print "### ALFA ###"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1578 if not (options.annotation or options.bam or options.bedgraph or options.counts):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1579 print >> sys.stderr, "\nError: At least one argument among '-a/--annotation', '--bam', '--bedgraph' or " \
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1580 "'-c/--counts' is required. Please refer to help (-h/--help) and usage cases for more " \
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1581 "details.\n"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1582 parser.print_usage()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1583 sys.exit()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1584
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1585 ## Getting the steps to execute and checking parameters
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1586
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1587 # Getting the steps to execute
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1588 generate_indexes = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1589 generate_BedGraph = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1590 intersect_indexes_BedGraph = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1591 generate_plot = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1592
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1593 # Checking parameters for the step: indexes generation
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1594 if options.annotation:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1595 generate_indexes = True
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1596 elif options.chr_len:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1597 unnecessary_param(options.chr_len, "Warning: the parameter '--chr_len' will not be used because the indexes generation step will not be performed.")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1598
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1599 # Checking parameters for the step: BedGraph files generation
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1600 if options.bam:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1601 if options.bedgraph:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1602 sys.exit("\nError: parameters '--bam' and '--bedgraph' provided, only one should be.\n### End of program")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1603 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1604 if not generate_indexes and not options.genome_index:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1605 sys.exit("\nError: parameter '-g/--genome_index' should be provided.\n### End of program")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1606 # Checking the input BAM files
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1607 for i in xrange(0, len(options.bam), 2):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1608 # Check whether the BAM file exists
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1609 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1610 open(options.bam[i])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1611 except IOError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1612 sys.exit("\nError: the BAM file " + options.bam[i] + " was not found.\n### End of program")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1613 except IndexError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1614 sys.exit("\nError: BAM files and associated labels are not correctly provided.\n"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1615 "Make sure to follow the expected format: --bam BAM_file1 Label1 [BAM_file2 Label2 ...]."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1616 "\n### End of program ###")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1617
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1618 # Check whether the BAM file extension in 'bam'
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1619 if not options.bam[i].endswith(".bam"):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1620 sys.exit("\nError: at least one BAM file hasn't a '.bam' extension.\n### End of program ###")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1621 # Check whether the labels hasn't a "bam" extension
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1622 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1623 if options.bam[i + 1].endswith(".bam"):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1624 sys.exit("\nError: at least one label for a BAM file has a '.bam' extension.\n"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1625 "Make sure to follow the expected format: --bam BAM_file1 Label1 [BAM_file2 Label2 ...]."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1626 "\n### End of program ###")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1627 except IndexError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1628 sys.exit("\nError: BAM files and associated labels are not correctly provided.\n"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1629 "Make sure to follow the expected format: --bam BAM_file1 Label1 [BAM_file2 Label2 ...]."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1630 "\n### End of program ###")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1631 # Get label and replace invalid characters by "_"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1632 label = "_".join(re.findall(r"[\w\-']+", options.bam[i + 1]))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1633 # Check whether the BedGraph file(s) and counts file that will be generated already exists
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1634 if options.strandness == "unstranded":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1635 existing_file(label + ".bedgraph")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1636 existing_file(label + ".unstranded.feature_counts.tsv")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1637 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1638 existing_file(label + ".plus.bedgraph")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1639 existing_file(label + ".minus.bedgraph")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1640 existing_file(label + ".stranded.feature_counts.tsv")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1641 # Register the sample label and filename
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1642 labels.append(label)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1643 bams.append(options.bam[i])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1644 # Set this step + the intersection one as tasks to process
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1645 generate_BedGraph = True
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1646 intersect_indexes_BedGraph = True
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1647
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1648 # Checking parameters for the step: indexes and BedGraph files intersection
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1649 if options.bedgraph:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1650 if not generate_indexes and not options.genome_index:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1651 sys.exit("\nError: parameter '-g/--genome_index' should be provided.\n### End of program")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1652 if options.strandness == "unstranded":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1653 sample_file_nb = 2
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1654 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1655 sample_file_nb = 3
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1656 # Checking the input BedGraph files
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1657 for i in xrange(0, len(options.bedgraph), sample_file_nb):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1658 # Check whether the BedGraph file(s) exists
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1659 for j in xrange(0,sample_file_nb - 1):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1660 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1661 open(options.bedgraph[i + j])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1662 except IOError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1663 if not (options.bedgraph[i + j].endswith(".bedgraph") or options.bedgraph[i + j].endswith(".bg")):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1664 sys.exit("\nError: it looks like BedGraph files and associated labels are not correctly provided.\n"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1665 "Make sure to follow the expected format: --bedgraph BedGraph_file1 Label1 [BedGraph_file2 Label2 ...]."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1666 "\n### End of program ###")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1667 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1668 sys.exit("\nError: the BedGraph file " + options.bedgraph[i + j] + " was not found.\n### End of program")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1669 except IndexError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1670 sys.exit("\nError: it looks like BedGraph files and associated labels are not correctly provided.\n"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1671 "Make sure to follow the expected format: --bedgraph BedGraph_file1 Label1 [BedGraph_file2 Label2 ...]."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1672 "\n### End of program ###")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1673 # Check whether the BedGraph file extension is "bedgraph"/"bg"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1674 if not (options.bedgraph[i + j].endswith(".bedgraph") or options.bedgraph[i + j].endswith(".bg")):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1675 sys.exit("\nError: the BedGrapg file" + options.bedgraph[i + j] + " hasn't a '.bedgraph'/'.bg' extension."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1676 "\n### End of program ###")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1677 # Check whether the labels hasn't a "bedgraph"/"bg" extension
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1678 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1679 if options.bedgraph[i + sample_file_nb - 1].endswith('.bedgraph') or options.bedgraph[i + sample_file_nb - 1].endswith('.bg'):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1680 sys.exit("\nError: the label " + options.bedgraph[i + sample_file_nb - 1] + " has a '.bedgraph'/'.bg' extension.\n"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1681 "Make sure to follow the expected format: "
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1682 "--bedgraph BedGraph_file1 Label1 [BedGraph_file2 Label2 ...]."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1683 "\n### End of program ###")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1684 except IndexError:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1685 sys.exit("\nError: it looks like BedGraph files and associated labels are not correctly provided.\n"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1686 "Make sure to follow the expected format: --bedgraph BedGraph_file1 Label1 [BedGraph_file2 Label2 ...]."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1687 "\n### End of program ###")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1688 # Register the sample label and filename(s)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1689 bedgraphs.append(re.sub("(.(plus|minus))?.((bg)|(bedgraph))", "", options.bedgraph[i]))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1690 label = "_".join(re.findall(r"[\w\-']+", options.bedgraph[i + sample_file_nb - 1]))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1691 labels.append(label)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1692 # Check whether the count file(s) that will be created already exists
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1693 if options.strandness == "unstranded":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1694 existing_file(label + ".unstranded.feature_counts.tsv")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1695 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1696 existing_file(label + ".stranded.feature_counts.tsv")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1697 # Set this step as a task to process
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1698 intersect_indexes_BedGraph = True
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1699
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1700 # Checking parameters for the step: plot generation
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1701 if options.counts:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1702 # Checking unnecessary parameters
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1703 unnecessary_param(options.annotation, "Warning: the parameter '-a/--annotation' will not be used because the counts are already provided.")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1704 generate_indexes = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1705 unnecessary_param(options.genome_index, "Warning: the parameter '-g/--genome_index' will not be used because the counts are already provided.")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1706 unnecessary_param(options.bam, "Warning: the parameter '--bam' will not be used because the counts are already provided.")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1707 generate_BedGraph = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1708 unnecessary_param(options.bedgraph, "Warning: the parameter '--bedgraph' will not be used because the counts are already provided.")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1709 intersect_indexes_BedGraph = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1710 # Registering the sample labels and filenames
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1711 for sample in options.counts:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1712 label = os.path.basename(sample)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1713 label = re.sub('(.(un)?stranded)?.feature_counts.tsv', '', label)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1714 label = "_".join(re.findall(r"[\w\-']+", label))
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1715 count_files.append(sample)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1716 labels.append(label)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1717 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1718 # Generating the genome index filenames
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1719 index_chrom_list = [] # Not a set because we need to sort it according to the chromosome names later
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1720 lengths = {}
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1721 if options.genome_index:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1722 genome_index_basename = options.genome_index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1723 elif options.annotation:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1724 # Otherwise the GTF filename without extension will be the basename
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1725 genome_index_basename = options.annotation.split("/")[-1].split(".gtf")[0]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1726 stranded_genome_index = genome_index_basename + ".stranded.index"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1727 unstranded_genome_index = genome_index_basename + ".unstranded.index"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1728 if options.strandness == "unstranded":
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1729 genome_index = unstranded_genome_index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1730 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1731 genome_index = stranded_genome_index
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1732 # If no plot is displayed or saved, plot parameters are useless
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1733 if (options.no_display and not (options.pdf or options.png or options.svg)) or not(intersect_indexes_BedGraph or options.counts):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1734 # unnecessary_param(options.categories_depth, "Warning: the parameter '-d/--categories_depth' will not be used because no plots will be displayed or saved.")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1735 # # Cannot be tested because options.categories_depth has always a value (default value if option not specified by user)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1736 unnecessary_param(options.threshold, "Warning: the parameter '-t/--threshold' will not be used because no plots will be displayed or saved.")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1737 unnecessary_param(options.biotype_filter, "Warning: the parameter '--biotype_filter' will not be used because no plots will be displayed or saved.")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1738 if options.counts:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1739 sys.exit("Error: there is nothing to do (counts are provided and no display or plot saving is required")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1740 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1741 ## [BN] X server check
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1742 # if not X server:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1743 #options.no_display = True
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1744 #Message
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1745 try:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1746 x_server = os.environ['DISPLAY']
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1747 generate_plot = True
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1748 except:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1749 x_server = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1750 if options.counts:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1751 sys.exit("Error: your current configuration does not allow graphical interface ('DISPLAY' variable is not set on your system).\nExiting")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1752 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1753 print >> sys.stderr, "WARNING: your current configuration does not allow graphical interface ('DISPLAY' variable is not set on your system).\nPlotting step will not be performed."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1754
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1755 ## Executing the step(s)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1756
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1757 # Indexes generation
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1758 if generate_indexes:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1759 # Checking if the index files already exist
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1760 if os.path.isfile(stranded_genome_index):
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1761 sys.exit("\nError: index files named '" + genome_index_basename +
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1762 ".stranded.index' already exists but you provided a GTF file. If you want to generate a new index, "
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1763 "please delete these files or specify an other path.\n### End of program")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1764 # Running the index generation commands
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1765 print "# Generating the genome index files"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1766 # Getting chromosomes lengths
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1767 lengths = get_chromosome_lengths()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1768 # Generating the index files
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1769 generate_genome_index(options.annotation, unstranded_genome_index, stranded_genome_index, lengths)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1770 # Displaying the list of indexed chromosomes
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1771 index_chrom_list.sort(key=alphanum_key)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1772 print "Indexed chromosomes: " + ", ".join(index_chrom_list)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1773 if generate_indexes or not options.counts:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1774 # Getting index info
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1775 read_index()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1776 # Computing the genome intergenic count: sum of the chr lengths minus sum of the genome annotated intervals
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1777 cpt_genome[("intergenic", "intergenic")] = sum(lengths.values()) - sum([v for x, v in cpt_genome.iteritems() if x != ("antisense", "antisense")])
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1778
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1779 # BedGraph files generation
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1780 if generate_BedGraph:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1781 print "# Generating the BedGraph files"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1782 #sample_files, sample_labels = generate_bedgraph_files()
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1783 generate_bedgraph_files(labels, bams)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1784
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1785 # Indexes and BedGraph files intersection
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1786 if intersect_indexes_BedGraph:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1787 print "# Intersecting index and BedGraph files"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1788 cpt = intersect_bedgraphs_and_index_to_counts_categories(labels, bedgraphs)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1789 # Write the counts to an output file
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1790 write_counts_in_files(cpt, cpt_genome)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1791
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1792 ## Plot generation ## MB: all the section still to review
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1793 if generate_plot:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1794 print "# Generating plots"
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1795 # If input files are the categories counts, the first step is to load them
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1796 if options.counts:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1797 #cpt, cpt_genome, sample_names = read_counts(options.counts)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1798 cpt, cpt_genome = read_counts(labels, count_files)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1799 # Managing the unknown biotypes
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1800 for sample_label, counters in cpt.items():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1801 for (cat, biot) in counters:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1802 if biot not in biotypes:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1803 unknown_biot.add(biot)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1804 for biot in unknown_biot:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1805 biotypes.add(biot)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1806 biotypes_group1["others"].append(biot)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1807 biotypes = sorted(biotypes)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1808 # Moving antisense cat to the end of the list
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1809 biotypes.remove("antisense")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1810 biotypes.append("antisense")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1811 biotypes_group1 = sorted(biotypes_group1)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1812 # Filtering biotypes if necessary
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1813 filtered_biotype = None
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1814 if options.biotype_filter:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1815 for sample_label in cpt:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1816 for feature in cpt[sample_label]:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1817 biotype = feature[1]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1818 if options.biotype_filter.lower() == biotype.lower():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1819 selected_biotype = biotype
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1820 break
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1821 if filtered_biotype:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1822 print "\nWarning: biotype '" + options.biotype_filter + "' not found. Please check the biotype name and that this biotype exists in your sample(s)."
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1823 # Setting the plots filenames
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1824 if options.pdf: ## MB: Do the same for svg and png??
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1825 pdf = PdfPages(options.pdf)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1826 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1827 pdf = False
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1828 ## Generate the categories plot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1829 # Recategorizing within the final categories and plot generation
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1830 final_cats = categs_levels[options.categories_depth - 1]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1831 parent_categs = parent_categ_groups[options.categories_depth - 1]
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1832 final_cat_cpt, final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt, cpt_genome, final_cats, filtered_biotype)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1833 # Remove the "antisense" category if the library type is "unstranded" ## MB: if options.strandness == "unstranded": cat_list.remove("antisense")??
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1834 for dic in cpt.values():
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1835 if ("antisense", "antisense") in dic.keys(): break
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1836 else:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1837 cat_list.remove("antisense")
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1838 #make_plot(labels, cat_list, sample_labels, final_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold, svg=options.svg, png=options.png)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1839 make_plot(labels, cat_list, final_cat_cpt, final_genome_cpt, pdf, "Categories", options.threshold, svg=options.svg, png=options.png, categ_groups= parent_categs)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1840 if filtered_biotype:
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1841 #make_plot(labels, cat_list, sample_labels, filtered_cat_cpt, final_genome_cpt, pdf, "categories", options.threshold, title="Categories distribution for '" + filtered_biotype + "' biotype", svg=options.svg, png=options.png)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1842 make_plot(labels, cat_list, filtered_cat_cpt, final_genome_cpt, pdf, "Categories", options.threshold, title="Categories distribution for '" + filtered_biotype + "' biotype", svg=options.svg, png=options.png, categ_groups= parent_categs)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1843 ## Generate the biotypes plot
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1844 # Recategorization within the final biotypes and plot generation
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1845 final_cat_cpt, final_genome_cpt = group_counts_by_biotype(cpt, cpt_genome, biotypes)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1846 #make_plot(biotypes, sample_labels, final_cat_cpt, final_genome_cpt, pdf, "biotypes", options.threshold, svg=options.svg, png=options.png)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1847 make_plot(labels, biotypes, final_cat_cpt, final_genome_cpt, pdf, "Biotypes", options.threshold, svg=options.svg, png=options.png)
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1848
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1849
e360f840a92e Uploaded
biocomp-ibens
parents:
diff changeset
1850 print "### End of program ###"