Mercurial > repos > charles-bernard > alfa
comparison ALFA/ALFA.py @ 18:a1e2ab10b317 draft
Uploaded
author | charles-bernard |
---|---|
date | Tue, 11 Oct 2016 09:18:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:e3d439570972 | 18:a1e2ab10b317 |
---|---|
1 #!/usr/bin/python | |
2 #-*- coding: utf-8 -*- | |
3 | |
4 __author__ = 'noel & bahin' | |
5 ''' <decription> ''' | |
6 | |
7 import argparse | |
8 import os | |
9 import numpy | |
10 import sys | |
11 import subprocess | |
12 import matplotlib.pyplot as plt | |
13 import matplotlib.cm as cmx | |
14 import matplotlib.colors as colors | |
15 import matplotlib.patheffects as PathEffects | |
16 import re | |
17 from matplotlib.backends.backend_pdf import PdfPages | |
18 # To correctly embbed the texts when saving plots in svg format | |
19 import matplotlib | |
20 matplotlib.rcParams['svg.fonttype'] = 'none' | |
21 | |
22 ########################################################################## | |
23 # FUNCTIONS # | |
24 ########################################################################## | |
25 | |
26 def init_dict(d, key, init): | |
27 if key not in d: | |
28 d[key] = init | |
29 | |
30 def get_chromosome_lengths(args): | |
31 """ | |
32 Parse the file containing the chromosomes lengths. | |
33 If no length file is provided, browse the annotation file (GTF) to estimate the chromosome sizes ( | |
34 """ | |
35 lengths={} | |
36 gtf_chrom_names=set() | |
37 force_get_lengths = False | |
38 # If the user provided the chromosome length file | |
39 if args.chr_len: | |
40 with open(args.chr_len, 'r') as chr_len_file: | |
41 for line in chr_len_file: | |
42 lengths[line.split('\t')[0]] = int(line.rstrip().split('\t')[1]) | |
43 with open(args.annotation,'r') as gtf_file: | |
44 for line in gtf_file: | |
45 if not line.startswith('#'): | |
46 chrom = line.split('\t')[0] | |
47 if chrom not in gtf_chrom_names: | |
48 gtf_chrom_names.add(chrom) | |
49 for chrom in lengths.keys(): | |
50 if chrom not in gtf_chrom_names: | |
51 print "Warning: at least one chromosome name ('"+chrom+"') of the file '"+args.chr_len+"'does not match any chromosome name if GTF and was ignored." | |
52 #del lengths[chrom] | |
53 break | |
54 for chrom in gtf_chrom_names: | |
55 if force_get_lengths: break | |
56 if chrom not in lengths.keys(): | |
57 print "WARNING: chromosome name '"+chrom+"' was found in gtf and does not match any chromosome name provided in",args.chr_len+". " | |
58 print "\t=> The chromosome lenghts will be approximated using annotations in the GTF file." | |
59 continue_value ="" | |
60 while continue_value not in {"yes","y","no","n"}: | |
61 continue_value = raw_input("\tDo you want to continue ('yes' or 'y')?\n\tElse write 'no' or 'n' to exit the script and check your file of lengths.\n") | |
62 if continue_value == "no" or continue_value == "n": | |
63 sys.exit("Exiting") | |
64 elif continue_value == "yes" or continue_value == "y": | |
65 force_get_lengths = True | |
66 break | |
67 print "Error: use 'yes/y/no/n' only" | |
68 if not force_get_lengths: | |
69 return lengths | |
70 # Otherwise, (or if at least one chromosome was missing in chromosome lengths file) we consider the end of the last annotation of the chromosome in the GTF file as the chromosome length | |
71 with open(args.annotation, 'r') as gtf_file: | |
72 for line in gtf_file: | |
73 if not line.startswith('#'): | |
74 chrom = line.split('\t')[0] | |
75 end = int(line.split('\t')[4]) | |
76 init_dict(lengths, chrom, 0) | |
77 lengths[chrom] = max(lengths[chrom], end) | |
78 if force_get_lengths: | |
79 print "The chromosome lenghts have been approximated using the last annotations in the GTF file." | |
80 return lengths | |
81 | |
82 def write_feature_on_index(feat,chrom, start, stop, sign, stranded_genome_index, unstranded_genome_index=None): | |
83 grouped_by_biotype_features = [] | |
84 for biotype,categs in feat.iteritems(): | |
85 categ_list=[] | |
86 for cat in set(categs): | |
87 categ_list.append(cat) | |
88 grouped_by_biotype_features.append(":".join((str(biotype),",".join(categ_list)))) | |
89 stranded_genome_index.write('\t'.join((chrom, start, stop, sign,''))+'\t'.join(grouped_by_biotype_features)+'\n') | |
90 if unstranded_genome_index : | |
91 unstranded_genome_index.write('\t'.join((chrom, start, stop, '.',''))+'\t'.join(grouped_by_biotype_features)+'\n') | |
92 | |
93 | |
94 def add_info(cpt, feat_values, start, stop, chrom=None, unstranded_genome_index=None, stranded_genome_index = None , biotype_prios=None, coverage=1, categ_prios=None): | |
95 """ | |
96 From an annotated genomic interval (start/stop positions and associated feature : one or more category(ies)/biotype pair(s) ) | |
97 - If a file is provided: write the information at the end of the index file being generated; | |
98 - else : browse the features and update the counts of categories/biotypes found in the genome. | |
99 """ | |
100 ## Writing in the file is it was provided | |
101 if stranded_genome_index : | |
102 unstranded_features = None | |
103 # If only one strand has a feature, this feature will directly be written on the unstranded index | |
104 if feat_values[0] == {} : | |
105 # A interval with no feature corresponds to a region annotated only on the reverse strand : update 'antisense' counts | |
106 stranded_genome_index.write('\t'.join((chrom, start, stop, '+','antisense\n'))) | |
107 write_feature_on_index(feat_values[1], chrom ,start, stop, '-', stranded_genome_index, unstranded_genome_index) | |
108 else : | |
109 if feat_values[1] == {} : | |
110 write_feature_on_index(feat_values[0], chrom ,start, stop, '+', stranded_genome_index, unstranded_genome_index) | |
111 stranded_genome_index.write('\t'.join((chrom, start, stop, '-','antisense\n'))) | |
112 | |
113 # Else, the unstranded index should contain the union of plus and minus features | |
114 else : | |
115 write_feature_on_index(feat_values[0], chrom ,start, stop, '+', stranded_genome_index) | |
116 write_feature_on_index(feat_values[1], chrom ,start, stop, '-', stranded_genome_index) | |
117 unstranded_feat = dict(feat_values[0], **feat_values[1]) | |
118 for name in set(feat_values[0]) & set(feat_values[1]): | |
119 unstranded_feat[name]+=feat_values[0][name] | |
120 write_feature_on_index(unstranded_feat, chrom ,start, stop, '.', unstranded_genome_index) | |
121 | |
122 ## Increasing category counter(s) | |
123 else : | |
124 # Default behavior if no biotype priorities : category with the highest priority for each found biotype has the same weight (1/n_biotypes) | |
125 if not biotype_prios: | |
126 nb_feat = len(feat_values) | |
127 # For every categ(s)/biotype pairs | |
128 for feat in feat_values: | |
129 cur_prio = 0 | |
130 selected_categ = [] | |
131 # Separate categorie(s) and biotype | |
132 try: | |
133 biot,cats = feat.split(":") | |
134 # Error if feature equal "antisense" : update the 'antisense/antisense' counts | |
135 except ValueError : | |
136 try : | |
137 cpt[(feat,feat)] += (int(stop) - int(start)) * coverage / float(nb_feat) | |
138 except : | |
139 cpt[(feat,feat)] = (int(stop) - int(start)) * coverage / float(nb_feat) | |
140 return | |
141 # Browse the categories and get only the one(s) with highest priority | |
142 for cat in cats.split(','): | |
143 try: prio = prios[cat] | |
144 except: | |
145 #TODO Find a way to add unknown categories | |
146 if cat not in unknown_feature: | |
147 print >> sys.stderr, "Warning: Unknown categorie %s found and ignored\n...\r" %cat, | |
148 unknown_feature.append(cat) | |
149 continue | |
150 if prio > cur_prio : | |
151 cur_prio = prio | |
152 selected_categ = [cat] | |
153 if prio == cur_prio : | |
154 if cat != selected_categ : | |
155 try: | |
156 if cat not in selected_categ : | |
157 selected_categ.append(cat) | |
158 except TypeError : | |
159 selected_categ = [selected_categ,cat] | |
160 # Increase each counts by the coverage divided by the number of categories and biotypes | |
161 nb_cats = len(selected_categ) | |
162 for cat in selected_categ : | |
163 try: | |
164 cpt[(cat,biot)] += (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats)) | |
165 except KeyError: | |
166 cpt[(cat,biot)] = (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats)) | |
167 #else : | |
168 #cpt[(cats,biot)] = (int(stop) - int(start)) / float(nb_feat) * coverage | |
169 # Else, apply biotype selection according to the priority set | |
170 else: | |
171 #TODO Add an option to pass biotype priorities and handle it | |
172 pass | |
173 | |
174 def print_chrom(features_dict, chrom, stranded_index_file, unstranded_index_file, cpt_genome): | |
175 with open(unstranded_index_file,'a') as findex, open(stranded_index_file,'a') as fstrandedindex: | |
176 # Initialization of variables : start position of the first interval and associated features for +/- strands | |
177 start = "" | |
178 for pos in sorted(features_dict['+'].keys()): | |
179 if start != "": | |
180 add_info(cpt_genome, [features_plus,features_minus], str(start), str(pos), chrom, stranded_genome_index = fstrandedindex, unstranded_genome_index = findex) | |
181 start = pos | |
182 features_plus = features_dict['+'][pos] | |
183 features_minus = features_dict['-'][pos] | |
184 | |
185 | |
186 def create_genome_index(annotation, unstranded_genome_index, stranded_genome_index,cpt_genome,prios,biotypes,chrom_sizes): | |
187 ''' Create an index of the genome annotations and save it in a file''' | |
188 print '\n### Generating genome indexes\n', | |
189 sys.stdout.flush() | |
190 # Initializations | |
191 intervals_dict = 0 | |
192 max_value = -1 | |
193 prev_chrom = '' | |
194 reverse_strand = {'+':'-','-':'+'} | |
195 i = 0 # Line counter | |
196 # Write the chromosomes lengths as comment lines before the genome index | |
197 with open(unstranded_genome_index,'w') as findex, open(stranded_genome_index,'w') as fstrandedindex: | |
198 for key,value in chrom_sizes.items(): | |
199 findex.write("#%s\t%s\n" %(key, value)) | |
200 fstrandedindex.write("#%s\t%s\n" %(key, value)) | |
201 # Running through the GTF file and writing into genome index files | |
202 with open(annotation, 'r') as gtf_file: | |
203 for line in gtf_file: | |
204 # Print messages after X processed lines | |
205 i += 1 | |
206 if i % 100000 == 0: | |
207 print >> sys.stderr, '\r%s line processed...' %str(i) | |
208 print '\r \r. . .', | |
209 sys.stdout.flush() | |
210 elif i % 20000 == 0: | |
211 print '\r \r. . .', | |
212 sys.stdout.flush() | |
213 elif i % 2000 == 0: | |
214 print '.', | |
215 sys.stdout.flush() | |
216 # Processing lines except comment ones | |
217 if not line.startswith('#'): | |
218 # Getting the line infos | |
219 line_split=line[:-1].split('\t') | |
220 chrom = line_split[0] | |
221 cat=line_split[2] | |
222 start = int(line_split[3]) - 1 | |
223 stop = int(line_split[4]) | |
224 strand = line_split[6] | |
225 antisense = reverse_strand[strand] | |
226 biotype=line_split[8].split('biotype')[1].split(';')[0].strip('" ') | |
227 feat = [(cat,biotype)] | |
228 # Registering chromosome info in the genome index file if this is a new chromosome or a annotation not overlapping previously recorded features | |
229 if start > max_value or chrom != prev_chrom: | |
230 # Write the previous features | |
231 if intervals_dict != 0: | |
232 if chrom != prev_chrom : | |
233 print_chrom(intervals_dict, prev_chrom, stranded_genome_index, unstranded_genome_index, cpt_genome) | |
234 print "\rChromosome '" + prev_chrom + "' registered." | |
235 else: | |
236 print_chrom(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index, cpt_genome) | |
237 prev_chrom = chrom | |
238 # (Re)Initializing the chromosome lists | |
239 intervals_dict = {strand:{start:{biotype:[cat]}, stop:{}},antisense:{start:{},stop:{}}} | |
240 max_value = stop | |
241 | |
242 # Update the dictionary which represents intervals for every disctinct annotation | |
243 else: | |
244 # Get intervals on the same strand as the current feature | |
245 stranded_intervals = intervals_dict[strand] | |
246 start_added = False | |
247 for pos in sorted(stranded_intervals.keys()): | |
248 #print pos | |
249 #print stranded_intervals[pos] | |
250 #print | |
251 # While the position is below the feature's interval, store the features | |
252 if pos < start : | |
253 cur_cat_dict = dict(stranded_intervals[pos]) | |
254 cur_antisense_dict = dict(intervals_dict[antisense][pos]) | |
255 | |
256 # If the start position already exists: update it by addind the new feature | |
257 elif pos == start : | |
258 cur_cat_dict = dict(stranded_intervals[pos]) | |
259 cur_antisense_dict = dict(intervals_dict[antisense][pos]) | |
260 #print "cur",cur_cat_dict | |
261 try : stranded_intervals[pos][biotype] = stranded_intervals[pos][biotype]+[cat] | |
262 except KeyError: stranded_intervals[pos][biotype] = [cat] | |
263 start_added = True | |
264 #print "cur",cur_cat_dict | |
265 | |
266 elif pos > start : | |
267 # Create a new entry for the start position if necessary | |
268 if not start_added : | |
269 #print "cur",cur_cat_dict | |
270 stranded_intervals[start] = dict(cur_cat_dict) | |
271 stranded_intervals[start][biotype] = [cat] | |
272 #stranded_intervals[pos][biotype].append(cat) | |
273 intervals_dict[antisense][start] = cur_antisense_dict | |
274 start_added = True | |
275 #print "cur",cur_cat_dict | |
276 # While the position is below the stop, just add the new feature | |
277 if pos < stop : | |
278 cur_cat_dict = dict(stranded_intervals[pos]) | |
279 cur_antisense_dict = dict(intervals_dict[antisense][pos]) | |
280 try: stranded_intervals[pos][biotype] = stranded_intervals[pos][biotype] + [cat] | |
281 except KeyError: stranded_intervals[pos][biotype] = [cat] | |
282 # Close the created interval : create an entry at the stop position and restore the features | |
283 elif pos > stop : | |
284 stranded_intervals[stop] = dict(cur_cat_dict) | |
285 intervals_dict[antisense][stop] = cur_antisense_dict | |
286 break | |
287 else : | |
288 break | |
289 #try: | |
290 #cur_cat_dict = list(stranded_intervals[pos][biotype]) | |
291 #except KeyError: cur_cat_dict = list() | |
292 #print stranded_intervals[pos] | |
293 #print | |
294 # Extend the dictionary if necessary | |
295 if stop > max_value: | |
296 max_value = stop | |
297 stranded_intervals[stop] = {} | |
298 intervals_dict[antisense][stop] = {} | |
299 #except KeyError: | |
300 #print intervals_dict | |
301 #quit() | |
302 #intervals_dict[strand] = {strand:{start:{biotype:[cat]}, stop:{}},antisense:{start:{},stop:{}}} | |
303 #continue | |
304 #for sign in ['-','+']: | |
305 #print sign | |
306 #for key,val in sorted(intervals_dict[sign].iteritems()): | |
307 #print key,'\t',val | |
308 #print | |
309 #print "-------\n" | |
310 | |
311 | |
312 #Store the categories of the last chromosome | |
313 print_chrom(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index, cpt_genome) | |
314 print "\rChromosome '" + prev_chrom + "' registered.\nDone!" | |
315 | |
316 | |
317 def create_bedgraph_files(bams,strandness): | |
318 samples_files = [] | |
319 labels = [] | |
320 print "\n### Generating the bedgraph files" | |
321 for n in range(0, len(bams), 2): | |
322 print "\rProcessing '%s'\n..." %bams[n], | |
323 sys.stdout.flush() | |
324 #Get the label for this sample | |
325 label = bams[n+1] | |
326 #Modify it to contain only alphanumeric caracters (avoid files generation with dangerous names) | |
327 modified_label = "_".join(re.findall(r"[\w']+", label)) | |
328 if strandness in ["reverse","fr-secondstrand"]: | |
329 subprocess.call('bedtools genomecov -bg -split -strand - -ibam ' + bams[n] + ' > ' + modified_label + '.plus.bedgraph', shell=True) | |
330 subprocess.call('bedtools genomecov -bg -split -strand + -ibam ' + bams[n] + ' > ' + modified_label + '.minus.bedgraph', shell=True) | |
331 elif strandness in ["forward","fr-firststrand"]: | |
332 subprocess.call('bedtools genomecov -bg -split -strand + -ibam ' + bams[n] + ' > ' + modified_label + '.plus.bedgraph', shell=True) | |
333 subprocess.call('bedtools genomecov -bg -split -strand - -ibam ' + bams[n] + ' > ' + modified_label + '.minus.bedgraph', shell=True) | |
334 else : | |
335 subprocess.call('bedtools genomecov -bg -split -ibam ' + bams[n] + ' > ' + modified_label + '.bedgraph', shell=True) | |
336 samples_files.append(modified_label) | |
337 labels.append(label) | |
338 print "\rDone!" | |
339 return samples_files, labels | |
340 | |
341 def read_gtf(gtf_index_file, sign): | |
342 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_features, endGTF | |
343 strand = "" | |
344 while strand != sign : | |
345 gtf_line = gtf_index_file.readline() | |
346 # If the GTF file is finished | |
347 if not gtf_line: | |
348 endGTF = True | |
349 return endGTF | |
350 splitline = gtf_line.rstrip().split('\t') | |
351 try: strand = splitline[3] | |
352 # strand information can not be found in the file file header | |
353 except IndexError: pass | |
354 gtf_chrom = splitline[0] | |
355 gtf_features = splitline[4:] | |
356 gtf_start, gtf_stop = map(int, splitline[1:3]) | |
357 return endGTF | |
358 | |
359 def read_counts_files(counts_files): | |
360 cpt={} | |
361 cpt_genome={} | |
362 labels=[] | |
363 for fcounts in counts_files: | |
364 label=os.path.splitext(os.path.basename(fcounts))[0] | |
365 labels.append(label) | |
366 cpt[label]={} | |
367 with open (fcounts,"r") as f: | |
368 for line in f: | |
369 if line[0]=="#": | |
370 continue | |
371 line_split=line[:-1].split('\t') | |
372 feature=tuple(line_split[0].split(',')) | |
373 cpt[label][feature]=float(line_split[1]) | |
374 cpt_genome[feature]=float(line_split[2]) | |
375 return cpt,cpt_genome,labels | |
376 | |
377 | |
378 def get_chromosome_names_in_index(genome_index): | |
379 chrom_list = [] | |
380 with open(genome_index, 'r') as findex: | |
381 chrom = "" | |
382 for line in findex: | |
383 cur_chrom = line.split('\t')[0] | |
384 if cur_chrom == chrom: | |
385 pass | |
386 else: | |
387 chrom = cur_chrom | |
388 if chrom not in chrom_list: | |
389 chrom_list.append(chrom) | |
390 return set(chrom_list) | |
391 | |
392 | |
393 def intersect_bedgraphs_and_index_to_counts_categories(samples_files,samples_names,prios,genome_index, strandness, biotype_prios = None): | |
394 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_cat, endGTF | |
395 print "\n### Intersecting files with indexes" | |
396 unknown_chrom = [] | |
397 cpt = {} # Counter for the nucleotides in the BAM input file(s) | |
398 for n in range(len(samples_files)): | |
399 sample_file=samples_files[n] | |
400 sample_name=samples_names[n] | |
401 # Initializing the category counter dict for this sample | |
402 init_dict(cpt, sample_name, {}) | |
403 if strandness == "unstranded": | |
404 strands = [("",".")] | |
405 else: | |
406 strands = [('.plus','+'), ('.minus','-')] | |
407 | |
408 # Intersecting the BEDGRAPH and genome index files | |
409 print "\rProcessing '%s'\n. . ." %sample_file, | |
410 sys.stdout.flush() | |
411 | |
412 for strand,sign in strands: | |
413 prev_chrom = '' | |
414 endGTF = False # Reaching the next chr or the end of the GTF index | |
415 intergenic_adds = 0.0 | |
416 i = 0 | |
417 i_chgt = 0 | |
418 with open(sample_file + strand + '.bedgraph', 'r') as bam_count_file: | |
419 # Running through the BEDGRAPH file | |
420 for bam_line in bam_count_file: | |
421 i += 1 | |
422 if i % 10000 == 0: | |
423 print ".", | |
424 sys.stdout.flush() | |
425 if i % 100000 == 0: | |
426 print "\r \r. . .", | |
427 sys.stdout.flush() | |
428 # Getting the BAM line info | |
429 bam_chrom = bam_line.split('\t')[0] | |
430 bam_start, bam_stop, bam_cpt = map(float, bam_line.split('\t')[1:4]) | |
431 # Skip the line if the chromosome is not in the index | |
432 if bam_chrom not in chrom_list: | |
433 if bam_chrom not in unknown_chrom: | |
434 unknown_chrom.append(bam_chrom) | |
435 print "\r \r Chromosome '" + bam_chrom + "' not found in index." | |
436 continue | |
437 # If this is a new chromosome (or the first one) | |
438 if bam_chrom != prev_chrom: | |
439 i_chgt = i | |
440 intergenic_adds = 0.0 | |
441 # (Re)opening the GTF index and looking for the first line of the matching chr | |
442 try: gtf_index_file.close() | |
443 except UnboundLocalError: pass | |
444 gtf_index_file = open(genome_index, 'r') | |
445 endGTF = False | |
446 read_gtf(gtf_index_file, sign) | |
447 while bam_chrom != gtf_chrom: | |
448 read_gtf(gtf_index_file, sign) | |
449 if endGTF: | |
450 break | |
451 prev_chrom = bam_chrom | |
452 | |
453 # Looking for the first matching annotation in the GTF index | |
454 while (not endGTF) and (gtf_chrom == bam_chrom) and (bam_start >= gtf_stop): | |
455 read_gtf(gtf_index_file, sign) | |
456 if gtf_chrom != bam_chrom: | |
457 endGTF = True | |
458 # Processing BAM lines before the first GTF annotation if there are | |
459 if bam_start < gtf_start: | |
460 # Increase the 'intergenic' category counter with all or part of the BAM interval | |
461 try: | |
462 intergenic_adds += min(bam_stop,gtf_start)-bam_start | |
463 cpt[sample_name][('intergenic','intergenic')] += (min(bam_stop, gtf_start) - bam_start) * bam_cpt | |
464 except KeyError: | |
465 cpt[sample_name][('intergenic','intergenic')] = (min(bam_stop, gtf_start) - bam_start) * bam_cpt | |
466 # Go to next line if the BAM interval was totally upstream the first GTF annotation, carry on with the remaining part otherwise | |
467 if endGTF or (bam_stop <= gtf_start): | |
468 continue | |
469 else: | |
470 bam_start = gtf_start | |
471 | |
472 # We can start the crossover | |
473 while not endGTF: | |
474 # Update category counter | |
475 add_info(cpt[sample_name], gtf_features, bam_start, min(bam_stop,gtf_stop), coverage = bam_cpt, categ_prios = prios) | |
476 # Read the next GTF file line if the BAM line is not entirely covered | |
477 if bam_stop > gtf_stop: | |
478 # Update the BAM start pointer | |
479 bam_start = gtf_stop | |
480 endGTF = read_gtf(gtf_index_file, sign) | |
481 # If we read a new chromosome in the GTF file then it is considered finished | |
482 if bam_chrom != gtf_chrom: | |
483 endGTF = True | |
484 if endGTF: | |
485 break | |
486 else: | |
487 # Next if the BAM line is entirely covered | |
488 bam_start = bam_stop | |
489 break | |
490 | |
491 # Processing the end of the BAM line if necessary | |
492 if endGTF and (bam_stop > bam_start): | |
493 try: | |
494 cpt[sample_name][('intergenic','intergenic')] += (bam_stop - bam_start) * bam_cpt | |
495 except KeyError: | |
496 cpt[sample_name][('intergenic','intergenic')] = (bam_stop - bam_start) * bam_cpt | |
497 gtf_index_file.close() | |
498 print "\r \rDone!" | |
499 return cpt | |
500 | |
501 def write_counts_in_files(cpt,genome_counts): | |
502 for label,dico in cpt.items(): | |
503 label = "_".join(re.findall(r"[\w']+", label)) | |
504 with open(label+".categories_counts","w") as fout: | |
505 fout.write("#Category,biotype\tCounts_in_bam\tSize_in_genome\n" ) | |
506 for feature,counts in dico.items(): | |
507 fout.write("%s\t%s\t%s\n" %(','.join(feature),counts,genome_counts[feature])) | |
508 | |
509 def recategorize_the_counts(cpt,cpt_genome,final): | |
510 final_cat_cpt={} | |
511 final_genome_cpt={} | |
512 for f in cpt: | |
513 #print "\nFinal categories for",f,"sample" | |
514 final_cat_cpt[f]={} | |
515 for final_cat in final: | |
516 tot = 0 | |
517 tot_genome=0 | |
518 for cat in final[final_cat]: | |
519 tot += cpt[f][cat] | |
520 tot_genome+=cpt_genome[cat] | |
521 #output_file.write('\t'.join((final_cat, str(tot))) + '\n') | |
522 #print '\t'.join((final_cat, str(tot))) | |
523 final_cat_cpt[f][final_cat]=tot | |
524 final_genome_cpt[final_cat]=tot_genome | |
525 return final_cat_cpt,final_genome_cpt | |
526 | |
527 def group_counts_by_categ(cpt,cpt_genome,final,selected_biotype): | |
528 final_cat_cpt={} | |
529 final_genome_cpt={} | |
530 filtered_cat_cpt = {} | |
531 for f in cpt: | |
532 final_cat_cpt[f]={} | |
533 filtered_cat_cpt[f] = {} | |
534 for final_cat in final: | |
535 tot = 0 | |
536 tot_filter = 0 | |
537 tot_genome=0 | |
538 for cat in final[final_cat]: | |
539 for key,value in cpt[f].items(): | |
540 if key[0] == cat : | |
541 tot += value | |
542 tot_genome+=cpt_genome[key] | |
543 if key[1] == selected_biotype : | |
544 tot_filter += value | |
545 #output_file.write('\t'.join((final_cat, str(tot))) + '\n') | |
546 #print '\t'.join((final_cat, str(tot))) | |
547 final_cat_cpt[f][final_cat]=tot | |
548 if tot_genome == 0: | |
549 final_genome_cpt[final_cat]= 1e-100 | |
550 else: | |
551 final_genome_cpt[final_cat]=tot_genome | |
552 filtered_cat_cpt[f][final_cat]=tot_filter | |
553 #if 'antisense' in final_genome_cpt: final_genome_cpt['antisense'] = 0 | |
554 return final_cat_cpt,final_genome_cpt,filtered_cat_cpt | |
555 | |
556 def group_counts_by_biotype(cpt,cpt_genome,biotypes): | |
557 final_cpt={} | |
558 final_genome_cpt={} | |
559 for f in cpt: | |
560 final_cpt[f]={} | |
561 for biot in biotypes: | |
562 tot = 0 | |
563 tot_genome=0 | |
564 try: | |
565 for final_biot in biotypes[biot]: | |
566 for key,value in cpt[f].items(): | |
567 if key[1] == final_biot : | |
568 tot += value | |
569 #if key[1] != 'antisense': | |
570 tot_genome+=cpt_genome[key] | |
571 except: | |
572 for key,value in cpt[f].items(): | |
573 if key[1] == biot : | |
574 tot += value | |
575 tot_genome+=cpt_genome[key] | |
576 if tot != 0: | |
577 final_cpt[f][biot]=tot | |
578 final_genome_cpt[biot]=tot_genome | |
579 return final_cpt,final_genome_cpt | |
580 | |
581 #def get_cmap(N): | |
582 #'''Returns a function that maps each index in 0, 1, ... N-1 to a distinct | |
583 #RGB color.''' | |
584 #color_norm = colors.Normalize(vmin=0, vmax=N-1) | |
585 #scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv') | |
586 #def map_index_to_rgb_color(index): | |
587 #return scalar_map.to_rgba(index) | |
588 #return map_index_to_rgb_color | |
589 | |
590 def one_sample_plot(ordered_categs, percentages, enrichment, n_cat, index, index_enrichment, bar_width, counts_type, title) : | |
591 ### Initialization | |
592 fig = plt.figure(figsize=(13,9)) | |
593 ax1 = plt.subplot2grid((2,4),(0,0),colspan=2) | |
594 ax2 = plt.subplot2grid((2,4),(1,0),colspan=2) | |
595 cmap= plt.get_cmap('Spectral') | |
596 cols=[cmap(x) for x in xrange(0,256,256/n_cat)] | |
597 if title: | |
598 ax1.set_title(title+"in: %s" %samples_names[0]) | |
599 else : | |
600 ax1.set_title(counts_type+" distribution in mapped reads in: %s" %samples_names[0]) | |
601 ax2.set_title('Normalized counts of '+counts_type) | |
602 | |
603 | |
604 ### Barplots | |
605 #First barplot: percentage of reads in each categorie | |
606 ax1.bar(index, percentages, bar_width, | |
607 color=cols) | |
608 #Second barplot: enrichment relative to the genome for each categ | |
609 # (the reads count in a categ is divided by the categ size in the genome) | |
610 ax2.bar(index_enrichment, enrichment, bar_width, | |
611 color=cols,) | |
612 ### Piecharts | |
613 pielabels = [ordered_categs[i] if percentages[i]>0.025 else '' for i in xrange(n_cat)] | |
614 sum_enrichment = numpy.sum(enrichment) | |
615 pielabels_enrichment = [ordered_categs[i] if enrichment[i]/sum_enrichment>0.025 else '' for i in xrange(n_cat)] | |
616 # Categories piechart | |
617 ax3 = plt.subplot2grid((2,4),(0,2)) | |
618 pie_wedge_collection, texts = ax3.pie(percentages,labels=pielabels, shadow=True, colors=cols) | |
619 # Enrichment piechart | |
620 ax4 = plt.subplot2grid((2,4),(1,2)) | |
621 pie_wedge_collection, texts = ax4.pie(enrichment,labels=pielabels_enrichment, shadow=True, colors=cols) | |
622 # Add legends (append percentages to labels) | |
623 labels = [" ".join((ordered_categs[i],"({:.1%})".format(percentages[i]))) for i in range(len(ordered_categs))] | |
624 ax3.legend(pie_wedge_collection,labels,loc='center',fancybox=True, shadow=True,prop={'size':'medium'}, bbox_to_anchor=(1.7,0.5)) | |
625 labels = [" ".join((ordered_categs[i],"({:.1%})".format(enrichment[i]/sum_enrichment))) for i in range(len(ordered_categs))]# if ordered_categs[i] != 'antisense'] | |
626 ax4.legend(pie_wedge_collection,labels,loc='center',fancybox=True, shadow=True,prop={'size':'medium'},bbox_to_anchor=(1.7,0.5)) | |
627 # Set aspect ratio to be equal so that pie is drawn as a circle | |
628 ax3.set_aspect('equal') | |
629 ax4.set_aspect('equal') | |
630 return fig, ax1, ax2 | |
631 | |
632 def make_plot(ordered_categs,samples_names,categ_counts,genome_counts,pdf, counts_type, threshold, title = None ,svg = None, png = None): | |
633 # From ordered_categs, keep only the features (categs or biotypes) that we can find in at least one sample. | |
634 existing_categs = set() | |
635 for sample in categ_counts.values(): | |
636 existing_categs |= set(sample.keys()) | |
637 ordered_categs = filter(existing_categs.__contains__, ordered_categs) | |
638 n_cat = len(ordered_categs) | |
639 n_exp=len(samples_names) | |
640 ##Initialization of the matrix of counts (nrow=nb_experiements, ncol=nb_categorie) | |
641 counts=numpy.matrix(numpy.zeros(shape=(n_exp,n_cat))) | |
642 for exp in xrange(len(samples_names)): | |
643 for cat in xrange(len(ordered_categs)): | |
644 try: counts[exp,cat]=categ_counts[samples_names[exp]][ordered_categs[cat]] | |
645 except: pass | |
646 | |
647 ##Normalize the categorie sizes by the total size to get percentages | |
648 sizes=[] | |
649 sizes_sum=0 | |
650 for cat in ordered_categs: | |
651 sizes.append(genome_counts[cat]) | |
652 sizes_sum+=genome_counts[cat] | |
653 if 'antisense' in ordered_categs: | |
654 antisense_pos = ordered_categs.index('antisense') | |
655 sizes[antisense_pos] = 1e-100 | |
656 for cpt in xrange(len(sizes)): | |
657 sizes[cpt]/=float(sizes_sum) | |
658 | |
659 ## Create array which contains the percentage of reads in each categ for every sample | |
660 percentages=numpy.array(counts/numpy.sum(counts,axis=1)) | |
661 ## Create the enrichment array (counts divided by the categorie sizes in the genome) | |
662 enrichment=numpy.array(percentages/sizes) | |
663 if 'antisense_pos' in locals(): | |
664 for i in xrange(len(samples_names)): | |
665 enrichment[i][antisense_pos] = 0 | |
666 #enrichment=numpy.log(numpy.array(percentages/sizes)) | |
667 for exp in xrange(n_exp): | |
668 for i in xrange(n_cat): | |
669 val = enrichment[exp][i] | |
670 if val > 1: | |
671 enrichment[exp][i] = val-1 | |
672 elif val == 1 or val == 0: | |
673 enrichment[exp][i] = 0 | |
674 else: | |
675 enrichment[exp][i] = -1/val+1 | |
676 | |
677 #### Finally, produce the plot | |
678 | |
679 ##Get the colors from the colormap | |
680 ncolor=16 | |
681 cmap = ["#e47878", "#68b4e5", "#a3ea9b", "#ea9cf3", "#e5c957", "#a3ecd1", "#e97ca0", "#66d985", "#8e7ae5", "#b3e04b", "#b884e4", "#e4e758", "#738ee3", "#e76688", "#70dddd", "#e49261"] | |
682 if n_exp > ncolor: | |
683 cmap = plt.get_cmap('Set3',n_exp) | |
684 cmap = [cmap(i) for i in xrange(n_exp)] | |
685 | |
686 ## Parameters for the plot | |
687 opacity = 1 | |
688 #Create a vector which contains the position of each bar | |
689 index = numpy.arange(n_cat) | |
690 #Size of the bars (depends on the categs number) | |
691 bar_width = 0.9/n_exp | |
692 | |
693 | |
694 ##Initialise the subplot | |
695 # if there is only one sample, also plot piecharts | |
696 #if n_exp == 1 and counts_type.lower() == 'categories': | |
697 #fig, ax1, ax2 = one_sample_plot(ordered_categs, percentages[0], enrichment[0], n_cat, index, bar_width, counts_type, title) | |
698 ## If more than one sample | |
699 #else: | |
700 fig, (ax1,ax2) = plt.subplots(2,figsize=(5+(n_cat+2*n_exp)/3,10)) | |
701 # Store the bars objects for enrichment plot | |
702 rects = [] | |
703 #For each sample/experiment | |
704 for i in range(n_exp): | |
705 #First barplot: percentage of reads in each categorie | |
706 ax1.bar(index+i*bar_width, percentages[i], bar_width, | |
707 alpha=opacity, | |
708 color=cmap[i], | |
709 label=samples_names[i], edgecolor='#FFFFFF', lw=0) | |
710 #Second barplot: enrichment relative to the genome for each categ | |
711 # (the reads count in a categ is divided by the categ size in the genome) | |
712 rects.append( ax2.bar(index+i*bar_width, enrichment[i], bar_width, | |
713 alpha=opacity, | |
714 color=cmap[i], | |
715 label=samples_names[i], edgecolor=cmap[i], lw=0)) | |
716 | |
717 ## Graphical options for the plot | |
718 # Adding of the legend | |
719 ax1.legend(loc='best',frameon=False) | |
720 #ax2.legend(loc='upper center',bbox_to_anchor=(0.5,-0.1), fancybox=True, shadow=True) | |
721 # Main titles | |
722 if title: | |
723 ax1.set_title(title) | |
724 else : | |
725 ax1.set_title(counts_type+" distribution in mapped reads") | |
726 ax2.set_title('Normalized counts of '+counts_type) | |
727 | |
728 # Adding enrichment baseline | |
729 #ax2.axhline(y=0,color='black',linestyle='dashed',linewidth='1.5') | |
730 # Axes limits | |
731 ax1.set_xlim(-0.1,len(ordered_categs)+0.1) | |
732 if len(sizes) == 1: ax1.set_xlim(-2,3) | |
733 ax2.set_xlim(ax1.get_xlim()) | |
734 # Set axis limits (max/min values + 5% margin) | |
735 ax2_ymin = [] | |
736 ax2_ymax = [] | |
737 for sample_values in enrichment: | |
738 ax2_ymin.append(min(sample_values)) | |
739 ax2_ymax.append(max(sample_values)) | |
740 ax2_ymax = max(ax2_ymax) | |
741 ax2_ymin = min(ax2_ymin) | |
742 margin_top, margin_bottom = (abs(0.05*ax2_ymax), abs(0.05*ax2_ymin)) | |
743 ax1.set_ylim(0,ax1.get_ylim()[1]*1.05) | |
744 if threshold: | |
745 threshold_bottom = -abs(float(threshold[0]))+1 | |
746 threshold_top = float(threshold[1])-1 | |
747 | |
748 for i in xrange(n_exp): | |
749 for y in xrange(n_cat): | |
750 val = enrichment[i][y] | |
751 if not numpy.isnan(val) and not (threshold_bottom < val < threshold_top): | |
752 rect = rects[i][y] | |
753 rect_height = rect.get_height() | |
754 if rect.get_y() < 0: | |
755 diff = rect_height + threshold_bottom | |
756 rect.set_y(threshold_bottom) | |
757 ax2_ymin = threshold_bottom | |
758 margin_bottom = 0 | |
759 else: | |
760 diff = rect_height - threshold_top | |
761 ax2_ymax = threshold_top | |
762 margin_top = 0 | |
763 rect.set_height(rect.get_height()-diff) | |
764 if margin_top != 0 and margin_bottom != 0: | |
765 margin_top, margin_bottom = [max(margin_top, margin_bottom) for i in xrange(2)] | |
766 ax2.set_ylim(ax2_ymin-margin_bottom,ax2_ymax+margin_top) | |
767 # Y axis title | |
768 ax1.set_ylabel('Proportion of reads (%)') | |
769 ax2.set_ylabel('Enrichment relative to genome') | |
770 # X axis title | |
771 ax1.set_xlabel(counts_type) | |
772 ax2.set_xlabel(counts_type) | |
773 # Add the categories on the x-axis | |
774 ax1.set_xticks(index + bar_width*n_exp/2) | |
775 ax2.set_xticks(index + bar_width*n_exp/2) | |
776 if counts_type.lower() != 'categories': | |
777 ax1.set_xticklabels(ordered_categs,rotation='30',ha='right') | |
778 ax2.set_xticklabels(ordered_categs,rotation='30',ha='right') | |
779 else: | |
780 ax1.set_xticklabels(ordered_categs) | |
781 ax2.set_xticklabels(ordered_categs) | |
782 # Display fractions values in percentages | |
783 ax1.set_yticklabels([str(int(i*100)) for i in ax1.get_yticks()]) | |
784 # Correct y-axis ticks labels for enrichment subplot | |
785 #ax2.set_yticklabels([str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()]) | |
786 yticks = list(ax2.get_yticks()) | |
787 yticks = [ yticks[i]-1 if yticks[i]>9 else yticks[i]+1 if yticks[i]<-9 else yticks[i] for i in xrange(len(yticks))] | |
788 ax2.set_yticks(yticks) | |
789 ax2.set_yticklabels([str(int(i+1))+"$^{+1}$" if i>0 and i%1==0 else str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(int(-(i-1)))+"$^{-1}$" if i<0 and i%1==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()]) | |
790 #ax2.set_yticklabels([i+1 if i>0 else 1 if i==0 else "$\\frac{1}{%s}$" %-(i-1) for i in ax2.get_yticks()]) | |
791 # Change appearance of 'antisense' bars on enrichment plot since we cannot calculate an enrichment for this artificial category | |
792 if 'antisense_pos' in locals(): #ax2.text(antisense_pos+bar_width/2,ax2.get_ylim()[1]/10,'NA') | |
793 for i in xrange(n_exp) : | |
794 rect = rects[i][antisense_pos] | |
795 rect.set_y(ax2.get_ylim()[0]) | |
796 rect.set_height(ax2.get_ylim()[1] - ax2.get_ylim()[0]) | |
797 rect.set_hatch('/') | |
798 rect.set_fill(False) | |
799 rect.set_linewidth(0) | |
800 #rect.set_color('lightgrey') | |
801 #rect.set_edgecolor('#EDEDED') | |
802 rect.set_color('#EDEDED') | |
803 ax2.text(index[antisense_pos] + bar_width*n_exp/2 - 0.1, (ax2_ymax+ax2_ymin)/2, 'NA') | |
804 # Add text for features absent in sample | |
805 for i in xrange(n_exp): | |
806 for y in xrange(n_cat): | |
807 if percentages[i][y] == 0: | |
808 txt = ax1.text(y + bar_width*(i+0.5), 0.02, 'Absent in sample', rotation = 'vertical', color = cmap[i], horizontalalignment ='center', verticalalignment = 'bottom') | |
809 txt.set_path_effects([PathEffects.Stroke(linewidth=0.5),PathEffects.Normal()]) | |
810 elif enrichment[i][y] == 0: | |
811 rects[i][y].set_linewidth(1) | |
812 | |
813 # Remove top/right/bottom axes | |
814 for ax in [ax1,ax2]: | |
815 ax.spines['top'].set_visible(False) | |
816 ax.spines['right'].set_visible(False) | |
817 ax.spines['bottom'].set_visible(False) | |
818 ax.tick_params(axis='x', which='both', bottom='on', top='off', labelbottom='on') | |
819 ax.tick_params(axis='y', which='both', left='on', right='off', labelleft='on') | |
820 | |
821 | |
822 | |
823 #ax1.legend(loc='center right', bbox_to_anchor=(1.2, 0),fancybox=True, shadow=True) | |
824 ## Showing the plot | |
825 plt.tight_layout() | |
826 fig.subplots_adjust(wspace=0.2) | |
827 if pdf: | |
828 pdf.savefig() | |
829 plt.close() | |
830 elif svg: | |
831 if svg == True: | |
832 plt.savefig(counts_type+'.svg') | |
833 else : | |
834 if os.path.splitext(svg)[1] == '.svg': | |
835 plt.savefig('.'.join((os.path.splitext(svg)[0],counts_type,'svg'))) | |
836 else: | |
837 plt.savefig('.'.join((svg,counts_type,'svg'))) | |
838 elif png: | |
839 if png == True: | |
840 plt.savefig(counts_type+'.png') | |
841 else : | |
842 if os.path.splitext(png)[1] == '.png': | |
843 plt.savefig('.'.join((os.path.splitext(png)[0],counts_type,'png'))) | |
844 else: | |
845 plt.savefig('.'.join((png,counts_type,'png'))) | |
846 else: | |
847 plt.show() | |
848 ## Save on disk it as a png image | |
849 #fig.savefig('image_output.png', dpi=300, format='png', bbox_extra_artists=(lgd,), bbox_inches='tight') | |
850 | |
851 | |
852 def filter_categs_on_biotype(selected_biotype,cpt) : | |
853 filtered_cpt = {} | |
854 for sample in cpt: | |
855 filtered_cpt[sample] = {} | |
856 for feature,count in cpt[sample].items(): | |
857 if feature[1] == selected_biotype: | |
858 filtered_cpt[sample][feature[0]] = count | |
859 return filtered_cpt | |
860 | |
861 | |
862 ########################################################################## | |
863 # MAIN # | |
864 ########################################################################## | |
865 | |
866 def usage_message(name=None): | |
867 return ''' | |
868 Generate genome indexes: | |
869 python ALFA.py -a GTF_FILE [-g GENOME_INDEX] | |
870 [--chr_len CHR_LENGTHS_FILE] | |
871 Process BAM file(s): | |
872 python ALFA.py -g GENOME_INDEX -i BAM1 LABEL1 [BAM2 LABEL2 ...] | |
873 [--bedgraph] [-s STRAND] | |
874 [-n] [--pdf output.pdf] | |
875 [-d {1,2,3,4}] [-t YMIN YMAX] | |
876 Index genome + process BAM: | |
877 python ALFA.py -a GTF_FILE [-g GENOME_INDEX] | |
878 -i BAM1 LABEL1 [BAM2 LABEL2 ...] | |
879 [--chr_len CHR_LENGTHS_FILE] | |
880 [--bedgraph][-s STRAND] | |
881 [-n] [--pdf output.pdf] | |
882 [-d {1,2,3,4}] [-t YMIN YMAX] | |
883 | |
884 Process previously created ALFA counts file(s): | |
885 python ALFA.py -c COUNTS1 [COUNTS2 ...] | |
886 [-s STRAND] | |
887 [-n] [--pdf output.pdf] | |
888 [-d {1,2,3,4}] [-t YMIN YMAX] | |
889 | |
890 ''' | |
891 | |
892 | |
893 #### Parse command line arguments and store them in 'options' | |
894 parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, usage=usage_message()) | |
895 parser.add_argument('--version', action='version', version='version 1.0', help="show program's version number and exit\n\n-----------\n\n") | |
896 # Options concernant l'index | |
897 parser.add_argument('-g','--genome_index', help="Genome index files path and basename for existing index, or path and basename for new index creation\n\n") | |
898 parser.add_argument('-a','--annotation', metavar = "GTF_FILE", help='Genomic annotations file (GTF format)\n\n') | |
899 parser.add_argument('--chr_len', help='Tabulated file containing chromosome names and lengths\n\n-----------\n\n') | |
900 | |
901 # Options pour l'étape d'intersection | |
902 parser.add_argument('-i','--input','--bam', dest='input', metavar=('BAM_FILE1 LABEL1',""), nargs='+', help='Input BAM file(s) and label(s). The BAM files must be sorted by position.\n\n') | |
903 parser.add_argument('--bedgraph', action='store_const',default = False, const = True, help="Use this options if your input file(s) is(are) already in bedgraph format\n\n") | |
904 parser.add_argument('-c','--counts',metavar=('COUNTS_FILE',""), nargs='+', help="Use this options instead of '-i/--input' to provide ALFA counts files as input \ninstead of bam/bedgraph files.\n\n") | |
905 parser.add_argument('-s','--strandness', dest="strandness", nargs=1, action = 'store', default = ['unstranded'], choices = ['unstranded','forward','reverse','fr-firststrand','fr-secondstrand'], metavar="", help ="Library orientation. Choose within: 'unstranded', 'forward'/'fr-firststrand' \nor 'reverse'/'fr-secondstrand'. (Default: 'unstranded')\n\n-----------\n\n") | |
906 | |
907 # Options concernant le plot | |
908 parser.add_argument('-biotype_filter',nargs=1,help=argparse.SUPPRESS)#"Make an extra plot of categories distribution using only counts of the specified biotype.") | |
909 parser.add_argument('-d','--categories_depth', type=int, default='3', choices=range(1,5), help = "Use this option to set the hierarchical level that will be considered in the GTF file (default=3): \n(1) gene,intergenic; \n(2) intron,exon,intergenic; \n(3) 5'UTR,CDS,3'UTR,intron,intergenic; \n(4) start_codon,5'UTR,CDS,3'UTR,stop_codon,intron,intergenic. \n\n") | |
910 parser.add_argument('--pdf', nargs='?', default=False, help="Save produced plots in PDF format at specified path ('categories_plots.pdf' if no argument provided)\n\n") | |
911 parser.add_argument('--png', nargs='?', default=False, const=True, help="Save produced plots in PNG format with provided argument as basename \nor 'categories.png' and 'biotypes.png' if no argument provided\n\n") | |
912 parser.add_argument('--svg', nargs='?', default=False, const=True, help="Save produced plots in SVG format with provided argument as basename \nor 'categories.svg' and 'biotypes.svg' if no argument provided\n\n") | |
913 parser.add_argument('-n','--no_plot', dest='quiet', action='store_const', default=False, const=True, help="Do not show plots\n\n") | |
914 parser.add_argument('-t','--threshold', dest='threshold', nargs = 2, metavar=("ymin","ymax"), type=float , help="Set axis limits for enrichment plots\n\n") | |
915 | |
916 if len(sys.argv)==1: | |
917 parser.print_usage() | |
918 sys.exit(1) | |
919 | |
920 options = parser.parse_args() | |
921 | |
922 def required_arg(arg, aliases): | |
923 if not arg: | |
924 print >> sys.stderr, "\nError: %s argument is missing.\n" %aliases | |
925 parser.print_usage() | |
926 sys.exit() | |
927 | |
928 def check_files_enxtension(files): | |
929 return | |
930 | |
931 # Booleans for steps to be executed | |
932 make_index = False | |
933 intersect_reads = False | |
934 process_counts = False | |
935 | |
936 #### Check arguments conformity and define which steps have to be perfomed | |
937 if options.counts : | |
938 # Aucun autre argument requis | |
939 # Vérifier extension input | |
940 | |
941 # Action : Faire le plot uniquement | |
942 process_counts = True | |
943 else: | |
944 if options.annotation : | |
945 # Vérifier si présence -gi | |
946 if options.genome_index : | |
947 genome_index_basename = options.genome_index | |
948 else: | |
949 genome_index_basename = options.annotation.split("/")[-1].split(".gtf")[0] | |
950 # Vérifier si un fichier existe déjà: | |
951 if os.path.isfile(genome_index_basename+".stranded.index") : | |
952 if options.input: | |
953 print >> sys.stderr, "\nWarning: a index file named '%s' already exists and will be used. If you want to create a new index, please delete this file or specify an other path." %(genome_index_basename+".stranded.index") | |
954 else: | |
955 sys.exit("\nError: a index file named %s already exists. If you want to create a new index, please delete this file or specify an other path.\n" %(genome_index_basename+".stranded.index")) | |
956 # sinon -> action : index à faire | |
957 else : | |
958 make_index = True | |
959 # si l'index n'est pas à faire : | |
960 if options.input: | |
961 # Arguments requis: input, genome_index | |
962 if 'genome_index_basename' not in locals(): | |
963 required_arg(options.genome_index, "-g/--genome_index") | |
964 genome_index_basename = options.genome_index | |
965 required_arg(options.input, "-i/--input/--bam") | |
966 for i in xrange(0, len(options.input), 2): | |
967 try : | |
968 extension = os.path.splitext(options.input[i+1])[1] | |
969 if extension == ".bam" or extension == '.bedgraph' or extension == '.bg': | |
970 sys.exit("Error: it seems input files and associated labels are not correctly provided.\n\ | |
971 Make sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].") | |
972 except: | |
973 sys.exit("Error: it seems input files and associated labels are not correctly provided.\nMake sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].") | |
974 | |
975 intersect_reads = True | |
976 # Vérifier input's extension | |
977 #TODO | |
978 if not (options.counts or options.input or options.annotation): | |
979 sys.exit("\nError : some arguments are missing At least '-a', '-c' or '-i' is required. Please refer to help (-h/--help) and usage cases for more details.\n") | |
980 if not options.counts: | |
981 # Declare genome_index variables | |
982 stranded_genome_index = genome_index_basename+".stranded.index" | |
983 unstranded_genome_index = genome_index_basename+".unstranded.index" | |
984 if options.strandness[0] == "unstranded": | |
985 genome_index = unstranded_genome_index | |
986 else: | |
987 genome_index = stranded_genome_index | |
988 | |
989 | |
990 | |
991 #### Initialization of some variables | |
992 | |
993 # Initializing the category priority order, coding biotypes and the final list | |
994 prios = {'start_codon': 7, 'stop_codon': 7, 'five_prime_utr': 6, 'three_prime_utr': 6, 'UTR': 6, 'CDS': 5, 'exon': 4, 'transcript': 3, 'gene': 2, 'antisense': 1,'intergenic': 0} | |
995 | |
996 biotype_prios = None | |
997 #biotype_prios = {"protein_coding":1, "miRNA":2} | |
998 | |
999 # Possibles groups of categories to plot | |
1000 categs_group1={'start': ['start_codon'], '5UTR': ['five_prime_utr','UTR'], 'CDS': ['CDS', 'exon'], '3UTR': ['three_prime_utr'], 'stop': ['stop_codon'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']} | |
1001 categs_group2={'5UTR': ['five_prime_utr', 'UTR'], 'CDS': ['CDS', 'exon','start_codon','stop_codon'], '3UTR': ['three_prime_utr'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']} | |
1002 categs_group3={'exons': ['five_prime_utr', 'three_prime_utr', 'UTR', 'CDS', 'exon','start_codon','stop_codon'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']} | |
1003 categs_group4={'gene': ['five_prime_utr', 'three_prime_utr', 'UTR', 'CDS', 'exon','start_codon','stop_codon', 'transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']} | |
1004 categs_groups = [categs_group4,categs_group3,categs_group2,categs_group1] # Order and merging for the final plot | |
1005 cat_list = ['5UTR', 'start', 'CDS', 'stop', '3UTR', 'exons', 'introns', 'gene', 'intergenic', 'antisense'] | |
1006 | |
1007 | |
1008 # biotypes list | |
1009 biotypes ={'protein_coding','polymorphic_pseudogene','TR_C_gene','TR_D_gene','TR_J_gene','TR_V_gene','IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene',"3prime_overlapping_ncrna","lincRNA","macro_lncRNA","miRNA","misc_RNA","Mt_rRNA","Mt_tRNA","processed_transcript","ribozyme","rRNA","scaRNA","sense_intronic","sense_overlapping","snoRNA","snRNA","sRNA","TEC","vaultRNA","antisense","transcribed_processed_pseudogene","transcribed_unitary_pseudogene","transcribed_unprocessed_pseudogene","translated_unprocessed_pseudogene","TR_J_pseudogene","TR_V_pseudogene","unitary_pseudogene","unprocessed_pseudogene","processed_pseudogene","IG_C_pseudogene","IG_J_pseudogene","IG_V_pseudogene","pseudogene","ncRNA","tRNA"} # Type: set (to access quickly) | |
1010 | |
1011 # Grouping of biotypes: | |
1012 biotypes_group1={'protein_coding':['protein_coding'],'pseudogenes':['polymorphic_pseudogene',"transcribed_processed_pseudogene","transcribed_unitary_pseudogene","transcribed_unprocessed_pseudogene","translated_unprocessed_pseudogene","TR_J_pseudogene","TR_V_pseudogene","unitary_pseudogene","unprocessed_pseudogene","processed_pseudogene","IG_C_pseudogene","IG_J_pseudogene","IG_V_pseudogene","pseudogene"],'TR':['TR_C_gene','TR_D_gene','TR_J_gene','TR_V_gene'],'IG':['IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene'],\ | |
1013 'MT_RNA':["Mt_rRNA","Mt_tRNA"],\ | |
1014 'ncRNA':["lincRNA","macro_lncRNA","3prime_overlapping_ncrna","ncRNA"],\ | |
1015 "others":["misc_RNA","processed_transcript","ribozyme","scaRNA","sense_intronic","sense_overlapping","TEC","vaultRNA"], | |
1016 "antisense":["antisense"]} | |
1017 for biot in ["miRNA","snoRNA","snRNA","rRNA","sRNA","tRNA"]: | |
1018 biotypes_group1[biot]=[biot] | |
1019 | |
1020 # # Initializing the unkown features lits | |
1021 unknown_feature = [] | |
1022 | |
1023 # Initializing the genome category counter dict | |
1024 cpt_genome = {} | |
1025 | |
1026 | |
1027 if process_counts : | |
1028 #### If input files are the categories counts, just load them and continue to recategorization step | |
1029 cpt,cpt_genome,samples_names = read_counts_files(options.counts) | |
1030 else: | |
1031 #### Create genome index if needed and get the sizes of categories | |
1032 if make_index : | |
1033 #### Get the chromosome lengths | |
1034 lengths = get_chromosome_lengths(options) | |
1035 # Generating the genome index files if the user didn't provide them | |
1036 create_genome_index(options.annotation, unstranded_genome_index, stranded_genome_index, cpt_genome, prios, biotypes, lengths) | |
1037 | |
1038 | |
1039 #print '\nChr lengths:', lengths | |
1040 | |
1041 if intersect_reads: | |
1042 # If the indexes already exist, read them to compute the sizes of the categories in the genome and retrieve the chromosome lengths | |
1043 if not make_index : | |
1044 print "\n### Reading genome indexes\n...\r", | |
1045 sys.stdout.flush() | |
1046 lengths={} | |
1047 with open(genome_index, 'r') as genome_index_file: | |
1048 for line in genome_index_file: | |
1049 if line[0] == "#": | |
1050 lengths[line.split('\t')[0][1:]] = int(line.split('\t')[1]) | |
1051 else : | |
1052 add_info(cpt_genome, line.rstrip().split('\t')[4:], line.split('\t')[1], line.split('\t')[2], biotype_prios = None, categ_prios = prios) | |
1053 | |
1054 #### Computing the genome intergenic count: sum of the chr lengths minus sum of the genome annotated intervals | |
1055 cpt_genome[('intergenic','intergenic')] = sum(lengths.itervalues()) - sum([v for x,v in cpt_genome.iteritems() if x != ('antisense','antisense')]) | |
1056 if not make_index : | |
1057 print "Done!" | |
1058 #print '\nGenome category counts:' | |
1059 #for key,val in cpt_genome.iteritems(): | |
1060 #print key,"\t",val | |
1061 | |
1062 | |
1063 #### Create the Bedgraph files if needed and get the files list | |
1064 | |
1065 if not options.bedgraph: | |
1066 # Generating the BEDGRAPH files is the user provided BAM file(s) and get the samples labels (this names will be used in the plot legend) | |
1067 samples_files, samples_names = create_bedgraph_files(options.input,options.strandness[0]) | |
1068 else: | |
1069 # Just initialize the files list with the bedgraph paths | |
1070 samples_files = [options.input[i] for i in range(0,len(options.input),2)] | |
1071 # and get the labels | |
1072 samples_names = [options.input[i] for i in range(1,len(options.input),2)] | |
1073 #### Retrieving chromosome names saved in index | |
1074 chrom_list = get_chromosome_names_in_index(genome_index) | |
1075 #### Processing the BEDGRAPH files: intersecting the bedgraph with the genome index and count the number of aligned positions in each category | |
1076 cpt = intersect_bedgraphs_and_index_to_counts_categories(samples_files,samples_names,prios,genome_index, options.strandness[0], biotype_prios = None) | |
1077 | |
1078 #### Write the counts on disk | |
1079 write_counts_in_files(cpt,cpt_genome) | |
1080 | |
1081 if not (intersect_reads or process_counts) or (options.quiet and options.pdf == False): | |
1082 quit("\n### End of program") | |
1083 print "\n### Generating plots" | |
1084 # Updating the biotypes lists (biotypes and 'biotype_group1'): adding the 'unknow biotypes' found in gtf/index | |
1085 if unknown_feature == []: # 'unknown_feature' is define only during the index generation | |
1086 # Browse the feature to determine whether some biotypes are 'unknown' | |
1087 for sample,counts in cpt.items(): | |
1088 for (cat,biot) in counts: | |
1089 if biot not in biotypes and cat not in unknown_feature: | |
1090 unknown_feature.append(biot) | |
1091 for new_biot in unknown_feature: | |
1092 biotypes.add(new_biot) | |
1093 biotypes_group1["others"].append(new_biot) | |
1094 biotypes = sorted(biotypes) | |
1095 # move antisense categ to the end of the list | |
1096 biotypes.remove('antisense') | |
1097 biotypes.append('antisense') | |
1098 biotypes_group1 = sorted(biotypes_group1) | |
1099 | |
1100 | |
1101 #print '\nCounts for every category/biotype pair: ',cpt | |
1102 | |
1103 # Generating plots | |
1104 if options.pdf != False: | |
1105 if options.pdf == None: | |
1106 options.pdf = "categories_plots.pdf" | |
1107 pdf = PdfPages(options.pdf) | |
1108 else: | |
1109 pdf = False | |
1110 | |
1111 selected_biotype = None | |
1112 if options.biotype_filter: | |
1113 options.biotype_filter = options.biotype_filter[0] | |
1114 for sample in cpt: | |
1115 for feature in cpt[sample]: | |
1116 biotype = feature[1] | |
1117 if options.biotype_filter.lower() == biotype.lower(): | |
1118 selected_biotype=biotype | |
1119 break | |
1120 if selected_biotype == None : | |
1121 print "\nError: biotype '"+options.biotype_filter+"' not found. Please check the biotype name and that this biotype exists in your sample(s)." | |
1122 sys.exit() | |
1123 | |
1124 #Print a warning message if the UTRs are not specified as 5' or 3' (they will be ploted as 5'UTR) | |
1125 if 'UTR' in [categ[0] for counts in cpt.values() for categ in counts.keys()]: | |
1126 print '''\nWARNING: (some) 5'UTR/3'UTR are not precisely defined. Consequently, positions annotated as "UTR" will be counted as "5'UTR"\n''' | |
1127 | |
1128 #### Make the plot by categories | |
1129 #### Recategorizing with the final categories | |
1130 final_cats=categs_groups[options.categories_depth-1] | |
1131 final_cat_cpt,final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt,cpt_genome,final_cats,selected_biotype) | |
1132 #### Display the distribution of specified categories (or biotypes) in samples on a barplot | |
1133 # Remove the 'antisense' category if the library type is 'unstranded' | |
1134 for dic in cpt.values(): | |
1135 if ('antisense','antisense') in dic.keys(): break | |
1136 else: | |
1137 cat_list.remove('antisense') | |
1138 make_plot(cat_list,samples_names,final_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold, svg = options.svg, png = options.png) | |
1139 if selected_biotype : | |
1140 make_plot(cat_list,samples_names,filtered_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold,title="Categories distribution for '"+selected_biotype+"' biotype", svg = options.svg, png = options.png) | |
1141 | |
1142 #### Make the plot by biotypes | |
1143 #### Recategorizing with the final categories | |
1144 final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes) | |
1145 #### Display the distribution of specified categories (or biotypes) in samples on a barplot | |
1146 make_plot(biotypes,samples_names,final_cat_cpt,final_genome_cpt,pdf, "biotypes",options.threshold, svg = options.svg, png = options.png) | |
1147 | |
1148 | |
1149 | |
1150 ##### Recategorizing with the final categories | |
1151 #final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes_group1) | |
1152 ##### Display the distribution of specified categories (or biotypes) in samples on a barplot | |
1153 #make_plot(biotypes_group1,samples_names,final_cat_cpt,final_genome_cpt,pdf,"Biotype groups", options.threshold, title="Biotypes distribution in mapped reads \n(biotypes are grouped by 'family')", svg = options.svg, png = options.png) | |
1154 | |
1155 | |
1156 if options.pdf: | |
1157 pdf.close() | |
1158 print "\n### Plots saved in pdf file: %s" %options.pdf | |
1159 | |
1160 print "\n### End of program" |