18
|
1 #!/usr/bin/python
|
|
2 #-*- coding: utf-8 -*-
|
|
3
|
|
4 __author__ = 'noel & bahin'
|
|
5 ''' <decription> '''
|
|
6
|
|
7 import argparse
|
|
8 import os
|
|
9 import numpy
|
|
10 import sys
|
|
11 import subprocess
|
|
12 import matplotlib.pyplot as plt
|
|
13 import matplotlib.cm as cmx
|
|
14 import matplotlib.colors as colors
|
|
15 import matplotlib.patheffects as PathEffects
|
|
16 import re
|
|
17 from matplotlib.backends.backend_pdf import PdfPages
|
|
18 # To correctly embbed the texts when saving plots in svg format
|
|
19 import matplotlib
|
|
20 matplotlib.rcParams['svg.fonttype'] = 'none'
|
|
21
|
|
22 ##########################################################################
|
|
23 # FUNCTIONS #
|
|
24 ##########################################################################
|
|
25
|
|
26 def init_dict(d, key, init):
|
|
27 if key not in d:
|
|
28 d[key] = init
|
|
29
|
|
30 def get_chromosome_lengths(args):
|
|
31 """
|
|
32 Parse the file containing the chromosomes lengths.
|
|
33 If no length file is provided, browse the annotation file (GTF) to estimate the chromosome sizes (
|
|
34 """
|
|
35 lengths={}
|
|
36 gtf_chrom_names=set()
|
|
37 force_get_lengths = False
|
|
38 # If the user provided the chromosome length file
|
|
39 if args.chr_len:
|
|
40 with open(args.chr_len, 'r') as chr_len_file:
|
|
41 for line in chr_len_file:
|
|
42 lengths[line.split('\t')[0]] = int(line.rstrip().split('\t')[1])
|
|
43 with open(args.annotation,'r') as gtf_file:
|
|
44 for line in gtf_file:
|
|
45 if not line.startswith('#'):
|
|
46 chrom = line.split('\t')[0]
|
|
47 if chrom not in gtf_chrom_names:
|
|
48 gtf_chrom_names.add(chrom)
|
|
49 for chrom in lengths.keys():
|
|
50 if chrom not in gtf_chrom_names:
|
|
51 print "Warning: at least one chromosome name ('"+chrom+"') of the file '"+args.chr_len+"'does not match any chromosome name if GTF and was ignored."
|
|
52 #del lengths[chrom]
|
|
53 break
|
|
54 for chrom in gtf_chrom_names:
|
|
55 if force_get_lengths: break
|
|
56 if chrom not in lengths.keys():
|
|
57 print "WARNING: chromosome name '"+chrom+"' was found in gtf and does not match any chromosome name provided in",args.chr_len+". "
|
|
58 print "\t=> The chromosome lenghts will be approximated using annotations in the GTF file."
|
|
59 continue_value =""
|
|
60 while continue_value not in {"yes","y","no","n"}:
|
|
61 continue_value = raw_input("\tDo you want to continue ('yes' or 'y')?\n\tElse write 'no' or 'n' to exit the script and check your file of lengths.\n")
|
|
62 if continue_value == "no" or continue_value == "n":
|
|
63 sys.exit("Exiting")
|
|
64 elif continue_value == "yes" or continue_value == "y":
|
|
65 force_get_lengths = True
|
|
66 break
|
|
67 print "Error: use 'yes/y/no/n' only"
|
|
68 if not force_get_lengths:
|
|
69 return lengths
|
|
70 # Otherwise, (or if at least one chromosome was missing in chromosome lengths file) we consider the end of the last annotation of the chromosome in the GTF file as the chromosome length
|
|
71 with open(args.annotation, 'r') as gtf_file:
|
|
72 for line in gtf_file:
|
|
73 if not line.startswith('#'):
|
|
74 chrom = line.split('\t')[0]
|
|
75 end = int(line.split('\t')[4])
|
|
76 init_dict(lengths, chrom, 0)
|
|
77 lengths[chrom] = max(lengths[chrom], end)
|
|
78 if force_get_lengths:
|
|
79 print "The chromosome lenghts have been approximated using the last annotations in the GTF file."
|
|
80 return lengths
|
|
81
|
|
82 def write_feature_on_index(feat,chrom, start, stop, sign, stranded_genome_index, unstranded_genome_index=None):
|
|
83 grouped_by_biotype_features = []
|
|
84 for biotype,categs in feat.iteritems():
|
|
85 categ_list=[]
|
|
86 for cat in set(categs):
|
|
87 categ_list.append(cat)
|
|
88 grouped_by_biotype_features.append(":".join((str(biotype),",".join(categ_list))))
|
|
89 stranded_genome_index.write('\t'.join((chrom, start, stop, sign,''))+'\t'.join(grouped_by_biotype_features)+'\n')
|
|
90 if unstranded_genome_index :
|
|
91 unstranded_genome_index.write('\t'.join((chrom, start, stop, '.',''))+'\t'.join(grouped_by_biotype_features)+'\n')
|
|
92
|
|
93
|
|
94 def add_info(cpt, feat_values, start, stop, chrom=None, unstranded_genome_index=None, stranded_genome_index = None , biotype_prios=None, coverage=1, categ_prios=None):
|
|
95 """
|
|
96 From an annotated genomic interval (start/stop positions and associated feature : one or more category(ies)/biotype pair(s) )
|
|
97 - If a file is provided: write the information at the end of the index file being generated;
|
|
98 - else : browse the features and update the counts of categories/biotypes found in the genome.
|
|
99 """
|
|
100 ## Writing in the file is it was provided
|
|
101 if stranded_genome_index :
|
|
102 unstranded_features = None
|
|
103 # If only one strand has a feature, this feature will directly be written on the unstranded index
|
|
104 if feat_values[0] == {} :
|
|
105 # A interval with no feature corresponds to a region annotated only on the reverse strand : update 'antisense' counts
|
|
106 stranded_genome_index.write('\t'.join((chrom, start, stop, '+','antisense\n')))
|
|
107 write_feature_on_index(feat_values[1], chrom ,start, stop, '-', stranded_genome_index, unstranded_genome_index)
|
|
108 else :
|
|
109 if feat_values[1] == {} :
|
|
110 write_feature_on_index(feat_values[0], chrom ,start, stop, '+', stranded_genome_index, unstranded_genome_index)
|
|
111 stranded_genome_index.write('\t'.join((chrom, start, stop, '-','antisense\n')))
|
|
112
|
|
113 # Else, the unstranded index should contain the union of plus and minus features
|
|
114 else :
|
|
115 write_feature_on_index(feat_values[0], chrom ,start, stop, '+', stranded_genome_index)
|
|
116 write_feature_on_index(feat_values[1], chrom ,start, stop, '-', stranded_genome_index)
|
|
117 unstranded_feat = dict(feat_values[0], **feat_values[1])
|
|
118 for name in set(feat_values[0]) & set(feat_values[1]):
|
|
119 unstranded_feat[name]+=feat_values[0][name]
|
|
120 write_feature_on_index(unstranded_feat, chrom ,start, stop, '.', unstranded_genome_index)
|
|
121
|
|
122 ## Increasing category counter(s)
|
|
123 else :
|
|
124 # Default behavior if no biotype priorities : category with the highest priority for each found biotype has the same weight (1/n_biotypes)
|
|
125 if not biotype_prios:
|
|
126 nb_feat = len(feat_values)
|
|
127 # For every categ(s)/biotype pairs
|
|
128 for feat in feat_values:
|
|
129 cur_prio = 0
|
|
130 selected_categ = []
|
|
131 # Separate categorie(s) and biotype
|
|
132 try:
|
|
133 biot,cats = feat.split(":")
|
|
134 # Error if feature equal "antisense" : update the 'antisense/antisense' counts
|
|
135 except ValueError :
|
|
136 try :
|
|
137 cpt[(feat,feat)] += (int(stop) - int(start)) * coverage / float(nb_feat)
|
|
138 except :
|
|
139 cpt[(feat,feat)] = (int(stop) - int(start)) * coverage / float(nb_feat)
|
|
140 return
|
|
141 # Browse the categories and get only the one(s) with highest priority
|
|
142 for cat in cats.split(','):
|
|
143 try: prio = prios[cat]
|
|
144 except:
|
|
145 #TODO Find a way to add unknown categories
|
|
146 if cat not in unknown_feature:
|
|
147 print >> sys.stderr, "Warning: Unknown categorie %s found and ignored\n...\r" %cat,
|
|
148 unknown_feature.append(cat)
|
|
149 continue
|
|
150 if prio > cur_prio :
|
|
151 cur_prio = prio
|
|
152 selected_categ = [cat]
|
|
153 if prio == cur_prio :
|
|
154 if cat != selected_categ :
|
|
155 try:
|
|
156 if cat not in selected_categ :
|
|
157 selected_categ.append(cat)
|
|
158 except TypeError :
|
|
159 selected_categ = [selected_categ,cat]
|
|
160 # Increase each counts by the coverage divided by the number of categories and biotypes
|
|
161 nb_cats = len(selected_categ)
|
|
162 for cat in selected_categ :
|
|
163 try:
|
|
164 cpt[(cat,biot)] += (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats))
|
|
165 except KeyError:
|
|
166 cpt[(cat,biot)] = (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats))
|
|
167 #else :
|
|
168 #cpt[(cats,biot)] = (int(stop) - int(start)) / float(nb_feat) * coverage
|
|
169 # Else, apply biotype selection according to the priority set
|
|
170 else:
|
|
171 #TODO Add an option to pass biotype priorities and handle it
|
|
172 pass
|
|
173
|
|
174 def print_chrom(features_dict, chrom, stranded_index_file, unstranded_index_file, cpt_genome):
|
|
175 with open(unstranded_index_file,'a') as findex, open(stranded_index_file,'a') as fstrandedindex:
|
|
176 # Initialization of variables : start position of the first interval and associated features for +/- strands
|
|
177 start = ""
|
|
178 for pos in sorted(features_dict['+'].keys()):
|
|
179 if start != "":
|
|
180 add_info(cpt_genome, [features_plus,features_minus], str(start), str(pos), chrom, stranded_genome_index = fstrandedindex, unstranded_genome_index = findex)
|
|
181 start = pos
|
|
182 features_plus = features_dict['+'][pos]
|
|
183 features_minus = features_dict['-'][pos]
|
|
184
|
|
185
|
|
186 def create_genome_index(annotation, unstranded_genome_index, stranded_genome_index,cpt_genome,prios,biotypes,chrom_sizes):
|
|
187 ''' Create an index of the genome annotations and save it in a file'''
|
|
188 print '\n### Generating genome indexes\n',
|
|
189 sys.stdout.flush()
|
|
190 # Initializations
|
|
191 intervals_dict = 0
|
|
192 max_value = -1
|
|
193 prev_chrom = ''
|
|
194 reverse_strand = {'+':'-','-':'+'}
|
|
195 i = 0 # Line counter
|
|
196 # Write the chromosomes lengths as comment lines before the genome index
|
|
197 with open(unstranded_genome_index,'w') as findex, open(stranded_genome_index,'w') as fstrandedindex:
|
|
198 for key,value in chrom_sizes.items():
|
|
199 findex.write("#%s\t%s\n" %(key, value))
|
|
200 fstrandedindex.write("#%s\t%s\n" %(key, value))
|
|
201 # Running through the GTF file and writing into genome index files
|
|
202 with open(annotation, 'r') as gtf_file:
|
|
203 for line in gtf_file:
|
|
204 # Print messages after X processed lines
|
|
205 i += 1
|
|
206 if i % 100000 == 0:
|
|
207 print >> sys.stderr, '\r%s line processed...' %str(i)
|
|
208 print '\r \r. . .',
|
|
209 sys.stdout.flush()
|
|
210 elif i % 20000 == 0:
|
|
211 print '\r \r. . .',
|
|
212 sys.stdout.flush()
|
|
213 elif i % 2000 == 0:
|
|
214 print '.',
|
|
215 sys.stdout.flush()
|
|
216 # Processing lines except comment ones
|
|
217 if not line.startswith('#'):
|
|
218 # Getting the line infos
|
|
219 line_split=line[:-1].split('\t')
|
|
220 chrom = line_split[0]
|
|
221 cat=line_split[2]
|
|
222 start = int(line_split[3]) - 1
|
|
223 stop = int(line_split[4])
|
|
224 strand = line_split[6]
|
|
225 antisense = reverse_strand[strand]
|
|
226 biotype=line_split[8].split('biotype')[1].split(';')[0].strip('" ')
|
|
227 feat = [(cat,biotype)]
|
|
228 # Registering chromosome info in the genome index file if this is a new chromosome or a annotation not overlapping previously recorded features
|
|
229 if start > max_value or chrom != prev_chrom:
|
|
230 # Write the previous features
|
|
231 if intervals_dict != 0:
|
|
232 if chrom != prev_chrom :
|
|
233 print_chrom(intervals_dict, prev_chrom, stranded_genome_index, unstranded_genome_index, cpt_genome)
|
|
234 print "\rChromosome '" + prev_chrom + "' registered."
|
|
235 else:
|
|
236 print_chrom(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index, cpt_genome)
|
|
237 prev_chrom = chrom
|
|
238 # (Re)Initializing the chromosome lists
|
|
239 intervals_dict = {strand:{start:{biotype:[cat]}, stop:{}},antisense:{start:{},stop:{}}}
|
|
240 max_value = stop
|
|
241
|
|
242 # Update the dictionary which represents intervals for every disctinct annotation
|
|
243 else:
|
|
244 # Get intervals on the same strand as the current feature
|
|
245 stranded_intervals = intervals_dict[strand]
|
|
246 start_added = False
|
|
247 for pos in sorted(stranded_intervals.keys()):
|
|
248 #print pos
|
|
249 #print stranded_intervals[pos]
|
|
250 #print
|
|
251 # While the position is below the feature's interval, store the features
|
|
252 if pos < start :
|
|
253 cur_cat_dict = dict(stranded_intervals[pos])
|
|
254 cur_antisense_dict = dict(intervals_dict[antisense][pos])
|
|
255
|
|
256 # If the start position already exists: update it by addind the new feature
|
|
257 elif pos == start :
|
|
258 cur_cat_dict = dict(stranded_intervals[pos])
|
|
259 cur_antisense_dict = dict(intervals_dict[antisense][pos])
|
|
260 #print "cur",cur_cat_dict
|
|
261 try : stranded_intervals[pos][biotype] = stranded_intervals[pos][biotype]+[cat]
|
|
262 except KeyError: stranded_intervals[pos][biotype] = [cat]
|
|
263 start_added = True
|
|
264 #print "cur",cur_cat_dict
|
|
265
|
|
266 elif pos > start :
|
|
267 # Create a new entry for the start position if necessary
|
|
268 if not start_added :
|
|
269 #print "cur",cur_cat_dict
|
|
270 stranded_intervals[start] = dict(cur_cat_dict)
|
|
271 stranded_intervals[start][biotype] = [cat]
|
|
272 #stranded_intervals[pos][biotype].append(cat)
|
|
273 intervals_dict[antisense][start] = cur_antisense_dict
|
|
274 start_added = True
|
|
275 #print "cur",cur_cat_dict
|
|
276 # While the position is below the stop, just add the new feature
|
|
277 if pos < stop :
|
|
278 cur_cat_dict = dict(stranded_intervals[pos])
|
|
279 cur_antisense_dict = dict(intervals_dict[antisense][pos])
|
|
280 try: stranded_intervals[pos][biotype] = stranded_intervals[pos][biotype] + [cat]
|
|
281 except KeyError: stranded_intervals[pos][biotype] = [cat]
|
|
282 # Close the created interval : create an entry at the stop position and restore the features
|
|
283 elif pos > stop :
|
|
284 stranded_intervals[stop] = dict(cur_cat_dict)
|
|
285 intervals_dict[antisense][stop] = cur_antisense_dict
|
|
286 break
|
|
287 else :
|
|
288 break
|
|
289 #try:
|
|
290 #cur_cat_dict = list(stranded_intervals[pos][biotype])
|
|
291 #except KeyError: cur_cat_dict = list()
|
|
292 #print stranded_intervals[pos]
|
|
293 #print
|
|
294 # Extend the dictionary if necessary
|
|
295 if stop > max_value:
|
|
296 max_value = stop
|
|
297 stranded_intervals[stop] = {}
|
|
298 intervals_dict[antisense][stop] = {}
|
|
299 #except KeyError:
|
|
300 #print intervals_dict
|
|
301 #quit()
|
|
302 #intervals_dict[strand] = {strand:{start:{biotype:[cat]}, stop:{}},antisense:{start:{},stop:{}}}
|
|
303 #continue
|
|
304 #for sign in ['-','+']:
|
|
305 #print sign
|
|
306 #for key,val in sorted(intervals_dict[sign].iteritems()):
|
|
307 #print key,'\t',val
|
|
308 #print
|
|
309 #print "-------\n"
|
|
310
|
|
311
|
|
312 #Store the categories of the last chromosome
|
|
313 print_chrom(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index, cpt_genome)
|
|
314 print "\rChromosome '" + prev_chrom + "' registered.\nDone!"
|
|
315
|
|
316
|
|
317 def create_bedgraph_files(bams,strandness):
|
|
318 samples_files = []
|
|
319 labels = []
|
|
320 print "\n### Generating the bedgraph files"
|
|
321 for n in range(0, len(bams), 2):
|
|
322 print "\rProcessing '%s'\n..." %bams[n],
|
|
323 sys.stdout.flush()
|
|
324 #Get the label for this sample
|
|
325 label = bams[n+1]
|
|
326 #Modify it to contain only alphanumeric caracters (avoid files generation with dangerous names)
|
|
327 modified_label = "_".join(re.findall(r"[\w']+", label))
|
|
328 if strandness in ["reverse","fr-secondstrand"]:
|
|
329 subprocess.call('bedtools genomecov -bg -split -strand - -ibam ' + bams[n] + ' > ' + modified_label + '.plus.bedgraph', shell=True)
|
|
330 subprocess.call('bedtools genomecov -bg -split -strand + -ibam ' + bams[n] + ' > ' + modified_label + '.minus.bedgraph', shell=True)
|
|
331 elif strandness in ["forward","fr-firststrand"]:
|
|
332 subprocess.call('bedtools genomecov -bg -split -strand + -ibam ' + bams[n] + ' > ' + modified_label + '.plus.bedgraph', shell=True)
|
|
333 subprocess.call('bedtools genomecov -bg -split -strand - -ibam ' + bams[n] + ' > ' + modified_label + '.minus.bedgraph', shell=True)
|
|
334 else :
|
|
335 subprocess.call('bedtools genomecov -bg -split -ibam ' + bams[n] + ' > ' + modified_label + '.bedgraph', shell=True)
|
|
336 samples_files.append(modified_label)
|
|
337 labels.append(label)
|
|
338 print "\rDone!"
|
|
339 return samples_files, labels
|
|
340
|
|
341 def read_gtf(gtf_index_file, sign):
|
|
342 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_features, endGTF
|
|
343 strand = ""
|
|
344 while strand != sign :
|
|
345 gtf_line = gtf_index_file.readline()
|
|
346 # If the GTF file is finished
|
|
347 if not gtf_line:
|
|
348 endGTF = True
|
|
349 return endGTF
|
|
350 splitline = gtf_line.rstrip().split('\t')
|
|
351 try: strand = splitline[3]
|
|
352 # strand information can not be found in the file file header
|
|
353 except IndexError: pass
|
|
354 gtf_chrom = splitline[0]
|
|
355 gtf_features = splitline[4:]
|
|
356 gtf_start, gtf_stop = map(int, splitline[1:3])
|
|
357 return endGTF
|
|
358
|
|
359 def read_counts_files(counts_files):
|
|
360 cpt={}
|
|
361 cpt_genome={}
|
|
362 labels=[]
|
|
363 for fcounts in counts_files:
|
|
364 label=os.path.splitext(os.path.basename(fcounts))[0]
|
|
365 labels.append(label)
|
|
366 cpt[label]={}
|
|
367 with open (fcounts,"r") as f:
|
|
368 for line in f:
|
|
369 if line[0]=="#":
|
|
370 continue
|
|
371 line_split=line[:-1].split('\t')
|
|
372 feature=tuple(line_split[0].split(','))
|
|
373 cpt[label][feature]=float(line_split[1])
|
|
374 cpt_genome[feature]=float(line_split[2])
|
|
375 return cpt,cpt_genome,labels
|
|
376
|
|
377
|
|
378 def get_chromosome_names_in_index(genome_index):
|
|
379 chrom_list = []
|
|
380 with open(genome_index, 'r') as findex:
|
|
381 chrom = ""
|
|
382 for line in findex:
|
|
383 cur_chrom = line.split('\t')[0]
|
|
384 if cur_chrom == chrom:
|
|
385 pass
|
|
386 else:
|
|
387 chrom = cur_chrom
|
|
388 if chrom not in chrom_list:
|
|
389 chrom_list.append(chrom)
|
|
390 return set(chrom_list)
|
|
391
|
|
392
|
|
393 def intersect_bedgraphs_and_index_to_counts_categories(samples_files,samples_names,prios,genome_index, strandness, biotype_prios = None):
|
|
394 global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_cat, endGTF
|
|
395 print "\n### Intersecting files with indexes"
|
|
396 unknown_chrom = []
|
|
397 cpt = {} # Counter for the nucleotides in the BAM input file(s)
|
|
398 for n in range(len(samples_files)):
|
|
399 sample_file=samples_files[n]
|
|
400 sample_name=samples_names[n]
|
|
401 # Initializing the category counter dict for this sample
|
|
402 init_dict(cpt, sample_name, {})
|
|
403 if strandness == "unstranded":
|
|
404 strands = [("",".")]
|
|
405 else:
|
|
406 strands = [('.plus','+'), ('.minus','-')]
|
|
407
|
|
408 # Intersecting the BEDGRAPH and genome index files
|
|
409 print "\rProcessing '%s'\n. . ." %sample_file,
|
|
410 sys.stdout.flush()
|
|
411
|
|
412 for strand,sign in strands:
|
|
413 prev_chrom = ''
|
|
414 endGTF = False # Reaching the next chr or the end of the GTF index
|
|
415 intergenic_adds = 0.0
|
|
416 i = 0
|
|
417 i_chgt = 0
|
|
418 with open(sample_file + strand + '.bedgraph', 'r') as bam_count_file:
|
|
419 # Running through the BEDGRAPH file
|
|
420 for bam_line in bam_count_file:
|
|
421 i += 1
|
|
422 if i % 10000 == 0:
|
|
423 print ".",
|
|
424 sys.stdout.flush()
|
|
425 if i % 100000 == 0:
|
|
426 print "\r \r. . .",
|
|
427 sys.stdout.flush()
|
|
428 # Getting the BAM line info
|
|
429 bam_chrom = bam_line.split('\t')[0]
|
|
430 bam_start, bam_stop, bam_cpt = map(float, bam_line.split('\t')[1:4])
|
|
431 # Skip the line if the chromosome is not in the index
|
|
432 if bam_chrom not in chrom_list:
|
|
433 if bam_chrom not in unknown_chrom:
|
|
434 unknown_chrom.append(bam_chrom)
|
|
435 print "\r \r Chromosome '" + bam_chrom + "' not found in index."
|
|
436 continue
|
|
437 # If this is a new chromosome (or the first one)
|
|
438 if bam_chrom != prev_chrom:
|
|
439 i_chgt = i
|
|
440 intergenic_adds = 0.0
|
|
441 # (Re)opening the GTF index and looking for the first line of the matching chr
|
|
442 try: gtf_index_file.close()
|
|
443 except UnboundLocalError: pass
|
|
444 gtf_index_file = open(genome_index, 'r')
|
|
445 endGTF = False
|
|
446 read_gtf(gtf_index_file, sign)
|
|
447 while bam_chrom != gtf_chrom:
|
|
448 read_gtf(gtf_index_file, sign)
|
|
449 if endGTF:
|
|
450 break
|
|
451 prev_chrom = bam_chrom
|
|
452
|
|
453 # Looking for the first matching annotation in the GTF index
|
|
454 while (not endGTF) and (gtf_chrom == bam_chrom) and (bam_start >= gtf_stop):
|
|
455 read_gtf(gtf_index_file, sign)
|
|
456 if gtf_chrom != bam_chrom:
|
|
457 endGTF = True
|
|
458 # Processing BAM lines before the first GTF annotation if there are
|
|
459 if bam_start < gtf_start:
|
|
460 # Increase the 'intergenic' category counter with all or part of the BAM interval
|
|
461 try:
|
|
462 intergenic_adds += min(bam_stop,gtf_start)-bam_start
|
|
463 cpt[sample_name][('intergenic','intergenic')] += (min(bam_stop, gtf_start) - bam_start) * bam_cpt
|
|
464 except KeyError:
|
|
465 cpt[sample_name][('intergenic','intergenic')] = (min(bam_stop, gtf_start) - bam_start) * bam_cpt
|
|
466 # Go to next line if the BAM interval was totally upstream the first GTF annotation, carry on with the remaining part otherwise
|
|
467 if endGTF or (bam_stop <= gtf_start):
|
|
468 continue
|
|
469 else:
|
|
470 bam_start = gtf_start
|
|
471
|
|
472 # We can start the crossover
|
|
473 while not endGTF:
|
|
474 # Update category counter
|
|
475 add_info(cpt[sample_name], gtf_features, bam_start, min(bam_stop,gtf_stop), coverage = bam_cpt, categ_prios = prios)
|
|
476 # Read the next GTF file line if the BAM line is not entirely covered
|
|
477 if bam_stop > gtf_stop:
|
|
478 # Update the BAM start pointer
|
|
479 bam_start = gtf_stop
|
|
480 endGTF = read_gtf(gtf_index_file, sign)
|
|
481 # If we read a new chromosome in the GTF file then it is considered finished
|
|
482 if bam_chrom != gtf_chrom:
|
|
483 endGTF = True
|
|
484 if endGTF:
|
|
485 break
|
|
486 else:
|
|
487 # Next if the BAM line is entirely covered
|
|
488 bam_start = bam_stop
|
|
489 break
|
|
490
|
|
491 # Processing the end of the BAM line if necessary
|
|
492 if endGTF and (bam_stop > bam_start):
|
|
493 try:
|
|
494 cpt[sample_name][('intergenic','intergenic')] += (bam_stop - bam_start) * bam_cpt
|
|
495 except KeyError:
|
|
496 cpt[sample_name][('intergenic','intergenic')] = (bam_stop - bam_start) * bam_cpt
|
|
497 gtf_index_file.close()
|
|
498 print "\r \rDone!"
|
|
499 return cpt
|
|
500
|
|
501 def write_counts_in_files(cpt,genome_counts):
|
|
502 for label,dico in cpt.items():
|
|
503 label = "_".join(re.findall(r"[\w']+", label))
|
|
504 with open(label+".categories_counts","w") as fout:
|
|
505 fout.write("#Category,biotype\tCounts_in_bam\tSize_in_genome\n" )
|
|
506 for feature,counts in dico.items():
|
|
507 fout.write("%s\t%s\t%s\n" %(','.join(feature),counts,genome_counts[feature]))
|
|
508
|
|
509 def recategorize_the_counts(cpt,cpt_genome,final):
|
|
510 final_cat_cpt={}
|
|
511 final_genome_cpt={}
|
|
512 for f in cpt:
|
|
513 #print "\nFinal categories for",f,"sample"
|
|
514 final_cat_cpt[f]={}
|
|
515 for final_cat in final:
|
|
516 tot = 0
|
|
517 tot_genome=0
|
|
518 for cat in final[final_cat]:
|
|
519 tot += cpt[f][cat]
|
|
520 tot_genome+=cpt_genome[cat]
|
|
521 #output_file.write('\t'.join((final_cat, str(tot))) + '\n')
|
|
522 #print '\t'.join((final_cat, str(tot)))
|
|
523 final_cat_cpt[f][final_cat]=tot
|
|
524 final_genome_cpt[final_cat]=tot_genome
|
|
525 return final_cat_cpt,final_genome_cpt
|
|
526
|
|
527 def group_counts_by_categ(cpt,cpt_genome,final,selected_biotype):
|
|
528 final_cat_cpt={}
|
|
529 final_genome_cpt={}
|
|
530 filtered_cat_cpt = {}
|
|
531 for f in cpt:
|
|
532 final_cat_cpt[f]={}
|
|
533 filtered_cat_cpt[f] = {}
|
|
534 for final_cat in final:
|
|
535 tot = 0
|
|
536 tot_filter = 0
|
|
537 tot_genome=0
|
|
538 for cat in final[final_cat]:
|
|
539 for key,value in cpt[f].items():
|
|
540 if key[0] == cat :
|
|
541 tot += value
|
|
542 tot_genome+=cpt_genome[key]
|
|
543 if key[1] == selected_biotype :
|
|
544 tot_filter += value
|
|
545 #output_file.write('\t'.join((final_cat, str(tot))) + '\n')
|
|
546 #print '\t'.join((final_cat, str(tot)))
|
|
547 final_cat_cpt[f][final_cat]=tot
|
|
548 if tot_genome == 0:
|
|
549 final_genome_cpt[final_cat]= 1e-100
|
|
550 else:
|
|
551 final_genome_cpt[final_cat]=tot_genome
|
|
552 filtered_cat_cpt[f][final_cat]=tot_filter
|
|
553 #if 'antisense' in final_genome_cpt: final_genome_cpt['antisense'] = 0
|
|
554 return final_cat_cpt,final_genome_cpt,filtered_cat_cpt
|
|
555
|
|
556 def group_counts_by_biotype(cpt,cpt_genome,biotypes):
|
|
557 final_cpt={}
|
|
558 final_genome_cpt={}
|
|
559 for f in cpt:
|
|
560 final_cpt[f]={}
|
|
561 for biot in biotypes:
|
|
562 tot = 0
|
|
563 tot_genome=0
|
|
564 try:
|
|
565 for final_biot in biotypes[biot]:
|
|
566 for key,value in cpt[f].items():
|
|
567 if key[1] == final_biot :
|
|
568 tot += value
|
|
569 #if key[1] != 'antisense':
|
|
570 tot_genome+=cpt_genome[key]
|
|
571 except:
|
|
572 for key,value in cpt[f].items():
|
|
573 if key[1] == biot :
|
|
574 tot += value
|
|
575 tot_genome+=cpt_genome[key]
|
|
576 if tot != 0:
|
|
577 final_cpt[f][biot]=tot
|
|
578 final_genome_cpt[biot]=tot_genome
|
|
579 return final_cpt,final_genome_cpt
|
|
580
|
|
581 #def get_cmap(N):
|
|
582 #'''Returns a function that maps each index in 0, 1, ... N-1 to a distinct
|
|
583 #RGB color.'''
|
|
584 #color_norm = colors.Normalize(vmin=0, vmax=N-1)
|
|
585 #scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv')
|
|
586 #def map_index_to_rgb_color(index):
|
|
587 #return scalar_map.to_rgba(index)
|
|
588 #return map_index_to_rgb_color
|
|
589
|
|
590 def one_sample_plot(ordered_categs, percentages, enrichment, n_cat, index, index_enrichment, bar_width, counts_type, title) :
|
|
591 ### Initialization
|
|
592 fig = plt.figure(figsize=(13,9))
|
|
593 ax1 = plt.subplot2grid((2,4),(0,0),colspan=2)
|
|
594 ax2 = plt.subplot2grid((2,4),(1,0),colspan=2)
|
|
595 cmap= plt.get_cmap('Spectral')
|
|
596 cols=[cmap(x) for x in xrange(0,256,256/n_cat)]
|
|
597 if title:
|
|
598 ax1.set_title(title+"in: %s" %samples_names[0])
|
|
599 else :
|
|
600 ax1.set_title(counts_type+" distribution in mapped reads in: %s" %samples_names[0])
|
|
601 ax2.set_title('Normalized counts of '+counts_type)
|
|
602
|
|
603
|
|
604 ### Barplots
|
|
605 #First barplot: percentage of reads in each categorie
|
|
606 ax1.bar(index, percentages, bar_width,
|
|
607 color=cols)
|
|
608 #Second barplot: enrichment relative to the genome for each categ
|
|
609 # (the reads count in a categ is divided by the categ size in the genome)
|
|
610 ax2.bar(index_enrichment, enrichment, bar_width,
|
|
611 color=cols,)
|
|
612 ### Piecharts
|
|
613 pielabels = [ordered_categs[i] if percentages[i]>0.025 else '' for i in xrange(n_cat)]
|
|
614 sum_enrichment = numpy.sum(enrichment)
|
|
615 pielabels_enrichment = [ordered_categs[i] if enrichment[i]/sum_enrichment>0.025 else '' for i in xrange(n_cat)]
|
|
616 # Categories piechart
|
|
617 ax3 = plt.subplot2grid((2,4),(0,2))
|
|
618 pie_wedge_collection, texts = ax3.pie(percentages,labels=pielabels, shadow=True, colors=cols)
|
|
619 # Enrichment piechart
|
|
620 ax4 = plt.subplot2grid((2,4),(1,2))
|
|
621 pie_wedge_collection, texts = ax4.pie(enrichment,labels=pielabels_enrichment, shadow=True, colors=cols)
|
|
622 # Add legends (append percentages to labels)
|
|
623 labels = [" ".join((ordered_categs[i],"({:.1%})".format(percentages[i]))) for i in range(len(ordered_categs))]
|
|
624 ax3.legend(pie_wedge_collection,labels,loc='center',fancybox=True, shadow=True,prop={'size':'medium'}, bbox_to_anchor=(1.7,0.5))
|
|
625 labels = [" ".join((ordered_categs[i],"({:.1%})".format(enrichment[i]/sum_enrichment))) for i in range(len(ordered_categs))]# if ordered_categs[i] != 'antisense']
|
|
626 ax4.legend(pie_wedge_collection,labels,loc='center',fancybox=True, shadow=True,prop={'size':'medium'},bbox_to_anchor=(1.7,0.5))
|
|
627 # Set aspect ratio to be equal so that pie is drawn as a circle
|
|
628 ax3.set_aspect('equal')
|
|
629 ax4.set_aspect('equal')
|
|
630 return fig, ax1, ax2
|
|
631
|
|
632 def make_plot(ordered_categs,samples_names,categ_counts,genome_counts,pdf, counts_type, threshold, title = None ,svg = None, png = None):
|
|
633 # From ordered_categs, keep only the features (categs or biotypes) that we can find in at least one sample.
|
|
634 existing_categs = set()
|
|
635 for sample in categ_counts.values():
|
|
636 existing_categs |= set(sample.keys())
|
|
637 ordered_categs = filter(existing_categs.__contains__, ordered_categs)
|
|
638 n_cat = len(ordered_categs)
|
|
639 n_exp=len(samples_names)
|
|
640 ##Initialization of the matrix of counts (nrow=nb_experiements, ncol=nb_categorie)
|
|
641 counts=numpy.matrix(numpy.zeros(shape=(n_exp,n_cat)))
|
|
642 for exp in xrange(len(samples_names)):
|
|
643 for cat in xrange(len(ordered_categs)):
|
|
644 try: counts[exp,cat]=categ_counts[samples_names[exp]][ordered_categs[cat]]
|
|
645 except: pass
|
|
646
|
|
647 ##Normalize the categorie sizes by the total size to get percentages
|
|
648 sizes=[]
|
|
649 sizes_sum=0
|
|
650 for cat in ordered_categs:
|
|
651 sizes.append(genome_counts[cat])
|
|
652 sizes_sum+=genome_counts[cat]
|
|
653 if 'antisense' in ordered_categs:
|
|
654 antisense_pos = ordered_categs.index('antisense')
|
|
655 sizes[antisense_pos] = 1e-100
|
|
656 for cpt in xrange(len(sizes)):
|
|
657 sizes[cpt]/=float(sizes_sum)
|
|
658
|
|
659 ## Create array which contains the percentage of reads in each categ for every sample
|
|
660 percentages=numpy.array(counts/numpy.sum(counts,axis=1))
|
|
661 ## Create the enrichment array (counts divided by the categorie sizes in the genome)
|
|
662 enrichment=numpy.array(percentages/sizes)
|
|
663 if 'antisense_pos' in locals():
|
|
664 for i in xrange(len(samples_names)):
|
|
665 enrichment[i][antisense_pos] = 0
|
|
666 #enrichment=numpy.log(numpy.array(percentages/sizes))
|
|
667 for exp in xrange(n_exp):
|
|
668 for i in xrange(n_cat):
|
|
669 val = enrichment[exp][i]
|
|
670 if val > 1:
|
|
671 enrichment[exp][i] = val-1
|
|
672 elif val == 1 or val == 0:
|
|
673 enrichment[exp][i] = 0
|
|
674 else:
|
|
675 enrichment[exp][i] = -1/val+1
|
|
676
|
|
677 #### Finally, produce the plot
|
|
678
|
|
679 ##Get the colors from the colormap
|
|
680 ncolor=16
|
|
681 cmap = ["#e47878", "#68b4e5", "#a3ea9b", "#ea9cf3", "#e5c957", "#a3ecd1", "#e97ca0", "#66d985", "#8e7ae5", "#b3e04b", "#b884e4", "#e4e758", "#738ee3", "#e76688", "#70dddd", "#e49261"]
|
|
682 if n_exp > ncolor:
|
|
683 cmap = plt.get_cmap('Set3',n_exp)
|
|
684 cmap = [cmap(i) for i in xrange(n_exp)]
|
|
685
|
|
686 ## Parameters for the plot
|
|
687 opacity = 1
|
|
688 #Create a vector which contains the position of each bar
|
|
689 index = numpy.arange(n_cat)
|
|
690 #Size of the bars (depends on the categs number)
|
|
691 bar_width = 0.9/n_exp
|
|
692
|
|
693
|
|
694 ##Initialise the subplot
|
|
695 # if there is only one sample, also plot piecharts
|
|
696 #if n_exp == 1 and counts_type.lower() == 'categories':
|
|
697 #fig, ax1, ax2 = one_sample_plot(ordered_categs, percentages[0], enrichment[0], n_cat, index, bar_width, counts_type, title)
|
|
698 ## If more than one sample
|
|
699 #else:
|
|
700 fig, (ax1,ax2) = plt.subplots(2,figsize=(5+(n_cat+2*n_exp)/3,10))
|
|
701 # Store the bars objects for enrichment plot
|
|
702 rects = []
|
|
703 #For each sample/experiment
|
|
704 for i in range(n_exp):
|
|
705 #First barplot: percentage of reads in each categorie
|
|
706 ax1.bar(index+i*bar_width, percentages[i], bar_width,
|
|
707 alpha=opacity,
|
|
708 color=cmap[i],
|
|
709 label=samples_names[i], edgecolor='#FFFFFF', lw=0)
|
|
710 #Second barplot: enrichment relative to the genome for each categ
|
|
711 # (the reads count in a categ is divided by the categ size in the genome)
|
|
712 rects.append( ax2.bar(index+i*bar_width, enrichment[i], bar_width,
|
|
713 alpha=opacity,
|
|
714 color=cmap[i],
|
|
715 label=samples_names[i], edgecolor=cmap[i], lw=0))
|
|
716
|
|
717 ## Graphical options for the plot
|
|
718 # Adding of the legend
|
|
719 ax1.legend(loc='best',frameon=False)
|
|
720 #ax2.legend(loc='upper center',bbox_to_anchor=(0.5,-0.1), fancybox=True, shadow=True)
|
|
721 # Main titles
|
|
722 if title:
|
|
723 ax1.set_title(title)
|
|
724 else :
|
|
725 ax1.set_title(counts_type+" distribution in mapped reads")
|
|
726 ax2.set_title('Normalized counts of '+counts_type)
|
|
727
|
|
728 # Adding enrichment baseline
|
|
729 #ax2.axhline(y=0,color='black',linestyle='dashed',linewidth='1.5')
|
|
730 # Axes limits
|
|
731 ax1.set_xlim(-0.1,len(ordered_categs)+0.1)
|
|
732 if len(sizes) == 1: ax1.set_xlim(-2,3)
|
|
733 ax2.set_xlim(ax1.get_xlim())
|
|
734 # Set axis limits (max/min values + 5% margin)
|
|
735 ax2_ymin = []
|
|
736 ax2_ymax = []
|
|
737 for sample_values in enrichment:
|
|
738 ax2_ymin.append(min(sample_values))
|
|
739 ax2_ymax.append(max(sample_values))
|
|
740 ax2_ymax = max(ax2_ymax)
|
|
741 ax2_ymin = min(ax2_ymin)
|
|
742 margin_top, margin_bottom = (abs(0.05*ax2_ymax), abs(0.05*ax2_ymin))
|
|
743 ax1.set_ylim(0,ax1.get_ylim()[1]*1.05)
|
|
744 if threshold:
|
|
745 threshold_bottom = -abs(float(threshold[0]))+1
|
|
746 threshold_top = float(threshold[1])-1
|
|
747
|
|
748 for i in xrange(n_exp):
|
|
749 for y in xrange(n_cat):
|
|
750 val = enrichment[i][y]
|
|
751 if not numpy.isnan(val) and not (threshold_bottom < val < threshold_top):
|
|
752 rect = rects[i][y]
|
|
753 rect_height = rect.get_height()
|
|
754 if rect.get_y() < 0:
|
|
755 diff = rect_height + threshold_bottom
|
|
756 rect.set_y(threshold_bottom)
|
|
757 ax2_ymin = threshold_bottom
|
|
758 margin_bottom = 0
|
|
759 else:
|
|
760 diff = rect_height - threshold_top
|
|
761 ax2_ymax = threshold_top
|
|
762 margin_top = 0
|
|
763 rect.set_height(rect.get_height()-diff)
|
|
764 if margin_top != 0 and margin_bottom != 0:
|
|
765 margin_top, margin_bottom = [max(margin_top, margin_bottom) for i in xrange(2)]
|
|
766 ax2.set_ylim(ax2_ymin-margin_bottom,ax2_ymax+margin_top)
|
|
767 # Y axis title
|
|
768 ax1.set_ylabel('Proportion of reads (%)')
|
|
769 ax2.set_ylabel('Enrichment relative to genome')
|
|
770 # X axis title
|
|
771 ax1.set_xlabel(counts_type)
|
|
772 ax2.set_xlabel(counts_type)
|
|
773 # Add the categories on the x-axis
|
|
774 ax1.set_xticks(index + bar_width*n_exp/2)
|
|
775 ax2.set_xticks(index + bar_width*n_exp/2)
|
|
776 if counts_type.lower() != 'categories':
|
|
777 ax1.set_xticklabels(ordered_categs,rotation='30',ha='right')
|
|
778 ax2.set_xticklabels(ordered_categs,rotation='30',ha='right')
|
|
779 else:
|
|
780 ax1.set_xticklabels(ordered_categs)
|
|
781 ax2.set_xticklabels(ordered_categs)
|
|
782 # Display fractions values in percentages
|
|
783 ax1.set_yticklabels([str(int(i*100)) for i in ax1.get_yticks()])
|
|
784 # Correct y-axis ticks labels for enrichment subplot
|
|
785 #ax2.set_yticklabels([str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()])
|
|
786 yticks = list(ax2.get_yticks())
|
|
787 yticks = [ yticks[i]-1 if yticks[i]>9 else yticks[i]+1 if yticks[i]<-9 else yticks[i] for i in xrange(len(yticks))]
|
|
788 ax2.set_yticks(yticks)
|
|
789 ax2.set_yticklabels([str(int(i+1))+"$^{+1}$" if i>0 and i%1==0 else str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(int(-(i-1)))+"$^{-1}$" if i<0 and i%1==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()])
|
|
790 #ax2.set_yticklabels([i+1 if i>0 else 1 if i==0 else "$\\frac{1}{%s}$" %-(i-1) for i in ax2.get_yticks()])
|
|
791 # Change appearance of 'antisense' bars on enrichment plot since we cannot calculate an enrichment for this artificial category
|
|
792 if 'antisense_pos' in locals(): #ax2.text(antisense_pos+bar_width/2,ax2.get_ylim()[1]/10,'NA')
|
|
793 for i in xrange(n_exp) :
|
|
794 rect = rects[i][antisense_pos]
|
|
795 rect.set_y(ax2.get_ylim()[0])
|
|
796 rect.set_height(ax2.get_ylim()[1] - ax2.get_ylim()[0])
|
|
797 rect.set_hatch('/')
|
|
798 rect.set_fill(False)
|
|
799 rect.set_linewidth(0)
|
|
800 #rect.set_color('lightgrey')
|
|
801 #rect.set_edgecolor('#EDEDED')
|
|
802 rect.set_color('#EDEDED')
|
|
803 ax2.text(index[antisense_pos] + bar_width*n_exp/2 - 0.1, (ax2_ymax+ax2_ymin)/2, 'NA')
|
|
804 # Add text for features absent in sample
|
|
805 for i in xrange(n_exp):
|
|
806 for y in xrange(n_cat):
|
|
807 if percentages[i][y] == 0:
|
|
808 txt = ax1.text(y + bar_width*(i+0.5), 0.02, 'Absent in sample', rotation = 'vertical', color = cmap[i], horizontalalignment ='center', verticalalignment = 'bottom')
|
|
809 txt.set_path_effects([PathEffects.Stroke(linewidth=0.5),PathEffects.Normal()])
|
|
810 elif enrichment[i][y] == 0:
|
|
811 rects[i][y].set_linewidth(1)
|
|
812
|
|
813 # Remove top/right/bottom axes
|
|
814 for ax in [ax1,ax2]:
|
|
815 ax.spines['top'].set_visible(False)
|
|
816 ax.spines['right'].set_visible(False)
|
|
817 ax.spines['bottom'].set_visible(False)
|
|
818 ax.tick_params(axis='x', which='both', bottom='on', top='off', labelbottom='on')
|
|
819 ax.tick_params(axis='y', which='both', left='on', right='off', labelleft='on')
|
|
820
|
|
821
|
|
822
|
|
823 #ax1.legend(loc='center right', bbox_to_anchor=(1.2, 0),fancybox=True, shadow=True)
|
|
824 ## Showing the plot
|
|
825 plt.tight_layout()
|
|
826 fig.subplots_adjust(wspace=0.2)
|
|
827 if pdf:
|
|
828 pdf.savefig()
|
|
829 plt.close()
|
|
830 elif svg:
|
|
831 if svg == True:
|
|
832 plt.savefig(counts_type+'.svg')
|
|
833 else :
|
|
834 if os.path.splitext(svg)[1] == '.svg':
|
|
835 plt.savefig('.'.join((os.path.splitext(svg)[0],counts_type,'svg')))
|
|
836 else:
|
|
837 plt.savefig('.'.join((svg,counts_type,'svg')))
|
|
838 elif png:
|
|
839 if png == True:
|
|
840 plt.savefig(counts_type+'.png')
|
|
841 else :
|
|
842 if os.path.splitext(png)[1] == '.png':
|
|
843 plt.savefig('.'.join((os.path.splitext(png)[0],counts_type,'png')))
|
|
844 else:
|
|
845 plt.savefig('.'.join((png,counts_type,'png')))
|
|
846 else:
|
|
847 plt.show()
|
|
848 ## Save on disk it as a png image
|
|
849 #fig.savefig('image_output.png', dpi=300, format='png', bbox_extra_artists=(lgd,), bbox_inches='tight')
|
|
850
|
|
851
|
|
852 def filter_categs_on_biotype(selected_biotype,cpt) :
|
|
853 filtered_cpt = {}
|
|
854 for sample in cpt:
|
|
855 filtered_cpt[sample] = {}
|
|
856 for feature,count in cpt[sample].items():
|
|
857 if feature[1] == selected_biotype:
|
|
858 filtered_cpt[sample][feature[0]] = count
|
|
859 return filtered_cpt
|
|
860
|
|
861
|
|
862 ##########################################################################
|
|
863 # MAIN #
|
|
864 ##########################################################################
|
|
865
|
|
866 def usage_message(name=None):
|
|
867 return '''
|
|
868 Generate genome indexes:
|
|
869 python ALFA.py -a GTF_FILE [-g GENOME_INDEX]
|
|
870 [--chr_len CHR_LENGTHS_FILE]
|
|
871 Process BAM file(s):
|
|
872 python ALFA.py -g GENOME_INDEX -i BAM1 LABEL1 [BAM2 LABEL2 ...]
|
|
873 [--bedgraph] [-s STRAND]
|
|
874 [-n] [--pdf output.pdf]
|
|
875 [-d {1,2,3,4}] [-t YMIN YMAX]
|
|
876 Index genome + process BAM:
|
|
877 python ALFA.py -a GTF_FILE [-g GENOME_INDEX]
|
|
878 -i BAM1 LABEL1 [BAM2 LABEL2 ...]
|
|
879 [--chr_len CHR_LENGTHS_FILE]
|
|
880 [--bedgraph][-s STRAND]
|
|
881 [-n] [--pdf output.pdf]
|
|
882 [-d {1,2,3,4}] [-t YMIN YMAX]
|
|
883
|
|
884 Process previously created ALFA counts file(s):
|
|
885 python ALFA.py -c COUNTS1 [COUNTS2 ...]
|
|
886 [-s STRAND]
|
|
887 [-n] [--pdf output.pdf]
|
|
888 [-d {1,2,3,4}] [-t YMIN YMAX]
|
|
889
|
|
890 '''
|
|
891
|
|
892
|
|
893 #### Parse command line arguments and store them in 'options'
|
|
894 parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, usage=usage_message())
|
|
895 parser.add_argument('--version', action='version', version='version 1.0', help="show program's version number and exit\n\n-----------\n\n")
|
|
896 # Options concernant l'index
|
|
897 parser.add_argument('-g','--genome_index', help="Genome index files path and basename for existing index, or path and basename for new index creation\n\n")
|
|
898 parser.add_argument('-a','--annotation', metavar = "GTF_FILE", help='Genomic annotations file (GTF format)\n\n')
|
|
899 parser.add_argument('--chr_len', help='Tabulated file containing chromosome names and lengths\n\n-----------\n\n')
|
|
900
|
|
901 # Options pour l'étape d'intersection
|
|
902 parser.add_argument('-i','--input','--bam', dest='input', metavar=('BAM_FILE1 LABEL1',""), nargs='+', help='Input BAM file(s) and label(s). The BAM files must be sorted by position.\n\n')
|
|
903 parser.add_argument('--bedgraph', action='store_const',default = False, const = True, help="Use this options if your input file(s) is(are) already in bedgraph format\n\n")
|
|
904 parser.add_argument('-c','--counts',metavar=('COUNTS_FILE',""), nargs='+', help="Use this options instead of '-i/--input' to provide ALFA counts files as input \ninstead of bam/bedgraph files.\n\n")
|
|
905 parser.add_argument('-s','--strandness', dest="strandness", nargs=1, action = 'store', default = ['unstranded'], choices = ['unstranded','forward','reverse','fr-firststrand','fr-secondstrand'], metavar="", help ="Library orientation. Choose within: 'unstranded', 'forward'/'fr-firststrand' \nor 'reverse'/'fr-secondstrand'. (Default: 'unstranded')\n\n-----------\n\n")
|
|
906
|
|
907 # Options concernant le plot
|
|
908 parser.add_argument('-biotype_filter',nargs=1,help=argparse.SUPPRESS)#"Make an extra plot of categories distribution using only counts of the specified biotype.")
|
|
909 parser.add_argument('-d','--categories_depth', type=int, default='3', choices=range(1,5), help = "Use this option to set the hierarchical level that will be considered in the GTF file (default=3): \n(1) gene,intergenic; \n(2) intron,exon,intergenic; \n(3) 5'UTR,CDS,3'UTR,intron,intergenic; \n(4) start_codon,5'UTR,CDS,3'UTR,stop_codon,intron,intergenic. \n\n")
|
|
910 parser.add_argument('--pdf', nargs='?', default=False, help="Save produced plots in PDF format at specified path ('categories_plots.pdf' if no argument provided)\n\n")
|
|
911 parser.add_argument('--png', nargs='?', default=False, const=True, help="Save produced plots in PNG format with provided argument as basename \nor 'categories.png' and 'biotypes.png' if no argument provided\n\n")
|
|
912 parser.add_argument('--svg', nargs='?', default=False, const=True, help="Save produced plots in SVG format with provided argument as basename \nor 'categories.svg' and 'biotypes.svg' if no argument provided\n\n")
|
|
913 parser.add_argument('-n','--no_plot', dest='quiet', action='store_const', default=False, const=True, help="Do not show plots\n\n")
|
|
914 parser.add_argument('-t','--threshold', dest='threshold', nargs = 2, metavar=("ymin","ymax"), type=float , help="Set axis limits for enrichment plots\n\n")
|
|
915
|
|
916 if len(sys.argv)==1:
|
|
917 parser.print_usage()
|
|
918 sys.exit(1)
|
|
919
|
|
920 options = parser.parse_args()
|
|
921
|
|
922 def required_arg(arg, aliases):
|
|
923 if not arg:
|
|
924 print >> sys.stderr, "\nError: %s argument is missing.\n" %aliases
|
|
925 parser.print_usage()
|
|
926 sys.exit()
|
|
927
|
|
928 def check_files_enxtension(files):
|
|
929 return
|
|
930
|
|
931 # Booleans for steps to be executed
|
|
932 make_index = False
|
|
933 intersect_reads = False
|
|
934 process_counts = False
|
|
935
|
|
936 #### Check arguments conformity and define which steps have to be perfomed
|
|
937 if options.counts :
|
|
938 # Aucun autre argument requis
|
|
939 # Vérifier extension input
|
|
940
|
|
941 # Action : Faire le plot uniquement
|
|
942 process_counts = True
|
|
943 else:
|
|
944 if options.annotation :
|
|
945 # Vérifier si présence -gi
|
|
946 if options.genome_index :
|
|
947 genome_index_basename = options.genome_index
|
|
948 else:
|
|
949 genome_index_basename = options.annotation.split("/")[-1].split(".gtf")[0]
|
|
950 # Vérifier si un fichier existe déjà :
|
|
951 if os.path.isfile(genome_index_basename+".stranded.index") :
|
|
952 if options.input:
|
|
953 print >> sys.stderr, "\nWarning: a index file named '%s' already exists and will be used. If you want to create a new index, please delete this file or specify an other path." %(genome_index_basename+".stranded.index")
|
|
954 else:
|
|
955 sys.exit("\nError: a index file named %s already exists. If you want to create a new index, please delete this file or specify an other path.\n" %(genome_index_basename+".stranded.index"))
|
|
956 # sinon -> action : index à faire
|
|
957 else :
|
|
958 make_index = True
|
|
959 # si l'index n'est pas à faire :
|
|
960 if options.input:
|
|
961 # Arguments requis: input, genome_index
|
|
962 if 'genome_index_basename' not in locals():
|
|
963 required_arg(options.genome_index, "-g/--genome_index")
|
|
964 genome_index_basename = options.genome_index
|
|
965 required_arg(options.input, "-i/--input/--bam")
|
|
966 for i in xrange(0, len(options.input), 2):
|
|
967 try :
|
|
968 extension = os.path.splitext(options.input[i+1])[1]
|
|
969 if extension == ".bam" or extension == '.bedgraph' or extension == '.bg':
|
|
970 sys.exit("Error: it seems input files and associated labels are not correctly provided.\n\
|
|
971 Make sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].")
|
|
972 except:
|
|
973 sys.exit("Error: it seems input files and associated labels are not correctly provided.\nMake sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].")
|
|
974
|
|
975 intersect_reads = True
|
|
976 # Vérifier input's extension
|
|
977 #TODO
|
|
978 if not (options.counts or options.input or options.annotation):
|
|
979 sys.exit("\nError : some arguments are missing At least '-a', '-c' or '-i' is required. Please refer to help (-h/--help) and usage cases for more details.\n")
|
|
980 if not options.counts:
|
|
981 # Declare genome_index variables
|
|
982 stranded_genome_index = genome_index_basename+".stranded.index"
|
|
983 unstranded_genome_index = genome_index_basename+".unstranded.index"
|
|
984 if options.strandness[0] == "unstranded":
|
|
985 genome_index = unstranded_genome_index
|
|
986 else:
|
|
987 genome_index = stranded_genome_index
|
|
988
|
|
989
|
|
990
|
|
991 #### Initialization of some variables
|
|
992
|
|
993 # Initializing the category priority order, coding biotypes and the final list
|
|
994 prios = {'start_codon': 7, 'stop_codon': 7, 'five_prime_utr': 6, 'three_prime_utr': 6, 'UTR': 6, 'CDS': 5, 'exon': 4, 'transcript': 3, 'gene': 2, 'antisense': 1,'intergenic': 0}
|
|
995
|
|
996 biotype_prios = None
|
|
997 #biotype_prios = {"protein_coding":1, "miRNA":2}
|
|
998
|
|
999 # Possibles groups of categories to plot
|
|
1000 categs_group1={'start': ['start_codon'], '5UTR': ['five_prime_utr','UTR'], 'CDS': ['CDS', 'exon'], '3UTR': ['three_prime_utr'], 'stop': ['stop_codon'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']}
|
|
1001 categs_group2={'5UTR': ['five_prime_utr', 'UTR'], 'CDS': ['CDS', 'exon','start_codon','stop_codon'], '3UTR': ['three_prime_utr'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']}
|
|
1002 categs_group3={'exons': ['five_prime_utr', 'three_prime_utr', 'UTR', 'CDS', 'exon','start_codon','stop_codon'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']}
|
|
1003 categs_group4={'gene': ['five_prime_utr', 'three_prime_utr', 'UTR', 'CDS', 'exon','start_codon','stop_codon', 'transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']}
|
|
1004 categs_groups = [categs_group4,categs_group3,categs_group2,categs_group1] # Order and merging for the final plot
|
|
1005 cat_list = ['5UTR', 'start', 'CDS', 'stop', '3UTR', 'exons', 'introns', 'gene', 'intergenic', 'antisense']
|
|
1006
|
|
1007
|
|
1008 # biotypes list
|
|
1009 biotypes ={'protein_coding','polymorphic_pseudogene','TR_C_gene','TR_D_gene','TR_J_gene','TR_V_gene','IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene',"3prime_overlapping_ncrna","lincRNA","macro_lncRNA","miRNA","misc_RNA","Mt_rRNA","Mt_tRNA","processed_transcript","ribozyme","rRNA","scaRNA","sense_intronic","sense_overlapping","snoRNA","snRNA","sRNA","TEC","vaultRNA","antisense","transcribed_processed_pseudogene","transcribed_unitary_pseudogene","transcribed_unprocessed_pseudogene","translated_unprocessed_pseudogene","TR_J_pseudogene","TR_V_pseudogene","unitary_pseudogene","unprocessed_pseudogene","processed_pseudogene","IG_C_pseudogene","IG_J_pseudogene","IG_V_pseudogene","pseudogene","ncRNA","tRNA"} # Type: set (to access quickly)
|
|
1010
|
|
1011 # Grouping of biotypes:
|
|
1012 biotypes_group1={'protein_coding':['protein_coding'],'pseudogenes':['polymorphic_pseudogene',"transcribed_processed_pseudogene","transcribed_unitary_pseudogene","transcribed_unprocessed_pseudogene","translated_unprocessed_pseudogene","TR_J_pseudogene","TR_V_pseudogene","unitary_pseudogene","unprocessed_pseudogene","processed_pseudogene","IG_C_pseudogene","IG_J_pseudogene","IG_V_pseudogene","pseudogene"],'TR':['TR_C_gene','TR_D_gene','TR_J_gene','TR_V_gene'],'IG':['IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene'],\
|
|
1013 'MT_RNA':["Mt_rRNA","Mt_tRNA"],\
|
|
1014 'ncRNA':["lincRNA","macro_lncRNA","3prime_overlapping_ncrna","ncRNA"],\
|
|
1015 "others":["misc_RNA","processed_transcript","ribozyme","scaRNA","sense_intronic","sense_overlapping","TEC","vaultRNA"],
|
|
1016 "antisense":["antisense"]}
|
|
1017 for biot in ["miRNA","snoRNA","snRNA","rRNA","sRNA","tRNA"]:
|
|
1018 biotypes_group1[biot]=[biot]
|
|
1019
|
|
1020 # # Initializing the unkown features lits
|
|
1021 unknown_feature = []
|
|
1022
|
|
1023 # Initializing the genome category counter dict
|
|
1024 cpt_genome = {}
|
|
1025
|
|
1026
|
|
1027 if process_counts :
|
|
1028 #### If input files are the categories counts, just load them and continue to recategorization step
|
|
1029 cpt,cpt_genome,samples_names = read_counts_files(options.counts)
|
|
1030 else:
|
|
1031 #### Create genome index if needed and get the sizes of categories
|
|
1032 if make_index :
|
|
1033 #### Get the chromosome lengths
|
|
1034 lengths = get_chromosome_lengths(options)
|
|
1035 # Generating the genome index files if the user didn't provide them
|
|
1036 create_genome_index(options.annotation, unstranded_genome_index, stranded_genome_index, cpt_genome, prios, biotypes, lengths)
|
|
1037
|
|
1038
|
|
1039 #print '\nChr lengths:', lengths
|
|
1040
|
|
1041 if intersect_reads:
|
|
1042 # If the indexes already exist, read them to compute the sizes of the categories in the genome and retrieve the chromosome lengths
|
|
1043 if not make_index :
|
|
1044 print "\n### Reading genome indexes\n...\r",
|
|
1045 sys.stdout.flush()
|
|
1046 lengths={}
|
|
1047 with open(genome_index, 'r') as genome_index_file:
|
|
1048 for line in genome_index_file:
|
|
1049 if line[0] == "#":
|
|
1050 lengths[line.split('\t')[0][1:]] = int(line.split('\t')[1])
|
|
1051 else :
|
|
1052 add_info(cpt_genome, line.rstrip().split('\t')[4:], line.split('\t')[1], line.split('\t')[2], biotype_prios = None, categ_prios = prios)
|
|
1053
|
|
1054 #### Computing the genome intergenic count: sum of the chr lengths minus sum of the genome annotated intervals
|
|
1055 cpt_genome[('intergenic','intergenic')] = sum(lengths.itervalues()) - sum([v for x,v in cpt_genome.iteritems() if x != ('antisense','antisense')])
|
|
1056 if not make_index :
|
|
1057 print "Done!"
|
|
1058 #print '\nGenome category counts:'
|
|
1059 #for key,val in cpt_genome.iteritems():
|
|
1060 #print key,"\t",val
|
|
1061
|
|
1062
|
|
1063 #### Create the Bedgraph files if needed and get the files list
|
|
1064
|
|
1065 if not options.bedgraph:
|
|
1066 # Generating the BEDGRAPH files is the user provided BAM file(s) and get the samples labels (this names will be used in the plot legend)
|
|
1067 samples_files, samples_names = create_bedgraph_files(options.input,options.strandness[0])
|
|
1068 else:
|
|
1069 # Just initialize the files list with the bedgraph paths
|
|
1070 samples_files = [options.input[i] for i in range(0,len(options.input),2)]
|
|
1071 # and get the labels
|
|
1072 samples_names = [options.input[i] for i in range(1,len(options.input),2)]
|
|
1073 #### Retrieving chromosome names saved in index
|
|
1074 chrom_list = get_chromosome_names_in_index(genome_index)
|
|
1075 #### Processing the BEDGRAPH files: intersecting the bedgraph with the genome index and count the number of aligned positions in each category
|
|
1076 cpt = intersect_bedgraphs_and_index_to_counts_categories(samples_files,samples_names,prios,genome_index, options.strandness[0], biotype_prios = None)
|
|
1077
|
|
1078 #### Write the counts on disk
|
|
1079 write_counts_in_files(cpt,cpt_genome)
|
|
1080
|
|
1081 if not (intersect_reads or process_counts) or (options.quiet and options.pdf == False):
|
|
1082 quit("\n### End of program")
|
|
1083 print "\n### Generating plots"
|
|
1084 # Updating the biotypes lists (biotypes and 'biotype_group1'): adding the 'unknow biotypes' found in gtf/index
|
|
1085 if unknown_feature == []: # 'unknown_feature' is define only during the index generation
|
|
1086 # Browse the feature to determine whether some biotypes are 'unknown'
|
|
1087 for sample,counts in cpt.items():
|
|
1088 for (cat,biot) in counts:
|
|
1089 if biot not in biotypes and cat not in unknown_feature:
|
|
1090 unknown_feature.append(biot)
|
|
1091 for new_biot in unknown_feature:
|
|
1092 biotypes.add(new_biot)
|
|
1093 biotypes_group1["others"].append(new_biot)
|
|
1094 biotypes = sorted(biotypes)
|
|
1095 # move antisense categ to the end of the list
|
|
1096 biotypes.remove('antisense')
|
|
1097 biotypes.append('antisense')
|
|
1098 biotypes_group1 = sorted(biotypes_group1)
|
|
1099
|
|
1100
|
|
1101 #print '\nCounts for every category/biotype pair: ',cpt
|
|
1102
|
|
1103 # Generating plots
|
|
1104 if options.pdf != False:
|
|
1105 if options.pdf == None:
|
|
1106 options.pdf = "categories_plots.pdf"
|
|
1107 pdf = PdfPages(options.pdf)
|
|
1108 else:
|
|
1109 pdf = False
|
|
1110
|
|
1111 selected_biotype = None
|
|
1112 if options.biotype_filter:
|
|
1113 options.biotype_filter = options.biotype_filter[0]
|
|
1114 for sample in cpt:
|
|
1115 for feature in cpt[sample]:
|
|
1116 biotype = feature[1]
|
|
1117 if options.biotype_filter.lower() == biotype.lower():
|
|
1118 selected_biotype=biotype
|
|
1119 break
|
|
1120 if selected_biotype == None :
|
|
1121 print "\nError: biotype '"+options.biotype_filter+"' not found. Please check the biotype name and that this biotype exists in your sample(s)."
|
|
1122 sys.exit()
|
|
1123
|
|
1124 #Print a warning message if the UTRs are not specified as 5' or 3' (they will be ploted as 5'UTR)
|
|
1125 if 'UTR' in [categ[0] for counts in cpt.values() for categ in counts.keys()]:
|
|
1126 print '''\nWARNING: (some) 5'UTR/3'UTR are not precisely defined. Consequently, positions annotated as "UTR" will be counted as "5'UTR"\n'''
|
|
1127
|
|
1128 #### Make the plot by categories
|
|
1129 #### Recategorizing with the final categories
|
|
1130 final_cats=categs_groups[options.categories_depth-1]
|
|
1131 final_cat_cpt,final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt,cpt_genome,final_cats,selected_biotype)
|
|
1132 #### Display the distribution of specified categories (or biotypes) in samples on a barplot
|
|
1133 # Remove the 'antisense' category if the library type is 'unstranded'
|
|
1134 for dic in cpt.values():
|
|
1135 if ('antisense','antisense') in dic.keys(): break
|
|
1136 else:
|
|
1137 cat_list.remove('antisense')
|
|
1138 make_plot(cat_list,samples_names,final_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold, svg = options.svg, png = options.png)
|
|
1139 if selected_biotype :
|
|
1140 make_plot(cat_list,samples_names,filtered_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold,title="Categories distribution for '"+selected_biotype+"' biotype", svg = options.svg, png = options.png)
|
|
1141
|
|
1142 #### Make the plot by biotypes
|
|
1143 #### Recategorizing with the final categories
|
|
1144 final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes)
|
|
1145 #### Display the distribution of specified categories (or biotypes) in samples on a barplot
|
|
1146 make_plot(biotypes,samples_names,final_cat_cpt,final_genome_cpt,pdf, "biotypes",options.threshold, svg = options.svg, png = options.png)
|
|
1147
|
|
1148
|
|
1149
|
|
1150 ##### Recategorizing with the final categories
|
|
1151 #final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes_group1)
|
|
1152 ##### Display the distribution of specified categories (or biotypes) in samples on a barplot
|
|
1153 #make_plot(biotypes_group1,samples_names,final_cat_cpt,final_genome_cpt,pdf,"Biotype groups", options.threshold, title="Biotypes distribution in mapped reads \n(biotypes are grouped by 'family')", svg = options.svg, png = options.png)
|
|
1154
|
|
1155
|
|
1156 if options.pdf:
|
|
1157 pdf.close()
|
|
1158 print "\n### Plots saved in pdf file: %s" %options.pdf
|
|
1159
|
|
1160 print "\n### End of program" |