# HG changeset patch # User charles-bernard # Date 1513868491 18000 # Node ID 950982dda2aac23ae1fc03069841854d7aec1ce2 # Parent 1e508bfd35c5d738d938f978ac48b952a77bbbf7 Uploaded diff -r 1e508bfd35c5 -r 950982dda2aa alfa/.shed.yml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alfa/.shed.yml Thu Dec 21 10:01:31 2017 -0500 @@ -0,0 +1,14 @@ +categories: +- Graphics +- Next Gen Mappers +- Sequence Analysis +- Visualization +description: A tool to Compute and display distribution of reads by genomic categories +long_description: | + ALFA provides a global overview of features distribution composing New Generation Sequencing dataset(s). + Given a set of aligned reads (BAM files) and an annotation file (GTF format), the tool produces plots of the raw and normalized distributions of those reads among genomic categories (stop codon, 5'-UTR, CDS, intergenic, etc.) and biotypes (protein coding genes, miRNA, tRNA, etc.). Whatever the sequencing technique, whatever the organism. + https://github.com/biocompibens/ALFA +name: alfa +owner: charles_bernard +remote_repository_url: https://github.com/biocompibens/ALFA/tree/master/Galaxy_toolshed_repositories/ALFA +type: unrestricted diff -r 1e508bfd35c5 -r 950982dda2aa alfa/ALFA.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alfa/ALFA.py Thu Dec 21 10:01:31 2017 -0500 @@ -0,0 +1,1160 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +__author__ = 'noel & bahin' +''' ''' + +import argparse +import os +import numpy +import sys +import subprocess +import matplotlib.pyplot as plt +import matplotlib.cm as cmx +import matplotlib.colors as colors +import matplotlib.patheffects as PathEffects +import re +from matplotlib.backends.backend_pdf import PdfPages +# To correctly embbed the texts when saving plots in svg format +import matplotlib +matplotlib.rcParams['svg.fonttype'] = 'none' + +########################################################################## +# FUNCTIONS # +########################################################################## + +def init_dict(d, key, init): + if key not in d: + d[key] = init + +def get_chromosome_lengths(args): + """ + Parse the file containing the chromosomes lengths. + If no length file is provided, browse the annotation file (GTF) to estimate the chromosome sizes ( + """ + lengths={} + gtf_chrom_names=set() + force_get_lengths = False + # If the user provided the chromosome length file + if args.chr_len: + with open(args.chr_len, 'r') as chr_len_file: + for line in chr_len_file: + lengths[line.split('\t')[0]] = int(line.rstrip().split('\t')[1]) + with open(args.annotation,'r') as gtf_file: + for line in gtf_file: + if not line.startswith('#'): + chrom = line.split('\t')[0] + if chrom not in gtf_chrom_names: + gtf_chrom_names.add(chrom) + for chrom in lengths.keys(): + if chrom not in gtf_chrom_names: + print "Warning: at least one chromosome name ('"+chrom+"') of the file '"+args.chr_len+"'does not match any chromosome name if GTF and was ignored." + #del lengths[chrom] + break + for chrom in gtf_chrom_names: + if force_get_lengths: break + if chrom not in lengths.keys(): + print "WARNING: chromosome name '"+chrom+"' was found in gtf and does not match any chromosome name provided in",args.chr_len+". " + print "\t=> The chromosome lenghts will be approximated using annotations in the GTF file." + continue_value ="" + while continue_value not in {"yes","y","no","n"}: + continue_value = raw_input("\tDo you want to continue ('yes' or 'y')?\n\tElse write 'no' or 'n' to exit the script and check your file of lengths.\n") + if continue_value == "no" or continue_value == "n": + sys.exit("Exiting") + elif continue_value == "yes" or continue_value == "y": + force_get_lengths = True + break + print "Error: use 'yes/y/no/n' only" + if not force_get_lengths: + return lengths + # Otherwise, (or if at least one chromosome was missing in chromosome lengths file) we consider the end of the last annotation of the chromosome in the GTF file as the chromosome length + with open(args.annotation, 'r') as gtf_file: + for line in gtf_file: + if not line.startswith('#'): + chrom = line.split('\t')[0] + end = int(line.split('\t')[4]) + init_dict(lengths, chrom, 0) + lengths[chrom] = max(lengths[chrom], end) + if force_get_lengths: + print "The chromosome lenghts have been approximated using the last annotations in the GTF file." + return lengths + +def write_feature_on_index(feat,chrom, start, stop, sign, stranded_genome_index, unstranded_genome_index=None): + grouped_by_biotype_features = [] + for biotype,categs in feat.iteritems(): + categ_list=[] + for cat in set(categs): + categ_list.append(cat) + grouped_by_biotype_features.append(":".join((str(biotype),",".join(categ_list)))) + stranded_genome_index.write('\t'.join((chrom, start, stop, sign,''))+'\t'.join(grouped_by_biotype_features)+'\n') + if unstranded_genome_index : + unstranded_genome_index.write('\t'.join((chrom, start, stop, '.',''))+'\t'.join(grouped_by_biotype_features)+'\n') + + +def add_info(cpt, feat_values, start, stop, chrom=None, unstranded_genome_index=None, stranded_genome_index = None , biotype_prios=None, coverage=1, categ_prios=None): + """ + From an annotated genomic interval (start/stop positions and associated feature : one or more category(ies)/biotype pair(s) ) + - If a file is provided: write the information at the end of the index file being generated; + - else : browse the features and update the counts of categories/biotypes found in the genome. + """ + ## Writing in the file is it was provided + if stranded_genome_index : + unstranded_features = None + # If only one strand has a feature, this feature will directly be written on the unstranded index + if feat_values[0] == {} : + # A interval with no feature corresponds to a region annotated only on the reverse strand : update 'antisense' counts + stranded_genome_index.write('\t'.join((chrom, start, stop, '+','antisense\n'))) + write_feature_on_index(feat_values[1], chrom ,start, stop, '-', stranded_genome_index, unstranded_genome_index) + else : + if feat_values[1] == {} : + write_feature_on_index(feat_values[0], chrom ,start, stop, '+', stranded_genome_index, unstranded_genome_index) + stranded_genome_index.write('\t'.join((chrom, start, stop, '-','antisense\n'))) + + # Else, the unstranded index should contain the union of plus and minus features + else : + write_feature_on_index(feat_values[0], chrom ,start, stop, '+', stranded_genome_index) + write_feature_on_index(feat_values[1], chrom ,start, stop, '-', stranded_genome_index) + unstranded_feat = dict(feat_values[0], **feat_values[1]) + for name in set(feat_values[0]) & set(feat_values[1]): + unstranded_feat[name]+=feat_values[0][name] + write_feature_on_index(unstranded_feat, chrom ,start, stop, '.', unstranded_genome_index) + + ## Increasing category counter(s) + else : + # Default behavior if no biotype priorities : category with the highest priority for each found biotype has the same weight (1/n_biotypes) + if not biotype_prios: + nb_feat = len(feat_values) + # For every categ(s)/biotype pairs + for feat in feat_values: + cur_prio = 0 + selected_categ = [] + # Separate categorie(s) and biotype + try: + biot,cats = feat.split(":") + # Error if feature equal "antisense" : update the 'antisense/antisense' counts + except ValueError : + try : + cpt[(feat,feat)] += (int(stop) - int(start)) * coverage / float(nb_feat) + except : + cpt[(feat,feat)] = (int(stop) - int(start)) * coverage / float(nb_feat) + return + # Browse the categories and get only the one(s) with highest priority + for cat in cats.split(','): + try: prio = prios[cat] + except: + #TODO Find a way to add unknown categories + if cat not in unknown_feature: + print >> sys.stderr, "Warning: Unknown categorie %s found and ignored\n...\r" %cat, + unknown_feature.append(cat) + continue + if prio > cur_prio : + cur_prio = prio + selected_categ = [cat] + if prio == cur_prio : + if cat != selected_categ : + try: + if cat not in selected_categ : + selected_categ.append(cat) + except TypeError : + selected_categ = [selected_categ,cat] + # Increase each counts by the coverage divided by the number of categories and biotypes + nb_cats = len(selected_categ) + for cat in selected_categ : + try: + cpt[(cat,biot)] += (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats)) + except KeyError: + cpt[(cat,biot)] = (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats)) + #else : + #cpt[(cats,biot)] = (int(stop) - int(start)) / float(nb_feat) * coverage + # Else, apply biotype selection according to the priority set + else: + #TODO Add an option to pass biotype priorities and handle it + pass + +def print_chrom(features_dict, chrom, stranded_index_file, unstranded_index_file, cpt_genome): + with open(unstranded_index_file,'a') as findex, open(stranded_index_file,'a') as fstrandedindex: + # Initialization of variables : start position of the first interval and associated features for +/- strands + start = "" + for pos in sorted(features_dict['+'].keys()): + if start != "": + add_info(cpt_genome, [features_plus,features_minus], str(start), str(pos), chrom, stranded_genome_index = fstrandedindex, unstranded_genome_index = findex) + start = pos + features_plus = features_dict['+'][pos] + features_minus = features_dict['-'][pos] + + +def create_genome_index(annotation, unstranded_genome_index, stranded_genome_index,cpt_genome,prios,biotypes,chrom_sizes): + ''' Create an index of the genome annotations and save it in a file''' + print '\n### Generating genome indexes\n', + sys.stdout.flush() + # Initializations + intervals_dict = 0 + max_value = -1 + prev_chrom = '' + reverse_strand = {'+':'-','-':'+'} + i = 0 # Line counter + # Write the chromosomes lengths as comment lines before the genome index + with open(unstranded_genome_index,'w') as findex, open(stranded_genome_index,'w') as fstrandedindex: + for key,value in chrom_sizes.items(): + findex.write("#%s\t%s\n" %(key, value)) + fstrandedindex.write("#%s\t%s\n" %(key, value)) + # Running through the GTF file and writing into genome index files + with open(annotation, 'r') as gtf_file: + for line in gtf_file: + # Print messages after X processed lines + i += 1 + if i % 100000 == 0: + print >> sys.stderr, '\r%s line processed...' %str(i) + print '\r \r. . .', + sys.stdout.flush() + elif i % 20000 == 0: + print '\r \r. . .', + sys.stdout.flush() + elif i % 2000 == 0: + print '.', + sys.stdout.flush() + # Processing lines except comment ones + if not line.startswith('#'): + # Getting the line infos + line_split=line[:-1].split('\t') + chrom = line_split[0] + cat=line_split[2] + start = int(line_split[3]) - 1 + stop = int(line_split[4]) + strand = line_split[6] + antisense = reverse_strand[strand] + biotype=line_split[8].split('biotype')[1].split(';')[0].strip('" ') + feat = [(cat,biotype)] + # Registering chromosome info in the genome index file if this is a new chromosome or a annotation not overlapping previously recorded features + if start > max_value or chrom != prev_chrom: + # Write the previous features + if intervals_dict != 0: + if chrom != prev_chrom : + print_chrom(intervals_dict, prev_chrom, stranded_genome_index, unstranded_genome_index, cpt_genome) + print "\rChromosome '" + prev_chrom + "' registered." + else: + print_chrom(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index, cpt_genome) + prev_chrom = chrom + # (Re)Initializing the chromosome lists + intervals_dict = {strand:{start:{biotype:[cat]}, stop:{}},antisense:{start:{},stop:{}}} + max_value = stop + + # Update the dictionary which represents intervals for every disctinct annotation + else: + # Get intervals on the same strand as the current feature + stranded_intervals = intervals_dict[strand] + start_added = False + for pos in sorted(stranded_intervals.keys()): + #print pos + #print stranded_intervals[pos] + #print + # While the position is below the feature's interval, store the features + if pos < start : + cur_cat_dict = dict(stranded_intervals[pos]) + cur_antisense_dict = dict(intervals_dict[antisense][pos]) + + # If the start position already exists: update it by addind the new feature + elif pos == start : + cur_cat_dict = dict(stranded_intervals[pos]) + cur_antisense_dict = dict(intervals_dict[antisense][pos]) + #print "cur",cur_cat_dict + try : stranded_intervals[pos][biotype] = stranded_intervals[pos][biotype]+[cat] + except KeyError: stranded_intervals[pos][biotype] = [cat] + start_added = True + #print "cur",cur_cat_dict + + elif pos > start : + # Create a new entry for the start position if necessary + if not start_added : + #print "cur",cur_cat_dict + stranded_intervals[start] = dict(cur_cat_dict) + stranded_intervals[start][biotype] = [cat] + #stranded_intervals[pos][biotype].append(cat) + intervals_dict[antisense][start] = cur_antisense_dict + start_added = True + #print "cur",cur_cat_dict + # While the position is below the stop, just add the new feature + if pos < stop : + cur_cat_dict = dict(stranded_intervals[pos]) + cur_antisense_dict = dict(intervals_dict[antisense][pos]) + try: stranded_intervals[pos][biotype] = stranded_intervals[pos][biotype] + [cat] + except KeyError: stranded_intervals[pos][biotype] = [cat] + # Close the created interval : create an entry at the stop position and restore the features + elif pos > stop : + stranded_intervals[stop] = dict(cur_cat_dict) + intervals_dict[antisense][stop] = cur_antisense_dict + break + else : + break + #try: + #cur_cat_dict = list(stranded_intervals[pos][biotype]) + #except KeyError: cur_cat_dict = list() + #print stranded_intervals[pos] + #print + # Extend the dictionary if necessary + if stop > max_value: + max_value = stop + stranded_intervals[stop] = {} + intervals_dict[antisense][stop] = {} + #except KeyError: + #print intervals_dict + #quit() + #intervals_dict[strand] = {strand:{start:{biotype:[cat]}, stop:{}},antisense:{start:{},stop:{}}} + #continue + #for sign in ['-','+']: + #print sign + #for key,val in sorted(intervals_dict[sign].iteritems()): + #print key,'\t',val + #print + #print "-------\n" + + + #Store the categories of the last chromosome + print_chrom(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index, cpt_genome) + print "\rChromosome '" + prev_chrom + "' registered.\nDone!" + + +def create_bedgraph_files(bams,strandness): + samples_files = [] + labels = [] + print "\n### Generating the bedgraph files" + for n in range(0, len(bams), 2): + print "\rProcessing '%s'\n..." %bams[n], + sys.stdout.flush() + #Get the label for this sample + label = bams[n+1] + #Modify it to contain only alphanumeric caracters (avoid files generation with dangerous names) + modified_label = "_".join(re.findall(r"[\w']+", label)) + if strandness in ["reverse","fr-secondstrand"]: + subprocess.call('bedtools genomecov -bg -split -strand - -ibam ' + bams[n] + ' > ' + modified_label + '.plus.bedgraph', shell=True) + subprocess.call('bedtools genomecov -bg -split -strand + -ibam ' + bams[n] + ' > ' + modified_label + '.minus.bedgraph', shell=True) + elif strandness in ["forward","fr-firststrand"]: + subprocess.call('bedtools genomecov -bg -split -strand + -ibam ' + bams[n] + ' > ' + modified_label + '.plus.bedgraph', shell=True) + subprocess.call('bedtools genomecov -bg -split -strand - -ibam ' + bams[n] + ' > ' + modified_label + '.minus.bedgraph', shell=True) + else : + subprocess.call('bedtools genomecov -bg -split -ibam ' + bams[n] + ' > ' + modified_label + '.bedgraph', shell=True) + samples_files.append(modified_label) + labels.append(label) + print "\rDone!" + return samples_files, labels + +def read_gtf(gtf_index_file, sign): + global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_features, endGTF + strand = "" + while strand != sign : + gtf_line = gtf_index_file.readline() + # If the GTF file is finished + if not gtf_line: + endGTF = True + return endGTF + splitline = gtf_line.rstrip().split('\t') + try: strand = splitline[3] + # strand information can not be found in the file file header + except IndexError: pass + gtf_chrom = splitline[0] + gtf_features = splitline[4:] + gtf_start, gtf_stop = map(int, splitline[1:3]) + return endGTF + +def read_counts_files(counts_files): + cpt={} + cpt_genome={} + labels=[] + for fcounts in counts_files: + label=os.path.splitext(os.path.basename(fcounts))[0] + labels.append(label) + cpt[label]={} + with open (fcounts,"r") as f: + for line in f: + if line[0]=="#": + continue + line_split=line[:-1].split('\t') + feature=tuple(line_split[0].split(',')) + cpt[label][feature]=float(line_split[1]) + cpt_genome[feature]=float(line_split[2]) + return cpt,cpt_genome,labels + + +def get_chromosome_names_in_index(genome_index): + chrom_list = [] + with open(genome_index, 'r') as findex: + chrom = "" + for line in findex: + cur_chrom = line.split('\t')[0] + if cur_chrom == chrom: + pass + else: + chrom = cur_chrom + if chrom not in chrom_list: + chrom_list.append(chrom) + return set(chrom_list) + + +def intersect_bedgraphs_and_index_to_counts_categories(samples_files,samples_names,prios,genome_index, strandness, biotype_prios = None): + global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_cat, endGTF + print "\n### Intersecting files with indexes" + unknown_chrom = [] + cpt = {} # Counter for the nucleotides in the BAM input file(s) + for n in range(len(samples_files)): + sample_file=samples_files[n] + sample_name=samples_names[n] + # Initializing the category counter dict for this sample + init_dict(cpt, sample_name, {}) + if strandness == "unstranded": + strands = [("",".")] + else: + strands = [('.plus','+'), ('.minus','-')] + + # Intersecting the BEDGRAPH and genome index files + print "\rProcessing '%s'\n. . ." %sample_file, + sys.stdout.flush() + + for strand,sign in strands: + prev_chrom = '' + endGTF = False # Reaching the next chr or the end of the GTF index + intergenic_adds = 0.0 + i = 0 + i_chgt = 0 + with open(sample_file + strand + '.bedgraph', 'r') as bam_count_file: + # Running through the BEDGRAPH file + for bam_line in bam_count_file: + i += 1 + if i % 10000 == 0: + print ".", + sys.stdout.flush() + if i % 100000 == 0: + print "\r \r. . .", + sys.stdout.flush() + # Getting the BAM line info + bam_chrom = bam_line.split('\t')[0] + bam_start, bam_stop, bam_cpt = map(float, bam_line.split('\t')[1:4]) + # Skip the line if the chromosome is not in the index + if bam_chrom not in chrom_list: + if bam_chrom not in unknown_chrom: + unknown_chrom.append(bam_chrom) + print "\r \r Chromosome '" + bam_chrom + "' not found in index." + continue + # If this is a new chromosome (or the first one) + if bam_chrom != prev_chrom: + i_chgt = i + intergenic_adds = 0.0 + # (Re)opening the GTF index and looking for the first line of the matching chr + try: gtf_index_file.close() + except UnboundLocalError: pass + gtf_index_file = open(genome_index, 'r') + endGTF = False + read_gtf(gtf_index_file, sign) + while bam_chrom != gtf_chrom: + read_gtf(gtf_index_file, sign) + if endGTF: + break + prev_chrom = bam_chrom + + # Looking for the first matching annotation in the GTF index + while (not endGTF) and (gtf_chrom == bam_chrom) and (bam_start >= gtf_stop): + read_gtf(gtf_index_file, sign) + if gtf_chrom != bam_chrom: + endGTF = True + # Processing BAM lines before the first GTF annotation if there are + if bam_start < gtf_start: + # Increase the 'intergenic' category counter with all or part of the BAM interval + try: + intergenic_adds += min(bam_stop,gtf_start)-bam_start + cpt[sample_name][('intergenic','intergenic')] += (min(bam_stop, gtf_start) - bam_start) * bam_cpt + except KeyError: + cpt[sample_name][('intergenic','intergenic')] = (min(bam_stop, gtf_start) - bam_start) * bam_cpt + # Go to next line if the BAM interval was totally upstream the first GTF annotation, carry on with the remaining part otherwise + if endGTF or (bam_stop <= gtf_start): + continue + else: + bam_start = gtf_start + + # We can start the crossover + while not endGTF: + # Update category counter + add_info(cpt[sample_name], gtf_features, bam_start, min(bam_stop,gtf_stop), coverage = bam_cpt, categ_prios = prios) + # Read the next GTF file line if the BAM line is not entirely covered + if bam_stop > gtf_stop: + # Update the BAM start pointer + bam_start = gtf_stop + endGTF = read_gtf(gtf_index_file, sign) + # If we read a new chromosome in the GTF file then it is considered finished + if bam_chrom != gtf_chrom: + endGTF = True + if endGTF: + break + else: + # Next if the BAM line is entirely covered + bam_start = bam_stop + break + + # Processing the end of the BAM line if necessary + if endGTF and (bam_stop > bam_start): + try: + cpt[sample_name][('intergenic','intergenic')] += (bam_stop - bam_start) * bam_cpt + except KeyError: + cpt[sample_name][('intergenic','intergenic')] = (bam_stop - bam_start) * bam_cpt + gtf_index_file.close() + print "\r \rDone!" + return cpt + +def write_counts_in_files(cpt,genome_counts): + for label,dico in cpt.items(): + label = "_".join(re.findall(r"[\w']+", label)) + with open(label+".categories_counts","w") as fout: + fout.write("#Category,biotype\tCounts_in_bam\tSize_in_genome\n" ) + for feature,counts in dico.items(): + fout.write("%s\t%s\t%s\n" %(','.join(feature),counts,genome_counts[feature])) + +def recategorize_the_counts(cpt,cpt_genome,final): + final_cat_cpt={} + final_genome_cpt={} + for f in cpt: + #print "\nFinal categories for",f,"sample" + final_cat_cpt[f]={} + for final_cat in final: + tot = 0 + tot_genome=0 + for cat in final[final_cat]: + tot += cpt[f][cat] + tot_genome+=cpt_genome[cat] + #output_file.write('\t'.join((final_cat, str(tot))) + '\n') + #print '\t'.join((final_cat, str(tot))) + final_cat_cpt[f][final_cat]=tot + final_genome_cpt[final_cat]=tot_genome + return final_cat_cpt,final_genome_cpt + +def group_counts_by_categ(cpt,cpt_genome,final,selected_biotype): + final_cat_cpt={} + final_genome_cpt={} + filtered_cat_cpt = {} + for f in cpt: + final_cat_cpt[f]={} + filtered_cat_cpt[f] = {} + for final_cat in final: + tot = 0 + tot_filter = 0 + tot_genome=0 + for cat in final[final_cat]: + for key,value in cpt[f].items(): + if key[0] == cat : + tot += value + tot_genome+=cpt_genome[key] + if key[1] == selected_biotype : + tot_filter += value + #output_file.write('\t'.join((final_cat, str(tot))) + '\n') + #print '\t'.join((final_cat, str(tot))) + final_cat_cpt[f][final_cat]=tot + if tot_genome == 0: + final_genome_cpt[final_cat]= 1e-100 + else: + final_genome_cpt[final_cat]=tot_genome + filtered_cat_cpt[f][final_cat]=tot_filter + #if 'antisense' in final_genome_cpt: final_genome_cpt['antisense'] = 0 + return final_cat_cpt,final_genome_cpt,filtered_cat_cpt + +def group_counts_by_biotype(cpt,cpt_genome,biotypes): + final_cpt={} + final_genome_cpt={} + for f in cpt: + final_cpt[f]={} + for biot in biotypes: + tot = 0 + tot_genome=0 + try: + for final_biot in biotypes[biot]: + for key,value in cpt[f].items(): + if key[1] == final_biot : + tot += value + #if key[1] != 'antisense': + tot_genome+=cpt_genome[key] + except: + for key,value in cpt[f].items(): + if key[1] == biot : + tot += value + tot_genome+=cpt_genome[key] + if tot != 0: + final_cpt[f][biot]=tot + final_genome_cpt[biot]=tot_genome + return final_cpt,final_genome_cpt + +#def get_cmap(N): + #'''Returns a function that maps each index in 0, 1, ... N-1 to a distinct + #RGB color.''' + #color_norm = colors.Normalize(vmin=0, vmax=N-1) + #scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv') + #def map_index_to_rgb_color(index): + #return scalar_map.to_rgba(index) + #return map_index_to_rgb_color + +def one_sample_plot(ordered_categs, percentages, enrichment, n_cat, index, index_enrichment, bar_width, counts_type, title) : + ### Initialization + fig = plt.figure(figsize=(13,9)) + ax1 = plt.subplot2grid((2,4),(0,0),colspan=2) + ax2 = plt.subplot2grid((2,4),(1,0),colspan=2) + cmap= plt.get_cmap('Spectral') + cols=[cmap(x) for x in xrange(0,256,256/n_cat)] + if title: + ax1.set_title(title+"in: %s" %samples_names[0]) + else : + ax1.set_title(counts_type+" distribution in mapped reads in: %s" %samples_names[0]) + ax2.set_title('Normalized counts of '+counts_type) + + + ### Barplots + #First barplot: percentage of reads in each categorie + ax1.bar(index, percentages, bar_width, + color=cols) + #Second barplot: enrichment relative to the genome for each categ + # (the reads count in a categ is divided by the categ size in the genome) + ax2.bar(index_enrichment, enrichment, bar_width, + color=cols,) + ### Piecharts + pielabels = [ordered_categs[i] if percentages[i]>0.025 else '' for i in xrange(n_cat)] + sum_enrichment = numpy.sum(enrichment) + pielabels_enrichment = [ordered_categs[i] if enrichment[i]/sum_enrichment>0.025 else '' for i in xrange(n_cat)] + # Categories piechart + ax3 = plt.subplot2grid((2,4),(0,2)) + pie_wedge_collection, texts = ax3.pie(percentages,labels=pielabels, shadow=True, colors=cols) + # Enrichment piechart + ax4 = plt.subplot2grid((2,4),(1,2)) + pie_wedge_collection, texts = ax4.pie(enrichment,labels=pielabels_enrichment, shadow=True, colors=cols) + # Add legends (append percentages to labels) + labels = [" ".join((ordered_categs[i],"({:.1%})".format(percentages[i]))) for i in range(len(ordered_categs))] + ax3.legend(pie_wedge_collection,labels,loc='center',fancybox=True, shadow=True,prop={'size':'medium'}, bbox_to_anchor=(1.7,0.5)) + labels = [" ".join((ordered_categs[i],"({:.1%})".format(enrichment[i]/sum_enrichment))) for i in range(len(ordered_categs))]# if ordered_categs[i] != 'antisense'] + ax4.legend(pie_wedge_collection,labels,loc='center',fancybox=True, shadow=True,prop={'size':'medium'},bbox_to_anchor=(1.7,0.5)) + # Set aspect ratio to be equal so that pie is drawn as a circle + ax3.set_aspect('equal') + ax4.set_aspect('equal') + return fig, ax1, ax2 + +def make_plot(ordered_categs,samples_names,categ_counts,genome_counts,pdf, counts_type, threshold, title = None ,svg = None, png = None): + # From ordered_categs, keep only the features (categs or biotypes) that we can find in at least one sample. + existing_categs = set() + for sample in categ_counts.values(): + existing_categs |= set(sample.keys()) + ordered_categs = filter(existing_categs.__contains__, ordered_categs) + n_cat = len(ordered_categs) + n_exp=len(samples_names) + ##Initialization of the matrix of counts (nrow=nb_experiements, ncol=nb_categorie) + counts=numpy.matrix(numpy.zeros(shape=(n_exp,n_cat))) + for exp in xrange(len(samples_names)): + for cat in xrange(len(ordered_categs)): + try: counts[exp,cat]=categ_counts[samples_names[exp]][ordered_categs[cat]] + except: pass + + ##Normalize the categorie sizes by the total size to get percentages + sizes=[] + sizes_sum=0 + for cat in ordered_categs: + sizes.append(genome_counts[cat]) + sizes_sum+=genome_counts[cat] + if 'antisense' in ordered_categs: + antisense_pos = ordered_categs.index('antisense') + sizes[antisense_pos] = 1e-100 + for cpt in xrange(len(sizes)): + sizes[cpt]/=float(sizes_sum) + + ## Create array which contains the percentage of reads in each categ for every sample + percentages=numpy.array(counts/numpy.sum(counts,axis=1)) + ## Create the enrichment array (counts divided by the categorie sizes in the genome) + enrichment=numpy.array(percentages/sizes) + if 'antisense_pos' in locals(): + for i in xrange(len(samples_names)): + enrichment[i][antisense_pos] = 0 + #enrichment=numpy.log(numpy.array(percentages/sizes)) + for exp in xrange(n_exp): + for i in xrange(n_cat): + val = enrichment[exp][i] + if val > 1: + enrichment[exp][i] = val-1 + elif val == 1 or val == 0: + enrichment[exp][i] = 0 + else: + enrichment[exp][i] = -1/val+1 + + #### Finally, produce the plot + + ##Get the colors from the colormap + ncolor=16 + cmap = ["#e47878", "#68b4e5", "#a3ea9b", "#ea9cf3", "#e5c957", "#a3ecd1", "#e97ca0", "#66d985", "#8e7ae5", "#b3e04b", "#b884e4", "#e4e758", "#738ee3", "#e76688", "#70dddd", "#e49261"] + if n_exp > ncolor: + cmap = plt.get_cmap('Set3',n_exp) + cmap = [cmap(i) for i in xrange(n_exp)] + + ## Parameters for the plot + opacity = 1 + #Create a vector which contains the position of each bar + index = numpy.arange(n_cat) + #Size of the bars (depends on the categs number) + bar_width = 0.9/n_exp + + + ##Initialise the subplot + # if there is only one sample, also plot piecharts + #if n_exp == 1 and counts_type.lower() == 'categories': + #fig, ax1, ax2 = one_sample_plot(ordered_categs, percentages[0], enrichment[0], n_cat, index, bar_width, counts_type, title) + ## If more than one sample + #else: + fig, (ax1,ax2) = plt.subplots(2,figsize=(5+(n_cat+2*n_exp)/3,10)) + # Store the bars objects for enrichment plot + rects = [] + #For each sample/experiment + for i in range(n_exp): + #First barplot: percentage of reads in each categorie + ax1.bar(index+i*bar_width, percentages[i], bar_width, + alpha=opacity, + color=cmap[i], + label=samples_names[i], edgecolor='#FFFFFF', lw=0) + #Second barplot: enrichment relative to the genome for each categ + # (the reads count in a categ is divided by the categ size in the genome) + rects.append( ax2.bar(index+i*bar_width, enrichment[i], bar_width, + alpha=opacity, + color=cmap[i], + label=samples_names[i], edgecolor=cmap[i], lw=0)) + + ## Graphical options for the plot + # Adding of the legend + ax1.legend(loc='best',frameon=False) + #ax2.legend(loc='upper center',bbox_to_anchor=(0.5,-0.1), fancybox=True, shadow=True) + # Main titles + if title: + ax1.set_title(title) + else : + ax1.set_title(counts_type+" distribution in mapped reads") + ax2.set_title('Normalized counts of '+counts_type) + + # Adding enrichment baseline + #ax2.axhline(y=0,color='black',linestyle='dashed',linewidth='1.5') + # Axes limits + ax1.set_xlim(-0.1,len(ordered_categs)+0.1) + if len(sizes) == 1: ax1.set_xlim(-2,3) + ax2.set_xlim(ax1.get_xlim()) + # Set axis limits (max/min values + 5% margin) + ax2_ymin = [] + ax2_ymax = [] + for sample_values in enrichment: + ax2_ymin.append(min(sample_values)) + ax2_ymax.append(max(sample_values)) + ax2_ymax = max(ax2_ymax) + ax2_ymin = min(ax2_ymin) + margin_top, margin_bottom = (abs(0.05*ax2_ymax), abs(0.05*ax2_ymin)) + ax1.set_ylim(0,ax1.get_ylim()[1]*1.05) + if threshold: + threshold_bottom = -abs(float(threshold[0]))+1 + threshold_top = float(threshold[1])-1 + + for i in xrange(n_exp): + for y in xrange(n_cat): + val = enrichment[i][y] + if not numpy.isnan(val) and not (threshold_bottom < val < threshold_top): + rect = rects[i][y] + rect_height = rect.get_height() + if rect.get_y() < 0: + diff = rect_height + threshold_bottom + rect.set_y(threshold_bottom) + ax2_ymin = threshold_bottom + margin_bottom = 0 + else: + diff = rect_height - threshold_top + ax2_ymax = threshold_top + margin_top = 0 + rect.set_height(rect.get_height()-diff) + if margin_top != 0 and margin_bottom != 0: + margin_top, margin_bottom = [max(margin_top, margin_bottom) for i in xrange(2)] + ax2.set_ylim(ax2_ymin-margin_bottom,ax2_ymax+margin_top) + # Y axis title + ax1.set_ylabel('Proportion of reads (%)') + ax2.set_ylabel('Enrichment relative to genome') + # X axis title + ax1.set_xlabel(counts_type) + ax2.set_xlabel(counts_type) + # Add the categories on the x-axis + ax1.set_xticks(index + bar_width*n_exp/2) + ax2.set_xticks(index + bar_width*n_exp/2) + if counts_type.lower() != 'categories': + ax1.set_xticklabels(ordered_categs,rotation='30',ha='right') + ax2.set_xticklabels(ordered_categs,rotation='30',ha='right') + else: + ax1.set_xticklabels(ordered_categs) + ax2.set_xticklabels(ordered_categs) + # Display fractions values in percentages + ax1.set_yticklabels([str(int(i*100)) for i in ax1.get_yticks()]) + # Correct y-axis ticks labels for enrichment subplot + #ax2.set_yticklabels([str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()]) + yticks = list(ax2.get_yticks()) + yticks = [ yticks[i]-1 if yticks[i]>9 else yticks[i]+1 if yticks[i]<-9 else yticks[i] for i in xrange(len(yticks))] + ax2.set_yticks(yticks) + ax2.set_yticklabels([str(int(i+1))+"$^{+1}$" if i>0 and i%1==0 else str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(int(-(i-1)))+"$^{-1}$" if i<0 and i%1==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()]) + #ax2.set_yticklabels([i+1 if i>0 else 1 if i==0 else "$\\frac{1}{%s}$" %-(i-1) for i in ax2.get_yticks()]) + # Change appearance of 'antisense' bars on enrichment plot since we cannot calculate an enrichment for this artificial category + if 'antisense_pos' in locals(): #ax2.text(antisense_pos+bar_width/2,ax2.get_ylim()[1]/10,'NA') + for i in xrange(n_exp) : + rect = rects[i][antisense_pos] + rect.set_y(ax2.get_ylim()[0]) + rect.set_height(ax2.get_ylim()[1] - ax2.get_ylim()[0]) + rect.set_hatch('/') + rect.set_fill(False) + rect.set_linewidth(0) + #rect.set_color('lightgrey') + #rect.set_edgecolor('#EDEDED') + rect.set_color('#EDEDED') + ax2.text(index[antisense_pos] + bar_width*n_exp/2 - 0.1, (ax2_ymax+ax2_ymin)/2, 'NA') + # Add text for features absent in sample + for i in xrange(n_exp): + for y in xrange(n_cat): + if percentages[i][y] == 0: + txt = ax1.text(y + bar_width*(i+0.5), 0.02, 'Absent in sample', rotation = 'vertical', color = cmap[i], horizontalalignment ='center', verticalalignment = 'bottom') + txt.set_path_effects([PathEffects.Stroke(linewidth=0.5),PathEffects.Normal()]) + elif enrichment[i][y] == 0: + rects[i][y].set_linewidth(1) + + # Remove top/right/bottom axes + for ax in [ax1,ax2]: + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['bottom'].set_visible(False) + ax.tick_params(axis='x', which='both', bottom='on', top='off', labelbottom='on') + ax.tick_params(axis='y', which='both', left='on', right='off', labelleft='on') + + + + #ax1.legend(loc='center right', bbox_to_anchor=(1.2, 0),fancybox=True, shadow=True) + ## Showing the plot + plt.tight_layout() + fig.subplots_adjust(wspace=0.2) + if pdf: + pdf.savefig() + plt.close() + elif svg: + if svg == True: + plt.savefig(counts_type+'.svg') + else : + if os.path.splitext(svg)[1] == '.svg': + plt.savefig('.'.join((os.path.splitext(svg)[0],counts_type,'svg'))) + else: + plt.savefig('.'.join((svg,counts_type,'svg'))) + elif png: + if png == True: + plt.savefig(counts_type+'.png') + else : + if os.path.splitext(png)[1] == '.png': + plt.savefig('.'.join((os.path.splitext(png)[0],counts_type,'png'))) + else: + plt.savefig('.'.join((png,counts_type,'png'))) + else: + plt.show() + ## Save on disk it as a png image + #fig.savefig('image_output.png', dpi=300, format='png', bbox_extra_artists=(lgd,), bbox_inches='tight') + + +def filter_categs_on_biotype(selected_biotype,cpt) : + filtered_cpt = {} + for sample in cpt: + filtered_cpt[sample] = {} + for feature,count in cpt[sample].items(): + if feature[1] == selected_biotype: + filtered_cpt[sample][feature[0]] = count + return filtered_cpt + + +########################################################################## +# MAIN # +########################################################################## + +def usage_message(name=None): + return ''' + Generate genome indexes: + python ALFA.py -a GTF_FILE [-g GENOME_INDEX] + [--chr_len CHR_LENGTHS_FILE] + Process BAM file(s): + python ALFA.py -g GENOME_INDEX -i BAM1 LABEL1 [BAM2 LABEL2 ...] + [--bedgraph] [-s STRAND] + [-n] [--pdf output.pdf] + [-d {1,2,3,4}] [-t YMIN YMAX] + Index genome + process BAM: + python ALFA.py -a GTF_FILE [-g GENOME_INDEX] + -i BAM1 LABEL1 [BAM2 LABEL2 ...] + [--chr_len CHR_LENGTHS_FILE] + [--bedgraph][-s STRAND] + [-n] [--pdf output.pdf] + [-d {1,2,3,4}] [-t YMIN YMAX] + + Process previously created ALFA counts file(s): + python ALFA.py -c COUNTS1 [COUNTS2 ...] + [-s STRAND] + [-n] [--pdf output.pdf] + [-d {1,2,3,4}] [-t YMIN YMAX] + + ''' + + +#### Parse command line arguments and store them in 'options' +parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, usage=usage_message()) +parser.add_argument('--version', action='version', version='version 1.0', help="show program's version number and exit\n\n-----------\n\n") +# Options concernant l'index +parser.add_argument('-g','--genome_index', help="Genome index files path and basename for existing index, or path and basename for new index creation\n\n") +parser.add_argument('-a','--annotation', metavar = "GTF_FILE", help='Genomic annotations file (GTF format)\n\n') +parser.add_argument('--chr_len', help='Tabulated file containing chromosome names and lengths\n\n-----------\n\n') + +# Options pour l'étape d'intersection +parser.add_argument('-i','--input','--bam', dest='input', metavar=('BAM_FILE1 LABEL1',""), nargs='+', help='Input BAM file(s) and label(s). The BAM files must be sorted by position.\n\n') +parser.add_argument('--bedgraph', action='store_const',default = False, const = True, help="Use this options if your input file(s) is(are) already in bedgraph format\n\n") +parser.add_argument('-c','--counts',metavar=('COUNTS_FILE',""), nargs='+', help="Use this options instead of '-i/--input' to provide ALFA counts files as input \ninstead of bam/bedgraph files.\n\n") +parser.add_argument('-s','--strandness', dest="strandness", nargs=1, action = 'store', default = ['unstranded'], choices = ['unstranded','forward','reverse','fr-firststrand','fr-secondstrand'], metavar="", help ="Library orientation. Choose within: 'unstranded', 'forward'/'fr-firststrand' \nor 'reverse'/'fr-secondstrand'. (Default: 'unstranded')\n\n-----------\n\n") + +# Options concernant le plot +parser.add_argument('-biotype_filter',nargs=1,help=argparse.SUPPRESS)#"Make an extra plot of categories distribution using only counts of the specified biotype.") +parser.add_argument('-d','--categories_depth', type=int, default='3', choices=range(1,5), help = "Use this option to set the hierarchical level that will be considered in the GTF file (default=3): \n(1) gene,intergenic; \n(2) intron,exon,intergenic; \n(3) 5'UTR,CDS,3'UTR,intron,intergenic; \n(4) start_codon,5'UTR,CDS,3'UTR,stop_codon,intron,intergenic. \n\n") +parser.add_argument('--pdf', nargs='?', default=False, help="Save produced plots in PDF format at specified path ('categories_plots.pdf' if no argument provided)\n\n") +parser.add_argument('--png', nargs='?', default=False, const=True, help="Save produced plots in PNG format with provided argument as basename \nor 'categories.png' and 'biotypes.png' if no argument provided\n\n") +parser.add_argument('--svg', nargs='?', default=False, const=True, help="Save produced plots in SVG format with provided argument as basename \nor 'categories.svg' and 'biotypes.svg' if no argument provided\n\n") +parser.add_argument('-n','--no_plot', dest='quiet', action='store_const', default=False, const=True, help="Do not show plots\n\n") +parser.add_argument('-t','--threshold', dest='threshold', nargs = 2, metavar=("ymin","ymax"), type=float , help="Set axis limits for enrichment plots\n\n") + +if len(sys.argv)==1: + parser.print_usage() + sys.exit(1) + +options = parser.parse_args() + +def required_arg(arg, aliases): + if not arg: + print >> sys.stderr, "\nError: %s argument is missing.\n" %aliases + parser.print_usage() + sys.exit() + +def check_files_enxtension(files): + return + +# Booleans for steps to be executed +make_index = False +intersect_reads = False +process_counts = False + +#### Check arguments conformity and define which steps have to be perfomed +if options.counts : + # Aucun autre argument requis + # Vérifier extension input + + # Action : Faire le plot uniquement + process_counts = True +else: + if options.annotation : + # Vérifier si présence -gi + if options.genome_index : + genome_index_basename = options.genome_index + else: + genome_index_basename = options.annotation.split("/")[-1].split(".gtf")[0] + # Vérifier si un fichier existe déjà: + if os.path.isfile(genome_index_basename+".stranded.index") : + if options.input: + print >> sys.stderr, "\nWarning: a index file named '%s' already exists and will be used. If you want to create a new index, please delete this file or specify an other path." %(genome_index_basename+".stranded.index") + else: + sys.exit("\nError: a index file named %s already exists. If you want to create a new index, please delete this file or specify an other path.\n" %(genome_index_basename+".stranded.index")) + # sinon -> action : index à faire + else : + make_index = True + # si l'index n'est pas à faire : + if options.input: + # Arguments requis: input, genome_index + if 'genome_index_basename' not in locals(): + required_arg(options.genome_index, "-g/--genome_index") + genome_index_basename = options.genome_index + required_arg(options.input, "-i/--input/--bam") + for i in xrange(0, len(options.input), 2): + try : + extension = os.path.splitext(options.input[i+1])[1] + if extension == ".bam" or extension == '.bedgraph' or extension == '.bg': + sys.exit("Error: it seems input files and associated labels are not correctly provided.\n\ + Make sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].") + except: + sys.exit("Error: it seems input files and associated labels are not correctly provided.\nMake sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].") + + intersect_reads = True + # Vérifier input's extension + #TODO +if not (options.counts or options.input or options.annotation): + sys.exit("\nError : some arguments are missing At least '-a', '-c' or '-i' is required. Please refer to help (-h/--help) and usage cases for more details.\n") +if not options.counts: + # Declare genome_index variables + stranded_genome_index = genome_index_basename+".stranded.index" + unstranded_genome_index = genome_index_basename+".unstranded.index" + if options.strandness[0] == "unstranded": + genome_index = unstranded_genome_index + else: + genome_index = stranded_genome_index + + + +#### Initialization of some variables + +# Initializing the category priority order, coding biotypes and the final list +prios = {'start_codon': 7, 'stop_codon': 7, 'five_prime_utr': 6, 'three_prime_utr': 6, 'UTR': 6, 'CDS': 5, 'exon': 4, 'transcript': 3, 'gene': 2, 'antisense': 1,'intergenic': 0} + +biotype_prios = None +#biotype_prios = {"protein_coding":1, "miRNA":2} + +# Possibles groups of categories to plot +categs_group1={'start': ['start_codon'], '5UTR': ['five_prime_utr','UTR'], 'CDS': ['CDS', 'exon'], '3UTR': ['three_prime_utr'], 'stop': ['stop_codon'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']} +categs_group2={'5UTR': ['five_prime_utr', 'UTR'], 'CDS': ['CDS', 'exon','start_codon','stop_codon'], '3UTR': ['three_prime_utr'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']} +categs_group3={'exons': ['five_prime_utr', 'three_prime_utr', 'UTR', 'CDS', 'exon','start_codon','stop_codon'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']} +categs_group4={'gene': ['five_prime_utr', 'three_prime_utr', 'UTR', 'CDS', 'exon','start_codon','stop_codon', 'transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']} +categs_groups = [categs_group4,categs_group3,categs_group2,categs_group1] # Order and merging for the final plot +cat_list = ['5UTR', 'start', 'CDS', 'stop', '3UTR', 'exons', 'introns', 'gene', 'intergenic', 'antisense'] + + +# biotypes list +biotypes ={'protein_coding','polymorphic_pseudogene','TR_C_gene','TR_D_gene','TR_J_gene','TR_V_gene','IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene',"3prime_overlapping_ncrna","lincRNA","macro_lncRNA","miRNA","misc_RNA","Mt_rRNA","Mt_tRNA","processed_transcript","ribozyme","rRNA","scaRNA","sense_intronic","sense_overlapping","snoRNA","snRNA","sRNA","TEC","vaultRNA","antisense","transcribed_processed_pseudogene","transcribed_unitary_pseudogene","transcribed_unprocessed_pseudogene","translated_unprocessed_pseudogene","TR_J_pseudogene","TR_V_pseudogene","unitary_pseudogene","unprocessed_pseudogene","processed_pseudogene","IG_C_pseudogene","IG_J_pseudogene","IG_V_pseudogene","pseudogene","ncRNA","tRNA"} # Type: set (to access quickly) + +# Grouping of biotypes: +biotypes_group1={'protein_coding':['protein_coding'],'pseudogenes':['polymorphic_pseudogene',"transcribed_processed_pseudogene","transcribed_unitary_pseudogene","transcribed_unprocessed_pseudogene","translated_unprocessed_pseudogene","TR_J_pseudogene","TR_V_pseudogene","unitary_pseudogene","unprocessed_pseudogene","processed_pseudogene","IG_C_pseudogene","IG_J_pseudogene","IG_V_pseudogene","pseudogene"],'TR':['TR_C_gene','TR_D_gene','TR_J_gene','TR_V_gene'],'IG':['IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene'],\ + 'MT_RNA':["Mt_rRNA","Mt_tRNA"],\ + 'ncRNA':["lincRNA","macro_lncRNA","3prime_overlapping_ncrna","ncRNA"],\ + "others":["misc_RNA","processed_transcript","ribozyme","scaRNA","sense_intronic","sense_overlapping","TEC","vaultRNA"], + "antisense":["antisense"]} +for biot in ["miRNA","snoRNA","snRNA","rRNA","sRNA","tRNA"]: + biotypes_group1[biot]=[biot] + +# # Initializing the unkown features lits +unknown_feature = [] + +# Initializing the genome category counter dict +cpt_genome = {} + + +if process_counts : + #### If input files are the categories counts, just load them and continue to recategorization step + cpt,cpt_genome,samples_names = read_counts_files(options.counts) +else: + #### Create genome index if needed and get the sizes of categories + if make_index : + #### Get the chromosome lengths + lengths = get_chromosome_lengths(options) + # Generating the genome index files if the user didn't provide them + create_genome_index(options.annotation, unstranded_genome_index, stranded_genome_index, cpt_genome, prios, biotypes, lengths) + + + #print '\nChr lengths:', lengths + +if intersect_reads: + # If the indexes already exist, read them to compute the sizes of the categories in the genome and retrieve the chromosome lengths + if not make_index : + print "\n### Reading genome indexes\n...\r", + sys.stdout.flush() + lengths={} + with open(genome_index, 'r') as genome_index_file: + for line in genome_index_file: + if line[0] == "#": + lengths[line.split('\t')[0][1:]] = int(line.split('\t')[1]) + else : + add_info(cpt_genome, line.rstrip().split('\t')[4:], line.split('\t')[1], line.split('\t')[2], biotype_prios = None, categ_prios = prios) + + #### Computing the genome intergenic count: sum of the chr lengths minus sum of the genome annotated intervals + cpt_genome[('intergenic','intergenic')] = sum(lengths.itervalues()) - sum([v for x,v in cpt_genome.iteritems() if x != ('antisense','antisense')]) + if not make_index : + print "Done!" + #print '\nGenome category counts:' + #for key,val in cpt_genome.iteritems(): + #print key,"\t",val + + + #### Create the Bedgraph files if needed and get the files list + + if not options.bedgraph: + # Generating the BEDGRAPH files is the user provided BAM file(s) and get the samples labels (this names will be used in the plot legend) + samples_files, samples_names = create_bedgraph_files(options.input,options.strandness[0]) + else: + # Just initialize the files list with the bedgraph paths + samples_files = [options.input[i] for i in range(0,len(options.input),2)] + # and get the labels + samples_names = [options.input[i] for i in range(1,len(options.input),2)] + #### Retrieving chromosome names saved in index + chrom_list = get_chromosome_names_in_index(genome_index) + #### Processing the BEDGRAPH files: intersecting the bedgraph with the genome index and count the number of aligned positions in each category + cpt = intersect_bedgraphs_and_index_to_counts_categories(samples_files,samples_names,prios,genome_index, options.strandness[0], biotype_prios = None) + + #### Write the counts on disk + write_counts_in_files(cpt,cpt_genome) + +if not (intersect_reads or process_counts) or (options.quiet and options.pdf == False): + quit("\n### End of program") +print "\n### Generating plots" +# Updating the biotypes lists (biotypes and 'biotype_group1'): adding the 'unknow biotypes' found in gtf/index +if unknown_feature == []: # 'unknown_feature' is define only during the index generation + # Browse the feature to determine whether some biotypes are 'unknown' + for sample,counts in cpt.items(): + for (cat,biot) in counts: + if biot not in biotypes and cat not in unknown_feature: + unknown_feature.append(biot) +for new_biot in unknown_feature: + biotypes.add(new_biot) + biotypes_group1["others"].append(new_biot) +biotypes = sorted(biotypes) +# move antisense categ to the end of the list +biotypes.remove('antisense') +biotypes.append('antisense') +biotypes_group1 = sorted(biotypes_group1) + + +#print '\nCounts for every category/biotype pair: ',cpt + +# Generating plots +if options.pdf != False: + if options.pdf == None: + options.pdf = "categories_plots.pdf" + pdf = PdfPages(options.pdf) +else: + pdf = False + +selected_biotype = None +if options.biotype_filter: + options.biotype_filter = options.biotype_filter[0] + for sample in cpt: + for feature in cpt[sample]: + biotype = feature[1] + if options.biotype_filter.lower() == biotype.lower(): + selected_biotype=biotype + break + if selected_biotype == None : + print "\nError: biotype '"+options.biotype_filter+"' not found. Please check the biotype name and that this biotype exists in your sample(s)." + sys.exit() + +#Print a warning message if the UTRs are not specified as 5' or 3' (they will be ploted as 5'UTR) +if 'UTR' in [categ[0] for counts in cpt.values() for categ in counts.keys()]: + print '''\nWARNING: (some) 5'UTR/3'UTR are not precisely defined. Consequently, positions annotated as "UTR" will be counted as "5'UTR"\n''' + +#### Make the plot by categories + #### Recategorizing with the final categories +final_cats=categs_groups[options.categories_depth-1] +final_cat_cpt,final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt,cpt_genome,final_cats,selected_biotype) + #### Display the distribution of specified categories (or biotypes) in samples on a barplot +# Remove the 'antisense' category if the library type is 'unstranded' +for dic in cpt.values(): + if ('antisense','antisense') in dic.keys(): break +else: + cat_list.remove('antisense') +make_plot(cat_list,samples_names,final_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold, svg = options.svg, png = options.png) +if selected_biotype : + make_plot(cat_list,samples_names,filtered_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold,title="Categories distribution for '"+selected_biotype+"' biotype", svg = options.svg, png = options.png) + +#### Make the plot by biotypes + #### Recategorizing with the final categories +final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes) + #### Display the distribution of specified categories (or biotypes) in samples on a barplot +make_plot(biotypes,samples_names,final_cat_cpt,final_genome_cpt,pdf, "biotypes",options.threshold, svg = options.svg, png = options.png) + + + + ##### Recategorizing with the final categories +#final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes_group1) + ##### Display the distribution of specified categories (or biotypes) in samples on a barplot +#make_plot(biotypes_group1,samples_names,final_cat_cpt,final_genome_cpt,pdf,"Biotype groups", options.threshold, title="Biotypes distribution in mapped reads \n(biotypes are grouped by 'family')", svg = options.svg, png = options.png) + + +if options.pdf: + pdf.close() + print "\n### Plots saved in pdf file: %s" %options.pdf + +print "\n### End of program" \ No newline at end of file diff -r 1e508bfd35c5 -r 950982dda2aa alfa/ALFA.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alfa/ALFA.xml Thu Dec 21 10:01:31 2017 -0500 @@ -0,0 +1,329 @@ + + - Plot the distribution of genomic features in your aligned reads + + + + bedtools + samtools + matplotlib + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + +
+ +
+ + + + + + + + + + + + + + + + + + + +
+
+ + + + + outputFiles['plot'] is True and outputOptions['plotFormat'] == 'pdf' + + + outputFiles['plot'] is True and outputOptions['plotFormat'] == 'png' + + + outputFiles['plot'] is True and outputOptions['plotFormat'] == 'png' + + + outputFiles['plot'] is True and outputOptions['plotFormat'] == 'svg' + + + outputFiles['plot'] is True and outputOptions['plotFormat'] == 'svg' + + + outputFiles['countFile'] is True + + + outputFiles['index'] is True + + + outputFiles['index'] is True + + + + + + +
+ + + + +
+
+ + + + + + + + +
+
+ + + +
+
+ + + + + +
+ + + + + + + +
+
+ + + + + + + @MISC{ + author="Benoît Noël and Mathieu Bahin" + title="ALFA: Annotation Landscape For Aligned reads" + crossref="https://github.com/biocompibens/ALFA" + institution="Institut de Biologie de l'Ecole Normale Supérieure de Paris" + } + + +
diff -r 1e508bfd35c5 -r 950982dda2aa alfa/ALFA_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alfa/ALFA_wrapper.py Thu Dec 21 10:01:31 2017 -0500 @@ -0,0 +1,183 @@ +#!/usr/bin/python + +import argparse +import logging +import os +import re +import shutil +import subprocess +import sys +import tempfile + +def exit_and_explain(msg): + logging.critical(msg) + sys.exit(msg) + +def cleanup_before_exit(tmp_dir): + if tmp_dir and os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + +def get_arg(): + parser = argparse.ArgumentParser() + parser.add_argument('--project_name', dest='project_name', action='store', nargs=1, metavar='project_name', type=str) + #Input 1: Annotation File + parser.add_argument('--index', dest='indexes', action='store', nargs=2, metavar=('stranded_index_filename', 'unstranded_index_filename'), type=str) + parser.add_argument('--bi_index', dest='bi_indexes', action='store', nargs=1, metavar='built_in_indexes_dir_path', type=str ) + parser.add_argument('--annotation', dest='annotation_file', action='store', nargs=1, metavar='annotation_gtf_file', type=str ) + #Input 2: Mapped Reads + parser.add_argument('--reads_format', dest='reads_format', action='store', nargs=1, choices=['bam', 'bedgraph'], metavar='reads_format', type=str) + parser.add_argument('--reads', dest='reads', action='store', nargs='+', metavar=('bam_file1 label1',""), type=str) + parser.add_argument('--strandness', dest='strandness', action='store', nargs=1, default=['unstranded'], choices=['unstranded', 'forward', 'reverse'], metavar='strandness', type=str) + #Output files + parser.add_argument('--output_pdf', dest='output_pdf', action='store', nargs=1, metavar='output_pdf_filename', type=str) + parser.add_argument('--output_svg', dest='output_svg', action='store', nargs=2, metavar=('categories_svg_filename', 'biotypes_svg_filename'), type=str) + parser.add_argument('--output_png', dest='output_png', action='store', nargs=2, metavar=('categories_png_filename', 'biotypes_png_filename'), type=str) + parser.add_argument('--output_count', dest='output_count', action='store', nargs=1, metavar='output_count_filename', type=str) + parser.add_argument('--output_index', dest='output_indexes', action='store', nargs=2, metavar=('output_stranded_index_filename', 'output_unstranded_index_filename'), type=str) + #Output Options + parser.add_argument('--categories_depth', dest='categories_depth', action='store', nargs=1, default=[3], choices=range(1,5), metavar='categories_depth', type=int) + parser.add_argument('--plot_format', dest='plot_format', action='store', nargs=1, choices=['pdf', 'png', 'svg'], metavar='plot_format', type=str) + parser.add_argument('--threshold', dest='threshold', action='store', nargs=2, metavar=('yMin', 'yMax'), type=float) + #Internal variables + parser.add_argument('--log_report', dest='log_report', action='store', nargs=1, metavar='log_filename', type=str) + parser.add_argument('--tool_dir', dest='GALAXY_TOOL_DIR', action='store', nargs=1, metavar='galaxy_tool_dir_path', type=str) + args = parser.parse_args() + return args + +def symlink_user_indexes(stranded_index, unstranded_index): + index='index' + os.symlink(stranded_index, index + '.stranded.index') + os.symlink(unstranded_index, index + '.unstranded.index') + return index + +def get_input2_args(reads_list, format): + n = len(reads_list) + if n%2 != 0: + exit_and_explain('Problem with pairing reads filename and reads label') + if format == 'bam': + input2_args = '--bam' + elif format == 'begraph': + input2_args = '--bedgraph' + input2_args='-i' + k = 0 + reads_filenames = [''] * (n/2) + reads_labels = [''] * (n/2) + for i in range(0, n, 2): + reads_filenames[k] = reads_list[i].split('__fname__')[1] + cur_label = reads_list[i+1].split('__label__')[1] + reads_labels[k] = re.sub(r' ', '_', cur_label) + if not reads_labels[k]: + reads_labels[k] = 'sample_%s' % str(k) + input2_args='%s "%s" "%s"' % (input2_args, reads_filenames[k], reads_labels[k]) + k += 1 + return input2_args, reads_filenames, reads_labels + +def redirect_errors(alfa_out, alfa_err): + # When the option --n is enabled, alfa prints '### End of the program' in stderr even if the process worked- + # The following lines to avoid the tool from crashing in this case + if alfa_err and not re.search('### End of program', alfa_err): + # When alfa prints '### End of program' in stdout, all the messages in stderr are considered + # as warnings and not as errors. + if re.search('### End of program', alfa_out): + logging.warning("The script ALFA.py encountered the following warning:\n\n%s" % alfa_err) + logging.info("\n******************************************************************\n") + # True errors make the script exits + else: + exit_and_explain("The script ALFA.py encountered the following error:\n\n%s" % alfa_err) + +def merge_count_files(reads_labels): + merged_count_file = open('count_file.txt', 'wb') + for i in range(0, len(reads_labels)): + current_count_file = open('%s.categories_counts' % reads_labels[i], 'r') + merged_count_file.write('##LABEL: %s\n\n' % reads_labels[i]) + merged_count_file.write(current_count_file.read()) + merged_count_file.write('__________________________________________________________________\n') + current_count_file.close() + merged_count_file.close() + return 'count_file.txt' + +def main(): + args = get_arg() + + if not (args.output_pdf or args.output_png or args.output_svg or args.output_indexes or args.output_count): + exit_and_explain('Error: no output to return\nProcess Aborted\n') + tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='') + logging.basicConfig(level=logging.INFO, filename=args.log_report[0], filemode="a+", format='%(message)s') + alfa_path = os.path.join(args.GALAXY_TOOL_DIR[0], 'ALFA.py') + + #INPUT1: Annotation File + if args.indexes: + # The indexes submitted by the user must exhibit the suffix '.(un)stranded.index' and will be called by alfa by their prefix + index = symlink_user_indexes(args.indexes[0], args.indexes[1]) + input1_args = '-g "%s"' % index + elif args.bi_indexes: + input1_args = '-g "%s"' % args.bi_indexes[0] + elif args.annotation_file: + input1_args = '-a "%s"' % args.annotation_file[0] + else: + exit_and_explain('No annotation file submitted !') + + #INPUT 2: Mapped Reads + if args.reads: + input2_args, reads_filenames, reads_labels = get_input2_args(args.reads, args.reads_format[0]) + strandness = '-s %s' % args.strandness[0] + else: + exit_and_explain('No reads submitted !') + + ##Output options + categories_depth = '-d %s' % args.categories_depth[0] + if not (args.output_pdf or args.output_png or args.output_svg): + output_args = '--n' + else: + if args.output_pdf: + output_args = '--pdf plot.pdf' + if args.output_png: + output_args = '--png plot' + if args.output_svg: + output_args = '--svg plot' + if args.threshold: + output_args = '%s -t %.3f %.3f' % (output_args, args.threshold[0], args.threshold[1]) + + ##Run alfa + cmd = 'python %s %s %s %s %s %s' % (alfa_path, input1_args, input2_args, strandness, categories_depth, output_args) + logging.info("__________________________________________________________________\n") + logging.info("Alfa execution") + logging.info("__________________________________________________________________\n") + logging.info("Command Line:\n%s\n" % cmd) + logging.info("------------------------------------------------------------------\n") + alfa_result = subprocess.Popen(args=cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + alfa_out, alfa_err = alfa_result.communicate() + + ##Handle stdout, warning, errors... + redirect_errors(alfa_out, alfa_err) + + logging.info("Alfa prompt:\n%s" % alfa_out) + + ##Redirect outputs + if args.output_pdf: + shutil.move('plot.pdf', args.output_pdf[0]) + if args.output_png: + shutil.move('plot' + '.categories.png', args.output_png[0]) + shutil.move('plot' + '.biotypes.png', args.output_png[1]) + if args.output_svg: + shutil.move('plot' + '.categories.svg', args.output_svg[0]) + shutil.move('plot' + '.biotypes.svg', args.output_svg[1]) + if args.output_count: + count_filename = merge_count_files(reads_labels) + shutil.move(count_filename, args.output_count[0]) + if args.output_indexes: + if args.annotation_file: + indexes_regex = re.compile('.*\.index') + indexes = filter(indexes_regex.search, os.listdir('.')) + indexes.sort() + shutil.move(indexes[0], args.output_indexes[0]) + shutil.move(indexes[1], args.output_indexes[1]) + if args.indexes: + shutil.move(index + '.stranded.index', args.output_indexes[0]) + shutil.move(index + '.unstranded.index', args.output_indexes[1]) + if args.bi_indexes: + shutil.move(args.bi_indexes[0] + '.stranded.index', args.output_index[0]) + shutil.move(args.bi_indexes[1] + '.unstranded.index', args.output_index[1]) + + cleanup_before_exit(tmp_dir) +main() diff -r 1e508bfd35c5 -r 950982dda2aa alfa/alfa/test-data/alfa/test-data/toto.txt --- a/alfa/alfa/test-data/alfa/test-data/toto.txt Thu Dec 21 09:58:48 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ -# example repository config (see "hg help config" for more info) -[paths] -default = https://charles-bernard@toolshed.g2.bx.psu.edu/repos/charles-bernard/alfa - -# path aliases to other clones of this repo in URLs or filesystem paths -# (see "hg help config.paths" for more info) -# -# default-push = ssh://jdoe@example.net/hg/jdoes-fork -# my-fork = ssh://jdoe@example.net/hg/jdoes-fork -# my-clone = /home/jdoe/jdoes-clone - -[ui] -# name and email (local to this repository, optional), e.g. -# username = charles-bernard diff -r 1e508bfd35c5 -r 950982dda2aa alfa/test-data/alfa_toy-Biofeatures Distribution.pdf Binary file alfa/test-data/alfa_toy-Biofeatures Distribution.pdf has changed diff -r 1e508bfd35c5 -r 950982dda2aa alfa/test-data/alfa_toy.bam Binary file alfa/test-data/alfa_toy.bam has changed diff -r 1e508bfd35c5 -r 950982dda2aa alfa/test-data/alfa_toy.bedgraph --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alfa/test-data/alfa_toy.bedgraph Thu Dec 21 10:01:31 2017 -0500 @@ -0,0 +1,4 @@ +Chr1 149 199 2 +Chr1 299 349 1 +Chr1 499 549 6 +Chr1 1099 1149 1 diff -r 1e508bfd35c5 -r 950982dda2aa alfa/test-data/alfa_toy.categories_counts --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alfa/test-data/alfa_toy.categories_counts Thu Dec 21 10:01:31 2017 -0500 @@ -0,0 +1,5 @@ +#Category,biotype Counts_in_bam Size_in_genome +CDS,protein_coding 300.0 624.0 +five_prime_utr,protein_coding 75.0 250.5 +three_prime_utr,protein_coding 25.0 126.5 +intergenic,intergenic 100.0 249.0 diff -r 1e508bfd35c5 -r 950982dda2aa alfa/test-data/alfa_toy.gtf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alfa/test-data/alfa_toy.gtf Thu Dec 21 10:01:31 2017 -0500 @@ -0,0 +1,6 @@ +Chr1 ensembl_havana gene 250 1250 . + . gene_id "ENSMUSG00000051951"; gene_biotype "protein_coding"; +Chr1 ensembl_havana transcript 250 1250 . + . gene_id "ENSMUSG00000051951"; gene_biotype "protein_coding"; +Chr1 ensembl_havana exon 375 1000 . + . gene_id "ENSMUSG00000051951"; gene_biotype "protein_coding"; +Chr1 ensembl_havana CDS 375 1000 . + 0 gene_id "ENSMUSG00000051951"; gene_biotype "protein_coding"; +Chr1 ensembl_havana five_prime_utr 250 375 . - . gene_id "ENSMUSG00000051951"; gene_biotype "protein_coding"; +Chr1 ensembl_havana three_prime_utr 1000 1250 . - . gene_id "ENSMUSG00000051951"; gene_biotype "protein_coding"; diff -r 1e508bfd35c5 -r 950982dda2aa alfa/test-data/alfa_toy.stranded.index --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alfa/test-data/alfa_toy.stranded.index Thu Dec 21 10:01:31 2017 -0500 @@ -0,0 +1,11 @@ +#Chr1 1250 +Chr1 249 374 + protein_coding:gene,transcript +Chr1 249 374 - protein_coding:five_prime_utr +Chr1 374 375 + protein_coding:exon,CDS +Chr1 374 375 - protein_coding:five_prime_utr,three_prime_utr +Chr1 375 999 + protein_coding:exon,CDS +Chr1 375 999 - antisense +Chr1 999 1000 + protein_coding:exon,CDS +Chr1 999 1000 - protein_coding:three_prime_utr +Chr1 1000 1250 + protein_coding:gene,transcript +Chr1 1000 1250 - protein_coding:five_prime_utr,three_prime_utr,exon,CDS diff -r 1e508bfd35c5 -r 950982dda2aa alfa/test-data/alfa_toy.unstranded.index --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alfa/test-data/alfa_toy.unstranded.index Thu Dec 21 10:01:31 2017 -0500 @@ -0,0 +1,6 @@ +#Chr1 1250 +Chr1 249 374 . protein_coding:five_prime_utr,gene,transcript +Chr1 374 375 . protein_coding:five_prime_utr,three_prime_utr,exon,CDS +Chr1 375 999 . protein_coding:exon,CDS +Chr1 999 1000 . protein_coding:three_prime_utr,exon,CDS +Chr1 1000 1250 . protein_coding:five_prime_utr,exon,CDS,three_prime_utr,gene,transcript diff -r 1e508bfd35c5 -r 950982dda2aa alfa/tool-data/alfa_indexes.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alfa/tool-data/alfa_indexes.loc.sample Thu Dec 21 10:01:31 2017 -0500 @@ -0,0 +1,2 @@ +# +#Dictyostelium_discoideum dicty_2 7 Dictyostelium_discoideum_dicty_2_7 Dictyostelium_discoideum_dicty_2_7 Dictyostelium_discoideum: dicty_2 (release 7) diff -r 1e508bfd35c5 -r 950982dda2aa alfa/tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alfa/tool_data_table_conf.xml.sample Thu Dec 21 10:01:31 2017 -0500 @@ -0,0 +1,7 @@ + + + + species, version, release, value, dbkey, name, prefix + +
+
diff -r 1e508bfd35c5 -r 950982dda2aa alfa/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alfa/tool_dependencies.xml Thu Dec 21 10:01:31 2017 -0500 @@ -0,0 +1,12 @@ + + + + + + + + + + + +