Mercurial > repos > biocomp-ibens > data_manager_build_alfa_indexes
changeset 0:f68a60c3d768 draft default tip
Uploaded
author | biocomp-ibens |
---|---|
date | Wed, 16 May 2018 09:56:43 -0400 |
parents | |
children | |
files | data_manager_build_alfa_indexes/.shed.yml data_manager_build_alfa_indexes/data_manager/ALFA.py data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.xml data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes_testchr.py data_manager_build_alfa_indexes/data_manager/tool_dependencies.xml data_manager_build_alfa_indexes/data_manager_conf.xml data_manager_build_alfa_indexes/tool-data/alfa_indexes.loc.sample data_manager_build_alfa_indexes/tool_data_table_conf.xml.sample |
diffstat | 9 files changed, 1785 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/.shed.yml Wed May 16 09:56:43 2018 -0400 @@ -0,0 +1,13 @@ +categories: +- Data Managers +description: A tool to build ALFA indexes from automatically downloaded gtf annotation file +long_description: | + 1. The tool asks the admin to enter a 'species_name' and automatically download the last release of the corresponding gtf annotation file on Ensembl. + 2. The tool calls ALFA to generate the alfa indexes from this gtf file. + 3. Resulting indexes are stored in the child directory 'alfa_indexes' of the dir <galaxy_data_manager_data_path> defined in config/galaxy.ini + 4. Finally, the tool adds the new entry to the table 'alfa_indexes.loc'. This .loc file is where the data table 'alfa_indexes' points, as defined in config/shed_tool_data_table.conf.xml + 5. At the end of the process, when a user will use alfa (https://toolshed.g2.bx.psu.edu/view/charles-bernard/alfa), the built-in indexes corresponding to the 'species_name' will be available +name: data_manager_build_alfa_indexes +owner: charles_bernard +remote_repository_url: https://github.com/charles-bernard/Galaxy_tools/tree/master/data_manager_build_alfa_indexes +type: unrestricted \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/data_manager/ALFA.py Wed May 16 09:56:43 2018 -0400 @@ -0,0 +1,1160 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +__author__ = 'noel & bahin' +''' <decription> ''' + +import argparse +import os +import numpy +import sys +import subprocess +import matplotlib.pyplot as plt +import matplotlib.cm as cmx +import matplotlib.colors as colors +import matplotlib.patheffects as PathEffects +import re +from matplotlib.backends.backend_pdf import PdfPages +# To correctly embbed the texts when saving plots in svg format +import matplotlib +matplotlib.rcParams['svg.fonttype'] = 'none' + +########################################################################## +# FUNCTIONS # +########################################################################## + +def init_dict(d, key, init): + if key not in d: + d[key] = init + +def get_chromosome_lengths(args): + """ + Parse the file containing the chromosomes lengths. + If no length file is provided, browse the annotation file (GTF) to estimate the chromosome sizes ( + """ + lengths={} + gtf_chrom_names=set() + force_get_lengths = False + # If the user provided the chromosome length file + if args.chr_len: + with open(args.chr_len, 'r') as chr_len_file: + for line in chr_len_file: + lengths[line.split('\t')[0]] = int(line.rstrip().split('\t')[1]) + with open(args.annotation,'r') as gtf_file: + for line in gtf_file: + if not line.startswith('#'): + chrom = line.split('\t')[0] + if chrom not in gtf_chrom_names: + gtf_chrom_names.add(chrom) + for chrom in lengths.keys(): + if chrom not in gtf_chrom_names: + print "Warning: at least one chromosome name ('"+chrom+"') of the file '"+args.chr_len+"'does not match any chromosome name if GTF and was ignored." + #del lengths[chrom] + break + for chrom in gtf_chrom_names: + if force_get_lengths: break + if chrom not in lengths.keys(): + print "WARNING: chromosome name '"+chrom+"' was found in gtf and does not match any chromosome name provided in",args.chr_len+". " + print "\t=> The chromosome lenghts will be approximated using annotations in the GTF file." + continue_value ="" + while continue_value not in {"yes","y","no","n"}: + continue_value = raw_input("\tDo you want to continue ('yes' or 'y')?\n\tElse write 'no' or 'n' to exit the script and check your file of lengths.\n") + if continue_value == "no" or continue_value == "n": + sys.exit("Exiting") + elif continue_value == "yes" or continue_value == "y": + force_get_lengths = True + break + print "Error: use 'yes/y/no/n' only" + if not force_get_lengths: + return lengths + # Otherwise, (or if at least one chromosome was missing in chromosome lengths file) we consider the end of the last annotation of the chromosome in the GTF file as the chromosome length + with open(args.annotation, 'r') as gtf_file: + for line in gtf_file: + if not line.startswith('#'): + chrom = line.split('\t')[0] + end = int(line.split('\t')[4]) + init_dict(lengths, chrom, 0) + lengths[chrom] = max(lengths[chrom], end) + if force_get_lengths: + print "The chromosome lenghts have been approximated using the last annotations in the GTF file." + return lengths + +def write_feature_on_index(feat,chrom, start, stop, sign, stranded_genome_index, unstranded_genome_index=None): + grouped_by_biotype_features = [] + for biotype,categs in feat.iteritems(): + categ_list=[] + for cat in set(categs): + categ_list.append(cat) + grouped_by_biotype_features.append(":".join((str(biotype),",".join(categ_list)))) + stranded_genome_index.write('\t'.join((chrom, start, stop, sign,''))+'\t'.join(grouped_by_biotype_features)+'\n') + if unstranded_genome_index : + unstranded_genome_index.write('\t'.join((chrom, start, stop, '.',''))+'\t'.join(grouped_by_biotype_features)+'\n') + + +def add_info(cpt, feat_values, start, stop, chrom=None, unstranded_genome_index=None, stranded_genome_index = None , biotype_prios=None, coverage=1, categ_prios=None): + """ + From an annotated genomic interval (start/stop positions and associated feature : one or more category(ies)/biotype pair(s) ) + - If a file is provided: write the information at the end of the index file being generated; + - else : browse the features and update the counts of categories/biotypes found in the genome. + """ + ## Writing in the file is it was provided + if stranded_genome_index : + unstranded_features = None + # If only one strand has a feature, this feature will directly be written on the unstranded index + if feat_values[0] == {} : + # A interval with no feature corresponds to a region annotated only on the reverse strand : update 'antisense' counts + stranded_genome_index.write('\t'.join((chrom, start, stop, '+','antisense\n'))) + write_feature_on_index(feat_values[1], chrom ,start, stop, '-', stranded_genome_index, unstranded_genome_index) + else : + if feat_values[1] == {} : + write_feature_on_index(feat_values[0], chrom ,start, stop, '+', stranded_genome_index, unstranded_genome_index) + stranded_genome_index.write('\t'.join((chrom, start, stop, '-','antisense\n'))) + + # Else, the unstranded index should contain the union of plus and minus features + else : + write_feature_on_index(feat_values[0], chrom ,start, stop, '+', stranded_genome_index) + write_feature_on_index(feat_values[1], chrom ,start, stop, '-', stranded_genome_index) + unstranded_feat = dict(feat_values[0], **feat_values[1]) + for name in set(feat_values[0]) & set(feat_values[1]): + unstranded_feat[name]+=feat_values[0][name] + write_feature_on_index(unstranded_feat, chrom ,start, stop, '.', unstranded_genome_index) + + ## Increasing category counter(s) + else : + # Default behavior if no biotype priorities : category with the highest priority for each found biotype has the same weight (1/n_biotypes) + if not biotype_prios: + nb_feat = len(feat_values) + # For every categ(s)/biotype pairs + for feat in feat_values: + cur_prio = 0 + selected_categ = [] + # Separate categorie(s) and biotype + try: + biot,cats = feat.split(":") + # Error if feature equal "antisense" : update the 'antisense/antisense' counts + except ValueError : + try : + cpt[(feat,feat)] += (int(stop) - int(start)) * coverage / float(nb_feat) + except : + cpt[(feat,feat)] = (int(stop) - int(start)) * coverage / float(nb_feat) + return + # Browse the categories and get only the one(s) with highest priority + for cat in cats.split(','): + try: prio = prios[cat] + except: + #TODO Find a way to add unknown categories + if cat not in unknown_feature: + print >> sys.stderr, "Warning: Unknown categorie %s found and ignored\n...\r" %cat, + unknown_feature.append(cat) + continue + if prio > cur_prio : + cur_prio = prio + selected_categ = [cat] + if prio == cur_prio : + if cat != selected_categ : + try: + if cat not in selected_categ : + selected_categ.append(cat) + except TypeError : + selected_categ = [selected_categ,cat] + # Increase each counts by the coverage divided by the number of categories and biotypes + nb_cats = len(selected_categ) + for cat in selected_categ : + try: + cpt[(cat,biot)] += (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats)) + except KeyError: + cpt[(cat,biot)] = (int(stop) - int(start)) * coverage / (float(nb_feat * nb_cats)) + #else : + #cpt[(cats,biot)] = (int(stop) - int(start)) / float(nb_feat) * coverage + # Else, apply biotype selection according to the priority set + else: + #TODO Add an option to pass biotype priorities and handle it + pass + +def print_chrom(features_dict, chrom, stranded_index_file, unstranded_index_file, cpt_genome): + with open(unstranded_index_file,'a') as findex, open(stranded_index_file,'a') as fstrandedindex: + # Initialization of variables : start position of the first interval and associated features for +/- strands + start = "" + for pos in sorted(features_dict['+'].keys()): + if start != "": + add_info(cpt_genome, [features_plus,features_minus], str(start), str(pos), chrom, stranded_genome_index = fstrandedindex, unstranded_genome_index = findex) + start = pos + features_plus = features_dict['+'][pos] + features_minus = features_dict['-'][pos] + + +def create_genome_index(annotation, unstranded_genome_index, stranded_genome_index,cpt_genome,prios,biotypes,chrom_sizes): + ''' Create an index of the genome annotations and save it in a file''' + print '\n### Generating genome indexes\n', + sys.stdout.flush() + # Initializations + intervals_dict = 0 + max_value = -1 + prev_chrom = '' + reverse_strand = {'+':'-','-':'+'} + i = 0 # Line counter + # Write the chromosomes lengths as comment lines before the genome index + with open(unstranded_genome_index,'w') as findex, open(stranded_genome_index,'w') as fstrandedindex: + for key,value in chrom_sizes.items(): + findex.write("#%s\t%s\n" %(key, value)) + fstrandedindex.write("#%s\t%s\n" %(key, value)) + # Running through the GTF file and writing into genome index files + with open(annotation, 'r') as gtf_file: + for line in gtf_file: + # Print messages after X processed lines + i += 1 + if i % 100000 == 0: + print >> sys.stderr, '\r%s line processed...' %str(i) + print '\r \r. . .', + sys.stdout.flush() + elif i % 20000 == 0: + print '\r \r. . .', + sys.stdout.flush() + elif i % 2000 == 0: + print '.', + sys.stdout.flush() + # Processing lines except comment ones + if not line.startswith('#'): + # Getting the line infos + line_split=line[:-1].split('\t') + chrom = line_split[0] + cat=line_split[2] + start = int(line_split[3]) - 1 + stop = int(line_split[4]) + strand = line_split[6] + antisense = reverse_strand[strand] + biotype=line_split[8].split('biotype')[1].split(';')[0].strip('" ') + feat = [(cat,biotype)] + # Registering chromosome info in the genome index file if this is a new chromosome or a annotation not overlapping previously recorded features + if start > max_value or chrom != prev_chrom: + # Write the previous features + if intervals_dict != 0: + if chrom != prev_chrom : + print_chrom(intervals_dict, prev_chrom, stranded_genome_index, unstranded_genome_index, cpt_genome) + print "\rChromosome '" + prev_chrom + "' registered." + else: + print_chrom(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index, cpt_genome) + prev_chrom = chrom + # (Re)Initializing the chromosome lists + intervals_dict = {strand:{start:{biotype:[cat]}, stop:{}},antisense:{start:{},stop:{}}} + max_value = stop + + # Update the dictionary which represents intervals for every disctinct annotation + else: + # Get intervals on the same strand as the current feature + stranded_intervals = intervals_dict[strand] + start_added = False + for pos in sorted(stranded_intervals.keys()): + #print pos + #print stranded_intervals[pos] + #print + # While the position is below the feature's interval, store the features + if pos < start : + cur_cat_dict = dict(stranded_intervals[pos]) + cur_antisense_dict = dict(intervals_dict[antisense][pos]) + + # If the start position already exists: update it by addind the new feature + elif pos == start : + cur_cat_dict = dict(stranded_intervals[pos]) + cur_antisense_dict = dict(intervals_dict[antisense][pos]) + #print "cur",cur_cat_dict + try : stranded_intervals[pos][biotype] = stranded_intervals[pos][biotype]+[cat] + except KeyError: stranded_intervals[pos][biotype] = [cat] + start_added = True + #print "cur",cur_cat_dict + + elif pos > start : + # Create a new entry for the start position if necessary + if not start_added : + #print "cur",cur_cat_dict + stranded_intervals[start] = dict(cur_cat_dict) + stranded_intervals[start][biotype] = [cat] + #stranded_intervals[pos][biotype].append(cat) + intervals_dict[antisense][start] = cur_antisense_dict + start_added = True + #print "cur",cur_cat_dict + # While the position is below the stop, just add the new feature + if pos < stop : + cur_cat_dict = dict(stranded_intervals[pos]) + cur_antisense_dict = dict(intervals_dict[antisense][pos]) + try: stranded_intervals[pos][biotype] = stranded_intervals[pos][biotype] + [cat] + except KeyError: stranded_intervals[pos][biotype] = [cat] + # Close the created interval : create an entry at the stop position and restore the features + elif pos > stop : + stranded_intervals[stop] = dict(cur_cat_dict) + intervals_dict[antisense][stop] = cur_antisense_dict + break + else : + break + #try: + #cur_cat_dict = list(stranded_intervals[pos][biotype]) + #except KeyError: cur_cat_dict = list() + #print stranded_intervals[pos] + #print + # Extend the dictionary if necessary + if stop > max_value: + max_value = stop + stranded_intervals[stop] = {} + intervals_dict[antisense][stop] = {} + #except KeyError: + #print intervals_dict + #quit() + #intervals_dict[strand] = {strand:{start:{biotype:[cat]}, stop:{}},antisense:{start:{},stop:{}}} + #continue + #for sign in ['-','+']: + #print sign + #for key,val in sorted(intervals_dict[sign].iteritems()): + #print key,'\t',val + #print + #print "-------\n" + + + #Store the categories of the last chromosome + print_chrom(intervals_dict, chrom, stranded_genome_index, unstranded_genome_index, cpt_genome) + print "\rChromosome '" + prev_chrom + "' registered.\nDone!" + + +def create_bedgraph_files(bams,strandness): + samples_files = [] + labels = [] + print "\n### Generating the bedgraph files" + for n in range(0, len(bams), 2): + print "\rProcessing '%s'\n..." %bams[n], + sys.stdout.flush() + #Get the label for this sample + label = bams[n+1] + #Modify it to contain only alphanumeric caracters (avoid files generation with dangerous names) + modified_label = "_".join(re.findall(r"[\w']+", label)) + if strandness in ["reverse","fr-secondstrand"]: + subprocess.call('bedtools genomecov -bg -split -strand - -ibam ' + bams[n] + ' > ' + modified_label + '.plus.bedgraph', shell=True) + subprocess.call('bedtools genomecov -bg -split -strand + -ibam ' + bams[n] + ' > ' + modified_label + '.minus.bedgraph', shell=True) + elif strandness in ["forward","fr-firststrand"]: + subprocess.call('bedtools genomecov -bg -split -strand + -ibam ' + bams[n] + ' > ' + modified_label + '.plus.bedgraph', shell=True) + subprocess.call('bedtools genomecov -bg -split -strand - -ibam ' + bams[n] + ' > ' + modified_label + '.minus.bedgraph', shell=True) + else : + subprocess.call('bedtools genomecov -bg -split -ibam ' + bams[n] + ' > ' + modified_label + '.bedgraph', shell=True) + samples_files.append(modified_label) + labels.append(label) + print "\rDone!" + return samples_files, labels + +def read_gtf(gtf_index_file, sign): + global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_features, endGTF + strand = "" + while strand != sign : + gtf_line = gtf_index_file.readline() + # If the GTF file is finished + if not gtf_line: + endGTF = True + return endGTF + splitline = gtf_line.rstrip().split('\t') + try: strand = splitline[3] + # strand information can not be found in the file file header + except IndexError: pass + gtf_chrom = splitline[0] + gtf_features = splitline[4:] + gtf_start, gtf_stop = map(int, splitline[1:3]) + return endGTF + +def read_counts_files(counts_files): + cpt={} + cpt_genome={} + labels=[] + for fcounts in counts_files: + label=os.path.splitext(os.path.basename(fcounts))[0] + labels.append(label) + cpt[label]={} + with open (fcounts,"r") as f: + for line in f: + if line[0]=="#": + continue + line_split=line[:-1].split('\t') + feature=tuple(line_split[0].split(',')) + cpt[label][feature]=float(line_split[1]) + cpt_genome[feature]=float(line_split[2]) + return cpt,cpt_genome,labels + + +def get_chromosome_names_in_index(genome_index): + chrom_list = [] + with open(genome_index, 'r') as findex: + chrom = "" + for line in findex: + cur_chrom = line.split('\t')[0] + if cur_chrom == chrom: + pass + else: + chrom = cur_chrom + if chrom not in chrom_list: + chrom_list.append(chrom) + return set(chrom_list) + + +def intersect_bedgraphs_and_index_to_counts_categories(samples_files,samples_names,prios,genome_index, strandness, biotype_prios = None): + global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_cat, endGTF + print "\n### Intersecting files with indexes" + unknown_chrom = [] + cpt = {} # Counter for the nucleotides in the BAM input file(s) + for n in range(len(samples_files)): + sample_file=samples_files[n] + sample_name=samples_names[n] + # Initializing the category counter dict for this sample + init_dict(cpt, sample_name, {}) + if strandness == "unstranded": + strands = [("",".")] + else: + strands = [('.plus','+'), ('.minus','-')] + + # Intersecting the BEDGRAPH and genome index files + print "\rProcessing '%s'\n. . ." %sample_file, + sys.stdout.flush() + + for strand,sign in strands: + prev_chrom = '' + endGTF = False # Reaching the next chr or the end of the GTF index + intergenic_adds = 0.0 + i = 0 + i_chgt = 0 + with open(sample_file + strand + '.bedgraph', 'r') as bam_count_file: + # Running through the BEDGRAPH file + for bam_line in bam_count_file: + i += 1 + if i % 10000 == 0: + print ".", + sys.stdout.flush() + if i % 100000 == 0: + print "\r \r. . .", + sys.stdout.flush() + # Getting the BAM line info + bam_chrom = bam_line.split('\t')[0] + bam_start, bam_stop, bam_cpt = map(float, bam_line.split('\t')[1:4]) + # Skip the line if the chromosome is not in the index + if bam_chrom not in chrom_list: + if bam_chrom not in unknown_chrom: + unknown_chrom.append(bam_chrom) + print "\r \r Chromosome '" + bam_chrom + "' not found in index." + continue + # If this is a new chromosome (or the first one) + if bam_chrom != prev_chrom: + i_chgt = i + intergenic_adds = 0.0 + # (Re)opening the GTF index and looking for the first line of the matching chr + try: gtf_index_file.close() + except UnboundLocalError: pass + gtf_index_file = open(genome_index, 'r') + endGTF = False + read_gtf(gtf_index_file, sign) + while bam_chrom != gtf_chrom: + read_gtf(gtf_index_file, sign) + if endGTF: + break + prev_chrom = bam_chrom + + # Looking for the first matching annotation in the GTF index + while (not endGTF) and (gtf_chrom == bam_chrom) and (bam_start >= gtf_stop): + read_gtf(gtf_index_file, sign) + if gtf_chrom != bam_chrom: + endGTF = True + # Processing BAM lines before the first GTF annotation if there are + if bam_start < gtf_start: + # Increase the 'intergenic' category counter with all or part of the BAM interval + try: + intergenic_adds += min(bam_stop,gtf_start)-bam_start + cpt[sample_name][('intergenic','intergenic')] += (min(bam_stop, gtf_start) - bam_start) * bam_cpt + except KeyError: + cpt[sample_name][('intergenic','intergenic')] = (min(bam_stop, gtf_start) - bam_start) * bam_cpt + # Go to next line if the BAM interval was totally upstream the first GTF annotation, carry on with the remaining part otherwise + if endGTF or (bam_stop <= gtf_start): + continue + else: + bam_start = gtf_start + + # We can start the crossover + while not endGTF: + # Update category counter + add_info(cpt[sample_name], gtf_features, bam_start, min(bam_stop,gtf_stop), coverage = bam_cpt, categ_prios = prios) + # Read the next GTF file line if the BAM line is not entirely covered + if bam_stop > gtf_stop: + # Update the BAM start pointer + bam_start = gtf_stop + endGTF = read_gtf(gtf_index_file, sign) + # If we read a new chromosome in the GTF file then it is considered finished + if bam_chrom != gtf_chrom: + endGTF = True + if endGTF: + break + else: + # Next if the BAM line is entirely covered + bam_start = bam_stop + break + + # Processing the end of the BAM line if necessary + if endGTF and (bam_stop > bam_start): + try: + cpt[sample_name][('intergenic','intergenic')] += (bam_stop - bam_start) * bam_cpt + except KeyError: + cpt[sample_name][('intergenic','intergenic')] = (bam_stop - bam_start) * bam_cpt + gtf_index_file.close() + print "\r \rDone!" + return cpt + +def write_counts_in_files(cpt,genome_counts): + for label,dico in cpt.items(): + label = "_".join(re.findall(r"[\w']+", label)) + with open(label+".categories_counts","w") as fout: + fout.write("#Category,biotype\tCounts_in_bam\tSize_in_genome\n" ) + for feature,counts in dico.items(): + fout.write("%s\t%s\t%s\n" %(','.join(feature),counts,genome_counts[feature])) + +def recategorize_the_counts(cpt,cpt_genome,final): + final_cat_cpt={} + final_genome_cpt={} + for f in cpt: + #print "\nFinal categories for",f,"sample" + final_cat_cpt[f]={} + for final_cat in final: + tot = 0 + tot_genome=0 + for cat in final[final_cat]: + tot += cpt[f][cat] + tot_genome+=cpt_genome[cat] + #output_file.write('\t'.join((final_cat, str(tot))) + '\n') + #print '\t'.join((final_cat, str(tot))) + final_cat_cpt[f][final_cat]=tot + final_genome_cpt[final_cat]=tot_genome + return final_cat_cpt,final_genome_cpt + +def group_counts_by_categ(cpt,cpt_genome,final,selected_biotype): + final_cat_cpt={} + final_genome_cpt={} + filtered_cat_cpt = {} + for f in cpt: + final_cat_cpt[f]={} + filtered_cat_cpt[f] = {} + for final_cat in final: + tot = 0 + tot_filter = 0 + tot_genome=0 + for cat in final[final_cat]: + for key,value in cpt[f].items(): + if key[0] == cat : + tot += value + tot_genome+=cpt_genome[key] + if key[1] == selected_biotype : + tot_filter += value + #output_file.write('\t'.join((final_cat, str(tot))) + '\n') + #print '\t'.join((final_cat, str(tot))) + final_cat_cpt[f][final_cat]=tot + if tot_genome == 0: + final_genome_cpt[final_cat]= 1e-100 + else: + final_genome_cpt[final_cat]=tot_genome + filtered_cat_cpt[f][final_cat]=tot_filter + #if 'antisense' in final_genome_cpt: final_genome_cpt['antisense'] = 0 + return final_cat_cpt,final_genome_cpt,filtered_cat_cpt + +def group_counts_by_biotype(cpt,cpt_genome,biotypes): + final_cpt={} + final_genome_cpt={} + for f in cpt: + final_cpt[f]={} + for biot in biotypes: + tot = 0 + tot_genome=0 + try: + for final_biot in biotypes[biot]: + for key,value in cpt[f].items(): + if key[1] == final_biot : + tot += value + #if key[1] != 'antisense': + tot_genome+=cpt_genome[key] + except: + for key,value in cpt[f].items(): + if key[1] == biot : + tot += value + tot_genome+=cpt_genome[key] + if tot != 0: + final_cpt[f][biot]=tot + final_genome_cpt[biot]=tot_genome + return final_cpt,final_genome_cpt + +#def get_cmap(N): + #'''Returns a function that maps each index in 0, 1, ... N-1 to a distinct + #RGB color.''' + #color_norm = colors.Normalize(vmin=0, vmax=N-1) + #scalar_map = cmx.ScalarMappable(norm=color_norm, cmap='hsv') + #def map_index_to_rgb_color(index): + #return scalar_map.to_rgba(index) + #return map_index_to_rgb_color + +def one_sample_plot(ordered_categs, percentages, enrichment, n_cat, index, index_enrichment, bar_width, counts_type, title) : + ### Initialization + fig = plt.figure(figsize=(13,9)) + ax1 = plt.subplot2grid((2,4),(0,0),colspan=2) + ax2 = plt.subplot2grid((2,4),(1,0),colspan=2) + cmap= plt.get_cmap('Spectral') + cols=[cmap(x) for x in xrange(0,256,256/n_cat)] + if title: + ax1.set_title(title+"in: %s" %samples_names[0]) + else : + ax1.set_title(counts_type+" distribution in mapped reads in: %s" %samples_names[0]) + ax2.set_title('Normalized counts of '+counts_type) + + + ### Barplots + #First barplot: percentage of reads in each categorie + ax1.bar(index, percentages, bar_width, + color=cols) + #Second barplot: enrichment relative to the genome for each categ + # (the reads count in a categ is divided by the categ size in the genome) + ax2.bar(index_enrichment, enrichment, bar_width, + color=cols,) + ### Piecharts + pielabels = [ordered_categs[i] if percentages[i]>0.025 else '' for i in xrange(n_cat)] + sum_enrichment = numpy.sum(enrichment) + pielabels_enrichment = [ordered_categs[i] if enrichment[i]/sum_enrichment>0.025 else '' for i in xrange(n_cat)] + # Categories piechart + ax3 = plt.subplot2grid((2,4),(0,2)) + pie_wedge_collection, texts = ax3.pie(percentages,labels=pielabels, shadow=True, colors=cols) + # Enrichment piechart + ax4 = plt.subplot2grid((2,4),(1,2)) + pie_wedge_collection, texts = ax4.pie(enrichment,labels=pielabels_enrichment, shadow=True, colors=cols) + # Add legends (append percentages to labels) + labels = [" ".join((ordered_categs[i],"({:.1%})".format(percentages[i]))) for i in range(len(ordered_categs))] + ax3.legend(pie_wedge_collection,labels,loc='center',fancybox=True, shadow=True,prop={'size':'medium'}, bbox_to_anchor=(1.7,0.5)) + labels = [" ".join((ordered_categs[i],"({:.1%})".format(enrichment[i]/sum_enrichment))) for i in range(len(ordered_categs))]# if ordered_categs[i] != 'antisense'] + ax4.legend(pie_wedge_collection,labels,loc='center',fancybox=True, shadow=True,prop={'size':'medium'},bbox_to_anchor=(1.7,0.5)) + # Set aspect ratio to be equal so that pie is drawn as a circle + ax3.set_aspect('equal') + ax4.set_aspect('equal') + return fig, ax1, ax2 + +def make_plot(ordered_categs,samples_names,categ_counts,genome_counts,pdf, counts_type, threshold, title = None ,svg = None, png = None): + # From ordered_categs, keep only the features (categs or biotypes) that we can find in at least one sample. + existing_categs = set() + for sample in categ_counts.values(): + existing_categs |= set(sample.keys()) + ordered_categs = filter(existing_categs.__contains__, ordered_categs) + n_cat = len(ordered_categs) + n_exp=len(samples_names) + ##Initialization of the matrix of counts (nrow=nb_experiements, ncol=nb_categorie) + counts=numpy.matrix(numpy.zeros(shape=(n_exp,n_cat))) + for exp in xrange(len(samples_names)): + for cat in xrange(len(ordered_categs)): + try: counts[exp,cat]=categ_counts[samples_names[exp]][ordered_categs[cat]] + except: pass + + ##Normalize the categorie sizes by the total size to get percentages + sizes=[] + sizes_sum=0 + for cat in ordered_categs: + sizes.append(genome_counts[cat]) + sizes_sum+=genome_counts[cat] + if 'antisense' in ordered_categs: + antisense_pos = ordered_categs.index('antisense') + sizes[antisense_pos] = 1e-100 + for cpt in xrange(len(sizes)): + sizes[cpt]/=float(sizes_sum) + + ## Create array which contains the percentage of reads in each categ for every sample + percentages=numpy.array(counts/numpy.sum(counts,axis=1)) + ## Create the enrichment array (counts divided by the categorie sizes in the genome) + enrichment=numpy.array(percentages/sizes) + if 'antisense_pos' in locals(): + for i in xrange(len(samples_names)): + enrichment[i][antisense_pos] = 0 + #enrichment=numpy.log(numpy.array(percentages/sizes)) + for exp in xrange(n_exp): + for i in xrange(n_cat): + val = enrichment[exp][i] + if val > 1: + enrichment[exp][i] = val-1 + elif val == 1 or val == 0: + enrichment[exp][i] = 0 + else: + enrichment[exp][i] = -1/val+1 + + #### Finally, produce the plot + + ##Get the colors from the colormap + ncolor=16 + cmap = ["#e47878", "#68b4e5", "#a3ea9b", "#ea9cf3", "#e5c957", "#a3ecd1", "#e97ca0", "#66d985", "#8e7ae5", "#b3e04b", "#b884e4", "#e4e758", "#738ee3", "#e76688", "#70dddd", "#e49261"] + if n_exp > ncolor: + cmap = plt.get_cmap('Set3',n_exp) + cmap = [cmap(i) for i in xrange(n_exp)] + + ## Parameters for the plot + opacity = 1 + #Create a vector which contains the position of each bar + index = numpy.arange(n_cat) + #Size of the bars (depends on the categs number) + bar_width = 0.9/n_exp + + + ##Initialise the subplot + # if there is only one sample, also plot piecharts + #if n_exp == 1 and counts_type.lower() == 'categories': + #fig, ax1, ax2 = one_sample_plot(ordered_categs, percentages[0], enrichment[0], n_cat, index, bar_width, counts_type, title) + ## If more than one sample + #else: + fig, (ax1,ax2) = plt.subplots(2,figsize=(5+(n_cat+2*n_exp)/3,10)) + # Store the bars objects for enrichment plot + rects = [] + #For each sample/experiment + for i in range(n_exp): + #First barplot: percentage of reads in each categorie + ax1.bar(index+i*bar_width, percentages[i], bar_width, + alpha=opacity, + color=cmap[i], + label=samples_names[i], edgecolor='#FFFFFF', lw=0) + #Second barplot: enrichment relative to the genome for each categ + # (the reads count in a categ is divided by the categ size in the genome) + rects.append( ax2.bar(index+i*bar_width, enrichment[i], bar_width, + alpha=opacity, + color=cmap[i], + label=samples_names[i], edgecolor=cmap[i], lw=0)) + + ## Graphical options for the plot + # Adding of the legend + ax1.legend(loc='best',frameon=False) + #ax2.legend(loc='upper center',bbox_to_anchor=(0.5,-0.1), fancybox=True, shadow=True) + # Main titles + if title: + ax1.set_title(title) + else : + ax1.set_title(counts_type+" distribution in mapped reads") + ax2.set_title('Normalized counts of '+counts_type) + + # Adding enrichment baseline + #ax2.axhline(y=0,color='black',linestyle='dashed',linewidth='1.5') + # Axes limits + ax1.set_xlim(-0.1,len(ordered_categs)+0.1) + if len(sizes) == 1: ax1.set_xlim(-2,3) + ax2.set_xlim(ax1.get_xlim()) + # Set axis limits (max/min values + 5% margin) + ax2_ymin = [] + ax2_ymax = [] + for sample_values in enrichment: + ax2_ymin.append(min(sample_values)) + ax2_ymax.append(max(sample_values)) + ax2_ymax = max(ax2_ymax) + ax2_ymin = min(ax2_ymin) + margin_top, margin_bottom = (abs(0.05*ax2_ymax), abs(0.05*ax2_ymin)) + ax1.set_ylim(0,ax1.get_ylim()[1]*1.05) + if threshold: + threshold_bottom = -abs(float(threshold[0]))+1 + threshold_top = float(threshold[1])-1 + + for i in xrange(n_exp): + for y in xrange(n_cat): + val = enrichment[i][y] + if not numpy.isnan(val) and not (threshold_bottom < val < threshold_top): + rect = rects[i][y] + rect_height = rect.get_height() + if rect.get_y() < 0: + diff = rect_height + threshold_bottom + rect.set_y(threshold_bottom) + ax2_ymin = threshold_bottom + margin_bottom = 0 + else: + diff = rect_height - threshold_top + ax2_ymax = threshold_top + margin_top = 0 + rect.set_height(rect.get_height()-diff) + if margin_top != 0 and margin_bottom != 0: + margin_top, margin_bottom = [max(margin_top, margin_bottom) for i in xrange(2)] + ax2.set_ylim(ax2_ymin-margin_bottom,ax2_ymax+margin_top) + # Y axis title + ax1.set_ylabel('Proportion of reads (%)') + ax2.set_ylabel('Enrichment relative to genome') + # X axis title + ax1.set_xlabel(counts_type) + ax2.set_xlabel(counts_type) + # Add the categories on the x-axis + ax1.set_xticks(index + bar_width*n_exp/2) + ax2.set_xticks(index + bar_width*n_exp/2) + if counts_type.lower() != 'categories': + ax1.set_xticklabels(ordered_categs,rotation='30',ha='right') + ax2.set_xticklabels(ordered_categs,rotation='30',ha='right') + else: + ax1.set_xticklabels(ordered_categs) + ax2.set_xticklabels(ordered_categs) + # Display fractions values in percentages + ax1.set_yticklabels([str(int(i*100)) for i in ax1.get_yticks()]) + # Correct y-axis ticks labels for enrichment subplot + #ax2.set_yticklabels([str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()]) + yticks = list(ax2.get_yticks()) + yticks = [ yticks[i]-1 if yticks[i]>9 else yticks[i]+1 if yticks[i]<-9 else yticks[i] for i in xrange(len(yticks))] + ax2.set_yticks(yticks) + ax2.set_yticklabels([str(int(i+1))+"$^{+1}$" if i>0 and i%1==0 else str(i+1)+"$^{+1}$" if i>0 else 1 if i==0 else str(int(-(i-1)))+"$^{-1}$" if i<0 and i%1==0 else str(-(i-1))+"$^{-1}$" for i in ax2.get_yticks()]) + #ax2.set_yticklabels([i+1 if i>0 else 1 if i==0 else "$\\frac{1}{%s}$" %-(i-1) for i in ax2.get_yticks()]) + # Change appearance of 'antisense' bars on enrichment plot since we cannot calculate an enrichment for this artificial category + if 'antisense_pos' in locals(): #ax2.text(antisense_pos+bar_width/2,ax2.get_ylim()[1]/10,'NA') + for i in xrange(n_exp) : + rect = rects[i][antisense_pos] + rect.set_y(ax2.get_ylim()[0]) + rect.set_height(ax2.get_ylim()[1] - ax2.get_ylim()[0]) + rect.set_hatch('/') + rect.set_fill(False) + rect.set_linewidth(0) + #rect.set_color('lightgrey') + #rect.set_edgecolor('#EDEDED') + rect.set_color('#EDEDED') + ax2.text(index[antisense_pos] + bar_width*n_exp/2 - 0.1, (ax2_ymax+ax2_ymin)/2, 'NA') + # Add text for features absent in sample + for i in xrange(n_exp): + for y in xrange(n_cat): + if percentages[i][y] == 0: + txt = ax1.text(y + bar_width*(i+0.5), 0.02, 'Absent in sample', rotation = 'vertical', color = cmap[i], horizontalalignment ='center', verticalalignment = 'bottom') + txt.set_path_effects([PathEffects.Stroke(linewidth=0.5),PathEffects.Normal()]) + elif enrichment[i][y] == 0: + rects[i][y].set_linewidth(1) + + # Remove top/right/bottom axes + for ax in [ax1,ax2]: + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['bottom'].set_visible(False) + ax.tick_params(axis='x', which='both', bottom='on', top='off', labelbottom='on') + ax.tick_params(axis='y', which='both', left='on', right='off', labelleft='on') + + + + #ax1.legend(loc='center right', bbox_to_anchor=(1.2, 0),fancybox=True, shadow=True) + ## Showing the plot + plt.tight_layout() + fig.subplots_adjust(wspace=0.2) + if pdf: + pdf.savefig() + plt.close() + elif svg: + if svg == True: + plt.savefig(counts_type+'.svg') + else : + if os.path.splitext(svg)[1] == '.svg': + plt.savefig('.'.join((os.path.splitext(svg)[0],counts_type,'svg'))) + else: + plt.savefig('.'.join((svg,counts_type,'svg'))) + elif png: + if png == True: + plt.savefig(counts_type+'.png') + else : + if os.path.splitext(png)[1] == '.png': + plt.savefig('.'.join((os.path.splitext(png)[0],counts_type,'png'))) + else: + plt.savefig('.'.join((png,counts_type,'png'))) + else: + plt.show() + ## Save on disk it as a png image + #fig.savefig('image_output.png', dpi=300, format='png', bbox_extra_artists=(lgd,), bbox_inches='tight') + + +def filter_categs_on_biotype(selected_biotype,cpt) : + filtered_cpt = {} + for sample in cpt: + filtered_cpt[sample] = {} + for feature,count in cpt[sample].items(): + if feature[1] == selected_biotype: + filtered_cpt[sample][feature[0]] = count + return filtered_cpt + + +########################################################################## +# MAIN # +########################################################################## + +def usage_message(name=None): + return ''' + Generate genome indexes: + python ALFA.py -a GTF_FILE [-g GENOME_INDEX] + [--chr_len CHR_LENGTHS_FILE] + Process BAM file(s): + python ALFA.py -g GENOME_INDEX -i BAM1 LABEL1 [BAM2 LABEL2 ...] + [--bedgraph] [-s STRAND] + [-n] [--pdf output.pdf] + [-d {1,2,3,4}] [-t YMIN YMAX] + Index genome + process BAM: + python ALFA.py -a GTF_FILE [-g GENOME_INDEX] + -i BAM1 LABEL1 [BAM2 LABEL2 ...] + [--chr_len CHR_LENGTHS_FILE] + [--bedgraph][-s STRAND] + [-n] [--pdf output.pdf] + [-d {1,2,3,4}] [-t YMIN YMAX] + + Process previously created ALFA counts file(s): + python ALFA.py -c COUNTS1 [COUNTS2 ...] + [-s STRAND] + [-n] [--pdf output.pdf] + [-d {1,2,3,4}] [-t YMIN YMAX] + + ''' + + +#### Parse command line arguments and store them in 'options' +parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, usage=usage_message()) +parser.add_argument('--version', action='version', version='version 1.0', help="show program's version number and exit\n\n-----------\n\n") +# Options concernant l'index +parser.add_argument('-g','--genome_index', help="Genome index files path and basename for existing index, or path and basename for new index creation\n\n") +parser.add_argument('-a','--annotation', metavar = "GTF_FILE", help='Genomic annotations file (GTF format)\n\n') +parser.add_argument('--chr_len', help='Tabulated file containing chromosome names and lengths\n\n-----------\n\n') + +# Options pour l'étape d'intersection +parser.add_argument('-i','--input','--bam', dest='input', metavar=('BAM_FILE1 LABEL1',""), nargs='+', help='Input BAM file(s) and label(s). The BAM files must be sorted by position.\n\n') +parser.add_argument('--bedgraph', action='store_const',default = False, const = True, help="Use this options if your input file(s) is(are) already in bedgraph format\n\n") +parser.add_argument('-c','--counts',metavar=('COUNTS_FILE',""), nargs='+', help="Use this options instead of '-i/--input' to provide ALFA counts files as input \ninstead of bam/bedgraph files.\n\n") +parser.add_argument('-s','--strandness', dest="strandness", nargs=1, action = 'store', default = ['unstranded'], choices = ['unstranded','forward','reverse','fr-firststrand','fr-secondstrand'], metavar="", help ="Library orientation. Choose within: 'unstranded', 'forward'/'fr-firststrand' \nor 'reverse'/'fr-secondstrand'. (Default: 'unstranded')\n\n-----------\n\n") + +# Options concernant le plot +parser.add_argument('-biotype_filter',nargs=1,help=argparse.SUPPRESS)#"Make an extra plot of categories distribution using only counts of the specified biotype.") +parser.add_argument('-d','--categories_depth', type=int, default='3', choices=range(1,5), help = "Use this option to set the hierarchical level that will be considered in the GTF file (default=3): \n(1) gene,intergenic; \n(2) intron,exon,intergenic; \n(3) 5'UTR,CDS,3'UTR,intron,intergenic; \n(4) start_codon,5'UTR,CDS,3'UTR,stop_codon,intron,intergenic. \n\n") +parser.add_argument('--pdf', nargs='?', default=False, help="Save produced plots in PDF format at specified path ('categories_plots.pdf' if no argument provided)\n\n") +parser.add_argument('--png', nargs='?', default=False, const=True, help="Save produced plots in PNG format with provided argument as basename \nor 'categories.png' and 'biotypes.png' if no argument provided\n\n") +parser.add_argument('--svg', nargs='?', default=False, const=True, help="Save produced plots in SVG format with provided argument as basename \nor 'categories.svg' and 'biotypes.svg' if no argument provided\n\n") +parser.add_argument('-n','--no_plot', dest='quiet', action='store_const', default=False, const=True, help="Do not show plots\n\n") +parser.add_argument('-t','--threshold', dest='threshold', nargs = 2, metavar=("ymin","ymax"), type=float , help="Set axis limits for enrichment plots\n\n") + +if len(sys.argv)==1: + parser.print_usage() + sys.exit(1) + +options = parser.parse_args() + +def required_arg(arg, aliases): + if not arg: + print >> sys.stderr, "\nError: %s argument is missing.\n" %aliases + parser.print_usage() + sys.exit() + +def check_files_enxtension(files): + return + +# Booleans for steps to be executed +make_index = False +intersect_reads = False +process_counts = False + +#### Check arguments conformity and define which steps have to be perfomed +if options.counts : + # Aucun autre argument requis + # Vérifier extension input + + # Action : Faire le plot uniquement + process_counts = True +else: + if options.annotation : + # Vérifier si présence -gi + if options.genome_index : + genome_index_basename = options.genome_index + else: + genome_index_basename = options.annotation.split("/")[-1].split(".gtf")[0] + # Vérifier si un fichier existe déjà : + if os.path.isfile(genome_index_basename+".stranded.index") : + if options.input: + print >> sys.stderr, "\nWarning: a index file named '%s' already exists and will be used. If you want to create a new index, please delete this file or specify an other path." %(genome_index_basename+".stranded.index") + else: + sys.exit("\nError: a index file named %s already exists. If you want to create a new index, please delete this file or specify an other path.\n" %(genome_index_basename+".stranded.index")) + # sinon -> action : index à faire + else : + make_index = True + # si l'index n'est pas à faire : + if options.input: + # Arguments requis: input, genome_index + if 'genome_index_basename' not in locals(): + required_arg(options.genome_index, "-g/--genome_index") + genome_index_basename = options.genome_index + required_arg(options.input, "-i/--input/--bam") + for i in xrange(0, len(options.input), 2): + try : + extension = os.path.splitext(options.input[i+1])[1] + if extension == ".bam" or extension == '.bedgraph' or extension == '.bg': + sys.exit("Error: it seems input files and associated labels are not correctly provided.\n\ + Make sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].") + except: + sys.exit("Error: it seems input files and associated labels are not correctly provided.\nMake sure to follow the expected format : -i Input_file1 Label1 [Input_file2 Label2 ...].") + + intersect_reads = True + # Vérifier input's extension + #TODO +if not (options.counts or options.input or options.annotation): + sys.exit("\nError : some arguments are missing At least '-a', '-c' or '-i' is required. Please refer to help (-h/--help) and usage cases for more details.\n") +if not options.counts: + # Declare genome_index variables + stranded_genome_index = genome_index_basename+".stranded.index" + unstranded_genome_index = genome_index_basename+".unstranded.index" + if options.strandness[0] == "unstranded": + genome_index = unstranded_genome_index + else: + genome_index = stranded_genome_index + + + +#### Initialization of some variables + +# Initializing the category priority order, coding biotypes and the final list +prios = {'start_codon': 7, 'stop_codon': 7, 'five_prime_utr': 6, 'three_prime_utr': 6, 'UTR': 6, 'CDS': 5, 'exon': 4, 'transcript': 3, 'gene': 2, 'antisense': 1,'intergenic': 0} + +biotype_prios = None +#biotype_prios = {"protein_coding":1, "miRNA":2} + +# Possibles groups of categories to plot +categs_group1={'start': ['start_codon'], '5UTR': ['five_prime_utr','UTR'], 'CDS': ['CDS', 'exon'], '3UTR': ['three_prime_utr'], 'stop': ['stop_codon'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']} +categs_group2={'5UTR': ['five_prime_utr', 'UTR'], 'CDS': ['CDS', 'exon','start_codon','stop_codon'], '3UTR': ['three_prime_utr'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']} +categs_group3={'exons': ['five_prime_utr', 'three_prime_utr', 'UTR', 'CDS', 'exon','start_codon','stop_codon'], 'introns': ['transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']} +categs_group4={'gene': ['five_prime_utr', 'three_prime_utr', 'UTR', 'CDS', 'exon','start_codon','stop_codon', 'transcript', 'gene'], 'intergenic': ['intergenic'], 'antisense': ['antisense']} +categs_groups = [categs_group4,categs_group3,categs_group2,categs_group1] # Order and merging for the final plot +cat_list = ['5UTR', 'start', 'CDS', 'stop', '3UTR', 'exons', 'introns', 'gene', 'intergenic', 'antisense'] + + +# biotypes list +biotypes ={'protein_coding','polymorphic_pseudogene','TR_C_gene','TR_D_gene','TR_J_gene','TR_V_gene','IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene',"3prime_overlapping_ncrna","lincRNA","macro_lncRNA","miRNA","misc_RNA","Mt_rRNA","Mt_tRNA","processed_transcript","ribozyme","rRNA","scaRNA","sense_intronic","sense_overlapping","snoRNA","snRNA","sRNA","TEC","vaultRNA","antisense","transcribed_processed_pseudogene","transcribed_unitary_pseudogene","transcribed_unprocessed_pseudogene","translated_unprocessed_pseudogene","TR_J_pseudogene","TR_V_pseudogene","unitary_pseudogene","unprocessed_pseudogene","processed_pseudogene","IG_C_pseudogene","IG_J_pseudogene","IG_V_pseudogene","pseudogene","ncRNA","tRNA"} # Type: set (to access quickly) + +# Grouping of biotypes: +biotypes_group1={'protein_coding':['protein_coding'],'pseudogenes':['polymorphic_pseudogene',"transcribed_processed_pseudogene","transcribed_unitary_pseudogene","transcribed_unprocessed_pseudogene","translated_unprocessed_pseudogene","TR_J_pseudogene","TR_V_pseudogene","unitary_pseudogene","unprocessed_pseudogene","processed_pseudogene","IG_C_pseudogene","IG_J_pseudogene","IG_V_pseudogene","pseudogene"],'TR':['TR_C_gene','TR_D_gene','TR_J_gene','TR_V_gene'],'IG':['IG_C_gene','IG_D_gene','IG_J_gene','IG_V_gene'],\ + 'MT_RNA':["Mt_rRNA","Mt_tRNA"],\ + 'ncRNA':["lincRNA","macro_lncRNA","3prime_overlapping_ncrna","ncRNA"],\ + "others":["misc_RNA","processed_transcript","ribozyme","scaRNA","sense_intronic","sense_overlapping","TEC","vaultRNA"], + "antisense":["antisense"]} +for biot in ["miRNA","snoRNA","snRNA","rRNA","sRNA","tRNA"]: + biotypes_group1[biot]=[biot] + +# # Initializing the unkown features lits +unknown_feature = [] + +# Initializing the genome category counter dict +cpt_genome = {} + + +if process_counts : + #### If input files are the categories counts, just load them and continue to recategorization step + cpt,cpt_genome,samples_names = read_counts_files(options.counts) +else: + #### Create genome index if needed and get the sizes of categories + if make_index : + #### Get the chromosome lengths + lengths = get_chromosome_lengths(options) + # Generating the genome index files if the user didn't provide them + create_genome_index(options.annotation, unstranded_genome_index, stranded_genome_index, cpt_genome, prios, biotypes, lengths) + + + #print '\nChr lengths:', lengths + +if intersect_reads: + # If the indexes already exist, read them to compute the sizes of the categories in the genome and retrieve the chromosome lengths + if not make_index : + print "\n### Reading genome indexes\n...\r", + sys.stdout.flush() + lengths={} + with open(genome_index, 'r') as genome_index_file: + for line in genome_index_file: + if line[0] == "#": + lengths[line.split('\t')[0][1:]] = int(line.split('\t')[1]) + else : + add_info(cpt_genome, line.rstrip().split('\t')[4:], line.split('\t')[1], line.split('\t')[2], biotype_prios = None, categ_prios = prios) + + #### Computing the genome intergenic count: sum of the chr lengths minus sum of the genome annotated intervals + cpt_genome[('intergenic','intergenic')] = sum(lengths.itervalues()) - sum([v for x,v in cpt_genome.iteritems() if x != ('antisense','antisense')]) + if not make_index : + print "Done!" + #print '\nGenome category counts:' + #for key,val in cpt_genome.iteritems(): + #print key,"\t",val + + + #### Create the Bedgraph files if needed and get the files list + + if not options.bedgraph: + # Generating the BEDGRAPH files is the user provided BAM file(s) and get the samples labels (this names will be used in the plot legend) + samples_files, samples_names = create_bedgraph_files(options.input,options.strandness[0]) + else: + # Just initialize the files list with the bedgraph paths + samples_files = [options.input[i] for i in range(0,len(options.input),2)] + # and get the labels + samples_names = [options.input[i] for i in range(1,len(options.input),2)] + #### Retrieving chromosome names saved in index + chrom_list = get_chromosome_names_in_index(genome_index) + #### Processing the BEDGRAPH files: intersecting the bedgraph with the genome index and count the number of aligned positions in each category + cpt = intersect_bedgraphs_and_index_to_counts_categories(samples_files,samples_names,prios,genome_index, options.strandness[0], biotype_prios = None) + + #### Write the counts on disk + write_counts_in_files(cpt,cpt_genome) + +if not (intersect_reads or process_counts) or (options.quiet and options.pdf == False): + quit("\n### End of program") +print "\n### Generating plots" +# Updating the biotypes lists (biotypes and 'biotype_group1'): adding the 'unknow biotypes' found in gtf/index +if unknown_feature == []: # 'unknown_feature' is define only during the index generation + # Browse the feature to determine whether some biotypes are 'unknown' + for sample,counts in cpt.items(): + for (cat,biot) in counts: + if biot not in biotypes and cat not in unknown_feature: + unknown_feature.append(biot) +for new_biot in unknown_feature: + biotypes.add(new_biot) + biotypes_group1["others"].append(new_biot) +biotypes = sorted(biotypes) +# move antisense categ to the end of the list +biotypes.remove('antisense') +biotypes.append('antisense') +biotypes_group1 = sorted(biotypes_group1) + + +#print '\nCounts for every category/biotype pair: ',cpt + +# Generating plots +if options.pdf != False: + if options.pdf == None: + options.pdf = "categories_plots.pdf" + pdf = PdfPages(options.pdf) +else: + pdf = False + +selected_biotype = None +if options.biotype_filter: + options.biotype_filter = options.biotype_filter[0] + for sample in cpt: + for feature in cpt[sample]: + biotype = feature[1] + if options.biotype_filter.lower() == biotype.lower(): + selected_biotype=biotype + break + if selected_biotype == None : + print "\nError: biotype '"+options.biotype_filter+"' not found. Please check the biotype name and that this biotype exists in your sample(s)." + sys.exit() + +#Print a warning message if the UTRs are not specified as 5' or 3' (they will be ploted as 5'UTR) +if 'UTR' in [categ[0] for counts in cpt.values() for categ in counts.keys()]: + print '''\nWARNING: (some) 5'UTR/3'UTR are not precisely defined. Consequently, positions annotated as "UTR" will be counted as "5'UTR"\n''' + +#### Make the plot by categories + #### Recategorizing with the final categories +final_cats=categs_groups[options.categories_depth-1] +final_cat_cpt,final_genome_cpt, filtered_cat_cpt = group_counts_by_categ(cpt,cpt_genome,final_cats,selected_biotype) + #### Display the distribution of specified categories (or biotypes) in samples on a barplot +# Remove the 'antisense' category if the library type is 'unstranded' +for dic in cpt.values(): + if ('antisense','antisense') in dic.keys(): break +else: + cat_list.remove('antisense') +make_plot(cat_list,samples_names,final_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold, svg = options.svg, png = options.png) +if selected_biotype : + make_plot(cat_list,samples_names,filtered_cat_cpt,final_genome_cpt,pdf, "categories",options.threshold,title="Categories distribution for '"+selected_biotype+"' biotype", svg = options.svg, png = options.png) + +#### Make the plot by biotypes + #### Recategorizing with the final categories +final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes) + #### Display the distribution of specified categories (or biotypes) in samples on a barplot +make_plot(biotypes,samples_names,final_cat_cpt,final_genome_cpt,pdf, "biotypes",options.threshold, svg = options.svg, png = options.png) + + + + ##### Recategorizing with the final categories +#final_cat_cpt,final_genome_cpt = group_counts_by_biotype(cpt,cpt_genome,biotypes_group1) + ##### Display the distribution of specified categories (or biotypes) in samples on a barplot +#make_plot(biotypes_group1,samples_names,final_cat_cpt,final_genome_cpt,pdf,"Biotype groups", options.threshold, title="Biotypes distribution in mapped reads \n(biotypes are grouped by 'family')", svg = options.svg, png = options.png) + + +if options.pdf: + pdf.close() + print "\n### Plots saved in pdf file: %s" %options.pdf + +print "\n### End of program" \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py Wed May 16 09:56:43 2018 -0400 @@ -0,0 +1,239 @@ +#!/usr/bin/python + +import sys +import shutil +import re +import urllib2 +import subprocess +import gzip +import os +import tempfile +from optparse import OptionParser +from galaxy.util.json import from_json_string, to_json_string + +def get_arg(): + parser = OptionParser() + parser.add_option("-e", "--ensembl", dest = 'ensembl_info', action = "store", nargs = 2, metavar = ("kingdom", "species_name"), type = "str") + parser.add_option("-o", "--output", dest='output_filename', action="store", nargs = 1, metavar = 'JSON_FILE') + parser.add_option("--log", dest='log_filename', action="store", nargs=1, metavar='log_report') + (options, args) = parser.parse_args() + return options, args + +def cleanup_before_exit(tmp_dir): + if tmp_dir and os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + +def get_page_content(url): + req = urllib2.Request(url) + page = urllib2.urlopen(req) + return page.read() + +def download_file(link, local_file_name): + req = urllib2.Request(link) + src_file = urllib2.urlopen(req) + local_file = open(local_file_name, 'wb') + local_file.write(src_file.read()) + local_file.close() + +def uncompress_gz(gz_file_name, uncompressed_file_name): + print("____________________________________________________________") + print("*** Uncompressing %s" % gz_file_name) + uncompressed_file = open(uncompressed_file_name, 'wb') + with gzip.open(gz_file_name, 'rb') as src_file: + uncompressed_file.write(src_file.read()) + uncompressed_file.close() + print("-> Uncompressed !\n") + +def add_data_table_entry( data_manager_dict, data_table_entry ): + data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) + data_manager_dict['data_tables']['alfa_indexes'] = data_manager_dict['data_tables'].get( 'alfa_indexes', data_table_entry ) + return data_manager_dict + +def standardize_species_name(species_name): + # substitute all capital letters, replace every succession of chars that are not letters to one underscore + standard_species_name = re.sub(r'[)]$', '', species_name) + standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name) + return standard_species_name.lower() + +def get_ensembl_url_root(kingdom): + print("____________________________________________________________") + print("*** Determining Ensembl ftp root url") + if kingdom == 'vertebrates': + root = 'ftp://ftp.ensembl.org/pub/current_gtf/' + else: + root = 'ftp://ftp.ensemblgenomes.org/pub/%s/current/' % kingdom + print("-> Determined !\n") + return root + +def test_ensembl_species_exists(kingdom, url, species_name): + """ + Test if a species exist on the ftp & return the species name with the species_line if so. + if the species_name matches a single string, then this string will be returned as the species name + if the species_name matches several strings, then an error is printed with all the possible species to enter for a new run + """ + print("____________________________________________________________") + print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom)) + list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:]) + if kingdom=='vertebrates': + download_file(url, list_species_file_name) + else: + download_file(url + list_species_file_name, list_species_file_name) + + grep_result = subprocess.Popen(['grep', species_name, list_species_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + species_lines_matched, grep_error = grep_result.communicate() + if grep_error != None or species_lines_matched == "": + msg = 'The species \'%s\' is not referenced on Ensembl (%s)' % (species_name, kingdom) + sys.exit(msg) + + species_lines = species_lines_matched.split('\n') + del species_lines[-1] + nb_lines = len(species_lines) + + if nb_lines == 1: + if kingdom == 'vertebrates': + fields = species_lines[0].split(' ') + columns = fields[-1].split('\r') + found_species_name = columns[0] + else: + columns = species_lines[0].split('\t') + found_species_name = columns[1] + if species_name != found_species_name: + print('-> \'%s\' has been replace with the complete species name \'%s\'' % (species_name, found_species_name)) + return found_species_name, species_lines_matched + print("-> Referenced !\n") + return species_name, species_lines_matched + else: + list_species = [''] * nb_lines + for i in range(0, nb_lines): + if kingdom == 'vertebrates': + fields = species_lines[i].split(' ') + columns = fields[-1].split('\r') + list_species[i] = columns[0] + else: + columns = species_lines[i].split('\t') + list_species[i] = columns[1] + exact_match = re.search('^%s$' % species_name, list_species[i]) + if exact_match: + print("-> Referenced !\n") + return species_name, species_lines[i] + msg = ("The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\n" + "Please retry with one of these following species names:\n" % species_name) + for s in list_species: + msg = ("%s- %s\n" % (msg, s)) + sys.exit(msg) + +def get_ensembl_collection(kingdom, species_line): + print("*** Extracting the %s_collection of the species" % kingdom) + collection_regex = re.compile('%s_.+_collection' % kingdom.lower()) + collection_match = re.search(collection_regex, species_line) + if not collection_match: + print("-> Skiped: this species is not classified in a Ensembl %s collection\n" % kingdom) + return None + print("-> Extracted !\n") + return collection_match.group(0) + +def get_ensembl_gtf_archive_name(url_dir, species_name): + print("____________________________________________________________") + print("*** Extracting the gtf archive name of %s" % species_name) + gtf_archive_regex = re.compile('%s\..*\.[0-9]+\.gtf\.gz' % species_name, flags = re.IGNORECASE) + dir_content = get_page_content(url_dir) + gtf_archive_match = re.search(gtf_archive_regex, dir_content) + if not gtf_archive_match: + sys.exit('The species is referenced on Ensembl but error of nomenclature led to download failure') + gtf_archive_name = gtf_archive_match.group(0) + print("-> Extracted !\n") + return gtf_archive_name + +def get_ensembl_gtf_archive(kingdom, url, species_name, species_line): + if kingdom != 'vertebrates': + url = url + 'gtf/' + if kingdom == 'bacteria' or kingdom == 'protists' or kingdom == 'fungi': + collection = get_ensembl_collection(kingdom, species_line) + if collection != None: + url = url + "%s/" % collection + final_url = url + species_name + '/' + gtf_archive_name = get_ensembl_gtf_archive_name(final_url, species_name) + print("____________________________________________________________") + print("*** Download the gtf archive of %s" % species_name) + download_file(final_url + gtf_archive_name, gtf_archive_name) + print("-> Downloaded !\n") + return gtf_archive_name + +def generate_alfa_indexes(path_to_alfa, gtf_file_name): + print("____________________________________________________________") + print("*** Generating alfa indexes from %s" % gtf_file_name) + alfa_result = subprocess.Popen(['python', path_to_alfa, '-a', gtf_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + alfa_out, alfa_err = alfa_result.communicate() + if alfa_err != None and not re.search('### End of program', alfa_err): + msg = 'Generation Failed due an alfa error: %s' % (alfa_err) + sys.exit(msg) + print("Alfa prompt:\n%s" % alfa_out) + print("-> Generated !\n") + +def get_data_table_new_entry(gtf_archive_name): + info_list = gtf_archive_name.split('.') + species = info_list[0] + version = info_list[1] + release = info_list[2] + value = '%s_%s_%s' % (species, version, release) + dbkey = value + name = '%s: %s (release %s)' % (species, version, release) + prefix = '%s.%s.%s' % (species, version, release) + entry_dict = { 'species': species, 'version': version, 'release': release, 'value': value, 'dbkey': dbkey, 'name': name, 'prefix': prefix } + return entry_dict + +def main(): + options, args = get_arg() + tool_dir = args[0] + + path_to_alfa = os.path.join(tool_dir, 'ALFA.py') + + if options.output_filename == None: + msg = 'No json output file specified' + sys.exit(msg) + output_filename = options.output_filename + + # Interestingly the output file to return is not empty initially. + # it contains a dictionary, with notably the path to the dir where the alfa_indexes + # are expected to be found + params = from_json_string(open(output_filename).read()) + target_directory = params['output_data'][0]['extra_files_path'] + os.mkdir(target_directory) + + tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='') + os.chdir(tmp_dir) + + data_manager_dict = {} + + if options.ensembl_info: + kingdom, species_name = options.ensembl_info + species_name = standardize_species_name(species_name) + url = get_ensembl_url_root(kingdom) + species_name, species_line = test_ensembl_species_exists(kingdom, url, species_name) + gtf_archive_name = get_ensembl_gtf_archive(kingdom, url, species_name, species_line) + data_table_entry = get_data_table_new_entry(gtf_archive_name) + gtf_file_name = '%s.gtf' % data_table_entry['prefix'] + uncompress_gz(gtf_archive_name, gtf_file_name) + generate_alfa_indexes(path_to_alfa, gtf_file_name) + stranded_index_name = '%s.stranded.index' % data_table_entry['prefix'] + unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix'] + add_data_table_entry(data_manager_dict, data_table_entry) + + print("____________________________________________________________") + print("*** General Info") + print("URL ROOT:\t%s" % url) + print("SPECIES:\t%s" % data_table_entry['species']) + print("VERSION:\t%s" % data_table_entry['version']) + print("RELEASE:\t%s" % data_table_entry['release']) + print("VALUE:\t%s" % data_table_entry['value']) + print("DBKEY:\t%s" % data_table_entry['dbkey']) + print("NAME:\t%s" % data_table_entry['name']) + print("PREFIX:\t%s" % data_table_entry['prefix']) + + shutil.move(stranded_index_name, os.path.join(target_directory, stranded_index_name)) + shutil.move(unstranded_index_name, os.path.join(target_directory, unstranded_index_name)) + + cleanup_before_exit(tmp_dir) + + open(output_filename, 'wb').write(to_json_string(data_manager_dict)) +main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.xml Wed May 16 09:56:43 2018 -0400 @@ -0,0 +1,57 @@ +<tool id="build_alfa_indexes" name="ALFA indexes" version="0.0.1" tool_type="manage_data"> + <description>build ALFA indexes from automatically downloaded gtf annotation file</description> + <!-- ALFA requires bedtools suite v2.20.0 and above --> + <requirements> + <requirement type="package" version="2.24">bedtools</requirement> + <requirement type="package" version="1.2">samtools</requirement> + <requirement type="package" version="1.4">matplotlib</requirement> + </requirements> + + <command interpreter="python">data_manager_build_alfa_indexes.py -e "${reference_source['kingdom']}" "${reference_source['species_name']}" -o "${out_file}" "$__tool_directory__"</command> + + <inputs> + <conditional name="reference_source"> + <param name="reference_source_selector" type="select" label="Choose the source for the reference genome"> + <option value="ensembl" selected="True">Ensembl Genomes Websites</option> + </param> + <when value="ensembl"> + <param name="kingdom" type="select" label="Select the ensembl website where to fetch the genome"> + <option value="vertebrates" selected="True">Ensembl (Vertebrates)</option> + <option value="bacteria" selected="True">Ensembl Bacteria</option> + <option value="fungi" selected="True">Ensembl Fungi</option> + <option value="metazoa" selected="True">Ensembl Metazoa</option> + <option value="plants" selected="True">Ensembl Plants</option> + <option value="protists" selected="True">Ensembl Protists</option> + </param> + <param name="species_name" type="text" value="Homo sapiens" label="Complete Species_Name" optional="False"> + <validator type="empty_field" message="Please, enter a species name."/> + </param> + </when> + </conditional> + </inputs> + + <outputs> + <data name="out_file" format="data_manager_json"/> + </outputs> + + <help> +<![CDATA[ +**What it does** + + + | 1. The tool asks the admin to enter a *species_name* and automatically download the last release of the corresponding gtf annotation file on Ensembl. + | + | 2. The tool calls ALFA.py to generate the alfa indexes from this gtf file. + | + | 3. Resulting indexes are stored in the child directory *alfa_indexes/* of the dir *<galaxy_data_manager_data_path>* defined in *config/galaxy.ini* + | + | 4. Finally, the tool adds the new entry to the table *alfa_indexes.loc*. This .loc file is where the data table *alfa_indexes* points, as defined in *config/shed_tool_data_table.conf.xml* + | + | 5. At the end of the process, when a user will use `alfa`_ , the built-in indexes corresponding to the *species_name* will be available + + .. _alfa: https://toolshed.g2.bx.psu.edu/view/charles-bernard/alfa + +]]> + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes_testchr.py Wed May 16 09:56:43 2018 -0400 @@ -0,0 +1,272 @@ +#!/usr/bin/python + +import sys +import shutil +import re +import urllib2 +import subprocess +import gzip +import os +import tempfile +from optparse import OptionParser +from galaxy.util.json import from_json_string, to_json_string + +def get_arg(): + parser = OptionParser() + parser.add_option("-e", "--ensembl", dest = 'ensembl_info', action = "store", nargs = 2, metavar = ("kingdom", "species_name"), type = "str") + parser.add_option("-o", "--output", dest='output_filename', action="store", nargs = 1, metavar = 'JSON_FILE') + parser.add_option("--log", dest='log_filename', action="store", nargs=1, metavar='log_report') + (options, args) = parser.parse_args() + return options, args + +def cleanup_before_exit(tmp_dir): + if tmp_dir and os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + +def get_page_content(url): + req = urllib2.Request(url) + page = urllib2.urlopen(req) + return page.read() + +def download_file(link, local_file_name): + req = urllib2.Request(link) + src_file = urllib2.urlopen(req) + local_file = open(local_file_name, 'wb') + local_file.write(src_file.read()) + local_file.close() + +def uncompress_gz(gz_file_name, uncompressed_file_name): + print("____________________________________________________________") + print("*** Uncompressing %s" % gz_file_name) + uncompressed_file = open(uncompressed_file_name, 'wb') + with gzip.open(gz_file_name, 'rb') as src_file: + uncompressed_file.write(src_file.read()) + uncompressed_file.close() + print("-> Uncompressed !\n") + +def standardize_species_name(species_name): + #substitute all capital letters, replace every succession of chars that are not letters to one underscore + standard_species_name = re.sub(r'[)]$', '', species_name) + standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name) + return standard_species_name.lower() + +def get_ensembl_url_root(kingdom): + print("____________________________________________________________") + print("*** Determining Ensembl ftp root url") + if kingdom == 'vertebrates': + root = 'ftp://ftp.ensembl.org/pub/current_gtf/' + else: + root = 'ftp://ftp.ensemblgenomes.org/pub/%s/current/' % kingdom + print("-> Determined !\n") + return root + +def test_ensembl_species_exists(kingdom, url, species_name): + """ + Test if a species exist on the ftp & return the species name with the species_line if so. + if the species_name matches a single string, then this string will be returned as the species name + if the species_name matches several strings, then an error is printed with all the possible species to enter for a new run + """ + print("____________________________________________________________") + print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom)) + list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:]) + if kingdom=='vertebrates': + download_file(url, list_species_file_name) + else: + download_file(url + list_species_file_name, list_species_file_name) + + grep_result = subprocess.Popen(['grep', species_name, list_species_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + species_lines_matched, grep_error = grep_result.communicate() + if grep_error != None or species_lines_matched == "": + msg = 'The species \'%s\' is not referenced on Ensembl (%s)' % (species_name, kingdom) + sys.exit(msg) + + species_lines = species_lines_matched.split('\n') + del species_lines[-1] + nb_lines = len(species_lines) + + if nb_lines == 1: + if kingdom == 'vertebrates': + fields = species_lines[0].split(' ') + columns = fields[-1].split('\r') + found_species_name = columns[0] + else: + columns = species_lines[0].split('\t') + found_species_name = columns[1] + if species_name != found_species_name: + print('-> \'%s\' has been replace with the complete species name \'%s\'' % (species_name, found_species_name)) + return found_species_name, species_lines_matched + print("-> Referenced !\n") + return species_name, species_lines_matched + else: + list_species = [''] * nb_lines + for i in range(0, nb_lines): + if kingdom == 'vertebrates': + fields = species_lines[i].split(' ') + columns = fields[-1].split('\r') + list_species[i] = columns[0] + else: + columns = species_lines[i].split('\t') + list_species[i] = columns[1] + exact_match = re.search('^%s$' % species_name, list_species[i]) + if exact_match: + print("-> Referenced !\n") + return species_name, species_lines[i] + msg = ("The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\n" + "Please retry with one of these following species names:\n" % species_name) + for s in list_species: + msg = ("%s- %s\n" % (msg, s)) + sys.exit(msg) + +def get_ensembl_collection(kingdom, species_line): + print("*** Extracting the %s_collection of the species" % kingdom) + collection_regex = re.compile('%s_.+_collection' % kingdom.lower()) + collection_match = re.search(collection_regex, species_line) + if not collection_match: + print("-> Skiped: this species is not classified in a Ensembl %s collection\n" % kingdom) + return None + print("-> Extracted !\n") + return collection_match.group(0) + +def get_ensembl_gtf_archive_name(url_dir, species_name): + print("____________________________________________________________") + print("*** Extracting the gtf archive name of %s" % species_name) + gtf_archive_regex = re.compile('%s\..*\.[0-9]+\.gtf\.gz' % species_name, flags = re.IGNORECASE) + chr_gtf_archive_regex = re.compile('%s\..*\.[0-9]+\.chr\.gtf\.gz' % species_name, flags = re.IGNORECASE) + dir_content = get_page_content(url_dir) + gtf_archive_match = re.search(gtf_archive_regex, dir_content) + chr_gtf_archive_match = re.search(chr_gtf_archive_regex, dir_content) + if not gtf_archive_match: + sys.exit('The species is referenced on Ensembl but error of nomenclature led to download failure') + if not chr_gtf_archive_match: + chr_gtf_archive_name = "" + else: + chr_gtf_archive_name = chr_gtf_archive_match.group(0) + gtf_archive_name = gtf_archive_match.group(0) + print("-> Extracted !\n") + return gtf_archive_name, chr_gtf_archive_name + +def get_ensembl_gtf_archive(kingdom, url, species_name, species_line): + if kingdom != 'vertebrates': + url = url + 'gtf/' + if kingdom == 'bacteria' or kingdom == 'protists' or kingdom == 'fungi': + collection = get_ensembl_collection(kingdom, species_line) + if collection != None: + url = url + "%s/" % collection + final_url = url + species_name + '/' + gtf_archive_name, chr_gtf_archive_name = get_ensembl_gtf_archive_name(final_url, species_name) + print("____________________________________________________________") + print("*** Download the gtf archive of %s" % species_name) + download_file(final_url + gtf_archive_name, gtf_archive_name) + print("-> Downloaded !\n") + if chr_gtf_archive_name: + print("*** Download the chr gtf archive of %s" % species_name) + download_file(final_url + chr_gtf_archive_name, chr_gtf_archive_name) + print("-> Downloaded !\n") + return gtf_archive_name, chr_gtf_archive_name + +def generate_alfa_indexes(path_to_alfa, gtf_file_name): + print("____________________________________________________________") + print("*** Generating alfa indexes from %s" % gtf_file_name) + alfa_result = subprocess.Popen(['python', path_to_alfa, '-a', gtf_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + alfa_out, alfa_err = alfa_result.communicate() + if alfa_err != None and not re.search('### End of program', alfa_err): + msg = 'Generation Failed due an alfa error: %s' % (alfa_err) + sys.exit(msg) + print("Alfa prompt:\n%s" % alfa_out) + print("-> Generated !\n") + +def get_data_table_new_entry(gtf_archive_name): + info_list = gtf_archive_name.split('.') + species = info_list[0] + version = info_list[1] + release = info_list[2] + value = '%s_%s_%s' % (species, version, release) + dbkey = value + name = '%s: %s (release %s)' % (species, version, release) + prefix = '%s.%s.%s' % (species, version, release) + entry_dict = { 'species': species, 'version': version, 'release': release, 'value': value, 'dbkey': dbkey, 'name': name, 'prefix': prefix } + return entry_dict + +def chr_get_data_table_new_entry(chr_gtf_archive_name): + info_list = chr_gtf_archive_name.split('.') + species = info_list[0] + version = info_list[1] + release = info_list[2] + value = '%s_%s_%s.chr' % (species, version, release) + dbkey = value + name = '%s: %s (release %s) - Chr' % (species, version, release) + prefix = '%s.%s.%s.chr' % (species, version, release) + entry_dict = { 'species': species, 'version': version, 'release': release, 'value': value, 'dbkey': dbkey, 'name': name, 'prefix': prefix } + return entry_dict + +def main(): + options, args = get_arg() + tool_dir = args[0] + + path_to_alfa = os.path.join(tool_dir, 'ALFA.py') + + if options.output_filename == None: + msg = 'No json output file specified' + sys.exit(msg) + output_filename = options.output_filename + + # Interestingly the output file to return is not empty initially. + # it contains a dictionary, with notably the path to the dir where the alfa_indexes + # are expected to be found + params = from_json_string(open(output_filename).read()) + target_directory = params['output_data'][0]['extra_files_path'] + os.mkdir(target_directory) + + tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='') + os.chdir(tmp_dir) + + data_manager_dict = {} + data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) + data_manager_dict['data_tables']['alfa_indexes'] = data_manager_dict['data_tables'].get('alfa_indexes', []) + + if options.ensembl_info: + kingdom, species_name = options.ensembl_info + species_name = standardize_species_name(species_name) + url = get_ensembl_url_root(kingdom) + species_name, species_line = test_ensembl_species_exists(kingdom, url, species_name) + gtf_archive_name, chr_gtf_archive_name = get_ensembl_gtf_archive(kingdom, url, species_name, species_line) + data_table_entry = get_data_table_new_entry(gtf_archive_name) + gtf_file_name = '%s.gtf' % data_table_entry['prefix'] + uncompress_gz(gtf_archive_name, gtf_file_name) + generate_alfa_indexes(path_to_alfa, gtf_file_name) + stranded_index_name = '%s.stranded.index' % data_table_entry['prefix'] + unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix'] + data_manager_dict['data_tables']['alfa_indexes'].append(data_table_entry) + if chr_gtf_archive_name: + data_table_entry = chr_get_data_table_new_entry(chr_gtf_archive_name) + chr_gtf_file_name = '%s.gtf' % data_table_entry['prefix'] + uncompress_gz(chr_gtf_archive_name, chr_gtf_file_name) + generate_alfa_indexes(path_to_alfa, chr_gtf_file_name) + chr_stranded_index_name = '%s.stranded.index' % data_table_entry['prefix'] + chr_unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix'] + data_manager_dict['data_tables']['alfa_indexes'].append(data_table_entry) + + + print("____________________________________________________________") + print("*** General Info") + print("URL ROOT:\t%s" % url) + print("SPECIES:\t%s" % data_table_entry['species']) + print("VERSION:\t%s" % data_table_entry['version']) + print("RELEASE:\t%s" % data_table_entry['release']) + print("VALUE:\t%s" % data_table_entry['value']) + print("DBKEY:\t%s" % data_table_entry['dbkey']) + print("NAME:\t%s" % data_table_entry['name']) + print("PREFIX:\t%s" % data_table_entry['prefix']) + + shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name)) + shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name)) + + if chr_gtf_archive_name: + shutil.copyfile(chr_stranded_index_name, os.path.join(target_directory, stranded_index_name)) + shutil.copyfile(chr_unstranded_index_name, os.path.join(target_directory, unstranded_index_name)) + + + cleanup_before_exit(tmp_dir) + + open(output_filename, 'wb').write(to_json_string(data_manager_dict)) +main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/data_manager/tool_dependencies.xml Wed May 16 09:56:43 2018 -0400 @@ -0,0 +1,12 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="bedtools" version="2.24"> + <repository changeset_revision="3416a1d4a582" name="package_bedtools_2_24" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="samtools" version="1.2"> + <repository changeset_revision="f6ae3ba3f3c1" name="package_samtools_1_2" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> + <package name="matplotlib" version="1.4"> + <repository changeset_revision="f7424e1cf115" name="package_python_2_7_matplotlib_1_4" owner="iuc" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/data_manager_conf.xml Wed May 16 09:56:43 2018 -0400 @@ -0,0 +1,23 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/data_manager_build_alfa_indexes.xml" id="build_alfa_indexes"> + <data_table name="alfa_indexes"> + <output> + <column name="species" /> + <column name="version" /> + <column name="release" /> + <column name="value" /> + <column name="dbkey" /> + <column name="name" /> + <column name="prefix" output_ref="out_file"> + <move type="directory"> + <!-- <source>{prefix}</source> - out_file.extra_files_path is used as base by default --> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">alfa_indexes/${dbkey}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/alfa_indexes/${dbkey}/${species}.${version}.${release}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/tool-data/alfa_indexes.loc.sample Wed May 16 09:56:43 2018 -0400 @@ -0,0 +1,2 @@ +#<species> <version> <release> <value> <dbkey> <name> <prefix> +#Dictyostelium_discoideum dicty_2 7 Dictyostelium_discoideum_dicty_2_7 Dictyostelium_discoideum_dicty_2_7 Dictyostelium_discoideum: dicty_2 (release 7) <path_to_dicty_indexes_dir>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/tool_data_table_conf.xml.sample Wed May 16 09:56:43 2018 -0400 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of all alfa indexes --> + <table name="alfa_indexes" comment_char="#" allow_duplicate_entries="False"> + <columns>species, version, release, value, dbkey, name, prefix</columns> + <file path="tool-data/alfa_indexes.loc" /> + </table> +</tables>