0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Reads an interval or gene BED and a MAF Source.
|
|
5 Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates
|
|
6
|
|
7 Alignment blocks are layered ontop of each other based upon score.
|
|
8
|
|
9 usage: %prog maf_file [options]
|
|
10 -d, --dbkey=d: Database key, ie hg17
|
|
11 -c, --chromCol=c: Column of Chr
|
|
12 -s, --startCol=s: Column of Start
|
|
13 -e, --endCol=e: Column of End
|
|
14 -S, --strandCol=S: Column of Strand
|
|
15 -G, --geneBED: Input is a Gene BED file, process and join exons as one region
|
|
16 -t, --mafSourceType=t: Type of MAF source to use
|
|
17 -m, --mafSource=m: Path of source MAF file, if not using cached version
|
|
18 -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version
|
|
19 -i, --interval_file=i: Input interval file
|
|
20 -o, --output_file=o: Output MAF file
|
|
21 -p, --species=p: Species to include in output
|
|
22 -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species.
|
|
23 -z, --mafIndexFileDir=z: Directory of local maf_index.loc file
|
|
24
|
|
25 usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR
|
|
26 """
|
|
27
|
|
28 #Dan Blankenberg
|
|
29 from galaxy import eggs
|
|
30 from galaxy.tools.util import maf_utilities
|
|
31 import pkg_resources; pkg_resources.require( "bx-python" )
|
|
32 from bx.cookbook import doc_optparse
|
|
33 import bx.intervals.io
|
|
34 import sys
|
|
35
|
|
36 assert sys.version_info[:2] >= ( 2, 4 )
|
|
37
|
|
38 def stop_err( msg ):
|
|
39 sys.stderr.write( msg )
|
|
40 sys.exit()
|
|
41
|
|
42 def __main__():
|
|
43
|
|
44 #Parse Command Line
|
|
45 options, args = doc_optparse.parse( __doc__ )
|
|
46 mincols = 0
|
|
47 strand_col = -1
|
|
48
|
|
49 if options.dbkey:
|
|
50 primary_species = options.dbkey
|
|
51 else:
|
|
52 primary_species = None
|
|
53 if primary_species in [None, "?", "None"]:
|
|
54 stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." )
|
|
55
|
|
56 include_primary = True
|
|
57 secondary_species = maf_utilities.parse_species_option( options.species )
|
|
58 if secondary_species:
|
|
59 species = list( secondary_species ) # make copy of species list
|
|
60 if primary_species in secondary_species:
|
|
61 secondary_species.remove( primary_species )
|
|
62 else:
|
|
63 include_primary = False
|
|
64 else:
|
|
65 species = None
|
|
66
|
|
67 if options.interval_file:
|
|
68 interval_file = options.interval_file
|
|
69 else:
|
|
70 stop_err( "Input interval file has not been specified." )
|
|
71
|
|
72 if options.output_file:
|
|
73 output_file = options.output_file
|
|
74 else:
|
|
75 stop_err( "Output file has not been specified." )
|
|
76
|
|
77 if not options.geneBED:
|
|
78 if options.chromCol:
|
|
79 chr_col = int( options.chromCol ) - 1
|
|
80 else:
|
|
81 stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." )
|
|
82
|
|
83 if options.startCol:
|
|
84 start_col = int( options.startCol ) - 1
|
|
85 else:
|
|
86 stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." )
|
|
87
|
|
88 if options.endCol:
|
|
89 end_col = int( options.endCol ) - 1
|
|
90 else:
|
|
91 stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." )
|
|
92
|
|
93 if options.strandCol:
|
|
94 strand_col = int( options.strandCol ) - 1
|
|
95
|
|
96 mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir
|
|
97
|
|
98 overwrite_with_gaps = True
|
|
99 if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false':
|
|
100 overwrite_with_gaps = False
|
|
101
|
|
102 #Finish parsing command line
|
|
103
|
|
104 #get index for mafs based on type
|
|
105 index = index_filename = None
|
|
106 #using specified uid for locally cached
|
|
107 if options.mafSourceType.lower() in ["cached"]:
|
|
108 index = maf_utilities.maf_index_by_uid( options.mafSource, mafIndexFile )
|
|
109 if index is None:
|
|
110 stop_err( "The MAF source specified (%s) appears to be invalid." % ( options.mafSource ) )
|
|
111 elif options.mafSourceType.lower() in ["user"]:
|
|
112 #index maf for use here, need to remove index_file when finished
|
|
113 index, index_filename = maf_utilities.open_or_build_maf_index( options.mafSource, options.mafIndex, species = [primary_species] )
|
|
114 if index is None:
|
|
115 stop_err( "Your MAF file appears to be malformed." )
|
|
116 else:
|
|
117 stop_err( "Invalid MAF source type specified." )
|
|
118
|
|
119 #open output file
|
|
120 output = open( output_file, "w" )
|
|
121
|
|
122 if options.geneBED:
|
|
123 region_enumerator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() )
|
|
124 else:
|
|
125 region_enumerator = enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, strand_col = strand_col, fix_strand = True, return_header = False, return_comments = False ) )
|
|
126
|
|
127 #Step through intervals
|
|
128 regions_extracted = 0
|
|
129 line_count = 0
|
|
130 for line_count, line in region_enumerator:
|
|
131 try:
|
|
132 if options.geneBED: #Process as Gene BED
|
|
133 try:
|
|
134 starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line )
|
|
135 #create spliced alignment object
|
|
136 alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
|
|
137 primary_name = secondary_name = fields[3]
|
|
138 alignment_strand = fields[5]
|
|
139 except Exception, e:
|
|
140 print "Error loading exon positions from input line %i: %s" % ( line_count, e )
|
|
141 continue
|
|
142 else: #Process as standard intervals
|
|
143 try:
|
|
144 #create spliced alignment object
|
|
145 alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps )
|
|
146 primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end )
|
|
147 secondary_name = ""
|
|
148 alignment_strand = line.strand
|
|
149 except Exception, e:
|
|
150 print "Error loading region positions from input line %i: %s" % ( line_count, e )
|
|
151 continue
|
|
152
|
|
153 #Write alignment to output file
|
|
154 #Output primary species first, if requested
|
|
155 if include_primary:
|
|
156 output.write( ">%s.%s\n" %( primary_species, primary_name ) )
|
|
157 if alignment_strand == "-":
|
|
158 output.write( alignment.get_sequence_reverse_complement( primary_species ) )
|
|
159 else:
|
|
160 output.write( alignment.get_sequence( primary_species ) )
|
|
161 output.write( "\n" )
|
|
162 #Output all remainging species
|
|
163 for spec in secondary_species or alignment.get_species_names( skip = primary_species ):
|
|
164 if secondary_name:
|
|
165 output.write( ">%s.%s\n" % ( spec, secondary_name ) )
|
|
166 else:
|
|
167 output.write( ">%s\n" % ( spec ) )
|
|
168 if alignment_strand == "-":
|
|
169 output.write( alignment.get_sequence_reverse_complement( spec ) )
|
|
170 else:
|
|
171 output.write( alignment.get_sequence( spec ) )
|
|
172 output.write( "\n" )
|
|
173
|
|
174 output.write( "\n" )
|
|
175
|
|
176 regions_extracted += 1
|
|
177
|
|
178 except Exception, e:
|
|
179 print "Unexpected error from input line %i: %s" % ( line_count, e )
|
|
180 continue
|
|
181
|
|
182 #close output file
|
|
183 output.close()
|
|
184
|
|
185 #remove index file if created during run
|
|
186 maf_utilities.remove_temp_index_file( index_filename )
|
|
187
|
|
188 #Print message about success for user
|
|
189 if regions_extracted > 0:
|
|
190 print "%i regions were processed successfully." % ( regions_extracted )
|
|
191 else:
|
|
192 print "No regions were processed successfully."
|
|
193 if line_count > 0 and options.geneBED:
|
|
194 print "This tool requires your input file to conform to the 12 column BED standard."
|
|
195
|
|
196 if __name__ == "__main__": __main__()
|