Mercurial > repos > xuebing > sharplabtool
comparison tools/maf/interval_maf_to_merged_fasta.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 Reads an interval or gene BED and a MAF Source. | |
5 Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates | |
6 | |
7 Alignment blocks are layered ontop of each other based upon score. | |
8 | |
9 usage: %prog maf_file [options] | |
10 -d, --dbkey=d: Database key, ie hg17 | |
11 -c, --chromCol=c: Column of Chr | |
12 -s, --startCol=s: Column of Start | |
13 -e, --endCol=e: Column of End | |
14 -S, --strandCol=S: Column of Strand | |
15 -G, --geneBED: Input is a Gene BED file, process and join exons as one region | |
16 -t, --mafSourceType=t: Type of MAF source to use | |
17 -m, --mafSource=m: Path of source MAF file, if not using cached version | |
18 -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version | |
19 -i, --interval_file=i: Input interval file | |
20 -o, --output_file=o: Output MAF file | |
21 -p, --species=p: Species to include in output | |
22 -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species. | |
23 -z, --mafIndexFileDir=z: Directory of local maf_index.loc file | |
24 | |
25 usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR | |
26 """ | |
27 | |
28 #Dan Blankenberg | |
29 from galaxy import eggs | |
30 from galaxy.tools.util import maf_utilities | |
31 import pkg_resources; pkg_resources.require( "bx-python" ) | |
32 from bx.cookbook import doc_optparse | |
33 import bx.intervals.io | |
34 import sys | |
35 | |
36 assert sys.version_info[:2] >= ( 2, 4 ) | |
37 | |
38 def stop_err( msg ): | |
39 sys.stderr.write( msg ) | |
40 sys.exit() | |
41 | |
42 def __main__(): | |
43 | |
44 #Parse Command Line | |
45 options, args = doc_optparse.parse( __doc__ ) | |
46 mincols = 0 | |
47 strand_col = -1 | |
48 | |
49 if options.dbkey: | |
50 primary_species = options.dbkey | |
51 else: | |
52 primary_species = None | |
53 if primary_species in [None, "?", "None"]: | |
54 stop_err( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." ) | |
55 | |
56 include_primary = True | |
57 secondary_species = maf_utilities.parse_species_option( options.species ) | |
58 if secondary_species: | |
59 species = list( secondary_species ) # make copy of species list | |
60 if primary_species in secondary_species: | |
61 secondary_species.remove( primary_species ) | |
62 else: | |
63 include_primary = False | |
64 else: | |
65 species = None | |
66 | |
67 if options.interval_file: | |
68 interval_file = options.interval_file | |
69 else: | |
70 stop_err( "Input interval file has not been specified." ) | |
71 | |
72 if options.output_file: | |
73 output_file = options.output_file | |
74 else: | |
75 stop_err( "Output file has not been specified." ) | |
76 | |
77 if not options.geneBED: | |
78 if options.chromCol: | |
79 chr_col = int( options.chromCol ) - 1 | |
80 else: | |
81 stop_err( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." ) | |
82 | |
83 if options.startCol: | |
84 start_col = int( options.startCol ) - 1 | |
85 else: | |
86 stop_err( "Start column not set, click the pencil icon in the history item to set the metadata attributes." ) | |
87 | |
88 if options.endCol: | |
89 end_col = int( options.endCol ) - 1 | |
90 else: | |
91 stop_err( "End column not set, click the pencil icon in the history item to set the metadata attributes." ) | |
92 | |
93 if options.strandCol: | |
94 strand_col = int( options.strandCol ) - 1 | |
95 | |
96 mafIndexFile = "%s/maf_index.loc" % options.mafIndexFileDir | |
97 | |
98 overwrite_with_gaps = True | |
99 if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false': | |
100 overwrite_with_gaps = False | |
101 | |
102 #Finish parsing command line | |
103 | |
104 #get index for mafs based on type | |
105 index = index_filename = None | |
106 #using specified uid for locally cached | |
107 if options.mafSourceType.lower() in ["cached"]: | |
108 index = maf_utilities.maf_index_by_uid( options.mafSource, mafIndexFile ) | |
109 if index is None: | |
110 stop_err( "The MAF source specified (%s) appears to be invalid." % ( options.mafSource ) ) | |
111 elif options.mafSourceType.lower() in ["user"]: | |
112 #index maf for use here, need to remove index_file when finished | |
113 index, index_filename = maf_utilities.open_or_build_maf_index( options.mafSource, options.mafIndex, species = [primary_species] ) | |
114 if index is None: | |
115 stop_err( "Your MAF file appears to be malformed." ) | |
116 else: | |
117 stop_err( "Invalid MAF source type specified." ) | |
118 | |
119 #open output file | |
120 output = open( output_file, "w" ) | |
121 | |
122 if options.geneBED: | |
123 region_enumerator = maf_utilities.line_enumerator( open( interval_file, "r" ).readlines() ) | |
124 else: | |
125 region_enumerator = enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chr_col, start_col = start_col, end_col = end_col, strand_col = strand_col, fix_strand = True, return_header = False, return_comments = False ) ) | |
126 | |
127 #Step through intervals | |
128 regions_extracted = 0 | |
129 line_count = 0 | |
130 for line_count, line in region_enumerator: | |
131 try: | |
132 if options.geneBED: #Process as Gene BED | |
133 try: | |
134 starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed( line ) | |
135 #create spliced alignment object | |
136 alignment = maf_utilities.get_spliced_region_alignment( index, primary_species, fields[0], starts, ends, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps ) | |
137 primary_name = secondary_name = fields[3] | |
138 alignment_strand = fields[5] | |
139 except Exception, e: | |
140 print "Error loading exon positions from input line %i: %s" % ( line_count, e ) | |
141 continue | |
142 else: #Process as standard intervals | |
143 try: | |
144 #create spliced alignment object | |
145 alignment = maf_utilities.get_region_alignment( index, primary_species, line.chrom, line.start, line.end, strand = '+', species = species, mincols = mincols, overwrite_with_gaps = overwrite_with_gaps ) | |
146 primary_name = "%s(%s):%s-%s" % ( line.chrom, line.strand, line.start, line.end ) | |
147 secondary_name = "" | |
148 alignment_strand = line.strand | |
149 except Exception, e: | |
150 print "Error loading region positions from input line %i: %s" % ( line_count, e ) | |
151 continue | |
152 | |
153 #Write alignment to output file | |
154 #Output primary species first, if requested | |
155 if include_primary: | |
156 output.write( ">%s.%s\n" %( primary_species, primary_name ) ) | |
157 if alignment_strand == "-": | |
158 output.write( alignment.get_sequence_reverse_complement( primary_species ) ) | |
159 else: | |
160 output.write( alignment.get_sequence( primary_species ) ) | |
161 output.write( "\n" ) | |
162 #Output all remainging species | |
163 for spec in secondary_species or alignment.get_species_names( skip = primary_species ): | |
164 if secondary_name: | |
165 output.write( ">%s.%s\n" % ( spec, secondary_name ) ) | |
166 else: | |
167 output.write( ">%s\n" % ( spec ) ) | |
168 if alignment_strand == "-": | |
169 output.write( alignment.get_sequence_reverse_complement( spec ) ) | |
170 else: | |
171 output.write( alignment.get_sequence( spec ) ) | |
172 output.write( "\n" ) | |
173 | |
174 output.write( "\n" ) | |
175 | |
176 regions_extracted += 1 | |
177 | |
178 except Exception, e: | |
179 print "Unexpected error from input line %i: %s" % ( line_count, e ) | |
180 continue | |
181 | |
182 #close output file | |
183 output.close() | |
184 | |
185 #remove index file if created during run | |
186 maf_utilities.remove_temp_index_file( index_filename ) | |
187 | |
188 #Print message about success for user | |
189 if regions_extracted > 0: | |
190 print "%i regions were processed successfully." % ( regions_extracted ) | |
191 else: | |
192 print "No regions were processed successfully." | |
193 if line_count > 0 and options.geneBED: | |
194 print "This tool requires your input file to conform to the 12 column BED standard." | |
195 | |
196 if __name__ == "__main__": __main__() |