comparison interval_maf_to_merged_fasta.py @ 0:f24a9ff28d3c draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/genebed_maf_to_fasta/ commit 17e2194066ca843f6b2391a9632ea9de67d39351"
author iuc
date Fri, 21 Aug 2020 15:10:13 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f24a9ff28d3c
1 #!/usr/bin/env python
2 """
3 Reads an interval or gene BED and a MAF Source.
4 Produces a FASTA file containing the aligned intervals/gene sequences, based upon the provided coordinates
5
6 Alignment blocks are layered ontop of each other based upon score.
7
8 usage: %prog maf_file [options]
9 -d, --dbkey=d: Database key, ie hg17
10 -c, --chromCol=c: Column of Chr
11 -s, --startCol=s: Column of Start
12 -e, --endCol=e: Column of End
13 -S, --strandCol=S: Column of Strand
14 -G, --geneBED: Input is a Gene BED file, process and join exons as one region
15 -t, --mafSourceType=t: Type of MAF source to use
16 -m, --mafSource=m: Path of source MAF file, if not using cached version
17 -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version
18 -i, --interval_file=i: Input interval file
19 -o, --output_file=o: Output MAF file
20 -p, --species=p: Species to include in output
21 -O, --overwrite_with_gaps=O: Overwrite bases found in a lower-scoring block with gaps interior to the sequence for a species.
22 -z, --mafIndexFileDir=z: Directory of local maf_index.loc file
23
24 usage: %prog dbkey_of_BED comma_separated_list_of_additional_dbkeys_to_extract comma_separated_list_of_indexed_maf_files input_gene_bed_file output_fasta_file cached|user GALAXY_DATA_INDEX_DIR
25 """
26 # Dan Blankenberg
27 from __future__ import print_function
28
29 import sys
30
31 import bx.intervals.io
32 from bx.cookbook import doc_optparse
33 from galaxy.tools.util import maf_utilities
34
35
36 def stop_err(msg):
37 sys.exit(msg)
38
39
40 def __main__():
41 # Parse Command Line
42 options, args = doc_optparse.parse(__doc__)
43 mincols = 0
44 strand_col = -1
45
46 if options.dbkey:
47 primary_species = options.dbkey
48 else:
49 primary_species = None
50 if primary_species in [None, "?", "None"]:
51 stop_err("You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file.")
52
53 include_primary = True
54 secondary_species = maf_utilities.parse_species_option(options.species)
55 if secondary_species:
56 species = list(secondary_species) # make copy of species list
57 if primary_species in secondary_species:
58 secondary_species.remove(primary_species)
59 else:
60 include_primary = False
61 else:
62 species = None
63
64 if options.interval_file:
65 interval_file = options.interval_file
66 else:
67 stop_err("Input interval file has not been specified.")
68
69 if options.output_file:
70 output_file = options.output_file
71 else:
72 stop_err("Output file has not been specified.")
73
74 if not options.geneBED:
75 if options.chromCol:
76 chr_col = int(options.chromCol) - 1
77 else:
78 stop_err("Chromosome column not set, click the pencil icon in the history item to set the metadata attributes.")
79
80 if options.startCol:
81 start_col = int(options.startCol) - 1
82 else:
83 stop_err("Start column not set, click the pencil icon in the history item to set the metadata attributes.")
84
85 if options.endCol:
86 end_col = int(options.endCol) - 1
87 else:
88 stop_err("End column not set, click the pencil icon in the history item to set the metadata attributes.")
89
90 if options.strandCol:
91 strand_col = int(options.strandCol) - 1
92
93 mafIndexFile = "%s/maf_indexes.loc" % options.mafIndexFileDir
94
95 overwrite_with_gaps = True
96 if options.overwrite_with_gaps and options.overwrite_with_gaps.lower() == 'false':
97 overwrite_with_gaps = False
98
99 # Finish parsing command line
100
101 # get index for mafs based on type
102 index = index_filename = None
103 # using specified uid for locally cached
104 if options.mafSourceType.lower() in ["cached"]:
105 index = maf_utilities.maf_index_by_uid(options.mafSource, mafIndexFile)
106 if index is None:
107 stop_err("The MAF source specified (%s) appears to be invalid." % (options.mafSource))
108 elif options.mafSourceType.lower() in ["user"]:
109 # index maf for use here, need to remove index_file when finished
110 index, index_filename = maf_utilities.open_or_build_maf_index(options.mafSource, options.mafIndex, species=[primary_species])
111 if index is None:
112 stop_err("Your MAF file appears to be malformed.")
113 else:
114 stop_err("Invalid MAF source type specified.")
115
116 # open output file
117 output = open(output_file, "w")
118
119 if options.geneBED:
120 region_enumerator = maf_utilities.line_enumerator(open(interval_file, "r").readlines())
121 else:
122 region_enumerator = enumerate(bx.intervals.io.NiceReaderWrapper(
123 open(interval_file, 'r'), chrom_col=chr_col, start_col=start_col,
124 end_col=end_col, strand_col=strand_col, fix_strand=True,
125 return_header=False, return_comments=False))
126
127 # Step through intervals
128 regions_extracted = 0
129 line_count = 0
130 for line_count, line in region_enumerator:
131 try:
132 if options.geneBED: # Process as Gene BED
133 try:
134 starts, ends, fields = maf_utilities.get_starts_ends_fields_from_gene_bed(line)
135 # create spliced alignment object
136 alignment = maf_utilities.get_spliced_region_alignment(
137 index, primary_species, fields[0], starts, ends,
138 strand='+', species=species, mincols=mincols,
139 overwrite_with_gaps=overwrite_with_gaps)
140 except Exception as e:
141 print(e)
142 try:
143 primary_name = secondary_name = fields[3]
144 alignment_strand = fields[5]
145 except Exception as e:
146 print("Error loading exon positions from input line %i: %s" % (line_count, e))
147 continue
148 else: # Process as standard intervals
149 try:
150 # create spliced alignment object
151 alignment = maf_utilities.get_region_alignment(
152 index, primary_species, line.chrom, line.start,
153 line.end, strand='+', species=species, mincols=mincols,
154 overwrite_with_gaps=overwrite_with_gaps)
155 primary_name = "%s(%s):%s-%s" % (line.chrom, line.strand, line.start, line.end)
156 secondary_name = ""
157 alignment_strand = line.strand
158 except Exception as e:
159 print("Error loading region positions from input line %i: %s" % (line_count, e.__dict__))
160 continue
161
162 # Write alignment to output file
163 # Output primary species first, if requested
164 if include_primary:
165 output.write(">%s.%s\n" % (primary_species, primary_name))
166 if alignment_strand == "-":
167 output.write(alignment.get_sequence_reverse_complement(primary_species))
168 else:
169 output.write(alignment.get_sequence(primary_species))
170 output.write("\n")
171 # Output all remainging species
172 for spec in secondary_species or alignment.get_species_names(skip=primary_species):
173 if secondary_name:
174 output.write(">%s.%s\n" % (spec, secondary_name))
175 else:
176 output.write(">%s\n" % (spec))
177 if alignment_strand == "-":
178 output.write(alignment.get_sequence_reverse_complement(spec))
179 else:
180 output.write(alignment.get_sequence(spec))
181 output.write("\n")
182
183 output.write("\n")
184 regions_extracted += 1
185 except Exception as e:
186 print("Unexpected error from input line %i: %s\n%s" % (line_count, e, line))
187 raise
188
189 # close output file
190 output.close()
191
192 # remove index file if created during run
193 maf_utilities.remove_temp_index_file(index_filename)
194
195 # Print message about success for user
196 if regions_extracted > 0:
197 print("%i regions were processed successfully." % (regions_extracted))
198 else:
199 print("No regions were processed successfully.")
200 if line_count > 0 and options.geneBED:
201 print("This tool requires your input file to conform to the 12 column BED standard.")
202
203
204 if __name__ == "__main__":
205 __main__()