annotate tools/maf/interval2maf.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 Reads a list of intervals and a maf. Produces a new maf containing the
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 blocks or parts of blocks in the original that overlapped the intervals.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 If a MAF file, not UID, is provided the MAF file is indexed before being processed.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 NOTE: If two intervals overlap the same block it will be written twice.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 usage: %prog maf_file [options]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 -d, --dbkey=d: Database key, ie hg17
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 -c, --chromCol=c: Column of Chr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 -s, --startCol=s: Column of Start
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 -e, --endCol=e: Column of End
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 -S, --strandCol=S: Column of Strand
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 -t, --mafType=t: Type of MAF source to use
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 -m, --mafFile=m: Path of source MAF file, if not using cached version
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 -I, --mafIndex=I: Path of precomputed source MAF file index, if not using cached version
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 -i, --interval_file=i: Input interval file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 -o, --output_file=o: Output MAF file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 -p, --species=p: Species to include in output
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 -P, --split_blocks_by_species=P: Split blocks by species
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 -r, --remove_all_gap_columns=r: Remove all Gap columns
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 -l, --indexLocation=l: Override default maf_index.loc file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 -z, --mafIndexFile=z: Directory of local maf index file ( maf_index.loc or maf_pairwise.loc )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 #Dan Blankenberg
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 from galaxy import eggs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 import pkg_resources; pkg_resources.require( "bx-python" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 from bx.cookbook import doc_optparse
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 import bx.align.maf
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 import bx.intervals.io
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 from galaxy.tools.util import maf_utilities
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 import sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 assert sys.version_info[:2] >= ( 2, 4 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 def __main__():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 index = index_filename = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 mincols = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 #Parse Command Line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 options, args = doc_optparse.parse( __doc__ )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 if options.dbkey: dbkey = options.dbkey
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 else: dbkey = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 if dbkey in [None, "?"]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 maf_utilities.tool_fail( "You must specify a proper build in order to extract alignments. You can specify your genome build by clicking on the pencil icon associated with your interval file." )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 species = maf_utilities.parse_species_option( options.species )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 if options.chromCol: chromCol = int( options.chromCol ) - 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 maf_utilities.tool_fail( "Chromosome column not set, click the pencil icon in the history item to set the metadata attributes." )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 if options.startCol: startCol = int( options.startCol ) - 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 maf_utilities.tool_fail( "Start column not set, click the pencil icon in the history item to set the metadata attributes." )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 if options.endCol: endCol = int( options.endCol ) - 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 maf_utilities.tool_fail( "End column not set, click the pencil icon in the history item to set the metadata attributes." )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 if options.strandCol: strandCol = int( options.strandCol ) - 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 strandCol = -1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 if options.interval_file: interval_file = options.interval_file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 maf_utilities.tool_fail( "Input interval file has not been specified." )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 if options.output_file: output_file = options.output_file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 maf_utilities.tool_fail( "Output file has not been specified." )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 split_blocks_by_species = remove_all_gap_columns = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 if options.split_blocks_by_species and options.split_blocks_by_species == 'split_blocks_by_species':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 split_blocks_by_species = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 if options.remove_all_gap_columns and options.remove_all_gap_columns == 'remove_all_gap_columns':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 remove_all_gap_columns = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 remove_all_gap_columns = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 #Finish parsing command line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 #Open indexed access to MAFs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 if options.mafType:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 if options.indexLocation:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 index = maf_utilities.maf_index_by_uid( options.mafType, options.indexLocation )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 index = maf_utilities.maf_index_by_uid( options.mafType, options.mafIndexFile )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 if index is None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 maf_utilities.tool_fail( "The MAF source specified (%s) appears to be invalid." % ( options.mafType ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 elif options.mafFile:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 index, index_filename = maf_utilities.open_or_build_maf_index( options.mafFile, options.mafIndex, species = [dbkey] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 if index is None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 maf_utilities.tool_fail( "Your MAF file appears to be malformed." )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 maf_utilities.tool_fail( "Desired source MAF type has not been specified." )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 #Create MAF writter
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 out = bx.align.maf.Writer( open(output_file, "w") )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 #Iterate over input regions
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 num_blocks = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 num_regions = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 for num_regions, region in enumerate( bx.intervals.io.NiceReaderWrapper( open( interval_file, 'r' ), chrom_col = chromCol, start_col = startCol, end_col = endCol, strand_col = strandCol, fix_strand = True, return_header = False, return_comments = False ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 src = maf_utilities.src_merge( dbkey, region.chrom )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 for block in index.get_as_iterator( src, region.start, region.end ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 if split_blocks_by_species:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 blocks = [ new_block for new_block in maf_utilities.iter_blocks_split_by_species( block ) if maf_utilities.component_overlaps_region( new_block.get_component_by_src_start( dbkey ), region ) ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 blocks = [ block ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 for block in blocks:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 block = maf_utilities.chop_block_by_region( block, src, region )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 if block is not None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 if species is not None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 block = block.limit_to_species( species )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 block = maf_utilities.orient_block_by_region( block, src, region )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 if remove_all_gap_columns:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 block.remove_all_gap_columns()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 out.write( block )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 num_blocks += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 #Close output MAF
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 out.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 #remove index file if created during run
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 maf_utilities.remove_temp_index_file( index_filename )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 if num_blocks:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 print "%i MAF blocks extracted for %i regions." % ( num_blocks, ( num_regions + 1 ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 elif num_regions is not None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 print "No MAF blocks could be extracted for %i regions." % ( num_regions + 1 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 print "No valid regions have been provided."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 if __name__ == "__main__": __main__()