0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Estimate INDELs for pair-wise alignments.
|
|
5
|
|
6 usage: %prog maf_input out_file1 out_file2
|
|
7 """
|
|
8
|
|
9 from __future__ import division
|
|
10 from galaxy import eggs
|
|
11 import pkg_resources
|
|
12 pkg_resources.require( "bx-python" )
|
|
13 try:
|
|
14 pkg_resources.require("numpy")
|
|
15 except:
|
|
16 pass
|
|
17 import psyco_full
|
|
18 import sys
|
|
19 from bx.cookbook import doc_optparse
|
|
20 from galaxy.tools.exception_handling import *
|
|
21 import bx.align.maf
|
|
22
|
|
23 assert sys.version_info[:2] >= ( 2, 4 )
|
|
24
|
|
25 def main():
|
|
26 # Parsing Command Line here
|
|
27 options, args = doc_optparse.parse( __doc__ )
|
|
28
|
|
29 try:
|
|
30 inp_file, out_file1 = args
|
|
31 except:
|
|
32 print >> sys.stderr, "Tool initialization error."
|
|
33 sys.exit()
|
|
34
|
|
35 try:
|
|
36 fin = open(inp_file,'r')
|
|
37 except:
|
|
38 print >> sys.stderr, "Unable to open input file"
|
|
39 sys.exit()
|
|
40 try:
|
|
41 fout1 = open(out_file1,'w')
|
|
42 #fout2 = open(out_file2,'w')
|
|
43 except:
|
|
44 print >> sys.stderr, "Unable to open output file"
|
|
45 sys.exit()
|
|
46
|
|
47 try:
|
|
48 maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
|
|
49 except:
|
|
50 print >>sys.stderr, "Your MAF file appears to be malformed."
|
|
51 sys.exit()
|
|
52 maf_count = 0
|
|
53
|
|
54 print >>fout1, "#Block\tSource\tSeq1_Start\tSeq1_End\tSeq2_Start\tSeq2_End\tIndel_length"
|
|
55 for block_ind, block in enumerate(maf_reader):
|
|
56 if len(block.components) < 2:
|
|
57 continue
|
|
58 seq1 = block.components[0].text
|
|
59 src1 = block.components[0].src
|
|
60 start1 = block.components[0].start
|
|
61 if len(block.components) == 2:
|
|
62 seq2 = block.components[1].text
|
|
63 src2 = block.components[1].src
|
|
64 start2 = block.components[1].start
|
|
65 #for pos in range(len(seq1)):
|
|
66 nt_pos1 = start1-1 #position of the nucleotide (without counting gaps)
|
|
67 nt_pos2 = start2-1
|
|
68 pos = 0 #character column position
|
|
69 gaplen1 = 0
|
|
70 gaplen2 = 0
|
|
71 prev_pos_gap1 = 0
|
|
72 prev_pos_gap2 = 0
|
|
73 while pos < len(seq1):
|
|
74 if prev_pos_gap1 == 0:
|
|
75 gaplen1 = 0
|
|
76 if prev_pos_gap2 == 0:
|
|
77 gaplen2 = 0
|
|
78
|
|
79 if seq1[pos] == '-':
|
|
80 if seq2[pos] != '-':
|
|
81 nt_pos2 += 1
|
|
82 gaplen1 += 1
|
|
83 prev_pos_gap1 = 1
|
|
84 #write 2
|
|
85 if prev_pos_gap2 == 1:
|
|
86 prev_pos_gap2 = 0
|
|
87 print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1,nt_pos1+1,nt_pos2-1,nt_pos2-1+gaplen2,gaplen2)
|
|
88 if pos == len(seq1)-1:
|
|
89 print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1,nt_pos1+1,nt_pos2+1-gaplen1,nt_pos2+1,gaplen1)
|
|
90 else:
|
|
91 prev_pos_gap1 = 0
|
|
92 prev_pos_gap2 = 0
|
|
93 """
|
|
94 if prev_pos_gap1 == 1:
|
|
95 prev_pos_gap1 = 0
|
|
96 print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,gaplen1)
|
|
97 elif prev_pos_gap2 == 1:
|
|
98 prev_pos_gap2 = 0
|
|
99 print >>fout1,"%d\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos2-1,nt_pos2,gaplen2)
|
|
100 """
|
|
101 else:
|
|
102 nt_pos1 += 1
|
|
103 if seq2[pos] != '-':
|
|
104 nt_pos2 += 1
|
|
105 #write both
|
|
106 if prev_pos_gap1 == 1:
|
|
107 prev_pos_gap1 = 0
|
|
108 print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2-gaplen1,nt_pos2,gaplen1)
|
|
109 elif prev_pos_gap2 == 1:
|
|
110 prev_pos_gap2 = 0
|
|
111 print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1-gaplen2,nt_pos1,nt_pos2-1,nt_pos2,gaplen2)
|
|
112 else:
|
|
113 gaplen2 += 1
|
|
114 prev_pos_gap2 = 1
|
|
115 #write 1
|
|
116 if prev_pos_gap1 == 1:
|
|
117 prev_pos_gap1 = 0
|
|
118 print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src1,nt_pos1-1,nt_pos1,nt_pos2,nt_pos2+gaplen1,gaplen1)
|
|
119 if pos == len(seq1)-1:
|
|
120 print >>fout1,"%d\t%s\t%s\t%s\t%s\t%s\t%s" %(block_ind+1,src2,nt_pos1+1-gaplen2,nt_pos1+1,nt_pos2,nt_pos2+1,gaplen2)
|
|
121 pos += 1
|
|
122 if __name__ == "__main__":
|
|
123 main()
|