0
|
1 #!/usr/bin/env python
|
|
2 #guruprasad Ananda
|
|
3 """
|
|
4 Estimates substitution rates from pairwise alignments using JC69 model.
|
|
5 """
|
|
6
|
|
7 from galaxy import eggs
|
|
8 from galaxy.tools.util.galaxyops import *
|
|
9 from galaxy.tools.util import maf_utilities
|
|
10 import bx.align.maf
|
|
11 import sys, fileinput
|
|
12
|
|
13 def stop_err(msg):
|
|
14 sys.stderr.write(msg)
|
|
15 sys.exit()
|
|
16
|
|
17 if len(sys.argv) < 3:
|
|
18 stop_err("Incorrect number of arguments.")
|
|
19
|
|
20 inp_file = sys.argv[1]
|
|
21 out_file = sys.argv[2]
|
|
22 fout = open(out_file, 'w')
|
|
23 int_file = sys.argv[3]
|
|
24 if int_file != "None": #The user has specified an interval file
|
|
25 dbkey_i = sys.argv[4]
|
|
26 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
|
|
27
|
|
28
|
|
29 def rateEstimator(block):
|
|
30 global alignlen, mismatches
|
|
31
|
|
32 src1 = block.components[0].src
|
|
33 sequence1 = block.components[0].text
|
|
34 start1 = block.components[0].start
|
|
35 end1 = block.components[0].end
|
|
36 len1 = int(end1)-int(start1)
|
|
37 len1_withgap = len(sequence1)
|
|
38 mismatch = 0.0
|
|
39
|
|
40 for seq in range (1,len(block.components)):
|
|
41 src2 = block.components[seq].src
|
|
42 sequence2 = block.components[seq].text
|
|
43 start2 = block.components[seq].start
|
|
44 end2 = block.components[seq].end
|
|
45 len2 = int(end2)-int(start2)
|
|
46 for nt in range(len1_withgap):
|
|
47 if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': #Not a gap or masked character
|
|
48 if sequence1[nt].upper() != sequence2[nt].upper():
|
|
49 mismatch += 1
|
|
50
|
|
51 if int_file == "None":
|
|
52 p = mismatch/min(len1,len2)
|
|
53 print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%.4f" %(src1,start1,end1,src2,start2,end2,min(len1,len2),mismatch,p)
|
|
54 else:
|
|
55 mismatches += mismatch
|
|
56 alignlen += min(len1,len2)
|
|
57
|
|
58 def main():
|
|
59 skipped = 0
|
|
60 not_pairwise = 0
|
|
61
|
|
62 if int_file == "None":
|
|
63 try:
|
|
64 maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
|
|
65 except:
|
|
66 stop_err("Your MAF file appears to be malformed.")
|
|
67 print >>fout, "#Seq1\tStart1\tEnd1\tSeq2\tStart2\tEnd2\tL\tN\tp"
|
|
68 for block in maf_reader:
|
|
69 if len(block.components) != 2:
|
|
70 not_pairwise += 1
|
|
71 continue
|
|
72 try:
|
|
73 rateEstimator(block)
|
|
74 except:
|
|
75 skipped += 1
|
|
76 else:
|
|
77 index, index_filename = maf_utilities.build_maf_index( inp_file, species = [dbkey_i] )
|
|
78 if index is None:
|
|
79 print >> sys.stderr, "Your MAF file appears to be malformed."
|
|
80 sys.exit()
|
|
81 win = NiceReaderWrapper( fileinput.FileInput( int_file ),
|
|
82 chrom_col=chr_col_i,
|
|
83 start_col=start_col_i,
|
|
84 end_col=end_col_i,
|
|
85 strand_col=strand_col_i,
|
|
86 fix_strand=True)
|
|
87 species=None
|
|
88 mincols = 0
|
|
89 global alignlen, mismatches
|
|
90
|
|
91 for interval in win:
|
|
92 alignlen = 0
|
|
93 mismatches = 0.0
|
|
94 src = "%s.%s" % ( dbkey_i, interval.chrom )
|
|
95 for block in maf_utilities.get_chopped_blocks_for_region( index, src, interval, species, mincols ):
|
|
96 if len(block.components) != 2:
|
|
97 not_pairwise += 1
|
|
98 continue
|
|
99 try:
|
|
100 rateEstimator(block)
|
|
101 except:
|
|
102 skipped += 1
|
|
103 if alignlen:
|
|
104 p = mismatches/alignlen
|
|
105 else:
|
|
106 p = 'NA'
|
|
107 interval.fields.append(str(alignlen))
|
|
108 interval.fields.append(str(mismatches))
|
|
109 interval.fields.append(str(p))
|
|
110 print >>fout, "\t".join(interval.fields)
|
|
111 #num_blocks += 1
|
|
112
|
|
113 if not_pairwise:
|
|
114 print "Skipped %d non-pairwise blocks" %(not_pairwise)
|
|
115 if skipped:
|
|
116 print "Skipped %d blocks as invalid" %(skipped)
|
|
117 if __name__ == "__main__":
|
|
118 main()
|