annotate tools/regVariation/substitution_rates.py @ 2:c2a356708570

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:42 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 #guruprasad Ananda
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 Estimates substitution rates from pairwise alignments using JC69 model.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 from galaxy import eggs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 from galaxy.tools.util.galaxyops import *
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 from galaxy.tools.util import maf_utilities
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 import bx.align.maf
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 import sys, fileinput
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 def stop_err(msg):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 sys.stderr.write(msg)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 if len(sys.argv) < 3:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 stop_err("Incorrect number of arguments.")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 inp_file = sys.argv[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 out_file = sys.argv[2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 fout = open(out_file, 'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 int_file = sys.argv[3]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 if int_file != "None": #The user has specified an interval file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 dbkey_i = sys.argv[4]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 def rateEstimator(block):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 global alignlen, mismatches
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 src1 = block.components[0].src
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 sequence1 = block.components[0].text
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 start1 = block.components[0].start
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 end1 = block.components[0].end
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 len1 = int(end1)-int(start1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 len1_withgap = len(sequence1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 mismatch = 0.0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 for seq in range (1,len(block.components)):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 src2 = block.components[seq].src
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 sequence2 = block.components[seq].text
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 start2 = block.components[seq].start
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 end2 = block.components[seq].end
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 len2 = int(end2)-int(start2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 for nt in range(len1_withgap):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 if sequence1[nt] not in '-#$^*?' and sequence2[nt] not in '-#$^*?': #Not a gap or masked character
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 if sequence1[nt].upper() != sequence2[nt].upper():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 mismatch += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 if int_file == "None":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 p = mismatch/min(len1,len2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 print >>fout, "%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%.4f" %(src1,start1,end1,src2,start2,end2,min(len1,len2),mismatch,p)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 mismatches += mismatch
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 alignlen += min(len1,len2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 skipped = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 not_pairwise = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 if int_file == "None":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 maf_reader = bx.align.maf.Reader( open(inp_file, 'r') )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 stop_err("Your MAF file appears to be malformed.")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 print >>fout, "#Seq1\tStart1\tEnd1\tSeq2\tStart2\tEnd2\tL\tN\tp"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 for block in maf_reader:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 if len(block.components) != 2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 not_pairwise += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 rateEstimator(block)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 skipped += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 index, index_filename = maf_utilities.build_maf_index( inp_file, species = [dbkey_i] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 if index is None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 print >> sys.stderr, "Your MAF file appears to be malformed."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 win = NiceReaderWrapper( fileinput.FileInput( int_file ),
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 chrom_col=chr_col_i,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 start_col=start_col_i,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 end_col=end_col_i,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 strand_col=strand_col_i,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 fix_strand=True)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 species=None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 mincols = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 global alignlen, mismatches
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 for interval in win:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 alignlen = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 mismatches = 0.0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 src = "%s.%s" % ( dbkey_i, interval.chrom )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 for block in maf_utilities.get_chopped_blocks_for_region( index, src, interval, species, mincols ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 if len(block.components) != 2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 not_pairwise += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 rateEstimator(block)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 skipped += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 if alignlen:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 p = mismatches/alignlen
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 p = 'NA'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 interval.fields.append(str(alignlen))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 interval.fields.append(str(mismatches))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 interval.fields.append(str(p))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 print >>fout, "\t".join(interval.fields)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 #num_blocks += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 if not_pairwise:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 print "Skipped %d non-pairwise blocks" %(not_pairwise)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 if skipped:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 print "Skipped %d blocks as invalid" %(skipped)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 if __name__ == "__main__":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 main()