annotate tools/taxonomy/lca.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 #Guruprasad Ananda
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 Least Common Ancestor tool.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 import sys, string, re, commands, tempfile, random
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 def stop_err(msg):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 sys.stderr.write(msg)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 inputfile = sys.argv[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 outfile = sys.argv[2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 rank_bound = int( sys.argv[3] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 Mapping of ranks:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 root :2,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 superkingdom:3,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 kingdom :4,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 subkingdom :5,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 superphylum :6,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 phylum :7,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 subphylum :8,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 superclass :9,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 class :10,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 subclass :11,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 superorder :12,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 order :13,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 suborder :14,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 superfamily :15,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 family :16,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 subfamily :17,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 tribe :18,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 subtribe :19,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 genus :20,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 subgenus :21,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 species :22,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 subspecies :23,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 stop_err("Syntax error: Use correct syntax: program infile outfile")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 fin = open(sys.argv[1],'r')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 for j, line in enumerate( fin ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 elems = line.strip().split('\t')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 if len(elems) < 24:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 stop_err("The format of the input dataset is incorrect. Taxonomy datatype should contain at least 24 columns.")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 if j > 30:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 cols = range(1,len(elems))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 fin.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 group_col = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 tmpfile = tempfile.NamedTemporaryFile()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 The -k option for the Posix sort command is as follows:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 -k, --key=POS1[,POS2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 start a key at POS1, end it at POS2 (origin 1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 In other words, column positions start at 1 rather than 0, so
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 we need to add 1 to group_col.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 command_line = "sort -f -k " + str(group_col+1) +"," + str(group_col+1) + " -o " + tmpfile.name + " " + inputfile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 except Exception, exc:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 stop_err( 'Initialization error -> %s' %str(exc) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 error_code, stdout = commands.getstatusoutput(command_line)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 if error_code != 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 prev_item = ""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 prev_vals = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 remaining_vals = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 skipped_lines = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 fout = open(outfile, "w")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 block_valid = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 for ii, line in enumerate( file( tmpfile.name )):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 if line and not line.startswith( '#' ) and len(line.split('\t')) >= 24: #Taxonomy datatype should have at least 24 columns
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 line = line.rstrip( '\r\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 fields = line.split("\t")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 item = fields[group_col]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 if prev_item != "":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 # At this level, we're grouping on values (item and prev_item) in group_col
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 if item == prev_item:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 # Keep iterating and storing values until a new value is encountered.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 if block_valid:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 for i, col in enumerate(cols):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 if col >= 3:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 prev_vals[i].append(fields[col].strip())
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 if len(set(prev_vals[i])) > 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 block_valid = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 When a new value is encountered, write the previous value and the
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 corresponding aggregate values into the output file. This works
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 due to the sort on group_col we've applied to the data above.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 out_list = ['']*24
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 out_list[0] = str(prev_item)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 out_list[1] = str(prev_vals[0][0])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 out_list[2] = str(prev_vals[1][0])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 for k, col in enumerate(cols):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 if col >= 3 and col < 24:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 if len(set(prev_vals[k])) == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 out_list[col] = prev_vals[k][0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 while k < 23:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 out_list[k+1] = 'n'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 k += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 j = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 while True:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 out_list.append(str(prev_vals[23+j][0]))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 j += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 if rank_bound == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 print >>fout, '\t'.join(out_list).strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 if ''.join(out_list[rank_bound:24]) != 'n'*( 24 - rank_bound ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 print >>fout, '\t'.join(out_list).strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 block_valid = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 prev_item = item
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 prev_vals = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 for col in cols:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 val_list = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 val_list.append(fields[col].strip())
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 prev_vals.append(val_list)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 # This only occurs once, right at the start of the iteration.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 block_valid = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 prev_item = item #groupby item
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 for col in cols: #everyting else
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 val_list = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 val_list.append(fields[col].strip())
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 prev_vals.append(val_list)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 skipped_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157 skipped_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 # Handle the last grouped value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 out_list = ['']*24
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161 out_list[0] = str(prev_item)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 out_list[1] = str(prev_vals[0][0])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 out_list[2] = str(prev_vals[1][0])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 for k, col in enumerate(cols):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 if col >= 3 and col < 24:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167 if len(set(prev_vals[k])) == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 out_list[col] = prev_vals[k][0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171 while k < 23:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 out_list[k+1] = 'n'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173 k += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 j = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176 while True:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
177 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
178 out_list.append(str(prev_vals[23+j][0]))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
179 j += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
180 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
181 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
182
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
183 if rank_bound == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
184 print >>fout, '\t'.join(out_list).strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
185 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
186 if ''.join(out_list[rank_bound:24]) != 'n'*( 24 - rank_bound ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
187 print >>fout, '\t'.join(out_list).strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
188
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
189 if skipped_lines > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
190 print "Skipped %d invalid lines." % ( skipped_lines )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
191
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
192 if __name__ == "__main__":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
193 main()