comparison tools/taxonomy/lca.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env python
2 #Guruprasad Ananda
3 """
4 Least Common Ancestor tool.
5 """
6 import sys, string, re, commands, tempfile, random
7
8 def stop_err(msg):
9 sys.stderr.write(msg)
10 sys.exit()
11
12 def main():
13 try:
14 inputfile = sys.argv[1]
15 outfile = sys.argv[2]
16 rank_bound = int( sys.argv[3] )
17 """
18 Mapping of ranks:
19 root :2,
20 superkingdom:3,
21 kingdom :4,
22 subkingdom :5,
23 superphylum :6,
24 phylum :7,
25 subphylum :8,
26 superclass :9,
27 class :10,
28 subclass :11,
29 superorder :12,
30 order :13,
31 suborder :14,
32 superfamily :15,
33 family :16,
34 subfamily :17,
35 tribe :18,
36 subtribe :19,
37 genus :20,
38 subgenus :21,
39 species :22,
40 subspecies :23,
41 """
42 except:
43 stop_err("Syntax error: Use correct syntax: program infile outfile")
44
45 fin = open(sys.argv[1],'r')
46 for j, line in enumerate( fin ):
47 elems = line.strip().split('\t')
48 if len(elems) < 24:
49 stop_err("The format of the input dataset is incorrect. Taxonomy datatype should contain at least 24 columns.")
50 if j > 30:
51 break
52 cols = range(1,len(elems))
53 fin.close()
54
55 group_col = 0
56 tmpfile = tempfile.NamedTemporaryFile()
57
58 try:
59 """
60 The -k option for the Posix sort command is as follows:
61 -k, --key=POS1[,POS2]
62 start a key at POS1, end it at POS2 (origin 1)
63 In other words, column positions start at 1 rather than 0, so
64 we need to add 1 to group_col.
65 if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
66 """
67 command_line = "sort -f -k " + str(group_col+1) +"," + str(group_col+1) + " -o " + tmpfile.name + " " + inputfile
68 except Exception, exc:
69 stop_err( 'Initialization error -> %s' %str(exc) )
70
71 error_code, stdout = commands.getstatusoutput(command_line)
72
73 if error_code != 0:
74 stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))
75
76 prev_item = ""
77 prev_vals = []
78 remaining_vals = []
79 skipped_lines = 0
80 fout = open(outfile, "w")
81 block_valid = False
82
83
84 for ii, line in enumerate( file( tmpfile.name )):
85 if line and not line.startswith( '#' ) and len(line.split('\t')) >= 24: #Taxonomy datatype should have at least 24 columns
86 line = line.rstrip( '\r\n' )
87 try:
88 fields = line.split("\t")
89 item = fields[group_col]
90 if prev_item != "":
91 # At this level, we're grouping on values (item and prev_item) in group_col
92 if item == prev_item:
93 # Keep iterating and storing values until a new value is encountered.
94 if block_valid:
95 for i, col in enumerate(cols):
96 if col >= 3:
97 prev_vals[i].append(fields[col].strip())
98 if len(set(prev_vals[i])) > 1:
99 block_valid = False
100 break
101
102 else:
103 """
104 When a new value is encountered, write the previous value and the
105 corresponding aggregate values into the output file. This works
106 due to the sort on group_col we've applied to the data above.
107 """
108 out_list = ['']*24
109 out_list[0] = str(prev_item)
110 out_list[1] = str(prev_vals[0][0])
111 out_list[2] = str(prev_vals[1][0])
112
113 for k, col in enumerate(cols):
114 if col >= 3 and col < 24:
115 if len(set(prev_vals[k])) == 1:
116 out_list[col] = prev_vals[k][0]
117 else:
118 break
119 while k < 23:
120 out_list[k+1] = 'n'
121 k += 1
122
123 j = 0
124 while True:
125 try:
126 out_list.append(str(prev_vals[23+j][0]))
127 j += 1
128 except:
129 break
130
131 if rank_bound == 0:
132 print >>fout, '\t'.join(out_list).strip()
133 else:
134 if ''.join(out_list[rank_bound:24]) != 'n'*( 24 - rank_bound ):
135 print >>fout, '\t'.join(out_list).strip()
136
137 block_valid = True
138 prev_item = item
139 prev_vals = []
140 for col in cols:
141 val_list = []
142 val_list.append(fields[col].strip())
143 prev_vals.append(val_list)
144
145 else:
146 # This only occurs once, right at the start of the iteration.
147 block_valid = True
148 prev_item = item #groupby item
149 for col in cols: #everyting else
150 val_list = []
151 val_list.append(fields[col].strip())
152 prev_vals.append(val_list)
153
154 except:
155 skipped_lines += 1
156 else:
157 skipped_lines += 1
158
159 # Handle the last grouped value
160 out_list = ['']*24
161 out_list[0] = str(prev_item)
162 out_list[1] = str(prev_vals[0][0])
163 out_list[2] = str(prev_vals[1][0])
164
165 for k, col in enumerate(cols):
166 if col >= 3 and col < 24:
167 if len(set(prev_vals[k])) == 1:
168 out_list[col] = prev_vals[k][0]
169 else:
170 break
171 while k < 23:
172 out_list[k+1] = 'n'
173 k += 1
174
175 j = 0
176 while True:
177 try:
178 out_list.append(str(prev_vals[23+j][0]))
179 j += 1
180 except:
181 break
182
183 if rank_bound == 0:
184 print >>fout, '\t'.join(out_list).strip()
185 else:
186 if ''.join(out_list[rank_bound:24]) != 'n'*( 24 - rank_bound ):
187 print >>fout, '\t'.join(out_list).strip()
188
189 if skipped_lines > 0:
190 print "Skipped %d invalid lines." % ( skipped_lines )
191
192 if __name__ == "__main__":
193 main()