0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 tax_read_grouping.py <file in taxonomy format> <id column> <taxonomic ranks> <output format> <output file>
|
|
5 finds reads that only hit one taxonomic group. For example, consider the folliowing example:
|
|
6
|
|
7 read1 mammalia
|
|
8 read1 insecta
|
|
9 read2 insecta
|
|
10
|
|
11 in this case only read2 will be selected becuase it stays within insecta
|
|
12
|
|
13 This program takes the following options:
|
|
14
|
|
15 file in taxonomy format - dataset that complies with Galaxy's taxonomy format
|
|
16 id column - integer specifying the number of column containing seq id (starting with 1)
|
|
17 taxonomic ranks - a comma separated list of ranks from this list:
|
|
18
|
|
19 superkingdom
|
|
20 kingdom
|
|
21 subkingdom
|
|
22 superphylum
|
|
23 phylum
|
|
24 subphylum
|
|
25 superclass
|
|
26 class
|
|
27 subclass
|
|
28 superorder
|
|
29 order
|
|
30 suborder
|
|
31 superfamily
|
|
32 family
|
|
33 subfamily
|
|
34 tribe
|
|
35 subtribe
|
|
36 genus
|
|
37 subgenus
|
|
38 species
|
|
39 subspecies
|
|
40
|
|
41 output format - reads or counts
|
|
42
|
|
43 """
|
|
44
|
|
45 from galaxy import eggs
|
|
46 import pkg_resources
|
|
47 pkg_resources.require( 'pysqlite' )
|
|
48 from pysqlite2 import dbapi2 as sqlite
|
|
49 import string, sys, tempfile
|
|
50
|
|
51 # This dictionary maps taxonomic ranks to fields of Taxonomy file
|
|
52 taxRank = {
|
|
53 'root' :2,
|
|
54 'superkingdom':3,
|
|
55 'kingdom' :4,
|
|
56 'subkingdom' :5,
|
|
57 'superphylum' :6,
|
|
58 'phylum' :7,
|
|
59 'subphylum' :8,
|
|
60 'superclass' :9,
|
|
61 'class' :10,
|
|
62 'subclass' :11,
|
|
63 'superorder' :12,
|
|
64 'ord' :13,
|
|
65 'suborder' :14,
|
|
66 'superfamily' :15,
|
|
67 'family' :16,
|
|
68 'subfamily' :17,
|
|
69 'tribe' :18,
|
|
70 'subtribe' :19,
|
|
71 'genus' :20,
|
|
72 'subgenus' :21,
|
|
73 'species' :22,
|
|
74 'subspecies' :23,
|
|
75 'order' :13
|
|
76 }
|
|
77
|
|
78
|
|
79 def stop_err(msg):
|
|
80 sys.stderr.write(msg)
|
|
81 sys.exit()
|
|
82
|
|
83
|
|
84 db = tempfile.NamedTemporaryFile('w')
|
|
85
|
|
86 try:
|
|
87 con = sqlite.connect(db.name)
|
|
88 cur = con.cursor()
|
|
89 except:
|
|
90 stop_err('Cannot connect to %s\n') % db.name
|
|
91
|
|
92 try:
|
|
93 tax_file = open(sys.argv[1], 'r')
|
|
94 id_col = int(sys.argv[2]) - 1
|
|
95 taxa = string.split(sys.argv[3].rstrip(),',')
|
|
96
|
|
97 if sys.argv[4] == 'reads':
|
|
98 out_format = True
|
|
99 elif sys.argv[4] == 'counts':
|
|
100 out_format = False
|
|
101 else:
|
|
102 stop_err('Please specify "reads" or "counts" for output format\n')
|
|
103 out_file = open(sys.argv[5], 'w')
|
|
104
|
|
105 except:
|
|
106 stop_err('Check arguments\n')
|
|
107
|
|
108 if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n')
|
|
109
|
|
110 sql = ""
|
|
111 for i in range(len(taxa)):
|
|
112 if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order'
|
|
113 sql += '%s text, ' % taxa[i]
|
|
114
|
|
115 sql = sql.strip(', ')
|
|
116 sql = 'create table tax (name varchar(50) not null, ' + sql + ')'
|
|
117
|
|
118
|
|
119 cur.execute(sql)
|
|
120
|
|
121 invalid_line_number = 0
|
|
122
|
|
123 try:
|
|
124 for line in tax_file:
|
|
125 fields = string.split(line.rstrip(), '\t')
|
|
126 if len(fields) < 24:
|
|
127 invalid_line_number += 1
|
|
128 continue # Skipping malformed taxonomy lines
|
|
129
|
|
130 val_string = '"' + fields[id_col] + '", '
|
|
131
|
|
132 for rank in taxa:
|
|
133 taxon = fields[taxRank[rank]]
|
|
134 val_string += '"%s", ' % taxon
|
|
135
|
|
136 val_string = val_string.strip(', ')
|
|
137 val_string = "insert into tax values(" + val_string + ")"
|
|
138 cur.execute(val_string)
|
|
139 except Exception, e:
|
|
140 stop_err('%s\n' % e)
|
|
141
|
|
142 tax_file.close()
|
|
143
|
|
144 try:
|
|
145 for rank in taxa:
|
|
146 cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank )
|
|
147 cur.execute('insert into %s select name, name || %s as id, %s from tax group by id' % ( rank, rank, rank ) )
|
|
148 cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank)
|
|
149 cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) )
|
|
150
|
|
151 if rank == 'ord':
|
|
152 rankName = 'order'
|
|
153 else:
|
|
154 rankName = rank
|
|
155
|
|
156 if out_format:
|
|
157 cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank)
|
|
158 for item in cur.fetchall():
|
|
159 out_string = '%s\t%s\t' % ( item[0], item[1] )
|
|
160 out_string += rankName
|
|
161 print >>out_file, out_string
|
|
162 else:
|
|
163 cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank)
|
|
164 for item in cur.fetchall():
|
|
165 out_string = '%s\t%s\t' % ( item[0], item[1] )
|
|
166 out_string += rankName
|
|
167 print >>out_file, out_string
|
|
168 except Exception, e:
|
|
169 stop_err("%s\n" % e)
|
|
170
|