Mercurial > repos > devteam > find_diag_hits
comparison find_diag_hits.py @ 0:acf51ff24c7d draft default tip
Imported from capsule None
author | devteam |
---|---|
date | Mon, 27 Jan 2014 09:25:21 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acf51ff24c7d |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 tax_read_grouping.py <file in taxonomy format> <id column> <taxonomic ranks> <output format> <output file> | |
5 finds reads that only hit one taxonomic group. For example, consider the folliowing example: | |
6 | |
7 read1 mammalia | |
8 read1 insecta | |
9 read2 insecta | |
10 | |
11 in this case only read2 will be selected becuase it stays within insecta | |
12 | |
13 This program takes the following options: | |
14 | |
15 file in taxonomy format - dataset that complies with Galaxy's taxonomy format | |
16 id column - integer specifying the number of column containing seq id (starting with 1) | |
17 taxonomic ranks - a comma separated list of ranks from this list: | |
18 | |
19 superkingdom | |
20 kingdom | |
21 subkingdom | |
22 superphylum | |
23 phylum | |
24 subphylum | |
25 superclass | |
26 class | |
27 subclass | |
28 superorder | |
29 order | |
30 suborder | |
31 superfamily | |
32 family | |
33 subfamily | |
34 tribe | |
35 subtribe | |
36 genus | |
37 subgenus | |
38 species | |
39 subspecies | |
40 | |
41 output format - reads or counts | |
42 | |
43 """ | |
44 | |
45 from galaxy import eggs | |
46 import pkg_resources | |
47 pkg_resources.require( 'pysqlite' ) | |
48 from pysqlite2 import dbapi2 as sqlite | |
49 import string, sys, tempfile | |
50 | |
51 # This dictionary maps taxonomic ranks to fields of Taxonomy file | |
52 taxRank = { | |
53 'root' :2, | |
54 'superkingdom':3, | |
55 'kingdom' :4, | |
56 'subkingdom' :5, | |
57 'superphylum' :6, | |
58 'phylum' :7, | |
59 'subphylum' :8, | |
60 'superclass' :9, | |
61 'class' :10, | |
62 'subclass' :11, | |
63 'superorder' :12, | |
64 'ord' :13, | |
65 'suborder' :14, | |
66 'superfamily' :15, | |
67 'family' :16, | |
68 'subfamily' :17, | |
69 'tribe' :18, | |
70 'subtribe' :19, | |
71 'genus' :20, | |
72 'subgenus' :21, | |
73 'species' :22, | |
74 'subspecies' :23, | |
75 'order' :13 | |
76 } | |
77 | |
78 | |
79 def stop_err(msg): | |
80 sys.stderr.write(msg) | |
81 sys.exit() | |
82 | |
83 | |
84 db = tempfile.NamedTemporaryFile('w') | |
85 | |
86 try: | |
87 con = sqlite.connect(db.name) | |
88 cur = con.cursor() | |
89 except: | |
90 stop_err('Cannot connect to %s\n') % db.name | |
91 | |
92 try: | |
93 tax_file = open(sys.argv[1], 'r') | |
94 id_col = int(sys.argv[2]) - 1 | |
95 taxa = string.split(sys.argv[3].rstrip(),',') | |
96 | |
97 if sys.argv[4] == 'reads': | |
98 out_format = True | |
99 elif sys.argv[4] == 'counts': | |
100 out_format = False | |
101 else: | |
102 stop_err('Please specify "reads" or "counts" for output format\n') | |
103 out_file = open(sys.argv[5], 'w') | |
104 | |
105 except: | |
106 stop_err('Check arguments\n') | |
107 | |
108 if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n') | |
109 | |
110 sql = "" | |
111 for i in range(len(taxa)): | |
112 if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order' | |
113 sql += '%s text, ' % taxa[i] | |
114 | |
115 sql = sql.strip(', ') | |
116 sql = 'create table tax (name varchar(50) not null, ' + sql + ')' | |
117 | |
118 | |
119 cur.execute(sql) | |
120 | |
121 invalid_line_number = 0 | |
122 | |
123 try: | |
124 for line in tax_file: | |
125 fields = string.split(line.rstrip(), '\t') | |
126 if len(fields) < 24: | |
127 invalid_line_number += 1 | |
128 continue # Skipping malformed taxonomy lines | |
129 | |
130 val_string = '"' + fields[id_col] + '", ' | |
131 | |
132 for rank in taxa: | |
133 taxon = fields[taxRank[rank]] | |
134 val_string += '"%s", ' % taxon | |
135 | |
136 val_string = val_string.strip(', ') | |
137 val_string = "insert into tax values(" + val_string + ")" | |
138 cur.execute(val_string) | |
139 except Exception, e: | |
140 stop_err('%s\n' % e) | |
141 | |
142 tax_file.close() | |
143 | |
144 try: | |
145 for rank in taxa: | |
146 cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank ) | |
147 cur.execute('insert into %s select name, name || %s as id, %s from tax group by id' % ( rank, rank, rank ) ) | |
148 cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank) | |
149 cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) ) | |
150 | |
151 if rank == 'ord': | |
152 rankName = 'order' | |
153 else: | |
154 rankName = rank | |
155 | |
156 if out_format: | |
157 cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank) | |
158 for item in cur.fetchall(): | |
159 out_string = '%s\t%s\t' % ( item[0], item[1] ) | |
160 out_string += rankName | |
161 print >>out_file, out_string | |
162 else: | |
163 cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank) | |
164 for item in cur.fetchall(): | |
165 out_string = '%s\t%s\t' % ( item[0], item[1] ) | |
166 out_string += rankName | |
167 print >>out_file, out_string | |
168 except Exception, e: | |
169 stop_err("%s\n" % e) | |
170 |