comparison find_diag_hits.py @ 0:acf51ff24c7d draft default tip

Imported from capsule None
author devteam
date Mon, 27 Jan 2014 09:25:21 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acf51ff24c7d
1 #!/usr/bin/env python
2
3 """
4 tax_read_grouping.py <file in taxonomy format> <id column> <taxonomic ranks> <output format> <output file>
5 finds reads that only hit one taxonomic group. For example, consider the folliowing example:
6
7 read1 mammalia
8 read1 insecta
9 read2 insecta
10
11 in this case only read2 will be selected becuase it stays within insecta
12
13 This program takes the following options:
14
15 file in taxonomy format - dataset that complies with Galaxy's taxonomy format
16 id column - integer specifying the number of column containing seq id (starting with 1)
17 taxonomic ranks - a comma separated list of ranks from this list:
18
19 superkingdom
20 kingdom
21 subkingdom
22 superphylum
23 phylum
24 subphylum
25 superclass
26 class
27 subclass
28 superorder
29 order
30 suborder
31 superfamily
32 family
33 subfamily
34 tribe
35 subtribe
36 genus
37 subgenus
38 species
39 subspecies
40
41 output format - reads or counts
42
43 """
44
45 from galaxy import eggs
46 import pkg_resources
47 pkg_resources.require( 'pysqlite' )
48 from pysqlite2 import dbapi2 as sqlite
49 import string, sys, tempfile
50
51 # This dictionary maps taxonomic ranks to fields of Taxonomy file
52 taxRank = {
53 'root' :2,
54 'superkingdom':3,
55 'kingdom' :4,
56 'subkingdom' :5,
57 'superphylum' :6,
58 'phylum' :7,
59 'subphylum' :8,
60 'superclass' :9,
61 'class' :10,
62 'subclass' :11,
63 'superorder' :12,
64 'ord' :13,
65 'suborder' :14,
66 'superfamily' :15,
67 'family' :16,
68 'subfamily' :17,
69 'tribe' :18,
70 'subtribe' :19,
71 'genus' :20,
72 'subgenus' :21,
73 'species' :22,
74 'subspecies' :23,
75 'order' :13
76 }
77
78
79 def stop_err(msg):
80 sys.stderr.write(msg)
81 sys.exit()
82
83
84 db = tempfile.NamedTemporaryFile('w')
85
86 try:
87 con = sqlite.connect(db.name)
88 cur = con.cursor()
89 except:
90 stop_err('Cannot connect to %s\n') % db.name
91
92 try:
93 tax_file = open(sys.argv[1], 'r')
94 id_col = int(sys.argv[2]) - 1
95 taxa = string.split(sys.argv[3].rstrip(),',')
96
97 if sys.argv[4] == 'reads':
98 out_format = True
99 elif sys.argv[4] == 'counts':
100 out_format = False
101 else:
102 stop_err('Please specify "reads" or "counts" for output format\n')
103 out_file = open(sys.argv[5], 'w')
104
105 except:
106 stop_err('Check arguments\n')
107
108 if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n')
109
110 sql = ""
111 for i in range(len(taxa)):
112 if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order'
113 sql += '%s text, ' % taxa[i]
114
115 sql = sql.strip(', ')
116 sql = 'create table tax (name varchar(50) not null, ' + sql + ')'
117
118
119 cur.execute(sql)
120
121 invalid_line_number = 0
122
123 try:
124 for line in tax_file:
125 fields = string.split(line.rstrip(), '\t')
126 if len(fields) < 24:
127 invalid_line_number += 1
128 continue # Skipping malformed taxonomy lines
129
130 val_string = '"' + fields[id_col] + '", '
131
132 for rank in taxa:
133 taxon = fields[taxRank[rank]]
134 val_string += '"%s", ' % taxon
135
136 val_string = val_string.strip(', ')
137 val_string = "insert into tax values(" + val_string + ")"
138 cur.execute(val_string)
139 except Exception, e:
140 stop_err('%s\n' % e)
141
142 tax_file.close()
143
144 try:
145 for rank in taxa:
146 cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank )
147 cur.execute('insert into %s select name, name || %s as id, %s from tax group by id' % ( rank, rank, rank ) )
148 cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank)
149 cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) )
150
151 if rank == 'ord':
152 rankName = 'order'
153 else:
154 rankName = rank
155
156 if out_format:
157 cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank)
158 for item in cur.fetchall():
159 out_string = '%s\t%s\t' % ( item[0], item[1] )
160 out_string += rankName
161 print >>out_file, out_string
162 else:
163 cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank)
164 for item in cur.fetchall():
165 out_string = '%s\t%s\t' % ( item[0], item[1] )
166 out_string += rankName
167 print >>out_file, out_string
168 except Exception, e:
169 stop_err("%s\n" % e)
170