Mercurial > repos > jjohnson > fastq_seq_count
comparison fastq_seq_count.py @ 0:27c39155d53b draft default tip
"planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/fastq_seq_count commit 7fcdca778df4012c93cb4aec26c2ff056817afee-dirty"
author | jjohnson |
---|---|
date | Tue, 26 Oct 2021 14:39:00 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:27c39155d53b |
---|---|
1 #!/usr/bin/env python | |
2 from __future__ import print_function | |
3 | |
4 import argparse | |
5 import multiprocessing as mp | |
6 import subprocess | |
7 from operator import itemgetter | |
8 | |
9 | |
10 gresults = [] | |
11 | |
12 | |
13 def revcomp(seq): | |
14 return seq.translate(str.maketrans('ACGTacgtRYMKrymkVBHDvbhd', | |
15 'TGCAtgcaYRKMyrkmBVDHbvdh'))[::-1] | |
16 | |
17 | |
18 def get_count(pat, fastq): | |
19 cmd = 'HITS=`grep -E \'' + pat + '\' ' + fastq + ' | wc -l` && echo $HITS' | |
20 process = subprocess.Popen(['bash', '-c', cmd], | |
21 stdout=subprocess.PIPE, | |
22 universal_newlines=True) | |
23 output = process.stdout.read().strip() | |
24 return int(output) if output else 0 | |
25 | |
26 | |
27 def get_seq_count(path, name, ln, seq, label, strand): | |
28 # print('%s\t%d\t%s\t%s\t' % (path, ln, label, strand), file=sys.stderr) | |
29 cmd = 'HITS=`grep -E \'' + seq + '\' ' + path + ' | wc -l` && echo $HITS' | |
30 process = subprocess.Popen(['bash', '-c', cmd], | |
31 stdout=subprocess.PIPE, | |
32 universal_newlines=True) | |
33 output = process.stdout.read().strip() | |
34 n = int(output) if output else 0 | |
35 return (path, name, ln, seq, label, strand, n) | |
36 | |
37 | |
38 def process_seq_count(result): | |
39 global results | |
40 global csh | |
41 gresults.append(result) | |
42 """ | |
43 if csh: | |
44 (path, name, ln, seq, label, strand, n) = result | |
45 print('%s\t%d\t%s\t%s\t%d' % (path, ln, label, strand, n), file=cfh) | |
46 """ | |
47 | |
48 | |
49 def main(): | |
50 parser = argparse.ArgumentParser() | |
51 parser.add_argument('-p', '--path_file', required=True, | |
52 help='file files paths and names') | |
53 parser.add_argument('-i', '--query_file', required=True, | |
54 help='queries file') | |
55 parser.add_argument('-s', '--summary', required=True, | |
56 help='summary report file') | |
57 parser.add_argument('-n', '--counts', required=False, | |
58 help='counts') | |
59 parser.add_argument('-I', '--id_col', required=False, default=None, | |
60 help='identifier column indices') | |
61 parser.add_argument('-q', '--q_col', type=int, required=True, | |
62 help='column in queries for search sequence') | |
63 parser.add_argument('-Q', '--q_label', required=False, default='mutant', | |
64 help='label for search sequence column') | |
65 parser.add_argument('-c', '--c_col', type=int, | |
66 default=None, required=False, | |
67 help='column in queries for comparison sequence') | |
68 parser.add_argument('-C', '--c_label', required=False, default='normal', | |
69 help='label for comparison sequence column') | |
70 parser.add_argument('-r', '--reverse_complement', | |
71 action='store_true', default=False, | |
72 help='Also search for reverse complements') | |
73 parser.add_argument('-P', '--per_file', action='store_true', default=False, | |
74 help='report per file') | |
75 parser.add_argument('-T', '--threads', type=int, default=4, required=False, | |
76 help='threads') | |
77 args = parser.parse_args() | |
78 | |
79 id_col_list = [] | |
80 if args.id_col: | |
81 id_col_list = [int(x) for x in str(args.id_col).split(',')] | |
82 strands = ['+', '-'] if args.reverse_complement else ['+'] | |
83 labels = [args.q_label] | |
84 if args.c_col is not None: | |
85 labels.append(args.c_label) | |
86 ids = dict() | |
87 qseqs = dict() | |
88 cseqs = dict() | |
89 files = dict() | |
90 with open(args.query_file, 'r') as fh: | |
91 for ln, line in enumerate(fh): | |
92 qnum = ln + 1 | |
93 fields = str(line).rstrip().split('\t') | |
94 id = '\t'.join([str(qnum)] + | |
95 [fields[i] for i in id_col_list]) | |
96 ids[ln] = id | |
97 qseqs[ln] = fields[args.q_col] | |
98 if args.c_col is not None: | |
99 cseqs[ln] = fields[args.c_col] | |
100 | |
101 with open(str(args.path_file), 'r') as fh: | |
102 for ln, line in enumerate(fh): | |
103 path, name = str(line).rstrip().split('\t') | |
104 files[name] = path | |
105 | |
106 queries = [] | |
107 for name, path in files.items(): | |
108 for i in range(len(qseqs)): | |
109 ln = i + 1 | |
110 seq = qseqs[i] | |
111 label = args.q_label | |
112 strand = '+' | |
113 queries.append((path, name, ln, seq, label, strand)) | |
114 if args.reverse_complement: | |
115 strand = '-' | |
116 queries.append((path, name, ln, revcomp(seq), label, strand)) | |
117 if i in cseqs: | |
118 seq = cseqs[i] | |
119 label = args.c_label | |
120 strand = '+' | |
121 queries.append((path, name, ln, seq, label, strand)) | |
122 if args.reverse_complement: | |
123 strand = '-' | |
124 queries.append((path, name, ln, revcomp(seq), | |
125 label, strand)) | |
126 | |
127 pool = mp.Pool(args.threads) | |
128 for i, query in enumerate(queries): | |
129 pool.apply_async(get_seq_count, | |
130 args=(query), | |
131 callback=process_seq_count) | |
132 pool.close() | |
133 pool.join() | |
134 | |
135 if args.counts: | |
136 with open(args.counts, 'w') as cfh: | |
137 for result in gresults: | |
138 print('\t'.join([str(x) for x in result[1:]]), file=cfh) | |
139 | |
140 # count ln name label | |
141 counts = dict() | |
142 for i in range(len(ids)): | |
143 counts[i] = dict() | |
144 for name, path in files.items(): | |
145 counts[i][name] = dict() | |
146 for j, label in enumerate(labels): | |
147 counts[i][name][label] = dict() | |
148 for strand in strands: | |
149 counts[i][name][label][strand] = 0 | |
150 | |
151 results = sorted(gresults, key=itemgetter(2, 1, 4, 5)) | |
152 for i, result in enumerate(results): | |
153 (path, name, ln, seq, label, strand, n) = result | |
154 counts[ln-1][name][label][strand] = n | |
155 | |
156 with open(args.summary, 'w') as ofh: | |
157 labels = [args.q_label] | |
158 if args.c_col is not None: | |
159 labels.append(args.c_label) | |
160 for i in range(len(ids)): | |
161 tcnts = [0, 0] | |
162 print(ids[i], end='\t', file=ofh) | |
163 for name, path in files.items(): | |
164 frac = 1. | |
165 cnts = [0, 0] | |
166 for j, label in enumerate(labels): | |
167 for strand in strands: | |
168 cnts[j] += counts[i][name][label][strand] | |
169 if args.per_file: | |
170 print(cnts[j], end='\t', file=ofh) | |
171 if args.per_file: | |
172 cnt = sum(cnts) | |
173 frac = float(cnts[0])/cnt if cnt else 1.0 | |
174 print(frac, end='\t', file=ofh) | |
175 tcnts[0] += cnts[0] | |
176 tcnts[1] += cnts[1] | |
177 for k, label in enumerate(labels): | |
178 print(tcnts[k], end='\t', file=ofh) | |
179 tcnt = sum(tcnts) | |
180 frac = float(tcnts[0])/tcnt if tcnt else 1.0 | |
181 print(frac, end='\n', file=ofh) | |
182 | |
183 | |
184 if __name__ == "__main__": | |
185 main() |