Mercurial > repos > xuebing > sharplabtool
comparison tools/ilmn_pacbio/quake.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 from optparse import OptionParser, SUPPRESS_HELP | |
3 import os, random, sys | |
4 import cov_model | |
5 | |
6 ############################################################ | |
7 # quake.py | |
8 # | |
9 # Launch pipeline to correct errors in Illumina sequencing | |
10 # reads. | |
11 ############################################################ | |
12 | |
13 #r_dir = '/nfshomes/dakelley/research/error_correction/bin' | |
14 quake_dir = os.path.abspath(os.path.dirname(sys.argv[0])) | |
15 | |
16 ############################################################ | |
17 # main | |
18 ############################################################ | |
19 def main(): | |
20 usage = 'usage: %prog [options]' | |
21 parser = OptionParser(usage) | |
22 parser.add_option('-r', dest='readsf', help='Fastq file of reads') | |
23 parser.add_option('-f', dest='reads_listf', help='File containing fastq file names, one per line or two per line for paired end reads.') | |
24 parser.add_option('-k', dest='k', type='int', help='Size of k-mers to correct') | |
25 parser.add_option('-p', dest='proc', type='int', default=4, help='Number of processes [default: %default]') | |
26 parser.add_option('-q', dest='quality_scale', type='int', default=-1, help='Quality value ascii scale, generally 64 or 33. If not specified, it will guess.') | |
27 parser.add_option('--no_count', dest='no_count', action='store_true', default=False, help='Kmers are already counted and in expected file [reads file].qcts or [reads file].cts [default: %default]') | |
28 parser.add_option('--no_cut', dest='no_cut', action='store_true', default=False, help='Coverage model is optimized and cutoff was printed to expected file cutoff.txt [default: %default]') | |
29 parser.add_option('--int', dest='counted_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]') | |
30 parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff. Generally set between 10-1000 with lower numbers suggesting a lower threshold. [default: %default]') | |
31 # help='Model kmer coverage as a function of GC content of kmers [default: %default]' | |
32 parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP) | |
33 parser.add_option('--headers', action='store_true', default=False, help='Output original read headers (i.e. pass --headers to correct)' ) | |
34 (options, args) = parser.parse_args() | |
35 | |
36 if not options.readsf and not options.reads_listf: | |
37 parser.error('Must provide fastq file of reads with -r or file with list of fastq files of reads with -f') | |
38 if not options.k: | |
39 parser.error('Must provide k-mer size with -k') | |
40 if options.quality_scale == -1: | |
41 options.quality_scale = guess_quality_scale(options.readsf, options.reads_listf) | |
42 | |
43 if options.counted_kmers: | |
44 cts_suf = 'cts' | |
45 else: | |
46 cts_suf = 'qcts' | |
47 if options.readsf: | |
48 ctsf = '%s.%s' % (os.path.splitext( os.path.split(options.readsf)[1] )[0], cts_suf) | |
49 reads_str = '-r %s' % options.readsf | |
50 else: | |
51 ctsf = '%s.%s' % (os.path.split(options.reads_listf)[1], cts_suf) | |
52 reads_str = '-f %s' % options.reads_listf | |
53 | |
54 if not options.no_count and not options.no_cut: | |
55 count_kmers(options.readsf, options.reads_listf, options.k, ctsf, options.quality_scale) | |
56 | |
57 if not options.no_cut: | |
58 # model coverage | |
59 if options.counted_kmers: | |
60 cov_model.model_cutoff(ctsf, options.ratio) | |
61 else: | |
62 if options.model_gc: | |
63 cov_model.model_q_gc_cutoffs(ctsf, 10000, options.ratio) | |
64 else: | |
65 cov_model.model_q_cutoff(ctsf, 25000, options.ratio) | |
66 | |
67 | |
68 if options.model_gc: | |
69 # run correct C++ code | |
70 os.system('%s/correct %s -k %d -m %s -a cutoffs.gc.txt -p %d -q %d' % (quake_dir,reads_str, options.k, ctsf, options.proc, options.quality_scale)) | |
71 | |
72 else: | |
73 cutoff = open('cutoff.txt').readline().rstrip() | |
74 | |
75 # run correct C++ code | |
76 headers = '--headers' if options.headers else '' | |
77 os.system('%s/correct %s %s -k %d -m %s -c %s -p %d -q %d' % (quake_dir,headers, reads_str, options.k, ctsf, cutoff, options.proc, options.quality_scale)) | |
78 | |
79 | |
80 ################################################################################ | |
81 # guess_quality_scale | |
82 # Guess at ascii scale of quality values by examining | |
83 # a bunch of reads and looking for quality values < 64, | |
84 # in which case we set it to 33. | |
85 ################################################################################ | |
86 def guess_quality_scale(readsf, reads_listf): | |
87 reads_to_check = 1000 | |
88 if not readsf: | |
89 readsf = open(reads_listf).readline().split()[0] | |
90 | |
91 fqf = open(readsf) | |
92 reads_checked = 0 | |
93 header = fqf.readline() | |
94 while header and reads_checked < reads_to_check: | |
95 seq = fqf.readline() | |
96 mid = fqf.readline() | |
97 qual = fqf.readline().rstrip() | |
98 reads_checked += 1 | |
99 for q in qual: | |
100 if ord(q) < 64: | |
101 print 'Guessing quality values are on ascii 33 scale' | |
102 return 33 | |
103 header = fqf.readline() | |
104 | |
105 print 'Guessing quality values are on ascii 64 scale' | |
106 return 64 | |
107 | |
108 | |
109 | |
110 ############################################################ | |
111 # count_kmers | |
112 # | |
113 # Count kmers in the reads file using AMOS count-kmers or | |
114 # count-qmers | |
115 ############################################################ | |
116 def count_kmers(readsf, reads_listf, k, ctsf, quality_scale): | |
117 # find files | |
118 fq_files = [] | |
119 if readsf: | |
120 fq_files.append(readsf) | |
121 else: | |
122 for line in open(reads_listf): | |
123 for fqf in line.split(): | |
124 fq_files.append(fqf) | |
125 | |
126 if ctsf[-4:] == 'qcts': | |
127 os.system('cat %s | %s/count-qmers -k %d -q %d > %s' % (' '.join(fq_files), quake_dir, k, quality_scale, ctsf)) | |
128 else: | |
129 os.system('cat %s | %s/count-kmers -k %d > %s' % (' '.join(fq_files), quake_dir, k, ctsf)) | |
130 | |
131 | |
132 ############################################################ | |
133 # __main__ | |
134 ############################################################ | |
135 if __name__ == '__main__': | |
136 main() |