Mercurial > repos > xuebing > sharplabtool
comparison tools/ilmn_pacbio/quake.py @ 0:9071e359b9a3
Uploaded
| author | xuebing | 
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:9071e359b9a3 | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 from optparse import OptionParser, SUPPRESS_HELP | |
| 3 import os, random, sys | |
| 4 import cov_model | |
| 5 | |
| 6 ############################################################ | |
| 7 # quake.py | |
| 8 # | |
| 9 # Launch pipeline to correct errors in Illumina sequencing | |
| 10 # reads. | |
| 11 ############################################################ | |
| 12 | |
| 13 #r_dir = '/nfshomes/dakelley/research/error_correction/bin' | |
| 14 quake_dir = os.path.abspath(os.path.dirname(sys.argv[0])) | |
| 15 | |
| 16 ############################################################ | |
| 17 # main | |
| 18 ############################################################ | |
| 19 def main(): | |
| 20 usage = 'usage: %prog [options]' | |
| 21 parser = OptionParser(usage) | |
| 22 parser.add_option('-r', dest='readsf', help='Fastq file of reads') | |
| 23 parser.add_option('-f', dest='reads_listf', help='File containing fastq file names, one per line or two per line for paired end reads.') | |
| 24 parser.add_option('-k', dest='k', type='int', help='Size of k-mers to correct') | |
| 25 parser.add_option('-p', dest='proc', type='int', default=4, help='Number of processes [default: %default]') | |
| 26 parser.add_option('-q', dest='quality_scale', type='int', default=-1, help='Quality value ascii scale, generally 64 or 33. If not specified, it will guess.') | |
| 27 parser.add_option('--no_count', dest='no_count', action='store_true', default=False, help='Kmers are already counted and in expected file [reads file].qcts or [reads file].cts [default: %default]') | |
| 28 parser.add_option('--no_cut', dest='no_cut', action='store_true', default=False, help='Coverage model is optimized and cutoff was printed to expected file cutoff.txt [default: %default]') | |
| 29 parser.add_option('--int', dest='counted_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]') | |
| 30 parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff. Generally set between 10-1000 with lower numbers suggesting a lower threshold. [default: %default]') | |
| 31 # help='Model kmer coverage as a function of GC content of kmers [default: %default]' | |
| 32 parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP) | |
| 33 parser.add_option('--headers', action='store_true', default=False, help='Output original read headers (i.e. pass --headers to correct)' ) | |
| 34 (options, args) = parser.parse_args() | |
| 35 | |
| 36 if not options.readsf and not options.reads_listf: | |
| 37 parser.error('Must provide fastq file of reads with -r or file with list of fastq files of reads with -f') | |
| 38 if not options.k: | |
| 39 parser.error('Must provide k-mer size with -k') | |
| 40 if options.quality_scale == -1: | |
| 41 options.quality_scale = guess_quality_scale(options.readsf, options.reads_listf) | |
| 42 | |
| 43 if options.counted_kmers: | |
| 44 cts_suf = 'cts' | |
| 45 else: | |
| 46 cts_suf = 'qcts' | |
| 47 if options.readsf: | |
| 48 ctsf = '%s.%s' % (os.path.splitext( os.path.split(options.readsf)[1] )[0], cts_suf) | |
| 49 reads_str = '-r %s' % options.readsf | |
| 50 else: | |
| 51 ctsf = '%s.%s' % (os.path.split(options.reads_listf)[1], cts_suf) | |
| 52 reads_str = '-f %s' % options.reads_listf | |
| 53 | |
| 54 if not options.no_count and not options.no_cut: | |
| 55 count_kmers(options.readsf, options.reads_listf, options.k, ctsf, options.quality_scale) | |
| 56 | |
| 57 if not options.no_cut: | |
| 58 # model coverage | |
| 59 if options.counted_kmers: | |
| 60 cov_model.model_cutoff(ctsf, options.ratio) | |
| 61 else: | |
| 62 if options.model_gc: | |
| 63 cov_model.model_q_gc_cutoffs(ctsf, 10000, options.ratio) | |
| 64 else: | |
| 65 cov_model.model_q_cutoff(ctsf, 25000, options.ratio) | |
| 66 | |
| 67 | |
| 68 if options.model_gc: | |
| 69 # run correct C++ code | |
| 70 os.system('%s/correct %s -k %d -m %s -a cutoffs.gc.txt -p %d -q %d' % (quake_dir,reads_str, options.k, ctsf, options.proc, options.quality_scale)) | |
| 71 | |
| 72 else: | |
| 73 cutoff = open('cutoff.txt').readline().rstrip() | |
| 74 | |
| 75 # run correct C++ code | |
| 76 headers = '--headers' if options.headers else '' | |
| 77 os.system('%s/correct %s %s -k %d -m %s -c %s -p %d -q %d' % (quake_dir,headers, reads_str, options.k, ctsf, cutoff, options.proc, options.quality_scale)) | |
| 78 | |
| 79 | |
| 80 ################################################################################ | |
| 81 # guess_quality_scale | |
| 82 # Guess at ascii scale of quality values by examining | |
| 83 # a bunch of reads and looking for quality values < 64, | |
| 84 # in which case we set it to 33. | |
| 85 ################################################################################ | |
| 86 def guess_quality_scale(readsf, reads_listf): | |
| 87 reads_to_check = 1000 | |
| 88 if not readsf: | |
| 89 readsf = open(reads_listf).readline().split()[0] | |
| 90 | |
| 91 fqf = open(readsf) | |
| 92 reads_checked = 0 | |
| 93 header = fqf.readline() | |
| 94 while header and reads_checked < reads_to_check: | |
| 95 seq = fqf.readline() | |
| 96 mid = fqf.readline() | |
| 97 qual = fqf.readline().rstrip() | |
| 98 reads_checked += 1 | |
| 99 for q in qual: | |
| 100 if ord(q) < 64: | |
| 101 print 'Guessing quality values are on ascii 33 scale' | |
| 102 return 33 | |
| 103 header = fqf.readline() | |
| 104 | |
| 105 print 'Guessing quality values are on ascii 64 scale' | |
| 106 return 64 | |
| 107 | |
| 108 | |
| 109 | |
| 110 ############################################################ | |
| 111 # count_kmers | |
| 112 # | |
| 113 # Count kmers in the reads file using AMOS count-kmers or | |
| 114 # count-qmers | |
| 115 ############################################################ | |
| 116 def count_kmers(readsf, reads_listf, k, ctsf, quality_scale): | |
| 117 # find files | |
| 118 fq_files = [] | |
| 119 if readsf: | |
| 120 fq_files.append(readsf) | |
| 121 else: | |
| 122 for line in open(reads_listf): | |
| 123 for fqf in line.split(): | |
| 124 fq_files.append(fqf) | |
| 125 | |
| 126 if ctsf[-4:] == 'qcts': | |
| 127 os.system('cat %s | %s/count-qmers -k %d -q %d > %s' % (' '.join(fq_files), quake_dir, k, quality_scale, ctsf)) | |
| 128 else: | |
| 129 os.system('cat %s | %s/count-kmers -k %d > %s' % (' '.join(fq_files), quake_dir, k, ctsf)) | |
| 130 | |
| 131 | |
| 132 ############################################################ | |
| 133 # __main__ | |
| 134 ############################################################ | |
| 135 if __name__ == '__main__': | |
| 136 main() | 
