Mercurial > repos > iuc > khmer_partition
comparison filter-below-abund.py @ 5:766ec3dad7ff draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/khmer commit e0cd7ae10ce97bed51594e7cc0b969a803d698b7
| author | iuc |
|---|---|
| date | Fri, 07 Sep 2018 11:02:11 -0400 |
| parents | c393687bd857 |
| children |
comparison
equal
deleted
inserted
replaced
| 4:c393687bd857 | 5:766ec3dad7ff |
|---|---|
| 36 from __future__ import print_function | 36 from __future__ import print_function |
| 37 | 37 |
| 38 import os | 38 import os |
| 39 import sys | 39 import sys |
| 40 | 40 |
| 41 import khmer | 41 import screed |
| 42 from khmer.thread_utils import ThreadedSequenceProcessor, verbose_fasta_iter | 42 from khmer import Countgraph, ReadParser |
| 43 from khmer.utils import (broken_paired_reader, write_record) | |
| 43 | 44 |
| 44 WORKER_THREADS = 8 | |
| 45 GROUPSIZE = 100 | |
| 46 CUTOFF = 50 | 45 CUTOFF = 50 |
| 47 | 46 |
| 48 | 47 |
| 49 def main(): | 48 def main(): |
| 50 counting_ht = sys.argv[1] | 49 counting_ht = sys.argv[1] |
| 51 infiles = sys.argv[2:] | 50 infiles = sys.argv[2:] |
| 52 | 51 |
| 53 print('file with ht: %s' % counting_ht) | 52 print('file with ht: %s' % counting_ht) |
| 54 print('-- settings:') | |
| 55 print('N THREADS', WORKER_THREADS) | |
| 56 print('--') | |
| 57 | 53 |
| 58 print('making hashtable') | 54 print('making hashtable') |
| 59 ht = khmer.load_countgraph(counting_ht) | 55 ht = Countgraph.load(counting_ht) |
| 60 K = ht.ksize() | 56 K = ht.ksize() |
| 61 | 57 |
| 62 for infile in infiles: | 58 for infile in infiles: |
| 63 print('filtering', infile) | 59 print('filtering', infile) |
| 64 outfile = os.path.basename(infile) + '.below' | 60 outfile = os.path.basename(infile) + '.below' |
| 65 | 61 |
| 66 outfp = open(outfile, 'w') | 62 outfp = open(outfile, 'w') |
| 67 | 63 |
| 68 def process_fn(record, ht=ht): | 64 paired_iter = broken_paired_reader(ReadParser(infile), min_length=K, |
| 69 name = record['name'] | 65 force_single=True) |
| 70 seq = record['sequence'] | 66 for n, is_pair, read1, read2 in paired_iter: |
| 67 name = read1.name | |
| 68 seq = read1.sequence | |
| 71 if 'N' in seq: | 69 if 'N' in seq: |
| 72 return None, None | 70 return None, None |
| 73 | 71 |
| 74 trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) | 72 trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) |
| 75 | 73 |
| 76 if trim_at >= K: | 74 if trim_at >= K: |
| 77 return name, trim_seq | 75 write_record(screed.Record(name=name, sequence=trim_seq), outfp) |
| 78 | |
| 79 return None, None | |
| 80 | |
| 81 tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) | |
| 82 | |
| 83 tsp.start(verbose_fasta_iter(infile), outfp) | |
| 84 | 76 |
| 85 | 77 |
| 86 if __name__ == '__main__': | 78 if __name__ == '__main__': |
| 87 main() | 79 main() |
