Mercurial > repos > pjbriggs > pal_finder
comparison fastq_subset.py @ 8:4e625d3672ba draft
Pal_finder tool version 0.02.04.7: add detection/reporting of bad ranges; enable subset of reads to be used; check n-mers.
author | pjbriggs |
---|---|
date | Wed, 16 May 2018 07:39:16 -0400 |
parents | |
children | 52dbe2089d14 |
comparison
equal
deleted
inserted
replaced
7:5e133b7b79a6 | 8:4e625d3672ba |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse | |
4 import random | |
5 from Bio.SeqIO.QualityIO import FastqGeneralIterator | |
6 | |
7 def count_reads(fastq): | |
8 """ | |
9 Count number of reads in a Fastq file | |
10 """ | |
11 n = 0 | |
12 with open(fastq,'r') as fq: | |
13 while True: | |
14 buf = fq.read() | |
15 n += buf.count('\n') | |
16 if buf == "": break | |
17 return n/4 | |
18 | |
19 def fastq_subset(fastq_in,fastq_out,indices): | |
20 """ | |
21 Output a subset of reads from a Fastq file | |
22 | |
23 The reads to output are specifed by a list | |
24 of integer indices; only reads at those | |
25 positions in the input file will be written | |
26 to the output. | |
27 """ | |
28 with open(fastq_in,'r') as fq_in: | |
29 fq_out = open(fastq_out,'w') | |
30 i = 0 | |
31 for title,seq,qual in FastqGeneralIterator(fq_in): | |
32 if i in indices: | |
33 fq_out.write("@%s\n%s\n+\n%s\n" % (title, | |
34 seq, | |
35 qual)) | |
36 i += 1 | |
37 fq_out.close() | |
38 | |
39 if __name__ == "__main__": | |
40 | |
41 p = argparse.ArgumentParser() | |
42 p.add_argument("fastq_r1") | |
43 p.add_argument("fastq_r2") | |
44 p.add_argument("-n", | |
45 dest="subset_size", | |
46 default=None, | |
47 help="subset size") | |
48 p.add_argument("-s", | |
49 dest="seed", | |
50 type=int, | |
51 default=None, | |
52 help="seed for random number generator") | |
53 args = p.parse_args() | |
54 | |
55 print "Processing fastq pair:" | |
56 print "\t%s" % args.fastq_r1 | |
57 print "\t%s" % args.fastq_r2 | |
58 | |
59 nreads = count_reads(args.fastq_r1) | |
60 print "Counted %d reads in %s" % (nreads,args.fastq_r1) | |
61 | |
62 if args.subset_size is not None: | |
63 subset_size = float(args.subset_size) | |
64 if subset_size < 1.0: | |
65 subset_size = int(nreads*subset_size) | |
66 else: | |
67 subset_size = int(subset_size) | |
68 print "Extracting subset of reads: %s" % subset_size | |
69 if args.seed is not None: | |
70 print "Random number generator seed: %d" % args.seed | |
71 random.seed(args.seed) | |
72 subset = random.sample(xrange(nreads),subset_size) | |
73 fastq_subset(args.fastq_r1,"subset_r1.fq",subset) | |
74 fastq_subset(args.fastq_r2,"subset_r2.fq",subset) |