comparison tools/fastq_filter_by_id/fastq_filter_by_id.py @ 3:e0041942a12d draft default tip

v0.0.5 - galaxy_sequence_utils dependency and other cleanups inc using MIT license
author peterjc
date Fri, 03 Feb 2017 05:34:18 -0500
parents
children
comparison
equal deleted inserted replaced
2:d570cc324779 3:e0041942a12d
1 #!/usr/bin/env python
2 """Filter a FASTQ file with IDs from a tabular file, e.g. from BLAST.
3
4 NOTE - This script is now OBSOLETE, having been replaced by a new verion
5 which handles FASTA, FASTQ and SFF all in one.
6
7 Takes five command line options, tabular filename, ID column numbers
8 (comma separated list using one based counting), input FASTA filename, and
9 two output FASTA filenames (for records with and without the given IDs).
10
11 If either output filename is just a minus sign, that file is not created.
12 This is intended to allow output for just the matched (or just the non-matched)
13 records.
14
15 Note in the default NCBI BLAST+ tabular output, the query sequence ID is
16 in column one, and the ID of the match from the database is in column two.
17 Here sensible values for the column numbers would therefore be "1" or "2".
18
19 This tool is copyright 2010-2017 by Peter Cock, The James Hutton Institute
20 (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
21 See accompanying text file for licence details (MIT license).
22 """
23 import sys
24
25 if "-v" in sys.argv or "--version" in sys.argv:
26 print "v0.0.5"
27 sys.exit(0)
28
29 from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
30
31 # Parse Command Line
32 try:
33 tabular_file, cols_arg, in_file, out_positive_file, out_negative_file = sys.argv[1:]
34 except ValueError:
35 sys.exit("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
36 try:
37 columns = [int(arg)-1 for arg in cols_arg.split(",")]
38 except ValueError:
39 sys.exit("Expected list of columns (comma separated integers), got %s" % cols_arg)
40
41 # Read tabular file and record all specified identifiers
42 ids = set()
43 handle = open(tabular_file, "rU")
44 if len(columns) > 1:
45 # General case of many columns
46 for line in handle:
47 if line.startswith("#"):
48 # Ignore comments
49 continue
50 parts = line.rstrip("\n").split("\t")
51 for col in columns:
52 ids.add(parts[col])
53 print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns))
54 else:
55 # Single column, special case speed up
56 col = columns[0]
57 for line in handle:
58 if not line.startswith("#"):
59 ids.add(line.rstrip("\n").split("\t")[col])
60 print "Using %i IDs from tabular file" % (len(ids))
61 handle.close()
62
63 # Write filtered FASTQ file based on IDs from tabular file
64 reader = fastqReader(open(in_file, "rU"))
65 if out_positive_file != "-" and out_negative_file != "-":
66 print "Generating two FASTQ files"
67 positive_writer = fastqWriter(open(out_positive_file, "w"))
68 negative_writer = fastqWriter(open(out_negative_file, "w"))
69 for record in reader:
70 # The [1:] is because the fastaReader leaves the @ on the identifer.
71 if record.identifier and record.identifier.split()[0][1:] in ids:
72 positive_writer.write(record)
73 else:
74 negative_writer.write(record)
75 positive_writer.close()
76 negative_writer.close()
77 elif out_positive_file != "-":
78 print "Generating matching FASTQ file"
79 positive_writer = fastqWriter(open(out_positive_file, "w"))
80 for record in reader:
81 # The [1:] is because the fastaReader leaves the @ on the identifer.
82 if record.identifier and record.identifier.split()[0][1:] in ids:
83 positive_writer.write(record)
84 positive_writer.close()
85 elif out_negative_file != "-":
86 print "Generating non-matching FASTQ file"
87 negative_writer = fastqWriter(open(out_negative_file, "w"))
88 for record in reader:
89 # The [1:] is because the fastaReader leaves the @ on the identifer.
90 if not record.identifier or record.identifier.split()[0][1:] not in ids:
91 negative_writer.write(record)
92 negative_writer.close()
93 else:
94 sys.exit("Neither output file requested")
95 reader.close()