Mercurial > repos > peterjc > seq_select_by_id
view tools/filters/seq_select_by_id.py @ 1:50a8a6917a9c draft
Uploaded update (v0.0.3) to ignore blank lines in the ID file
author | peterjc |
---|---|
date | Fri, 18 May 2012 12:25:12 -0400 |
parents | 838b9bebfa3c |
children | 28d52478ace9 |
line wrap: on
line source
#!/usr/bin/env python """Select FASTA, QUAL, FASTQ or SSF sequences by IDs from a tabular file. Takes five command line options, tabular filename, ID column number (using one based counting), input filename, input type (e.g. FASTA or SFF) and the output filename (same format as input sequence file). When selecting from an SFF file, any Roche XML manifest in the input file is preserved in both output files. This tool is a short Python script which requires Biopython 1.54 or later for SFF file support. If you use this tool in scientific work leading to a publication, please cite the Biopython application note: Cock et al 2009. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. This script is copyright 2011-2012 by Peter Cock, The James Hutton Institute UK. All rights reserved. See accompanying text file for licence details (MIT/BSD style). This is version 0.0.3 of the script. """ import sys def stop_err(msg, err=1): sys.stderr.write(msg.rstrip() + "\n") sys.exit(err) #Parse Command Line try: tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:] except ValueError: stop_err("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv))) try: if col_arg.startswith("c"): column = int(col_arg[1:])-1 else: column = int(col_arg)-1 except ValueError: stop_err("Expected column number, got %s" % col_arg) if seq_format == "fastqcssanger": stop_err("Colorspace FASTQ not supported.") elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]: seq_format = seq_format.lower() elif seq_format.lower().startswith("fastq"): #We don't care how the qualities are encoded seq_format = "fastq" elif seq_format.lower().startswith("qual"): #We don't care what the scores are seq_format = "qual" else: stop_err("Unrecognised file format %r" % seq_format) try: from Bio import SeqIO except ImportError: stop_err("Biopython 1.54 or later is required") def parse_ids(tabular_file, col): """Read tabular file and record all specified identifiers.""" handle = open(tabular_file, "rU") for line in handle: if line.strip() and not line.startswith("#"): yield line.rstrip("\n").split("\t")[col].strip() handle.close() #Index the sequence file. #If very big, could use SeqIO.index_db() to avoid memory bottleneck... records = SeqIO.index(in_file, seq_format) print "Indexed %i sequences" % len(records) if seq_format.lower()=="sff": #Special case to try to preserve the XML manifest try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: stop_err("Requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: #Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest in_handle = open(in_file, "rb") #must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None in_handle.close() out_handle = open(out_file, "wb") writer = SffWriter(out_handle, xml=manifest) count = 0 #This does have the overhead of parsing into SeqRecord objects, #but doing the header and index at the low level is too fidly. iterator = (records[name] for name in parse_ids(tabular_file, column)) try: count = writer.write_file(iterator) except KeyError, err: out_handle.close() if name not in records: stop_err("Identifier %r not found in sequence file" % name) else: raise err out_handle.close() else: #Avoid overhead of parsing into SeqRecord objects, #just re-use the original formatting from the input file. out_handle = open(out_file, "w") count = 0 for name in parse_ids(tabular_file, column): try: out_handle.write(records.get_raw(name)) except KeyError: out_handle.close() stop_err("Identifier %r not found in sequence file" % name) count += 1 out_handle.close() print "Selected %i sequences by ID" % count