seq_filter_by_id: tools/seq_filter_by_id/seq_filter_by

comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 10:4a7d8ad2a983 draft

Bump Biopython dependency

author	peterjc
date	Thu, 30 Nov 2023 09:50:34 +0000
parents	141612f8c3e3
children	85ef5f5a0562

comparison

equal deleted inserted replaced

-:141612f8c3e3
+:4a7d8ad2a983
 If you use this tool in scientific work leading to a publication, please
 cite the Biopython application note:
 Cock et al 2009. Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
 This script is copyright 2010-2017 by Peter Cock, The James Hutton Institute
 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
 See accompanying text file for licence details (MIT license).
 Multiple tabular files and column numbers may be given, or replaced with
 the -t or --text option.
 """
 parser = OptionParser(usage=usage)
-parser.add_option('-i', '--input', dest='input',
+parser.add_option(
-default=None, help='Input sequences filename',
+"-i",
-metavar="FILE")
+"--input",
-parser.add_option('-f', '--format', dest='format',
+dest="input",
 default=None,
-help='Input sequence format (e.g. fasta, fastq, sff)')
+help="Input sequences filename",
-parser.add_option('-t', '--text', dest='id_list',
+metavar="FILE",
-default=None, help="Lists of white space separated IDs (instead of a tabular file)")
+)
-parser.add_option('-p', '--positive', dest='output_positive',
+parser.add_option(
-default=None,
+"-f",
-help='Output filename for matches',
+"--format",
-metavar="FILE")
+dest="format",
-parser.add_option('-n', '--negative', dest='output_negative',
+default=None,
-default=None,
+help="Input sequence format (e.g. fasta, fastq, sff)",
-help='Output filename for non-matches',
+)
-metavar="FILE")
+parser.add_option(
-parser.add_option("-l", "--logic", dest="logic",
+"-t",
-default="UNION",
+"--text",
-help="How to combined multiple ID columns (UNION or INTERSECTION)")
+dest="id_list",
-parser.add_option("-s", "--suffix", dest="suffix",
+default=None,
-action="store_true",
+help="Lists of white space separated IDs (instead of a tabular file)",
-help="Ignore pair-read suffices for matching names")
+)
-parser.add_option("-v", "--version", dest="version",
+parser.add_option(
-default=False, action="store_true",
+"-p",
-help="Show version and quit")
+"--positive",
+dest="output_positive",
+default=None,
+help="Output filename for matches",
+metavar="FILE",
+)
+parser.add_option(
+"-n",
+"--negative",
+dest="output_negative",
+default=None,
+help="Output filename for non-matches",
+metavar="FILE",
+)
+parser.add_option(
+"-l",
+"--logic",
+dest="logic",
+default="UNION",
+help="How to combined multiple ID columns (UNION or INTERSECTION)",
+)
+parser.add_option(
+"-s",
+"--suffix",
+dest="suffix",
+action="store_true",
+help="Ignore pair-read suffixes for matching names",
+)
+parser.add_option(
+"-v",
+"--version",
+dest="version",
+default=False,
+action="store_true",
+help="Show version and quit",
+)
 options, args = parser.parse_args()
 if options.version:
 print("v0.2.7")
 in_file = options.input
 seq_format = options.format
 out_positive_file = options.output_positive
 out_negative_file = options.output_negative
 logic = options.logic
-drop_suffices = bool(options.suffix)
+drop_suffixes = bool(options.suffix)
 if in_file is None or not os.path.isfile(in_file):
 sys.exit("Missing input file: %r" % in_file)
 if out_positive_file is None and out_negative_file is None:
 sys.exit("Neither output file requested")
 if not os.path.isfile(tabular_file):
 sys.exit("Missing tabular identifier file %r" % tabular_file)
 try:
 columns = [int(arg) - 1 for arg in cols_arg.split(",")]
 except ValueError:
-sys.exit("Expected list of columns (comma separated integers), got %r" % cols_arg)
+sys.exit(
+"Expected list of columns (comma separated integers), got %r" % cols_arg
+)
 if min(columns) < 0:
-sys.exit("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg)
+sys.exit(
+"Expect one-based column numbers (not zero-based counting), got %r"
+% cols_arg
+)
 identifiers.append((tabular_file, columns))
 name_warn = False
 def check_white_space(name):
 """Check identifier for white space, take first word only."""
 parts = name.split(None, 1)
 global name_warn
 if not name_warn and len(parts) > 1:
-name_warn = "WARNING: Some of your identifiers had white space in them, " + \
+name_warn = (
-"using first word only. e.g.:\n%s\n" % name
+"WARNING: Some of your identifiers had white space in them, "
++ "using first word only. e.g.:\n%s\n" % name
+)
 return parts[0]
-if drop_suffices:
+if drop_suffixes:
 def clean_name(name):
 """Remove suffix."""
 name = check_white_space(name)
 match = re_suffix.search(name)
 if match:
 # Use the fact this is a suffix, and regular expression will be
 # anchored to the end of the name:
-return name[:match.start()]
+return name[: match.start()]
 else:
 # Nothing to do
 return name
 assert clean_name("foo/1") == "foo"
 assert clean_name("foo/2") == "foo"
 assert clean_name("bar.f") == "bar"
 assert clean_name("bar.r") == "bar"
 assert clean_name("baz.p1") == "baz"
 # Just check the white space
 clean_name = check_white_space
 mapped_chars = {
-'>': '__gt__',
+">": "__gt__",
-'<': '__lt__',
+"<": "__lt__",
-"'": '__sq__',
+"'": "__sq__",
-'"': '__dq__',
+'"': "__dq__",
-'[': '__ob__',
+"[": "__ob__",
-']': '__cb__',
+"]": "__cb__",
-'{': '__oc__',
+"{": "__oc__",
-'}': '__cc__',
+"}": "__cc__",
-'@': '__at__',
+"@": "__at__",
-'\n': '__cn__',
+"\n": "__cn__",
-'\r': '__cr__',
+"\r": "__cr__",
-'\t': '__tc__',
+"\t": "__tc__",
-'#': '__pd__',
+"#": "__pd__",
 }
 # Read tabular file(s) and record all specified identifiers
 ids = None  # Will be a set
 if options.id_list:
 continue
 if not line.startswith("#"):
 name = clean_name(line.rstrip("\n").split("\t")[col])
 if name:
 file_ids.add(name)
-print("Using %i IDs from column %s in tabular file" % (len(file_ids), ", ".join(str(col + 1) for col in columns)))
+print(
+"Using %i IDs from column %s in tabular file"
+% (len(file_ids), ", ".join(str(col + 1) for col in columns))
+)
 if ids is None:
 ids = file_ids
 if logic == "UNION":
 ids.update(file_ids)
 else:
 ids.intersection_update(file_ids)
 handle.close()
 if len(identifiers) > 1:
 if logic == "UNION":
-print("Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)))
+print(
+"Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers))
+)
 else:
-print("Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)))
+print(
+"Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))
+)
 if name_warn:
 sys.stderr.write(name_warn)
 def crude_fasta_iterator(handle):
-"""Yields tuples, record ID and the full record as a string."""
+"""Parse FASTA file yielding tuples of (name, sequence)."""
 while True:
 line = handle.readline()
 if line == "":
 return  # Premature end of file, or just empty?
 if line[0] == ">":
 break
 no_id_warned = False
 while True:
 if line[0] != ">":
-raise ValueError(
+raise ValueError("Records in Fasta files should start with '>' character")
-"Records in Fasta files should start with '>' character")
 try:
 id = line[1:].split(None, 1)[0]
 except IndexError:
 if not no_id_warned:
 sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
 def fastq_filter(in_file, pos_file, neg_file, wanted):
 """FASTQ filter."""
 from Bio.SeqIO.QualityIO import FastqGeneralIterator
 handle = open(in_file, "r")
 if pos_file is not None and neg_file is not None:
 print("Generating two FASTQ files")
 positive_handle = open(pos_file, "w")
 negative_handle = open(neg_file, "w")
 pos_count = neg_count = 0
 if pos_file is not None:
 out_handle = open(pos_file, "wb")
 writer = SffWriter(out_handle, xml=manifest)
 in_handle.seek(0)  # start again after getting manifest
-pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted)
+pos_count = writer.write_file(
+rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted
+)
 out_handle.close()
 if neg_file is not None:
 out_handle = open(neg_file, "wb")
 writer = SffWriter(out_handle, xml=manifest)
 in_handle.seek(0)  # start again
-neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted)
+neg_count = writer.write_file(
+rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted
+)
 out_handle.close()
 # And we're done
 in_handle.close()
 # At the time of writing, Galaxy doesn't show SFF file read counts,
 # so it is useful to put them in stdout and thus shown in job info.
 return pos_count, neg_count
 if seq_format.lower() == "sff":
 # Now write filtered SFF file based on IDs wanted
-pos_count, neg_count = sff_filter(in_file, out_positive_file, out_negative_file, ids)
+pos_count, neg_count = sff_filter(
+in_file, out_positive_file, out_negative_file, ids
+)
 # At the time of writing, Galaxy doesn't show SFF file read counts,
 # so it is useful to put them in stdout and thus shown in job info.
 elif seq_format.lower() == "fasta":
 # Write filtered FASTA file based on IDs from tabular file
-pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
+pos_count, neg_count = fasta_filter(
+in_file, out_positive_file, out_negative_file, ids
+)
 print("%i with and %i without specified IDs" % (pos_count, neg_count))
 elif seq_format.lower().startswith("fastq"):
 # Write filtered FASTQ file based on IDs from tabular file
 fastq_filter(in_file, out_positive_file, out_negative_file, ids)
 # This does not currently track the counts

Mercurial > repos > peterjc > seq_filter_by_id

comparison tools/seq_filter_by_id/seq_filter_by_id.py @ 10:4a7d8ad2a983 draft