Mercurial > repos > peterjc > predictnls
diff tools/protein_analysis/predictnls.py @ 0:6e26c5a48e9a draft
Uploaded v0.0.4, first public release.
author | peterjc |
---|---|
date | Wed, 20 Feb 2013 11:39:06 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/protein_analysis/predictnls.py Wed Feb 20 11:39:06 2013 -0500 @@ -0,0 +1,167 @@ +#!/usr/bin/env python + +#Copyright 2011-2013 by Peter Cock, James Hutton Institute (formerly SCRI), UK +# +#Licenced under the GPL (GNU General Public Licence) version 3. +# +#Based on Perl script predictNLS v1.3, copyright 2001-2005 and the later +#versions up to predictnls v1.0.20 (copright 2012), by Rajesh Nair +#(nair@rostlab.org) and Burkhard Rost (rost@rostlab.org), Rost Lab, +#Columbia University http://rostlab.org/ + +"""Batch mode predictNLS, for finding nuclear localization signals + +This is a Python script re-implementing the predictNLS method, originally +written in Perl, described here: + +Murat Cokol, Rajesh Nair, and Burkhard Rost. +Finding nuclear localization signals. +EMBO reports 1(5), 411-415, 2000 + +http://dx.doi.org/10.1093/embo-reports/kvd092 + +The original Perl script was designed to work on a single sequence at a time, +but offers quite detailed output, including HTML (webpage). + +This Python version is designed to work on a single FASTA file containing +multiple sequences, and produces a single tabular output file, with one line +per NLS found (i.e. zero or more rows per query sequence). + +It takes either two or three command line arguments: + +predictNLS_batch input_file output_file [nls_motif_file] + +The input file should be protein sequences in FASTA format, the output file +is tab separated plain text, and the NLS motif file defaults to using the +plain text My_NLS_list file located next to the script file, or in a data +subdirectory. + +For convience if using this outside Galaxy, the input filename can be '-' +to mean stdin, and likewise the output filename can be '-' to mean stdout. + +Tested with the My_NLS_list file included with predictnls-1.0.7.tar.gz to +predictnls-1.0.20.tar.gz inclusive (the list was extended in v1.0.7 in +August 2010, see the change log included in those tar-balls). + +The Rost Lab provide source code tar balls for predictNLS on the FTP site +ftp://rostlab.org/predictnls/ but for Debian or RedHat based Linux they +recommend their package repository instead, +https://rostlab.org/owiki/index.php/Packages +""" + +import os +import sys +import re + +def stop_err(msg, return_code=1): + sys.stderr.write(msg.rstrip() + "\n") + sys.exit(return_code) + +if len(sys.argv) == 4: + fasta_filename, tabular_filename, re_filename = sys.argv[1:] +elif len(sys.argv) == 3: + fasta_filename, tabular_filename = sys.argv[1:] + #Use os.path.realpath(...) to handle being called via a symlink + #Try under subdirectory data: + re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), + "data", "My_NLS_list") + if not os.path.isfile(re_filename): + #Try in same directory as this script: + re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), + "My_NLS_list") +else: + stop_err("Expect 2 or 3 arguments: input FASTA file, output tabular file, and NLS motif file") + +if not os.path.isfile(fasta_filename): + stop_err("Could not find FASTA input file: %s" % fasta_filename) + +if not os.path.isfile(re_filename): + stop_err("Could not find NLS motif file: %s" % re_filename) + +def load_re(filename): + """Parse the 5+ column tabular NLS motif file.""" + handle = open(filename, "rU") + for line in handle: + line = line.rstrip("\n") + if not line: + continue + parts = line.split("\t") + assert 5 <= len(parts), parts + regex, evidence, p_count, percent_nuc, precent_non_nuc = parts[0:5] + try: + regex = re.compile(regex) + p_count = int(p_count) + except ValueError: + stop_err("Bad data in line: %s" % line) + if 6 <= len(parts): + proteins = parts[5] + assert p_count == len(proteins.split(",")), line + else: + proteins = "" + assert p_count == 0 + if 7 <= len(parts): + domains = parts[6] + assert int(p_count) == len(domains.split(",")), line + else: + domains = "" + assert p_count == 0 + #There can be further columns (DNA binding?), but we don't use them. + yield regex, evidence, p_count, percent_nuc, proteins, domains + handle.close() + +def fasta_iterator(filename): + """Simple FASTA parser yielding tuples of (name, upper case sequence).""" + if filename == "-": + handle = sys.stdin + else: + handle = open(filename) + name, seq = "", "" + for line in handle: + if line.startswith(">"): + if name: + yield name, seq + #Take the first word only as the name: + name = line[1:].rstrip().split(None,1)[0] + seq = "" + elif name: + #Simple way would leave in any internal white space, + #seq += line.strip().upper() + seq += "".join(line.strip().upper().split()) + elif not line.strip(): + #Ignore blank lines before first record + pass + else: + raise ValueError("Bad FASTA line %r" % line) + if filename != "-": + handle.close() + if name: + yield name, seq + raise StopIteration + +motifs = list(load_re(re_filename)) +print "Looking for %i NLS motifs" % len(motifs) + +if tabular_filename == "-": + out_handle = sys.stdout +else: + out_handle = open(tabular_filename, "w") +out_handle.write("#ID\tNLS start\tNLS seq\tNLS pattern\tType\tProtCount\t%NucProt\tProtList\tProtLoci\n") +count = 0 +nls = 0 +for idn, seq in fasta_iterator(fasta_filename): + for regex, evidence, p_count, percent_nuc_prot, proteins, domains in motifs: + #Perl predictnls v1.0.17 (and older) take right most hit only, Bug #40 + #This has been fixed (v1.0.18 onwards, June 2011), so we return all the matches + for match in regex.finditer(seq): + #Perl predictnls v1.0.17 (and older) return NLS start position with zero + #but changed to one based counting in v1.0.18 (June 2011) onwards, Bug #38 + #We therefore also use one based couting, hence the start+1 here: + out_handle.write("%s\t%i\t%s\t%s\t%s\t%i\t%s\t%s\t%s\n" \ + % (idn, match.start()+1, match.group(), + regex.pattern, evidence, p_count, + percent_nuc_prot, proteins, domains)) + nls += 1 + count += 1 +if tabular_filename != "-": + out_handle.close() +print "Found %i NLS motifs in %i sequences" % (nls, count)