diff tools/protein_analysis/predictnls.py @ 0:6e26c5a48e9a draft

Uploaded v0.0.4, first public release.
author peterjc
date Wed, 20 Feb 2013 11:39:06 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/protein_analysis/predictnls.py	Wed Feb 20 11:39:06 2013 -0500
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+
+#Copyright 2011-2013 by Peter Cock, James Hutton Institute (formerly SCRI), UK
+#
+#Licenced under the GPL (GNU General Public Licence) version 3.
+#
+#Based on Perl script predictNLS v1.3, copyright 2001-2005 and the later
+#versions up to predictnls v1.0.20 (copright 2012), by Rajesh Nair
+#(nair@rostlab.org) and Burkhard Rost (rost@rostlab.org), Rost Lab,
+#Columbia University http://rostlab.org/
+
+"""Batch mode predictNLS, for finding nuclear localization signals
+
+This is a Python script re-implementing the predictNLS method, originally
+written in Perl, described here:
+
+Murat Cokol, Rajesh Nair, and Burkhard Rost.
+Finding nuclear localization signals.
+EMBO reports 1(5), 411-415, 2000
+
+http://dx.doi.org/10.1093/embo-reports/kvd092
+
+The original Perl script was designed to work on a single sequence at a time,
+but offers quite detailed output, including HTML (webpage).
+
+This Python version is designed to work on a single FASTA file containing
+multiple sequences, and produces a single tabular output file, with one line
+per NLS found (i.e. zero or more rows per query sequence).
+
+It takes either two or three command line arguments:
+
+predictNLS_batch input_file output_file [nls_motif_file]
+
+The input file should be protein sequences in FASTA format, the output file
+is tab separated plain text, and the NLS motif file defaults to using the
+plain text My_NLS_list file located next to the script file, or in a data
+subdirectory.
+
+For convience if using this outside Galaxy, the input filename can be '-'
+to mean stdin, and likewise the output filename can be '-' to mean stdout.
+
+Tested with the My_NLS_list file included with predictnls-1.0.7.tar.gz to
+predictnls-1.0.20.tar.gz inclusive (the list was extended in v1.0.7 in
+August 2010, see the change log included in those tar-balls).
+
+The Rost Lab provide source code tar balls for predictNLS on the FTP site
+ftp://rostlab.org/predictnls/ but for Debian or RedHat based Linux they
+recommend their package repository instead,
+https://rostlab.org/owiki/index.php/Packages
+"""
+
+import os
+import sys
+import re
+
+def stop_err(msg, return_code=1):
+    sys.stderr.write(msg.rstrip() + "\n")
+    sys.exit(return_code)
+
+if len(sys.argv) == 4:
+    fasta_filename, tabular_filename, re_filename = sys.argv[1:]
+elif len(sys.argv) == 3:
+    fasta_filename, tabular_filename = sys.argv[1:]
+    #Use os.path.realpath(...) to handle being called via a symlink
+    #Try under subdirectory data:
+    re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),
+                               "data", "My_NLS_list")
+    if not os.path.isfile(re_filename):
+        #Try in same directory as this script:
+        re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),
+                                                   "My_NLS_list")
+else:
+    stop_err("Expect 2 or 3 arguments: input FASTA file, output tabular file, and NLS motif file")
+
+if not os.path.isfile(fasta_filename):
+    stop_err("Could not find FASTA input file: %s" % fasta_filename)
+
+if not os.path.isfile(re_filename):
+    stop_err("Could not find NLS motif file: %s" % re_filename)
+
+def load_re(filename):
+    """Parse the 5+ column tabular NLS motif file."""
+    handle = open(filename, "rU")
+    for line in handle:
+        line = line.rstrip("\n")
+        if not line:
+            continue
+        parts = line.split("\t")
+        assert 5 <= len(parts), parts
+        regex, evidence, p_count, percent_nuc, precent_non_nuc = parts[0:5]
+        try:
+            regex = re.compile(regex)
+            p_count = int(p_count)
+        except ValueError:
+            stop_err("Bad data in line: %s" % line)
+        if 6 <= len(parts):
+            proteins = parts[5]
+            assert p_count == len(proteins.split(",")), line
+        else:
+            proteins = ""
+            assert p_count == 0
+        if 7 <= len(parts):
+            domains = parts[6]
+            assert int(p_count) == len(domains.split(",")), line
+        else:
+            domains = ""
+            assert p_count == 0
+        #There can be further columns (DNA binding?), but we don't use them.
+        yield regex, evidence, p_count, percent_nuc, proteins, domains
+    handle.close()
+
+def fasta_iterator(filename):
+    """Simple FASTA parser yielding tuples of (name, upper case sequence)."""
+    if filename == "-":
+        handle = sys.stdin
+    else:
+        handle = open(filename)
+    name, seq = "", ""
+    for line in handle:
+        if line.startswith(">"):
+            if name:
+                yield name, seq
+            #Take the first word only as the name:
+            name = line[1:].rstrip().split(None,1)[0]
+            seq = ""
+        elif name:
+            #Simple way would leave in any internal white space,
+            #seq += line.strip().upper()
+            seq += "".join(line.strip().upper().split())
+        elif not line.strip():
+            #Ignore blank lines before first record
+            pass
+        else:
+            raise ValueError("Bad FASTA line %r" % line)
+    if filename != "-":
+        handle.close()
+    if name:
+        yield name, seq
+    raise StopIteration
+
+motifs = list(load_re(re_filename))
+print "Looking for %i NLS motifs" % len(motifs)
+
+if tabular_filename == "-":
+    out_handle = sys.stdout
+else:
+    out_handle = open(tabular_filename, "w")
+out_handle.write("#ID\tNLS start\tNLS seq\tNLS pattern\tType\tProtCount\t%NucProt\tProtList\tProtLoci\n")
+count = 0
+nls = 0
+for idn, seq in fasta_iterator(fasta_filename):
+    for regex, evidence, p_count, percent_nuc_prot, proteins, domains in motifs:
+        #Perl predictnls v1.0.17 (and older) take right most hit only, Bug #40
+        #This has been fixed (v1.0.18 onwards, June 2011), so we return all the matches
+        for match in regex.finditer(seq):
+            #Perl predictnls v1.0.17 (and older) return NLS start position with zero
+            #but changed to one based counting in v1.0.18 (June 2011) onwards, Bug #38
+            #We therefore also use one based couting, hence the start+1 here:
+            out_handle.write("%s\t%i\t%s\t%s\t%s\t%i\t%s\t%s\t%s\n" \
+                             % (idn, match.start()+1, match.group(),
+                                regex.pattern, evidence, p_count,
+                                percent_nuc_prot, proteins, domains))
+            nls += 1
+    count += 1
+if tabular_filename != "-":
+    out_handle.close()
+print "Found %i NLS motifs in %i sequences" % (nls, count)