0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 #Copyright 2011-2013 by Peter Cock, James Hutton Institute (formerly SCRI), UK
|
|
4 #
|
|
5 #Licenced under the GPL (GNU General Public Licence) version 3.
|
|
6 #
|
|
7 #Based on Perl script predictNLS v1.3, copyright 2001-2005 and the later
|
|
8 #versions up to predictnls v1.0.20 (copright 2012), by Rajesh Nair
|
|
9 #(nair@rostlab.org) and Burkhard Rost (rost@rostlab.org), Rost Lab,
|
|
10 #Columbia University http://rostlab.org/
|
|
11
|
|
12 """Batch mode predictNLS, for finding nuclear localization signals
|
|
13
|
|
14 This is a Python script re-implementing the predictNLS method, originally
|
|
15 written in Perl, described here:
|
|
16
|
|
17 Murat Cokol, Rajesh Nair, and Burkhard Rost.
|
|
18 Finding nuclear localization signals.
|
|
19 EMBO reports 1(5), 411-415, 2000
|
|
20
|
|
21 http://dx.doi.org/10.1093/embo-reports/kvd092
|
|
22
|
|
23 The original Perl script was designed to work on a single sequence at a time,
|
|
24 but offers quite detailed output, including HTML (webpage).
|
|
25
|
|
26 This Python version is designed to work on a single FASTA file containing
|
|
27 multiple sequences, and produces a single tabular output file, with one line
|
|
28 per NLS found (i.e. zero or more rows per query sequence).
|
|
29
|
|
30 It takes either two or three command line arguments:
|
|
31
|
|
32 predictNLS_batch input_file output_file [nls_motif_file]
|
|
33
|
|
34 The input file should be protein sequences in FASTA format, the output file
|
|
35 is tab separated plain text, and the NLS motif file defaults to using the
|
|
36 plain text My_NLS_list file located next to the script file, or in a data
|
|
37 subdirectory.
|
|
38
|
|
39 For convience if using this outside Galaxy, the input filename can be '-'
|
|
40 to mean stdin, and likewise the output filename can be '-' to mean stdout.
|
|
41
|
|
42 Tested with the My_NLS_list file included with predictnls-1.0.7.tar.gz to
|
|
43 predictnls-1.0.20.tar.gz inclusive (the list was extended in v1.0.7 in
|
|
44 August 2010, see the change log included in those tar-balls).
|
|
45
|
|
46 The Rost Lab provide source code tar balls for predictNLS on the FTP site
|
|
47 ftp://rostlab.org/predictnls/ but for Debian or RedHat based Linux they
|
|
48 recommend their package repository instead,
|
|
49 https://rostlab.org/owiki/index.php/Packages
|
|
50 """
|
|
51
|
|
52 import os
|
|
53 import sys
|
|
54 import re
|
|
55
|
|
56 def stop_err(msg, return_code=1):
|
|
57 sys.stderr.write(msg.rstrip() + "\n")
|
|
58 sys.exit(return_code)
|
|
59
|
|
60 if len(sys.argv) == 4:
|
|
61 fasta_filename, tabular_filename, re_filename = sys.argv[1:]
|
|
62 elif len(sys.argv) == 3:
|
|
63 fasta_filename, tabular_filename = sys.argv[1:]
|
|
64 #Use os.path.realpath(...) to handle being called via a symlink
|
|
65 #Try under subdirectory data:
|
|
66 re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),
|
|
67 "data", "My_NLS_list")
|
|
68 if not os.path.isfile(re_filename):
|
|
69 #Try in same directory as this script:
|
|
70 re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),
|
|
71 "My_NLS_list")
|
|
72 else:
|
|
73 stop_err("Expect 2 or 3 arguments: input FASTA file, output tabular file, and NLS motif file")
|
|
74
|
|
75 if not os.path.isfile(fasta_filename):
|
|
76 stop_err("Could not find FASTA input file: %s" % fasta_filename)
|
|
77
|
|
78 if not os.path.isfile(re_filename):
|
|
79 stop_err("Could not find NLS motif file: %s" % re_filename)
|
|
80
|
|
81 def load_re(filename):
|
|
82 """Parse the 5+ column tabular NLS motif file."""
|
|
83 handle = open(filename, "rU")
|
|
84 for line in handle:
|
|
85 line = line.rstrip("\n")
|
|
86 if not line:
|
|
87 continue
|
|
88 parts = line.split("\t")
|
|
89 assert 5 <= len(parts), parts
|
|
90 regex, evidence, p_count, percent_nuc, precent_non_nuc = parts[0:5]
|
|
91 try:
|
|
92 regex = re.compile(regex)
|
|
93 p_count = int(p_count)
|
|
94 except ValueError:
|
|
95 stop_err("Bad data in line: %s" % line)
|
|
96 if 6 <= len(parts):
|
|
97 proteins = parts[5]
|
|
98 assert p_count == len(proteins.split(",")), line
|
|
99 else:
|
|
100 proteins = ""
|
|
101 assert p_count == 0
|
|
102 if 7 <= len(parts):
|
|
103 domains = parts[6]
|
|
104 assert int(p_count) == len(domains.split(",")), line
|
|
105 else:
|
|
106 domains = ""
|
|
107 assert p_count == 0
|
|
108 #There can be further columns (DNA binding?), but we don't use them.
|
|
109 yield regex, evidence, p_count, percent_nuc, proteins, domains
|
|
110 handle.close()
|
|
111
|
|
112 def fasta_iterator(filename):
|
|
113 """Simple FASTA parser yielding tuples of (name, upper case sequence)."""
|
|
114 if filename == "-":
|
|
115 handle = sys.stdin
|
|
116 else:
|
|
117 handle = open(filename)
|
|
118 name, seq = "", ""
|
|
119 for line in handle:
|
|
120 if line.startswith(">"):
|
|
121 if name:
|
|
122 yield name, seq
|
|
123 #Take the first word only as the name:
|
|
124 name = line[1:].rstrip().split(None,1)[0]
|
|
125 seq = ""
|
|
126 elif name:
|
|
127 #Simple way would leave in any internal white space,
|
|
128 #seq += line.strip().upper()
|
|
129 seq += "".join(line.strip().upper().split())
|
|
130 elif not line.strip():
|
|
131 #Ignore blank lines before first record
|
|
132 pass
|
|
133 else:
|
|
134 raise ValueError("Bad FASTA line %r" % line)
|
|
135 if filename != "-":
|
|
136 handle.close()
|
|
137 if name:
|
|
138 yield name, seq
|
|
139 raise StopIteration
|
|
140
|
|
141 motifs = list(load_re(re_filename))
|
|
142 print "Looking for %i NLS motifs" % len(motifs)
|
|
143
|
|
144 if tabular_filename == "-":
|
|
145 out_handle = sys.stdout
|
|
146 else:
|
|
147 out_handle = open(tabular_filename, "w")
|
|
148 out_handle.write("#ID\tNLS start\tNLS seq\tNLS pattern\tType\tProtCount\t%NucProt\tProtList\tProtLoci\n")
|
|
149 count = 0
|
|
150 nls = 0
|
|
151 for idn, seq in fasta_iterator(fasta_filename):
|
|
152 for regex, evidence, p_count, percent_nuc_prot, proteins, domains in motifs:
|
|
153 #Perl predictnls v1.0.17 (and older) take right most hit only, Bug #40
|
|
154 #This has been fixed (v1.0.18 onwards, June 2011), so we return all the matches
|
|
155 for match in regex.finditer(seq):
|
|
156 #Perl predictnls v1.0.17 (and older) return NLS start position with zero
|
|
157 #but changed to one based counting in v1.0.18 (June 2011) onwards, Bug #38
|
|
158 #We therefore also use one based couting, hence the start+1 here:
|
|
159 out_handle.write("%s\t%i\t%s\t%s\t%s\t%i\t%s\t%s\t%s\n" \
|
|
160 % (idn, match.start()+1, match.group(),
|
|
161 regex.pattern, evidence, p_count,
|
|
162 percent_nuc_prot, proteins, domains))
|
|
163 nls += 1
|
|
164 count += 1
|
|
165 if tabular_filename != "-":
|
|
166 out_handle.close()
|
|
167 print "Found %i NLS motifs in %i sequences" % (nls, count)
|