Mercurial > repos > peterjc > predictnls
comparison tools/protein_analysis/predictnls.py @ 0:6e26c5a48e9a draft
Uploaded v0.0.4, first public release.
author | peterjc |
---|---|
date | Wed, 20 Feb 2013 11:39:06 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6e26c5a48e9a |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 #Copyright 2011-2013 by Peter Cock, James Hutton Institute (formerly SCRI), UK | |
4 # | |
5 #Licenced under the GPL (GNU General Public Licence) version 3. | |
6 # | |
7 #Based on Perl script predictNLS v1.3, copyright 2001-2005 and the later | |
8 #versions up to predictnls v1.0.20 (copright 2012), by Rajesh Nair | |
9 #(nair@rostlab.org) and Burkhard Rost (rost@rostlab.org), Rost Lab, | |
10 #Columbia University http://rostlab.org/ | |
11 | |
12 """Batch mode predictNLS, for finding nuclear localization signals | |
13 | |
14 This is a Python script re-implementing the predictNLS method, originally | |
15 written in Perl, described here: | |
16 | |
17 Murat Cokol, Rajesh Nair, and Burkhard Rost. | |
18 Finding nuclear localization signals. | |
19 EMBO reports 1(5), 411-415, 2000 | |
20 | |
21 http://dx.doi.org/10.1093/embo-reports/kvd092 | |
22 | |
23 The original Perl script was designed to work on a single sequence at a time, | |
24 but offers quite detailed output, including HTML (webpage). | |
25 | |
26 This Python version is designed to work on a single FASTA file containing | |
27 multiple sequences, and produces a single tabular output file, with one line | |
28 per NLS found (i.e. zero or more rows per query sequence). | |
29 | |
30 It takes either two or three command line arguments: | |
31 | |
32 predictNLS_batch input_file output_file [nls_motif_file] | |
33 | |
34 The input file should be protein sequences in FASTA format, the output file | |
35 is tab separated plain text, and the NLS motif file defaults to using the | |
36 plain text My_NLS_list file located next to the script file, or in a data | |
37 subdirectory. | |
38 | |
39 For convience if using this outside Galaxy, the input filename can be '-' | |
40 to mean stdin, and likewise the output filename can be '-' to mean stdout. | |
41 | |
42 Tested with the My_NLS_list file included with predictnls-1.0.7.tar.gz to | |
43 predictnls-1.0.20.tar.gz inclusive (the list was extended in v1.0.7 in | |
44 August 2010, see the change log included in those tar-balls). | |
45 | |
46 The Rost Lab provide source code tar balls for predictNLS on the FTP site | |
47 ftp://rostlab.org/predictnls/ but for Debian or RedHat based Linux they | |
48 recommend their package repository instead, | |
49 https://rostlab.org/owiki/index.php/Packages | |
50 """ | |
51 | |
52 import os | |
53 import sys | |
54 import re | |
55 | |
56 def stop_err(msg, return_code=1): | |
57 sys.stderr.write(msg.rstrip() + "\n") | |
58 sys.exit(return_code) | |
59 | |
60 if len(sys.argv) == 4: | |
61 fasta_filename, tabular_filename, re_filename = sys.argv[1:] | |
62 elif len(sys.argv) == 3: | |
63 fasta_filename, tabular_filename = sys.argv[1:] | |
64 #Use os.path.realpath(...) to handle being called via a symlink | |
65 #Try under subdirectory data: | |
66 re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), | |
67 "data", "My_NLS_list") | |
68 if not os.path.isfile(re_filename): | |
69 #Try in same directory as this script: | |
70 re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), | |
71 "My_NLS_list") | |
72 else: | |
73 stop_err("Expect 2 or 3 arguments: input FASTA file, output tabular file, and NLS motif file") | |
74 | |
75 if not os.path.isfile(fasta_filename): | |
76 stop_err("Could not find FASTA input file: %s" % fasta_filename) | |
77 | |
78 if not os.path.isfile(re_filename): | |
79 stop_err("Could not find NLS motif file: %s" % re_filename) | |
80 | |
81 def load_re(filename): | |
82 """Parse the 5+ column tabular NLS motif file.""" | |
83 handle = open(filename, "rU") | |
84 for line in handle: | |
85 line = line.rstrip("\n") | |
86 if not line: | |
87 continue | |
88 parts = line.split("\t") | |
89 assert 5 <= len(parts), parts | |
90 regex, evidence, p_count, percent_nuc, precent_non_nuc = parts[0:5] | |
91 try: | |
92 regex = re.compile(regex) | |
93 p_count = int(p_count) | |
94 except ValueError: | |
95 stop_err("Bad data in line: %s" % line) | |
96 if 6 <= len(parts): | |
97 proteins = parts[5] | |
98 assert p_count == len(proteins.split(",")), line | |
99 else: | |
100 proteins = "" | |
101 assert p_count == 0 | |
102 if 7 <= len(parts): | |
103 domains = parts[6] | |
104 assert int(p_count) == len(domains.split(",")), line | |
105 else: | |
106 domains = "" | |
107 assert p_count == 0 | |
108 #There can be further columns (DNA binding?), but we don't use them. | |
109 yield regex, evidence, p_count, percent_nuc, proteins, domains | |
110 handle.close() | |
111 | |
112 def fasta_iterator(filename): | |
113 """Simple FASTA parser yielding tuples of (name, upper case sequence).""" | |
114 if filename == "-": | |
115 handle = sys.stdin | |
116 else: | |
117 handle = open(filename) | |
118 name, seq = "", "" | |
119 for line in handle: | |
120 if line.startswith(">"): | |
121 if name: | |
122 yield name, seq | |
123 #Take the first word only as the name: | |
124 name = line[1:].rstrip().split(None,1)[0] | |
125 seq = "" | |
126 elif name: | |
127 #Simple way would leave in any internal white space, | |
128 #seq += line.strip().upper() | |
129 seq += "".join(line.strip().upper().split()) | |
130 elif not line.strip(): | |
131 #Ignore blank lines before first record | |
132 pass | |
133 else: | |
134 raise ValueError("Bad FASTA line %r" % line) | |
135 if filename != "-": | |
136 handle.close() | |
137 if name: | |
138 yield name, seq | |
139 raise StopIteration | |
140 | |
141 motifs = list(load_re(re_filename)) | |
142 print "Looking for %i NLS motifs" % len(motifs) | |
143 | |
144 if tabular_filename == "-": | |
145 out_handle = sys.stdout | |
146 else: | |
147 out_handle = open(tabular_filename, "w") | |
148 out_handle.write("#ID\tNLS start\tNLS seq\tNLS pattern\tType\tProtCount\t%NucProt\tProtList\tProtLoci\n") | |
149 count = 0 | |
150 nls = 0 | |
151 for idn, seq in fasta_iterator(fasta_filename): | |
152 for regex, evidence, p_count, percent_nuc_prot, proteins, domains in motifs: | |
153 #Perl predictnls v1.0.17 (and older) take right most hit only, Bug #40 | |
154 #This has been fixed (v1.0.18 onwards, June 2011), so we return all the matches | |
155 for match in regex.finditer(seq): | |
156 #Perl predictnls v1.0.17 (and older) return NLS start position with zero | |
157 #but changed to one based counting in v1.0.18 (June 2011) onwards, Bug #38 | |
158 #We therefore also use one based couting, hence the start+1 here: | |
159 out_handle.write("%s\t%i\t%s\t%s\t%s\t%i\t%s\t%s\t%s\n" \ | |
160 % (idn, match.start()+1, match.group(), | |
161 regex.pattern, evidence, p_count, | |
162 percent_nuc_prot, proteins, domains)) | |
163 nls += 1 | |
164 count += 1 | |
165 if tabular_filename != "-": | |
166 out_handle.close() | |
167 print "Found %i NLS motifs in %i sequences" % (nls, count) |