Mercurial > repos > peterjc > predictnls
view tools/protein_analysis/predictnls.py @ 0:6e26c5a48e9a draft
Uploaded v0.0.4, first public release.
author | peterjc |
---|---|
date | Wed, 20 Feb 2013 11:39:06 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python #Copyright 2011-2013 by Peter Cock, James Hutton Institute (formerly SCRI), UK # #Licenced under the GPL (GNU General Public Licence) version 3. # #Based on Perl script predictNLS v1.3, copyright 2001-2005 and the later #versions up to predictnls v1.0.20 (copright 2012), by Rajesh Nair #(nair@rostlab.org) and Burkhard Rost (rost@rostlab.org), Rost Lab, #Columbia University http://rostlab.org/ """Batch mode predictNLS, for finding nuclear localization signals This is a Python script re-implementing the predictNLS method, originally written in Perl, described here: Murat Cokol, Rajesh Nair, and Burkhard Rost. Finding nuclear localization signals. EMBO reports 1(5), 411-415, 2000 http://dx.doi.org/10.1093/embo-reports/kvd092 The original Perl script was designed to work on a single sequence at a time, but offers quite detailed output, including HTML (webpage). This Python version is designed to work on a single FASTA file containing multiple sequences, and produces a single tabular output file, with one line per NLS found (i.e. zero or more rows per query sequence). It takes either two or three command line arguments: predictNLS_batch input_file output_file [nls_motif_file] The input file should be protein sequences in FASTA format, the output file is tab separated plain text, and the NLS motif file defaults to using the plain text My_NLS_list file located next to the script file, or in a data subdirectory. For convience if using this outside Galaxy, the input filename can be '-' to mean stdin, and likewise the output filename can be '-' to mean stdout. Tested with the My_NLS_list file included with predictnls-1.0.7.tar.gz to predictnls-1.0.20.tar.gz inclusive (the list was extended in v1.0.7 in August 2010, see the change log included in those tar-balls). The Rost Lab provide source code tar balls for predictNLS on the FTP site ftp://rostlab.org/predictnls/ but for Debian or RedHat based Linux they recommend their package repository instead, https://rostlab.org/owiki/index.php/Packages """ import os import sys import re def stop_err(msg, return_code=1): sys.stderr.write(msg.rstrip() + "\n") sys.exit(return_code) if len(sys.argv) == 4: fasta_filename, tabular_filename, re_filename = sys.argv[1:] elif len(sys.argv) == 3: fasta_filename, tabular_filename = sys.argv[1:] #Use os.path.realpath(...) to handle being called via a symlink #Try under subdirectory data: re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), "data", "My_NLS_list") if not os.path.isfile(re_filename): #Try in same directory as this script: re_filename = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), "My_NLS_list") else: stop_err("Expect 2 or 3 arguments: input FASTA file, output tabular file, and NLS motif file") if not os.path.isfile(fasta_filename): stop_err("Could not find FASTA input file: %s" % fasta_filename) if not os.path.isfile(re_filename): stop_err("Could not find NLS motif file: %s" % re_filename) def load_re(filename): """Parse the 5+ column tabular NLS motif file.""" handle = open(filename, "rU") for line in handle: line = line.rstrip("\n") if not line: continue parts = line.split("\t") assert 5 <= len(parts), parts regex, evidence, p_count, percent_nuc, precent_non_nuc = parts[0:5] try: regex = re.compile(regex) p_count = int(p_count) except ValueError: stop_err("Bad data in line: %s" % line) if 6 <= len(parts): proteins = parts[5] assert p_count == len(proteins.split(",")), line else: proteins = "" assert p_count == 0 if 7 <= len(parts): domains = parts[6] assert int(p_count) == len(domains.split(",")), line else: domains = "" assert p_count == 0 #There can be further columns (DNA binding?), but we don't use them. yield regex, evidence, p_count, percent_nuc, proteins, domains handle.close() def fasta_iterator(filename): """Simple FASTA parser yielding tuples of (name, upper case sequence).""" if filename == "-": handle = sys.stdin else: handle = open(filename) name, seq = "", "" for line in handle: if line.startswith(">"): if name: yield name, seq #Take the first word only as the name: name = line[1:].rstrip().split(None,1)[0] seq = "" elif name: #Simple way would leave in any internal white space, #seq += line.strip().upper() seq += "".join(line.strip().upper().split()) elif not line.strip(): #Ignore blank lines before first record pass else: raise ValueError("Bad FASTA line %r" % line) if filename != "-": handle.close() if name: yield name, seq raise StopIteration motifs = list(load_re(re_filename)) print "Looking for %i NLS motifs" % len(motifs) if tabular_filename == "-": out_handle = sys.stdout else: out_handle = open(tabular_filename, "w") out_handle.write("#ID\tNLS start\tNLS seq\tNLS pattern\tType\tProtCount\t%NucProt\tProtList\tProtLoci\n") count = 0 nls = 0 for idn, seq in fasta_iterator(fasta_filename): for regex, evidence, p_count, percent_nuc_prot, proteins, domains in motifs: #Perl predictnls v1.0.17 (and older) take right most hit only, Bug #40 #This has been fixed (v1.0.18 onwards, June 2011), so we return all the matches for match in regex.finditer(seq): #Perl predictnls v1.0.17 (and older) return NLS start position with zero #but changed to one based counting in v1.0.18 (June 2011) onwards, Bug #38 #We therefore also use one based couting, hence the start+1 here: out_handle.write("%s\t%i\t%s\t%s\t%s\t%i\t%s\t%s\t%s\n" \ % (idn, match.start()+1, match.group(), regex.pattern, evidence, p_count, percent_nuc_prot, proteins, domains)) nls += 1 count += 1 if tabular_filename != "-": out_handle.close() print "Found %i NLS motifs in %i sequences" % (nls, count)