Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/rxlr_motifs.py @ 16:7de64c8b258d draft
Uploaded v0.2.5, MIT licence, RST for README, citation information, development moved to GitHub
author | peterjc |
---|---|
date | Wed, 18 Sep 2013 06:16:58 -0400 |
parents | a290c6d4e658 |
children | eb6ac44d4b8e |
comparison
equal
deleted
inserted
replaced
15:6abd809cefdd | 16:7de64c8b258d |
---|---|
109 "whisson_et_al_rxlr_eer_cropped.hmm") | 109 "whisson_et_al_rxlr_eer_cropped.hmm") |
110 if not os.path.isfile(hmm_file): | 110 if not os.path.isfile(hmm_file): |
111 stop_err("Missing HMM file for Whisson et al. (2007)") | 111 stop_err("Missing HMM file for Whisson et al. (2007)") |
112 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): | 112 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): |
113 stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher) | 113 stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher) |
114 #I've left the code to handle HMMER 3 in situ, in case | 114 |
115 #we revisit the choice to insist on HMMER 2. | |
116 hmmer3 = (3 == get_hmmer_version(hmmer_search)) | |
117 #Using zero (or 5.6?) for bitscore threshold | |
118 if hmmer3: | |
119 #The HMMER3 table output is easy to parse | |
120 #In HMMER3 can't use both -T and -E | |
121 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ | |
122 % (hmmer_search, hmm_output_file, hmm_file, fasta_file) | |
123 else: | |
124 #For HMMER2 we are stuck with parsing stdout | |
125 #Put 1e6 to effectively have no expectation threshold (otherwise | |
126 #HMMER defaults to 10 and the calculated e-value depends on the | |
127 #input FASTA file, and we can loose hits of interest). | |
128 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ | |
129 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) | |
130 return_code = os.system(cmd) | |
131 if return_code: | |
132 stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd)) | |
133 hmm_hits = set() | 115 hmm_hits = set() |
134 valid_ids = set() | 116 valid_ids = set() |
135 for title, seq in fasta_iterator(fasta_file): | 117 for title, seq in fasta_iterator(fasta_file): |
136 name = title.split(None,1)[0] | 118 name = title.split(None,1)[0] |
137 if name in valid_ids: | 119 if name in valid_ids: |
138 stop_err("Duplicated identifier %r" % name) | 120 stop_err("Duplicated identifier %r" % name) |
139 else: | 121 else: |
140 valid_ids.add(name) | 122 valid_ids.add(name) |
141 handle = open(hmm_output_file) | 123 if not valid_ids: |
142 for line in handle: | 124 #Special case, don't need to run HMMER if there are no sequences |
143 if not line.strip(): | 125 pass |
144 #We expect blank lines in the HMMER2 stdout | 126 else: |
145 continue | 127 #I've left the code to handle HMMER 3 in situ, in case |
146 elif line.startswith("#"): | 128 #we revisit the choice to insist on HMMER 2. |
147 #Header | 129 hmmer3 = (3 == get_hmmer_version(hmmer_search)) |
148 continue | 130 #Using zero (or 5.6?) for bitscore threshold |
131 if hmmer3: | |
132 #The HMMER3 table output is easy to parse | |
133 #In HMMER3 can't use both -T and -E | |
134 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ | |
135 % (hmmer_search, hmm_output_file, hmm_file, fasta_file) | |
149 else: | 136 else: |
150 name = line.split(None,1)[0] | 137 #For HMMER2 we are stuck with parsing stdout |
151 #Should be a sequence name in the HMMER3 table output. | 138 #Put 1e6 to effectively have no expectation threshold (otherwise |
152 #Could be anything in the HMMER2 stdout. | 139 #HMMER defaults to 10 and the calculated e-value depends on the |
153 if name in valid_ids: | 140 #input FASTA file, and we can loose hits of interest). |
154 hmm_hits.add(name) | 141 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ |
155 elif hmmer3: | 142 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) |
156 stop_err("Unexpected identifer %r in hmmsearch output" % name) | 143 return_code = os.system(cmd) |
157 handle.close() | 144 if return_code: |
158 #if hmmer3: | 145 stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) |
159 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | 146 |
160 #else: | 147 handle = open(hmm_output_file) |
161 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | 148 for line in handle: |
162 #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) | 149 if not line.strip(): |
163 os.remove(hmm_output_file) | 150 #We expect blank lines in the HMMER2 stdout |
151 continue | |
152 elif line.startswith("#"): | |
153 #Header | |
154 continue | |
155 else: | |
156 name = line.split(None,1)[0] | |
157 #Should be a sequence name in the HMMER3 table output. | |
158 #Could be anything in the HMMER2 stdout. | |
159 if name in valid_ids: | |
160 hmm_hits.add(name) | |
161 elif hmmer3: | |
162 stop_err("Unexpected identifer %r in hmmsearch output" % name) | |
163 handle.close() | |
164 #if hmmer3: | |
165 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | |
166 #else: | |
167 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | |
168 #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) | |
169 os.remove(hmm_output_file) | |
164 del valid_ids | 170 del valid_ids |
165 | 171 |
166 | 172 |
167 #Prepare short list of candidates containing RXLR to pass to SignalP | 173 #Prepare short list of candidates containing RXLR to pass to SignalP |
168 assert min_rxlr_start > 0, "Min value one, since zero based counting" | 174 assert min_rxlr_start > 0, "Min value one, since zero based counting" |