comparison ffp_phylogeny.py @ 0:d31a1bd74e63 draft

Uploaded first version
author damion
date Sun, 09 Aug 2015 16:05:40 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d31a1bd74e63
1 #!/usr/bin/python
2 import optparse
3 import re
4 import time
5 import os
6 import tempfile
7 import sys
8 import shlex, subprocess
9 from string import maketrans
10
11 VERSION_NUMBER = "0.1.03"
12
13 class MyParser(optparse.OptionParser):
14 """
15 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output
16 Provides a better class for displaying formatted help info in epilog() portion of optParse; allows for carriage returns.
17 """
18 def format_epilog(self, formatter):
19 return self.epilog
20
21
22 def stop_err( msg ):
23 sys.stderr.write("%s\n" % msg)
24 sys.exit(1)
25
26 def getTaxonomyNames(type, multiple, abbreviate, filepaths, filenames):
27 """
28 Returns a taxonomic list of names corresponding to each file being analyzed by ffp.
29 This may also include names for each fasta sequence found within a file if the
30 "-m" multiple option is provided. Default is to use the file names rather than fasta id's inside the files.
31 NOTE: THIS DOES NOT (MUST NOT) REORDER NAMES IN NAME ARRAY.
32 EACH NAME ENTRY IS TRIMMED AND MADE UNIQUE
33
34 @param type string ['text','amino','nucleotide']
35 @param multiple boolean Flag indicates to look within files for labels
36 @param abbreviate boolean Flag indicates to shorten labels
37 @filenames array original input file names as user selected them
38 @filepaths array resulting galaxy dataset file .dat paths
39
40 """
41 # Take off prefix/suffix whitespace/comma :
42 taxonomy = filenames.strip().strip(',').split(',')
43 names=[]
44 ptr = 0
45
46 for file in filepaths:
47 # Trim labels to 50 characters max. ffpjsd kneecaps a taxonomy label to 10 characters if it is greater than 50 chars.
48 taxonomyitem = taxonomy[ptr].strip()[:50] #.translate(translations)
49 # Convert non-alphanumeric characters to underscore in taxonomy names. ffprwn IS VERY SENSITIVE ABOUT THIS.
50 taxonomyitem = re.sub('[^0-9a-zA-Z]+', '_', taxonomyitem)
51
52 if (not type in 'text') and multiple:
53 #Must read each fasta file, looking for all lines beginning ">"
54 with open(file) as fastafile:
55 lineptr = 0
56 for line in fastafile:
57 if line[0] == '>':
58 name = line[1:].split(None,1)[0].strip()[:50]
59 # Odd case where no fasta description found
60 if name == '': name = taxonomyitem + '.' + str(lineptr)
61 names.append(name)
62 lineptr += 1
63 else:
64
65 names.append(taxonomyitem)
66
67 ptr += 1
68
69 if abbreviate:
70 names = trimCommonPrefixes(names)
71 names = trimCommonPrefixes(names, True) # reverse = Suffixes.
72
73 return names
74
75 def trimCommonPrefixes(names, reverse=False):
76 """
77 Examines sorted array of names. Trims off prefix of each subsequent pair.
78
79 @param names array of textual labels (file names or fasta taxonomy ids)
80 @param reverse boolean whether to reverse array strings before doing prefix trimming.
81 """
82 wordybits = '|.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
83
84 if reverse:
85 names = map(lambda name: name[::-1], names) #reverses characters in names
86
87 sortednames = sorted(names)
88 ptr = 0
89 sortedlen = len(sortednames)
90 oldprefixlen=0
91 prefixlen=0
92 for name in sortednames:
93 ptr += 1
94
95 #If we're not at the very last item, reevaluate prefixlen
96 if ptr < sortedlen:
97
98 # Skip first item in an any duplicate pair. Leave duplicate name in full.
99 if name == sortednames[ptr]:
100 if reverse:
101 continue
102 else:
103 names[names.index(name)] = 'DupLabel-' + name
104 continue
105
106 # See http://stackoverflow.com/questions/9114402/regexp-finding-longest-common-prefix-of-two-strings
107 prefixlen = len( name[:([x[0]==x[1] for x in zip(name, sortednames[ptr])]+[0]).index(0)] )
108
109 if prefixlen <= oldprefixlen:
110 newprefix = name[:oldprefixlen]
111 else:
112 newprefix = name[:prefixlen]
113 # Expands label to include any preceeding characters that were probably part of it.
114 newprefix = newprefix.rstrip(wordybits)
115 newname = name[len(newprefix):]
116 # Some tree visualizers don't show numeric labels?!?!
117 if not reverse and newname.replace('.','',1).isdigit():
118 newname = 'id_' + newname
119 names[names.index(name)] = newname #extract name after prefix part; has nl in it
120 oldprefixlen = prefixlen
121
122 if reverse:
123 names = map(lambda name: name[::-1], names) #now back to original direction
124
125 return names
126
127 def getTaxonomyFile(names):
128 """
129 FFP's ffpjsd -p [taxon file of labels] option creates a phylip tree with
130 given taxon labels
131
132 @param names array of datafile names or fasta sequence ids
133 """
134
135 try:
136 temp = tempfile.NamedTemporaryFile(mode='w+t',delete=False)
137 taxonomyTempFile = temp.name
138 temp.writelines(name + '\n' for name in names)
139
140 except:
141 stop_err("Galaxy configuration error for ffp_phylogeny tool. Unable to write taxonomy file " + taxonomyTempFile)
142
143 finally:
144 temp.close()
145
146 return taxonomyTempFile
147
148
149 def check_output(command):
150 """
151 Execute a command line containing a series of pipes; and handle error cases by exiting at first error case. This is a substitute for Python 2.7 subprocess.check_output() - allowing piped commands without shell=True call . Based on Python subprocess docs 17.1.4.2
152
153 ISSUE: warnings on stderr are given with no exit code 0:
154 ffpry: Warning: No keys of length 6 found.
155 ffpcol: (null): Not a key valued FFP.
156
157 Can't use communicate() because this closes processes' stdout
158 file handle even without errors because of read to end of stdout:
159 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
160
161 """
162 commands = command.split("|")
163 processes = []
164 ptr = 0
165 substantive = re.compile('[a-zA-Z0-9]+')
166
167 for command_line in commands:
168 print command_line.strip()
169 args = shlex.split(command_line.strip())
170 if ptr == 0:
171 proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
172 processes.append(proc)
173 else:
174
175 #this has to come before error processing?
176 newProcess = subprocess.Popen(args, stdin=processes[ptr-1].stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
177
178 # It seems the act of reading standard error output is enough to trigger
179 # error code signal for that process, i.e. so that retcode returns a code.
180 retcode = processes[ptr-1].poll()
181 stderrdata = processes[ptr-1].stderr.read()
182 #Issue with ffptree is it outputs ---- ... ---- on stderr even when ok.
183 if retcode or (len(stderrdata) > 0 and substantive.search(stderrdata)):
184 stop_err(stderrdata)
185
186 processes.append(newProcess)
187 processes[ptr-1].stdout.close() # Allow prev. process to receive a SIGPIPE if current process exits.
188
189 ptr += 1
190
191 retcode = processes[ptr-1].poll()
192 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
193 if retcode or (len(stderrdata) > 0 and substantive.search(stderrdata)):
194 stop_err(stderrdata)
195
196 return stdoutdata
197
198
199 class ReportEngine(object):
200
201 def __init__(self): pass
202
203 def __main__(self):
204
205
206 ## *************************** Parse Command Line *****************************
207 parser = MyParser(
208 description = 'FFP (Feature frequency profile) is an alignment free comparison tool',
209 usage = 'python ffp_phylogeny.py [input_files] [output file] [options]',
210 epilog="""Details:
211
212 FFP (Feature frequency profile) is an alignment free comparison tool for phylogenetic analysis and text comparison. It can be applied to nucleotide sequences, complete genomes, proteomes and even used for text comparison.
213
214 """)
215
216 parser.set_defaults(row_limit=0)
217 # Don't use "-h" , it is reserved for --help!
218
219 parser.add_option('-t', '--type', type='choice', dest='type', default='text',
220 choices=['amino','nucleotide','text'],
221 help='Choice of Amino acid, nucleotide or plain text sequences to find features in')
222
223 parser.add_option('-l', '--length', type='int', dest='length', default=6,
224 help='Features (any string of valid characters found in data) of this length will be counted. Synonyms: l-mer, k-mer, n-gram, k-tuple')
225
226 #parser.add_option('-n', '--normalize', dest='normalize', default=True, action='store_true',
227 # help='Normalize counts into relative frequency')
228
229 parser.add_option('-m', '--multiple', dest='multiple', default=False, action='store_true',
230 help='By default all sequences in a fasta file be treated as 1 sequence to profile. This option enables each sequence found in a fasta file to have its own profile.')
231
232 parser.add_option('-M', '--metric', type='string', dest='metric',
233 help='Various metrics to measure count distances by.')
234
235 parser.add_option('-x', '--taxonomy', type='string', dest='taxonomy',
236 help='Taxanomic label for each profile/sequence.')
237
238 parser.add_option('-d', '--disable', dest='disable', default=False, action='store_true',
239 help='By default amino acid and nucleotide characters are grouped by functional category (protein or purine/pyrimidine group) before being counted. Disable this to treat individual characters as distinct.')
240
241 parser.add_option('-a', '--abbreviate', dest='abbreviate', default=False, action='store_true',
242 help='Shorten tree taxonomy labels as much as possible.')
243
244 parser.add_option('-s', '--similarity', dest='similarity', default=False, action='store_true',
245 help='Enables pearson correlation coefficient matrix and any of the binary distance measures to be turned into similarity matrixes.')
246
247 parser.add_option('-f', '--filter', type='choice', dest='filter', default='none',
248 choices=['none','count','f','n','e','freq','norm','evd'],
249 help='Choice of [f=raw frequency|n=normal|e=extreme value (Gumbel)] distribution: Features are trimmed from the data based on lower/upper cutoff points according to the given distribution.')
250
251 parser.add_option('-L', '--lower', type='float', dest='lower',
252 help='Filter lower bound is a 0.00 percentages')
253
254 parser.add_option('-U', '--upper', type='float', dest='upper',
255 help='Filter upper bound is a 0.00 percentages')
256
257 parser.add_option('-o', '--output', type='string', dest='output',
258 help='Path of output file to create')
259
260 parser.add_option('-T', '--tree', dest='tree', default=False, action='store_true', help='Generate Phylogenetic Tree output file')
261
262 parser.add_option('-v', '--version', dest='version', default=False, action='store_true', help='Version number')
263
264 # Could also have -D INT decimal precision included for ffprwn .
265
266 options, args = parser.parse_args()
267
268 if options.version:
269 print VERSION_NUMBER
270 return
271
272 import time
273 time_start = time.time()
274
275 try:
276 in_files = args[:]
277
278 except:
279 stop_err("Expecting at least 1 input data file.")
280
281
282 #ffptxt / ffpaa / ffpry
283 if options.type in 'text':
284 command = 'ffptxt'
285
286 else:
287 if options.type == 'amino':
288 command = 'ffpaa'
289 else:
290 command = 'ffpry'
291
292 if options.disable:
293 command += ' -d'
294
295 if options.multiple:
296 command += ' -m'
297
298 command += ' -l ' + str(options.length)
299
300 if len(in_files): #Note: app isn't really suited to stdio
301 command += ' "' + '" "'.join(in_files) + '"'
302
303 #ffpcol / ffpfilt
304 if options.filter != 'none':
305 command += ' | ffpfilt'
306 if options.filter != 'count':
307 command += ' -' + options.filter
308 if options.lower > 0:
309 command += ' --lower ' + str(options.lower)
310 if options.upper > 0:
311 command += ' --upper ' + str(options.upper)
312
313 else:
314 command += ' | ffpcol'
315
316 if options.type in 'text':
317 command += ' -t'
318
319 else:
320
321 if options.type == 'amino':
322 command += ' -a'
323
324 if options.disable:
325 command += ' -d'
326
327 #if options.normalize:
328 command += ' | ffprwn'
329
330 #Now create a taxonomy label file, ensuring a name exists for each profile.
331 taxonomyNames = getTaxonomyNames(options.type, options.multiple, options.abbreviate, in_files, options.taxonomy)
332 taxonomyTempFile = getTaxonomyFile(taxonomyNames)
333
334 # -p = Include phylip format 'infile' of the taxon names to use. Very simple, just a list of fasta identifier names.
335 command += ' | ffpjsd -p ' + taxonomyTempFile
336
337 if options.metric and len(options.metric) >0 :
338 command += ' --' + options.metric
339 if options.similarity:
340 command += ' -s'
341
342 # Generate Newick (.nhx) formatted tree if we have at least 3 taxonomy items:
343 if options.tree:
344 if len(taxonomyNames) > 2:
345 command += ' | ffptree -q'
346 else:
347 stop_err("For a phylogenetic tree display, one must have at least 3 ffp profiles.")
348
349 #print command
350
351 result = check_output(command)
352 with open(options.output,'w') as fw:
353 fw.writelines(result)
354 os.remove(taxonomyTempFile)
355
356 if __name__ == '__main__':
357
358 time_start = time.time()
359
360 reportEngine = ReportEngine()
361 reportEngine.__main__()
362
363 print('Execution time (seconds): ' + str(int(time.time()-time_start)))
364