annotate ffp_phylogeny.py @ 0:d31a1bd74e63 draft

Uploaded first version
author damion
date Sun, 09 Aug 2015 16:05:40 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
1 #!/usr/bin/python
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
2 import optparse
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
3 import re
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
4 import time
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
5 import os
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
6 import tempfile
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
7 import sys
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
8 import shlex, subprocess
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
9 from string import maketrans
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
10
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
11 VERSION_NUMBER = "0.1.03"
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
12
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
13 class MyParser(optparse.OptionParser):
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
14 """
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
15 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
16 Provides a better class for displaying formatted help info in epilog() portion of optParse; allows for carriage returns.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
17 """
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
18 def format_epilog(self, formatter):
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
19 return self.epilog
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
20
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
21
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
22 def stop_err( msg ):
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
23 sys.stderr.write("%s\n" % msg)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
24 sys.exit(1)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
25
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
26 def getTaxonomyNames(type, multiple, abbreviate, filepaths, filenames):
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
27 """
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
28 Returns a taxonomic list of names corresponding to each file being analyzed by ffp.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
29 This may also include names for each fasta sequence found within a file if the
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
30 "-m" multiple option is provided. Default is to use the file names rather than fasta id's inside the files.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
31 NOTE: THIS DOES NOT (MUST NOT) REORDER NAMES IN NAME ARRAY.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
32 EACH NAME ENTRY IS TRIMMED AND MADE UNIQUE
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
33
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
34 @param type string ['text','amino','nucleotide']
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
35 @param multiple boolean Flag indicates to look within files for labels
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
36 @param abbreviate boolean Flag indicates to shorten labels
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
37 @filenames array original input file names as user selected them
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
38 @filepaths array resulting galaxy dataset file .dat paths
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
39
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
40 """
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
41 # Take off prefix/suffix whitespace/comma :
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
42 taxonomy = filenames.strip().strip(',').split(',')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
43 names=[]
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
44 ptr = 0
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
45
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
46 for file in filepaths:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
47 # Trim labels to 50 characters max. ffpjsd kneecaps a taxonomy label to 10 characters if it is greater than 50 chars.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
48 taxonomyitem = taxonomy[ptr].strip()[:50] #.translate(translations)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
49 # Convert non-alphanumeric characters to underscore in taxonomy names. ffprwn IS VERY SENSITIVE ABOUT THIS.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
50 taxonomyitem = re.sub('[^0-9a-zA-Z]+', '_', taxonomyitem)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
51
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
52 if (not type in 'text') and multiple:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
53 #Must read each fasta file, looking for all lines beginning ">"
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
54 with open(file) as fastafile:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
55 lineptr = 0
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
56 for line in fastafile:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
57 if line[0] == '>':
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
58 name = line[1:].split(None,1)[0].strip()[:50]
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
59 # Odd case where no fasta description found
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
60 if name == '': name = taxonomyitem + '.' + str(lineptr)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
61 names.append(name)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
62 lineptr += 1
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
63 else:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
64
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
65 names.append(taxonomyitem)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
66
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
67 ptr += 1
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
68
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
69 if abbreviate:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
70 names = trimCommonPrefixes(names)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
71 names = trimCommonPrefixes(names, True) # reverse = Suffixes.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
72
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
73 return names
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
74
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
75 def trimCommonPrefixes(names, reverse=False):
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
76 """
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
77 Examines sorted array of names. Trims off prefix of each subsequent pair.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
78
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
79 @param names array of textual labels (file names or fasta taxonomy ids)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
80 @param reverse boolean whether to reverse array strings before doing prefix trimming.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
81 """
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
82 wordybits = '|.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
83
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
84 if reverse:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
85 names = map(lambda name: name[::-1], names) #reverses characters in names
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
86
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
87 sortednames = sorted(names)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
88 ptr = 0
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
89 sortedlen = len(sortednames)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
90 oldprefixlen=0
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
91 prefixlen=0
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
92 for name in sortednames:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
93 ptr += 1
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
94
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
95 #If we're not at the very last item, reevaluate prefixlen
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
96 if ptr < sortedlen:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
97
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
98 # Skip first item in an any duplicate pair. Leave duplicate name in full.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
99 if name == sortednames[ptr]:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
100 if reverse:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
101 continue
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
102 else:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
103 names[names.index(name)] = 'DupLabel-' + name
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
104 continue
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
105
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
106 # See http://stackoverflow.com/questions/9114402/regexp-finding-longest-common-prefix-of-two-strings
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
107 prefixlen = len( name[:([x[0]==x[1] for x in zip(name, sortednames[ptr])]+[0]).index(0)] )
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
108
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
109 if prefixlen <= oldprefixlen:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
110 newprefix = name[:oldprefixlen]
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
111 else:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
112 newprefix = name[:prefixlen]
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
113 # Expands label to include any preceeding characters that were probably part of it.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
114 newprefix = newprefix.rstrip(wordybits)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
115 newname = name[len(newprefix):]
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
116 # Some tree visualizers don't show numeric labels?!?!
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
117 if not reverse and newname.replace('.','',1).isdigit():
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
118 newname = 'id_' + newname
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
119 names[names.index(name)] = newname #extract name after prefix part; has nl in it
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
120 oldprefixlen = prefixlen
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
121
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
122 if reverse:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
123 names = map(lambda name: name[::-1], names) #now back to original direction
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
124
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
125 return names
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
126
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
127 def getTaxonomyFile(names):
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
128 """
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
129 FFP's ffpjsd -p [taxon file of labels] option creates a phylip tree with
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
130 given taxon labels
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
131
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
132 @param names array of datafile names or fasta sequence ids
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
133 """
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
134
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
135 try:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
136 temp = tempfile.NamedTemporaryFile(mode='w+t',delete=False)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
137 taxonomyTempFile = temp.name
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
138 temp.writelines(name + '\n' for name in names)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
139
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
140 except:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
141 stop_err("Galaxy configuration error for ffp_phylogeny tool. Unable to write taxonomy file " + taxonomyTempFile)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
142
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
143 finally:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
144 temp.close()
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
145
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
146 return taxonomyTempFile
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
147
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
148
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
149 def check_output(command):
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
150 """
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
151 Execute a command line containing a series of pipes; and handle error cases by exiting at first error case. This is a substitute for Python 2.7 subprocess.check_output() - allowing piped commands without shell=True call . Based on Python subprocess docs 17.1.4.2
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
152
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
153 ISSUE: warnings on stderr are given with no exit code 0:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
154 ffpry: Warning: No keys of length 6 found.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
155 ffpcol: (null): Not a key valued FFP.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
156
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
157 Can't use communicate() because this closes processes' stdout
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
158 file handle even without errors because of read to end of stdout:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
159 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
160
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
161 """
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
162 commands = command.split("|")
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
163 processes = []
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
164 ptr = 0
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
165 substantive = re.compile('[a-zA-Z0-9]+')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
166
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
167 for command_line in commands:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
168 print command_line.strip()
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
169 args = shlex.split(command_line.strip())
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
170 if ptr == 0:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
171 proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
172 processes.append(proc)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
173 else:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
174
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
175 #this has to come before error processing?
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
176 newProcess = subprocess.Popen(args, stdin=processes[ptr-1].stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
177
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
178 # It seems the act of reading standard error output is enough to trigger
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
179 # error code signal for that process, i.e. so that retcode returns a code.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
180 retcode = processes[ptr-1].poll()
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
181 stderrdata = processes[ptr-1].stderr.read()
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
182 #Issue with ffptree is it outputs ---- ... ---- on stderr even when ok.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
183 if retcode or (len(stderrdata) > 0 and substantive.search(stderrdata)):
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
184 stop_err(stderrdata)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
185
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
186 processes.append(newProcess)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
187 processes[ptr-1].stdout.close() # Allow prev. process to receive a SIGPIPE if current process exits.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
188
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
189 ptr += 1
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
190
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
191 retcode = processes[ptr-1].poll()
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
192 (stdoutdata, stderrdata) = processes[ptr-1].communicate()
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
193 if retcode or (len(stderrdata) > 0 and substantive.search(stderrdata)):
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
194 stop_err(stderrdata)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
195
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
196 return stdoutdata
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
197
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
198
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
199 class ReportEngine(object):
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
200
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
201 def __init__(self): pass
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
202
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
203 def __main__(self):
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
204
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
205
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
206 ## *************************** Parse Command Line *****************************
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
207 parser = MyParser(
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
208 description = 'FFP (Feature frequency profile) is an alignment free comparison tool',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
209 usage = 'python ffp_phylogeny.py [input_files] [output file] [options]',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
210 epilog="""Details:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
211
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
212 FFP (Feature frequency profile) is an alignment free comparison tool for phylogenetic analysis and text comparison. It can be applied to nucleotide sequences, complete genomes, proteomes and even used for text comparison.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
213
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
214 """)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
215
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
216 parser.set_defaults(row_limit=0)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
217 # Don't use "-h" , it is reserved for --help!
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
218
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
219 parser.add_option('-t', '--type', type='choice', dest='type', default='text',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
220 choices=['amino','nucleotide','text'],
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
221 help='Choice of Amino acid, nucleotide or plain text sequences to find features in')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
222
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
223 parser.add_option('-l', '--length', type='int', dest='length', default=6,
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
224 help='Features (any string of valid characters found in data) of this length will be counted. Synonyms: l-mer, k-mer, n-gram, k-tuple')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
225
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
226 #parser.add_option('-n', '--normalize', dest='normalize', default=True, action='store_true',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
227 # help='Normalize counts into relative frequency')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
228
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
229 parser.add_option('-m', '--multiple', dest='multiple', default=False, action='store_true',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
230 help='By default all sequences in a fasta file be treated as 1 sequence to profile. This option enables each sequence found in a fasta file to have its own profile.')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
231
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
232 parser.add_option('-M', '--metric', type='string', dest='metric',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
233 help='Various metrics to measure count distances by.')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
234
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
235 parser.add_option('-x', '--taxonomy', type='string', dest='taxonomy',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
236 help='Taxanomic label for each profile/sequence.')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
237
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
238 parser.add_option('-d', '--disable', dest='disable', default=False, action='store_true',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
239 help='By default amino acid and nucleotide characters are grouped by functional category (protein or purine/pyrimidine group) before being counted. Disable this to treat individual characters as distinct.')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
240
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
241 parser.add_option('-a', '--abbreviate', dest='abbreviate', default=False, action='store_true',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
242 help='Shorten tree taxonomy labels as much as possible.')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
243
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
244 parser.add_option('-s', '--similarity', dest='similarity', default=False, action='store_true',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
245 help='Enables pearson correlation coefficient matrix and any of the binary distance measures to be turned into similarity matrixes.')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
246
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
247 parser.add_option('-f', '--filter', type='choice', dest='filter', default='none',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
248 choices=['none','count','f','n','e','freq','norm','evd'],
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
249 help='Choice of [f=raw frequency|n=normal|e=extreme value (Gumbel)] distribution: Features are trimmed from the data based on lower/upper cutoff points according to the given distribution.')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
250
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
251 parser.add_option('-L', '--lower', type='float', dest='lower',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
252 help='Filter lower bound is a 0.00 percentages')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
253
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
254 parser.add_option('-U', '--upper', type='float', dest='upper',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
255 help='Filter upper bound is a 0.00 percentages')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
256
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
257 parser.add_option('-o', '--output', type='string', dest='output',
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
258 help='Path of output file to create')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
259
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
260 parser.add_option('-T', '--tree', dest='tree', default=False, action='store_true', help='Generate Phylogenetic Tree output file')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
261
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
262 parser.add_option('-v', '--version', dest='version', default=False, action='store_true', help='Version number')
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
263
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
264 # Could also have -D INT decimal precision included for ffprwn .
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
265
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
266 options, args = parser.parse_args()
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
267
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
268 if options.version:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
269 print VERSION_NUMBER
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
270 return
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
271
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
272 import time
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
273 time_start = time.time()
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
274
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
275 try:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
276 in_files = args[:]
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
277
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
278 except:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
279 stop_err("Expecting at least 1 input data file.")
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
280
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
281
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
282 #ffptxt / ffpaa / ffpry
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
283 if options.type in 'text':
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
284 command = 'ffptxt'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
285
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
286 else:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
287 if options.type == 'amino':
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
288 command = 'ffpaa'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
289 else:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
290 command = 'ffpry'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
291
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
292 if options.disable:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
293 command += ' -d'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
294
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
295 if options.multiple:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
296 command += ' -m'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
297
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
298 command += ' -l ' + str(options.length)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
299
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
300 if len(in_files): #Note: app isn't really suited to stdio
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
301 command += ' "' + '" "'.join(in_files) + '"'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
302
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
303 #ffpcol / ffpfilt
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
304 if options.filter != 'none':
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
305 command += ' | ffpfilt'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
306 if options.filter != 'count':
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
307 command += ' -' + options.filter
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
308 if options.lower > 0:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
309 command += ' --lower ' + str(options.lower)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
310 if options.upper > 0:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
311 command += ' --upper ' + str(options.upper)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
312
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
313 else:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
314 command += ' | ffpcol'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
315
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
316 if options.type in 'text':
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
317 command += ' -t'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
318
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
319 else:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
320
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
321 if options.type == 'amino':
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
322 command += ' -a'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
323
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
324 if options.disable:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
325 command += ' -d'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
326
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
327 #if options.normalize:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
328 command += ' | ffprwn'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
329
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
330 #Now create a taxonomy label file, ensuring a name exists for each profile.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
331 taxonomyNames = getTaxonomyNames(options.type, options.multiple, options.abbreviate, in_files, options.taxonomy)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
332 taxonomyTempFile = getTaxonomyFile(taxonomyNames)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
333
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
334 # -p = Include phylip format 'infile' of the taxon names to use. Very simple, just a list of fasta identifier names.
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
335 command += ' | ffpjsd -p ' + taxonomyTempFile
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
336
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
337 if options.metric and len(options.metric) >0 :
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
338 command += ' --' + options.metric
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
339 if options.similarity:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
340 command += ' -s'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
341
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
342 # Generate Newick (.nhx) formatted tree if we have at least 3 taxonomy items:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
343 if options.tree:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
344 if len(taxonomyNames) > 2:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
345 command += ' | ffptree -q'
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
346 else:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
347 stop_err("For a phylogenetic tree display, one must have at least 3 ffp profiles.")
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
348
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
349 #print command
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
350
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
351 result = check_output(command)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
352 with open(options.output,'w') as fw:
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
353 fw.writelines(result)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
354 os.remove(taxonomyTempFile)
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
355
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
356 if __name__ == '__main__':
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
357
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
358 time_start = time.time()
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
359
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
360 reportEngine = ReportEngine()
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
361 reportEngine.__main__()
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
362
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
363 print('Execution time (seconds): ' + str(int(time.time()-time_start)))
d31a1bd74e63 Uploaded first version
damion
parents:
diff changeset
364