Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison alignment/phytab_ssr.py @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5b9a38ec4a39 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import os | |
4 import optparse | |
5 import subprocess | |
6 from multiprocessing import Pool | |
7 from functools import partial | |
8 | |
9 directory = "" | |
10 results = "results.data" | |
11 extension = ".fs" | |
12 slim_extension = ".slim" | |
13 jarpath='/home/galaxy/pkgs/SSR/SimilarSequenceRemover.jar' | |
14 | |
15 def unescape(string): | |
16 mapped_chars = { | |
17 '>': '__gt__', | |
18 '<': '__lt__', | |
19 "'": '__sq__', | |
20 '"': '__dq__', | |
21 '[': '__ob__', | |
22 ']': '__cb__', | |
23 '{': '__oc__', | |
24 '}': '__cc__', | |
25 '@': '__at__', | |
26 '\n': '__cn__', | |
27 '\r': '__cr__', | |
28 '\t': '__tc__', | |
29 '#': '__pd__' | |
30 } | |
31 | |
32 for key, value in mapped_chars.iteritems(): | |
33 string = string.replace(value, key) | |
34 | |
35 return string | |
36 | |
37 | |
38 def isTabular(file): | |
39 with open(file) as f: | |
40 for line in f: | |
41 if line[0] == '>': | |
42 return False | |
43 return True | |
44 | |
45 | |
46 def toData(text): | |
47 text = text.split('\n') | |
48 result = '' | |
49 for line in text: | |
50 if '>' in line: | |
51 line = '\n' + line.replace('> ', "") + '\t' | |
52 line = line.replace(" ", "\t") | |
53 result += line | |
54 return result[1:] # Index past the first newline char | |
55 | |
56 def toDataSingle(text): | |
57 text = text.split('\n') | |
58 result = '' | |
59 for line in text: | |
60 line = line + '\n' | |
61 result += line | |
62 return result[1:] # Index past the first newline char | |
63 | |
64 def runSimilarSequenceRemover(input): | |
65 input_name = directory + os.sep + input | |
66 outfasta = input_name + slim_extension | |
67 call = subprocess.call(['java','-jar', jarpath, '-i', input_name, '-o', outfasta, 's=' + percentage, '-h', alignallornot ]) | |
68 | |
69 class Sequence: | |
70 def __init__(self, string): | |
71 lis = string.split() | |
72 self.species = lis[0] | |
73 self.family = lis[1] | |
74 self.name = lis[2] | |
75 self.header = ' '.join(lis[:-1]) | |
76 self.sequence = lis[-1] | |
77 self.string = string | |
78 | |
79 def printFASTA(self): | |
80 return '> ' + self.header + '\n' + self.sequence + '\n' | |
81 | |
82 | |
83 def saveMulti(tabFile): | |
84 with open(tabFile) as f: | |
85 for line in f: | |
86 seq = Sequence(line) | |
87 with open(directory + os.sep + seq.family + extension, "a") as p: | |
88 p.write(seq.printFASTA()) | |
89 | |
90 | |
91 def saveSingle(fastaFile): | |
92 with open(fastaFile) as f: | |
93 for line in f: | |
94 with open(directory + os.sep + "fasta" + extension, "a") as p: | |
95 p.write(line) | |
96 | |
97 | |
98 def main(): | |
99 usage = """%prog [options] | |
100 options (listed below) default to 'None' if omitted | |
101 """ | |
102 parser = optparse.OptionParser(usage=usage) | |
103 | |
104 parser.add_option( | |
105 '-d', '--directory', | |
106 metavar="PATH", | |
107 dest='path', | |
108 default='.', | |
109 help='Path to working directory.') | |
110 | |
111 parser.add_option( | |
112 '-i', '--in', | |
113 dest='input', | |
114 action='store', | |
115 type='string', | |
116 metavar="FILE", | |
117 help='Name of input data.') | |
118 | |
119 parser.add_option( | |
120 '-a', '--alignall', | |
121 dest='alignall', | |
122 action='store', | |
123 type='string', help='t or f. t only aligns first 100 sites instead of entire seq') | |
124 | |
125 parser.add_option( | |
126 '-s', '--similarity', | |
127 dest='similarity', | |
128 action='store', | |
129 type='float', | |
130 help='Percentage similarity as cutoff (eg 0.99).') | |
131 | |
132 options, args = parser.parse_args() | |
133 | |
134 global directory | |
135 global percentage | |
136 global alignallornot | |
137 inputFile = unescape(options.input) | |
138 directory = unescape(options.path) + os.sep + "data" | |
139 percentage = str(options.similarity) | |
140 alignallornot = options.alignall | |
141 | |
142 os.mkdir(directory) | |
143 | |
144 if isTabular(inputFile): | |
145 saveMulti(inputFile) | |
146 else: | |
147 saveSingle(inputFile) | |
148 | |
149 pool = Pool() | |
150 list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)] | |
151 list_of_files = sorted(list_of_files) | |
152 | |
153 pool.map(runSimilarSequenceRemover, list_of_files) | |
154 | |
155 result = [file for file in os.listdir(directory) if file.lower().endswith(slim_extension)] | |
156 if isTabular(inputFile): | |
157 with open(directory + os.sep + results, "a") as f: | |
158 for file in result: | |
159 with open(directory + os.sep + file, "r") as r: | |
160 f.write(toData(r.read()) + "\n") | |
161 else: | |
162 with open(directory + os.sep + results, "a") as f: | |
163 for file in result: | |
164 with open(directory + os.sep + file, "r") as r: | |
165 f.write(r.read() +'\n') | |
166 | |
167 if __name__ == '__main__': | |
168 main() | |
169 | |
170 |