comparison alignment/phytab_ssr.py @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/usr/bin/env python
2
3 import os
4 import optparse
5 import subprocess
6 from multiprocessing import Pool
7 from functools import partial
8
9 directory = ""
10 results = "results.data"
11 extension = ".fs"
12 slim_extension = ".slim"
13 jarpath='/home/galaxy/pkgs/SSR/SimilarSequenceRemover.jar'
14
15 def unescape(string):
16 mapped_chars = {
17 '>': '__gt__',
18 '<': '__lt__',
19 "'": '__sq__',
20 '"': '__dq__',
21 '[': '__ob__',
22 ']': '__cb__',
23 '{': '__oc__',
24 '}': '__cc__',
25 '@': '__at__',
26 '\n': '__cn__',
27 '\r': '__cr__',
28 '\t': '__tc__',
29 '#': '__pd__'
30 }
31
32 for key, value in mapped_chars.iteritems():
33 string = string.replace(value, key)
34
35 return string
36
37
38 def isTabular(file):
39 with open(file) as f:
40 for line in f:
41 if line[0] == '>':
42 return False
43 return True
44
45
46 def toData(text):
47 text = text.split('\n')
48 result = ''
49 for line in text:
50 if '>' in line:
51 line = '\n' + line.replace('> ', "") + '\t'
52 line = line.replace(" ", "\t")
53 result += line
54 return result[1:] # Index past the first newline char
55
56 def toDataSingle(text):
57 text = text.split('\n')
58 result = ''
59 for line in text:
60 line = line + '\n'
61 result += line
62 return result[1:] # Index past the first newline char
63
64 def runSimilarSequenceRemover(input):
65 input_name = directory + os.sep + input
66 outfasta = input_name + slim_extension
67 call = subprocess.call(['java','-jar', jarpath, '-i', input_name, '-o', outfasta, 's=' + percentage, '-h', alignallornot ])
68
69 class Sequence:
70 def __init__(self, string):
71 lis = string.split()
72 self.species = lis[0]
73 self.family = lis[1]
74 self.name = lis[2]
75 self.header = ' '.join(lis[:-1])
76 self.sequence = lis[-1]
77 self.string = string
78
79 def printFASTA(self):
80 return '> ' + self.header + '\n' + self.sequence + '\n'
81
82
83 def saveMulti(tabFile):
84 with open(tabFile) as f:
85 for line in f:
86 seq = Sequence(line)
87 with open(directory + os.sep + seq.family + extension, "a") as p:
88 p.write(seq.printFASTA())
89
90
91 def saveSingle(fastaFile):
92 with open(fastaFile) as f:
93 for line in f:
94 with open(directory + os.sep + "fasta" + extension, "a") as p:
95 p.write(line)
96
97
98 def main():
99 usage = """%prog [options]
100 options (listed below) default to 'None' if omitted
101 """
102 parser = optparse.OptionParser(usage=usage)
103
104 parser.add_option(
105 '-d', '--directory',
106 metavar="PATH",
107 dest='path',
108 default='.',
109 help='Path to working directory.')
110
111 parser.add_option(
112 '-i', '--in',
113 dest='input',
114 action='store',
115 type='string',
116 metavar="FILE",
117 help='Name of input data.')
118
119 parser.add_option(
120 '-a', '--alignall',
121 dest='alignall',
122 action='store',
123 type='string', help='t or f. t only aligns first 100 sites instead of entire seq')
124
125 parser.add_option(
126 '-s', '--similarity',
127 dest='similarity',
128 action='store',
129 type='float',
130 help='Percentage similarity as cutoff (eg 0.99).')
131
132 options, args = parser.parse_args()
133
134 global directory
135 global percentage
136 global alignallornot
137 inputFile = unescape(options.input)
138 directory = unescape(options.path) + os.sep + "data"
139 percentage = str(options.similarity)
140 alignallornot = options.alignall
141
142 os.mkdir(directory)
143
144 if isTabular(inputFile):
145 saveMulti(inputFile)
146 else:
147 saveSingle(inputFile)
148
149 pool = Pool()
150 list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)]
151 list_of_files = sorted(list_of_files)
152
153 pool.map(runSimilarSequenceRemover, list_of_files)
154
155 result = [file for file in os.listdir(directory) if file.lower().endswith(slim_extension)]
156 if isTabular(inputFile):
157 with open(directory + os.sep + results, "a") as f:
158 for file in result:
159 with open(directory + os.sep + file, "r") as r:
160 f.write(toData(r.read()) + "\n")
161 else:
162 with open(directory + os.sep + results, "a") as f:
163 for file in result:
164 with open(directory + os.sep + file, "r") as r:
165 f.write(r.read() +'\n')
166
167 if __name__ == '__main__':
168 main()
169
170