comparison alignment/phytab_muscle.py @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 import os
2 import optparse
3 import subprocess
4 from multiprocessing import Pool
5
6 directory = ""
7 results = "results.data"
8 extension = ".fs"
9 aligned_extension = ".afa"
10
11
12 def unescape(string):
13 mapped_chars = {
14 '>': '__gt__',
15 '<': '__lt__',
16 "'": '__sq__',
17 '"': '__dq__',
18 '[': '__ob__',
19 ']': '__cb__',
20 '{': '__oc__',
21 '}': '__cc__',
22 '@': '__at__',
23 '\n': '__cn__',
24 '\r': '__cr__',
25 '\t': '__tc__',
26 '#': '__pd__'
27 }
28
29 for key, value in mapped_chars.iteritems():
30 string = string.replace(value, key)
31
32 return string
33
34
35 def isTabular(file):
36 with open(file) as f:
37 for line in f:
38 if line[0] == '>':
39 return False
40 return True
41
42
43 def toData(text):
44 text = text.split('\n')
45 result = ''
46 for line in text:
47 if '>' in line:
48 line = '\n' + line.replace('> ', "") + '\t'
49 line = line.replace(" ", "\t")
50 result += line
51 return result[1:] # Index past the first newline char
52
53 def toDataSingle(text):
54 text = text.split('\n')
55 result = ''
56 for line in text:
57 line = line + '\n'
58 result += line
59 return result[1:] # Index past the first newline char
60
61 def muscle(input):
62 file_name = directory + os.sep + input
63 popen = subprocess.Popen(['muscle', "-in", file_name, "-out", file_name + aligned_extension]) # ./muscle
64 popen.wait()
65
66 popen = subprocess.Popen(['pwd']) # ./muscle
67 popen.wait()
68
69
70 class Sequence:
71 def __init__(self, string):
72 lis = string.split()
73 self.species = lis[0]
74 self.family = lis[1]
75 self.name = lis[2]
76 self.header = ' '.join(lis[:-1])
77 self.sequence = lis[-1]
78 self.string = string
79
80 def printFASTA(self):
81 return '> ' + self.header + '\n' + self.sequence + '\n'
82
83
84 def saveMulti(tabFile):
85 with open(tabFile) as f:
86 for line in f:
87 seq = Sequence(line)
88 with open(directory + os.sep + seq.family + extension, "a") as p:
89 p.write(seq.printFASTA())
90
91
92 def saveSingle(fastaFile):
93 with open(fastaFile) as f:
94 for line in f:
95 with open(directory + os.sep + "fasta" + extension, "a") as p:
96 p.write(line)
97
98
99 def main():
100 usage = """%prog [options]
101 options (listed below) default to 'None' if omitted
102 """
103 parser = optparse.OptionParser(usage=usage)
104
105 parser.add_option(
106 '-d', '--directory',
107 metavar="PATH",
108 dest='path',
109 default='.',
110 help='Path to working directory.')
111
112 parser.add_option(
113 '-i', '--in',
114 dest='input',
115 action='store',
116 type='string',
117 metavar="FILE",
118 help='Name of input data.')
119
120 options, args = parser.parse_args()
121
122 global directory
123 inputFile = unescape(options.input)
124 directory = unescape(options.path) + os.sep + "data"
125
126 os.mkdir(directory)
127
128 if isTabular(inputFile):
129 saveMulti(inputFile)
130 else:
131 saveSingle(inputFile)
132
133 pool = Pool()
134 list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)]
135 pool.map(muscle, list_of_files)
136
137 result = [file for file in os.listdir(directory) if file.lower().endswith(aligned_extension)]
138 if isTabular(inputFile):
139 with open(directory + os.sep + results, "a") as f:
140 for file in result:
141 with open(directory + os.sep + file, "r") as r:
142 f.write(toData(r.read()) + "\n")
143 else:
144 with open(directory + os.sep + results, "a") as f:
145 for file in result:
146 with open(directory + os.sep + file, "r") as r:
147 f.write(toDataSingle(r.read()) + "\n")
148
149 if __name__ == '__main__':
150 main()