comparison alignment/phytab_prank.py @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 import os
2 import optparse
3 import subprocess
4 from multiprocessing import Pool
5
6 directory = ""
7 results = "results.data"
8 extension = ".fs"
9 aligned_extension = ".afa"
10 output_extension = ".afa.2.fas"
11
12
13 def unescape(string):
14 mapped_chars = {
15 '>': '__gt__',
16 '<': '__lt__',
17 "'": '__sq__',
18 '"': '__dq__',
19 '[': '__ob__',
20 ']': '__cb__',
21 '{': '__oc__',
22 '}': '__cc__',
23 '@': '__at__',
24 '\n': '__cn__',
25 '\r': '__cr__',
26 '\t': '__tc__',
27 '#': '__pd__'
28 }
29
30 for key, value in mapped_chars.iteritems():
31 string = string.replace(value, key)
32
33 return string
34
35
36 def isTabular(file):
37 with open(file) as f:
38 for line in f:
39 if line[0] == '>':
40 return False
41 return True
42
43
44 def toData(text):
45 text = text.split('\n')
46 result = ''
47 for line in text:
48 if '>' in line:
49 line = '\n' + line.replace('>__XX__', "") + '\t'
50 line = line.replace("__XX__", "\t")
51 result += line
52 return result[1:] # Index past the first newline char
53
54
55 def prank(input):
56 file_name = directory + os.sep + input
57 popen = subprocess.Popen(['pwd'])
58 popen.wait()
59 popen = subprocess.Popen(['prank', "-d=" + file_name, "-o=" + file_name + aligned_extension, "-quiet"])
60 popen.wait()
61
62 class Sequence:
63 def __init__(self, string):
64 lis = string.split()
65 self.species = lis[0]
66 self.family = lis[1]
67 self.name = lis[2]
68 self.header = '__XX__'.join(lis[:-1]) #prank replaces space with _ so can't join with spaces like muscle does
69 self.sequence = lis[-1]
70 self.string = string
71
72 def printFASTA(self):
73 return '>__XX__' + self.header + '\n' + self.sequence + '\n'
74
75
76 def saveMulti(tabFile):
77 with open(tabFile) as f:
78 for line in f:
79 seq = Sequence(line)
80 with open(directory + os.sep + seq.family + extension, "a") as p:
81 p.write(seq.printFASTA())
82
83
84 def saveSingle(fastaFile):
85 with open(fastaFile) as f:
86 for line in f:
87 with open(directory + os.sep + "fasta" + extension, "a") as p:
88 p.write(line)
89
90
91 def main():
92 usage = """%prog [options]
93 options (listed below) default to 'None' if omitted
94 """
95 parser = optparse.OptionParser(usage=usage)
96
97 parser.add_option(
98 '-d', '--directory',
99 metavar="PATH",
100 dest='path',
101 default='.',
102 help='Path to working directory.')
103
104 parser.add_option(
105 '-i', '--in',
106 dest='input',
107 action='store',
108 type='string',
109 metavar="FILE",
110 help='Name of input data.')
111
112 options, args = parser.parse_args()
113
114 global directory
115 inputFile = unescape(options.input)
116 directory = unescape(options.path) + os.sep + "data"
117
118 os.mkdir(directory)
119
120 if isTabular(inputFile):
121 saveMulti(inputFile)
122 else:
123 saveSingle(inputFile)
124
125 pool = Pool()
126 list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)]
127 pool.map(prank, list_of_files)
128 result = [file for file in os.listdir(directory) if file.lower().endswith(output_extension)]
129 with open(directory + os.sep + results, "a") as f:
130 for file in result:
131 with open(directory + os.sep + file, "r") as r:
132 f.write(toData(r.read()) + "\n")
133
134 if __name__ == '__main__':
135 main()