comparison alignment/phytab_aliscorecut.py @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 import os
2 import optparse
3 import subprocess
4 from multiprocessing import Pool
5 import shutil
6
7 results_dir = "./data"
8 results = "results.data"
9 fasta_extension = ".afa"
10 alicut_prefix = "ALICUT_"
11 familyList = []
12 galaxy_tool_dir = "/home/galaxy/bin/"
13 forbidden_chars = {
14 '(': '__rb__',
15 ')': '__lb__',
16 ':': '__co__',
17 ';': '__sc__',
18 ',': '__cm__',
19 '--': '__dd__',
20 '*': '__st__',
21 '|': '__pi__',
22 ' ': '__sp__'
23 }
24
25
26 def unescape(string):
27 mapped_chars = {
28 '>': '__gt__',
29 '<': '__lt__',
30 "'": '__sq__',
31 '"': '__dq__',
32 '[': '__ob__',
33 ']': '__cb__',
34 '{': '__oc__',
35 '}': '__cc__',
36 '@': '__at__',
37 '\n': '__cn__',
38 '\r': '__cr__',
39 '\t': '__tc__',
40 '#': '__pd__'
41 }
42
43 for key, value in mapped_chars.iteritems():
44 string = string.replace(value, key)
45
46 return string
47
48
49 def unpackData(families):
50 with open(families) as f:
51 for line in f:
52 seq = Sequence(line)
53 with open(results_dir + os.sep + seq.family + fasta_extension, "a") as p:
54 p.write(seq.printFASTA())
55
56
57 class Sequence:
58 def __init__(self, string):
59 lis = string.split('\t')
60 self.species = lis[0]
61 self.family = lis[1]
62 self.name = lis[2]
63 self.header = ' '.join(lis[:-1])
64 self.sequence = lis[-1]
65 self.string = string
66
67 def escapedHeader(self):
68 string = self.header
69 for key, value in forbidden_chars.iteritems():
70 string = string.replace(key, value)
71 return string
72
73 def printFASTA(self):
74 return '>' + self.escapedHeader() + '\n' + self.sequence + '\n'
75
76
77 def unescapeHeader(header):
78 string = header
79 for key, value in forbidden_chars.iteritems():
80 string = string.replace(value, key)
81 return string
82
83
84 def toData(text):
85 text = text.split('\n')
86 result = ''
87 for line in text:
88 if '>' in line:
89 line = '\n' + unescapeHeader(line.replace('>', "")) + '\t'
90 line = line.replace(" ", "\t")
91 result += line
92 return result[1:] # Index past the first newline char
93
94
95 def aliscore(input):
96 file_name = results_dir + os.sep + input
97 # print file_name
98 pop = subprocess.Popen(["perl", "-I", galaxy_tool_dir, galaxy_tool_dir + "Aliscore.02.pl", "-i", file_name])
99 pop.wait()
100
101
102 def main():
103 usage = """%prog [options]
104 options (listed below) default to 'None' if omitted
105 """
106 parser = optparse.OptionParser(usage=usage)
107
108 parser.add_option(
109 '-i', '--input',
110 dest='families',
111 action='store',
112 type='string',
113 metavar="FILE",
114 help='Name of input sequences.')
115
116 options, args = parser.parse_args()
117
118 families = unescape(options.families)
119
120 os.mkdir(results_dir)
121
122 unpackData(families)
123
124 list_of_files = [file for file in os.listdir(results_dir) if file.lower().endswith(fasta_extension)]
125
126 pool = Pool()
127 pool.map(aliscore, list_of_files)
128
129 alicut = "ALICUT_V2.0_modified.pl"
130 shutil.copy(galaxy_tool_dir + alicut, results_dir + os.sep + alicut)
131 os.chdir(results_dir)
132 pop = subprocess.Popen(["perl", "./" + alicut])
133 pop.wait()
134 os.chdir("../")
135
136 result = [file for file in os.listdir(results_dir) if file.startswith(alicut_prefix)]
137 with open(results_dir + os.sep + results, "a") as f:
138 for file in result:
139 if file.endswith(fasta_extension):
140 with open(results_dir + os.sep + file, "r") as r:
141 f.write(toData(r.read()) + "\n")
142
143 if __name__ == '__main__':
144 main()