comparison phylogenies/phytab_clearcut.py @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 import os
2 import optparse
3 import subprocess
4 from multiprocessing import Pool
5
6 directory = ""
7 results = "results.data"
8 extension = ".fs"
9 aligned_extension = ".tre"
10 datatype = ""
11
12 def unescape(string):
13 mapped_chars = {
14 '>': '__gt__',
15 '<': '__lt__',
16 "'": '__sq__',
17 '"': '__dq__',
18 '[': '__ob__',
19 ']': '__cb__',
20 '{': '__oc__',
21 '}': '__cc__',
22 '@': '__at__',
23 '\n': '__cn__',
24 '\r': '__cr__',
25 '\t': '__tc__',
26 '#': '__pd__'
27 }
28
29 for key, value in mapped_chars.iteritems():
30 string = string.replace(value, key)
31
32 return string
33
34
35 def isTabular(file):
36 with open(file) as f:
37 for line in f:
38 if line[0] == '>':
39 return False
40 return True
41
42 def toData(text, name):
43 name = name.replace("fasta", "") #file name has fasta when fasta file called
44 text = name.replace(".fs.tre", "") + "\t" + text.replace(" " , "")
45 return text
46
47 #
48 #def toData(text):
49 # text = text.split('\n')
50 # result = ''
51 # for line in text:
52 # if '>' in line:
53 # line = '\n' + line.replace('>', "") + '\t'
54 # line = line.replace(" ", "\t")
55 # result += line
56 # return result[1:] # Index past the first newline char
57
58 def clearcut(input):
59 file_name = directory + os.sep + input
60 popen = subprocess.Popen(['clearcut', "--in=" + file_name, "--out="+file_name + aligned_extension, "--alignment","-k", indata])
61 popen.wait()
62
63 class Sequence:
64 def __init__(self, string):
65 lis = string.split()
66 self.species = lis[0]
67 self.family = lis[1]
68 self.name = lis[2]
69 self.header = ' '.join(lis[:-1])
70 self.sequence = lis[-1]
71 self.string = string
72
73 def printFASTA(self):
74 return '>' + self.header + '\n' + self.sequence + '\n'
75
76 def saveMulti(tabFile):
77 with open(tabFile) as f:
78 for line in f:
79 seq = Sequence(line)
80 with open(directory + os.sep + seq.family + extension, "a") as p:
81 p.write(seq.printFASTA())
82
83 def saveSingle(fastaFile):
84 with open(fastaFile) as f:
85 for line in f:
86 with open(directory + os.sep + "fasta" + extension, "a") as p:
87 p.write(line)
88
89 def main():
90 usage = """%prog [options]
91 options (listed below) default to 'None' if omitted
92 """
93 parser = optparse.OptionParser(usage=usage)
94
95 parser.add_option(
96 '-d', '--directory',
97 metavar="PATH",
98 dest='path',
99 default='.',
100 help='Path to working directory.')
101
102 parser.add_option(
103 '-i', '--in',
104 dest='input',
105 action='store',
106 type='string',
107 metavar="FILE",
108 help='Name of input data.')
109
110 parser.add_option(
111 '-t', '--type',
112 dest='datatype',
113 action='store',
114 type='string',
115 help='-P for protein. -D for DNA.')
116
117 options, args = parser.parse_args()
118
119 global directory
120 global indata
121 inputFile = unescape(options.input)
122 directory = unescape(options.path) + os.sep + "data"
123 indata = "-" + unescape(options.datatype)
124
125 os.mkdir(directory)
126
127 if isTabular(inputFile):
128 saveMulti(inputFile)
129 else:
130 saveSingle(inputFile)
131
132 pool = Pool()
133 list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)]
134 pool.map(clearcut, list_of_files)
135
136 result = [file for file in os.listdir(directory) if file.lower().endswith(aligned_extension)]
137 with open(directory + os.sep + results, "a") as f:
138 for file in result:
139 with open(directory + os.sep + file, "r") as r:
140 f.write(toData(r.read(),file))
141
142 if __name__ == '__main__':
143 main()