Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison alignment/phytab_mafft.py @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
| author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> | 
|---|---|
| date | Tue, 11 Mar 2014 12:19:13 -0700 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:5b9a38ec4a39 | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import os | |
| 4 import optparse | |
| 5 import subprocess | |
| 6 from multiprocessing import Pool | |
| 7 | |
| 8 directory = "" | |
| 9 results = "results.data" | |
| 10 extension = ".fs" | |
| 11 aligned_extension = ".afa" | |
| 12 | |
| 13 | |
| 14 def unescape(string): | |
| 15 mapped_chars = { | |
| 16 '>': '__gt__', | |
| 17 '<': '__lt__', | |
| 18 "'": '__sq__', | |
| 19 '"': '__dq__', | |
| 20 '[': '__ob__', | |
| 21 ']': '__cb__', | |
| 22 '{': '__oc__', | |
| 23 '}': '__cc__', | |
| 24 '@': '__at__', | |
| 25 '\n': '__cn__', | |
| 26 '\r': '__cr__', | |
| 27 '\t': '__tc__', | |
| 28 '#': '__pd__' | |
| 29 } | |
| 30 | |
| 31 for key, value in mapped_chars.iteritems(): | |
| 32 string = string.replace(value, key) | |
| 33 | |
| 34 return string | |
| 35 | |
| 36 | |
| 37 def isTabular(file): | |
| 38 with open(file) as f: | |
| 39 for line in f: | |
| 40 if line[0] == '>': | |
| 41 return False | |
| 42 return True | |
| 43 | |
| 44 | |
| 45 def toData(text): | |
| 46 text = text.split('\n') | |
| 47 result = '' | |
| 48 for line in text: | |
| 49 if '>' in line: | |
| 50 line = '\n' + line.replace('> ', "") + '\t' | |
| 51 line = line.replace(" ", "\t") | |
| 52 result += line | |
| 53 return result[1:] # Index past the first newline char | |
| 54 | |
| 55 def toDataSingle(text): | |
| 56 text = text.split('\n') | |
| 57 result = '' | |
| 58 for line in text: | |
| 59 line = line + '\n' | |
| 60 result += line | |
| 61 return result[1:] # Index past the first newline char | |
| 62 | |
| 63 def mafftauto(input): | |
| 64 file_name = directory + os.sep + input | |
| 65 aln = file_name + aligned_extension | |
| 66 call = subprocess.call(['mafft', '--auto', '--out', aln, file_name]) | |
| 67 | |
| 68 def mafft1(input): | |
| 69 file_name = directory + os.sep + input | |
| 70 aln = file_name + aligned_extension | |
| 71 call = subprocess.call(['mafft', '--retree', '1', '--out', aln, file_name]) | |
| 72 | |
| 73 def mafft2(input): | |
| 74 file_name = directory + os.sep + input | |
| 75 aln = file_name + aligned_extension | |
| 76 call = subprocess.call(['mafft', '--retree', '2', '--out', aln, file_name]) | |
| 77 | |
| 78 def maffti(input): | |
| 79 file_name = directory + os.sep + input | |
| 80 aln = file_name + aligned_extension | |
| 81 call = subprocess.call(['mafft-fftnsi', '--out', aln, file_name]) | |
| 82 | |
| 83 def maffteinsi(input): | |
| 84 file_name = directory + os.sep + input | |
| 85 aln = file_name + aligned_extension | |
| 86 call = subprocess.call(['mafft-einsi', '--out', aln, file_name]) | |
| 87 | |
| 88 def mafftlinsi(input): | |
| 89 file_name = directory + os.sep + input | |
| 90 aln = file_name + aligned_extension | |
| 91 call = subprocess.call(['mafft-linsi', '--out', aln, file_name]) | |
| 92 | |
| 93 def mafftginsi(input): | |
| 94 file_name = directory + os.sep + input | |
| 95 aln = file_name + aligned_extension | |
| 96 call = subprocess.call(['mafft-ginsi', '--out', aln, file_name]) | |
| 97 | |
| 98 def mafftqinsi(input): | |
| 99 file_name = directory + os.sep + input | |
| 100 aln = file_name + aligned_extension | |
| 101 call = subprocess.call(['mafft-qinsi', '--out', aln, file_name]) | |
| 102 | |
| 103 | |
| 104 class Sequence: | |
| 105 def __init__(self, string): | |
| 106 lis = string.split() | |
| 107 self.species = lis[0] | |
| 108 self.family = lis[1] | |
| 109 self.name = lis[2] | |
| 110 self.header = ' '.join(lis[:-1]) | |
| 111 self.sequence = lis[-1] | |
| 112 self.string = string | |
| 113 | |
| 114 def printFASTA(self): | |
| 115 return '> ' + self.header + '\n' + self.sequence + '\n' | |
| 116 | |
| 117 | |
| 118 def saveMulti(tabFile): | |
| 119 with open(tabFile) as f: | |
| 120 for line in f: | |
| 121 seq = Sequence(line) | |
| 122 with open(directory + os.sep + seq.family + extension, "a") as p: | |
| 123 p.write(seq.printFASTA()) | |
| 124 | |
| 125 | |
| 126 def saveSingle(fastaFile): | |
| 127 with open(fastaFile) as f: | |
| 128 for line in f: | |
| 129 with open(directory + os.sep + "fasta" + extension, "a") as p: | |
| 130 p.write(line) | |
| 131 | |
| 132 | |
| 133 def main(): | |
| 134 usage = """%prog [options] | |
| 135 options (listed below) default to 'None' if omitted | |
| 136 """ | |
| 137 parser = optparse.OptionParser(usage=usage) | |
| 138 | |
| 139 parser.add_option( | |
| 140 '-d', '--directory', | |
| 141 metavar="PATH", | |
| 142 dest='path', | |
| 143 default='.', | |
| 144 help='Path to working directory.') | |
| 145 | |
| 146 parser.add_option( | |
| 147 '-i', '--in', | |
| 148 dest='input', | |
| 149 action='store', | |
| 150 type='string', | |
| 151 metavar="FILE", | |
| 152 help='Name of input data.') | |
| 153 | |
| 154 parser.add_option( | |
| 155 '-s', '--strat', | |
| 156 dest='strategy', | |
| 157 action='store', | |
| 158 type='string', | |
| 159 help='Alignement algorithm to use.') | |
| 160 | |
| 161 options, args = parser.parse_args() | |
| 162 | |
| 163 global directory | |
| 164 inputFile = unescape(options.input) | |
| 165 directory = unescape(options.path) + os.sep + "data" | |
| 166 strategy = unescape(options.strategy) | |
| 167 | |
| 168 os.mkdir(directory) | |
| 169 | |
| 170 if isTabular(inputFile): | |
| 171 saveMulti(inputFile) | |
| 172 else: | |
| 173 saveSingle(inputFile) | |
| 174 | |
| 175 pool = Pool() | |
| 176 list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)] | |
| 177 list_of_files = sorted(list_of_files) | |
| 178 if strategy == 'Auto': | |
| 179 pool.map(mafftauto, list_of_files) | |
| 180 | |
| 181 elif strategy == 'FFT-NS-1': | |
| 182 pool.map(mafft1, list_of_files) | |
| 183 | |
| 184 elif strategy == 'FFT-NS-2': | |
| 185 pool.map(mafft2, list_of_files) | |
| 186 | |
| 187 elif strategy == 'FFT-NS-i': | |
| 188 pool.map(maffti, list_of_files) | |
| 189 | |
| 190 elif strategy == 'E-INS-i': | |
| 191 pool.map(maffteinsi, list_of_files) | |
| 192 | |
| 193 elif strategy == 'L-INS-i': | |
| 194 pool.map(mafftlinsi, list_of_files) | |
| 195 | |
| 196 elif strategy == 'G-INS-i': | |
| 197 pool.map(mafftginsi, list_of_files) | |
| 198 | |
| 199 elif strategy == 'Q-INS-i': | |
| 200 pool.map(mafftqinsi, list_of_files) | |
| 201 | |
| 202 result = [file for file in os.listdir(directory) if file.lower().endswith(aligned_extension)] | |
| 203 if isTabular(inputFile): | |
| 204 with open(directory + os.sep + results, "a") as f: | |
| 205 for file in result: | |
| 206 with open(directory + os.sep + file, "r") as r: | |
| 207 f.write(toData(r.read()) + "\n") | |
| 208 else: | |
| 209 with open(directory + os.sep + results, "a") as f: | |
| 210 for file in result: | |
| 211 with open(directory + os.sep + file, "r") as r: | |
| 212 f.write(toDataSingle(r.read()) + "\n") | |
| 213 | |
| 214 if __name__ == '__main__': | |
| 215 main() | |
| 216 | 
