Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
diff alignment/phytab_mafft.py @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment/phytab_mafft.py Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,216 @@ +#!/usr/bin/env python + +import os +import optparse +import subprocess +from multiprocessing import Pool + +directory = "" +results = "results.data" +extension = ".fs" +aligned_extension = ".afa" + + +def unescape(string): + mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__' + } + + for key, value in mapped_chars.iteritems(): + string = string.replace(value, key) + + return string + + +def isTabular(file): + with open(file) as f: + for line in f: + if line[0] == '>': + return False + return True + + +def toData(text): + text = text.split('\n') + result = '' + for line in text: + if '>' in line: + line = '\n' + line.replace('> ', "") + '\t' + line = line.replace(" ", "\t") + result += line + return result[1:] # Index past the first newline char + +def toDataSingle(text): + text = text.split('\n') + result = '' + for line in text: + line = line + '\n' + result += line + return result[1:] # Index past the first newline char + +def mafftauto(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft', '--auto', '--out', aln, file_name]) + +def mafft1(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft', '--retree', '1', '--out', aln, file_name]) + +def mafft2(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft', '--retree', '2', '--out', aln, file_name]) + +def maffti(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft-fftnsi', '--out', aln, file_name]) + +def maffteinsi(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft-einsi', '--out', aln, file_name]) + +def mafftlinsi(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft-linsi', '--out', aln, file_name]) + +def mafftginsi(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft-ginsi', '--out', aln, file_name]) + +def mafftqinsi(input): + file_name = directory + os.sep + input + aln = file_name + aligned_extension + call = subprocess.call(['mafft-qinsi', '--out', aln, file_name]) + + +class Sequence: + def __init__(self, string): + lis = string.split() + self.species = lis[0] + self.family = lis[1] + self.name = lis[2] + self.header = ' '.join(lis[:-1]) + self.sequence = lis[-1] + self.string = string + + def printFASTA(self): + return '> ' + self.header + '\n' + self.sequence + '\n' + + +def saveMulti(tabFile): + with open(tabFile) as f: + for line in f: + seq = Sequence(line) + with open(directory + os.sep + seq.family + extension, "a") as p: + p.write(seq.printFASTA()) + + +def saveSingle(fastaFile): + with open(fastaFile) as f: + for line in f: + with open(directory + os.sep + "fasta" + extension, "a") as p: + p.write(line) + + +def main(): + usage = """%prog [options] +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-d', '--directory', + metavar="PATH", + dest='path', + default='.', + help='Path to working directory.') + + parser.add_option( + '-i', '--in', + dest='input', + action='store', + type='string', + metavar="FILE", + help='Name of input data.') + + parser.add_option( + '-s', '--strat', + dest='strategy', + action='store', + type='string', + help='Alignement algorithm to use.') + + options, args = parser.parse_args() + + global directory + inputFile = unescape(options.input) + directory = unescape(options.path) + os.sep + "data" + strategy = unescape(options.strategy) + + os.mkdir(directory) + + if isTabular(inputFile): + saveMulti(inputFile) + else: + saveSingle(inputFile) + + pool = Pool() + list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)] + list_of_files = sorted(list_of_files) + if strategy == 'Auto': + pool.map(mafftauto, list_of_files) + + elif strategy == 'FFT-NS-1': + pool.map(mafft1, list_of_files) + + elif strategy == 'FFT-NS-2': + pool.map(mafft2, list_of_files) + + elif strategy == 'FFT-NS-i': + pool.map(maffti, list_of_files) + + elif strategy == 'E-INS-i': + pool.map(maffteinsi, list_of_files) + + elif strategy == 'L-INS-i': + pool.map(mafftlinsi, list_of_files) + + elif strategy == 'G-INS-i': + pool.map(mafftginsi, list_of_files) + + elif strategy == 'Q-INS-i': + pool.map(mafftqinsi, list_of_files) + + result = [file for file in os.listdir(directory) if file.lower().endswith(aligned_extension)] + if isTabular(inputFile): + with open(directory + os.sep + results, "a") as f: + for file in result: + with open(directory + os.sep + file, "r") as r: + f.write(toData(r.read()) + "\n") + else: + with open(directory + os.sep + results, "a") as f: + for file in result: + with open(directory + os.sep + file, "r") as r: + f.write(toDataSingle(r.read()) + "\n") + +if __name__ == '__main__': + main() +