Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
annotate orthologs/hmmbuild.py @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
| author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> | 
|---|---|
| date | Tue, 11 Mar 2014 12:19:13 -0700 | 
| parents | |
| children | 
| rev | line source | 
|---|---|
| 0 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 1 import os | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 2 import optparse | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 3 import subprocess | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 4 from multiprocessing import Pool | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 5 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 6 directory = "./data" | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 7 results = "results.data" | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 8 extension = ".afa" | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 9 model_extension = ".hmm" | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 10 inputFile = "" | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 11 index_of_name_in_hmm = 6 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 12 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 13 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 14 def unescape(string): | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 15 mapped_chars = { | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 16 '>': '__gt__', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 17 '<': '__lt__', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 18 "'": '__sq__', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 19 '"': '__dq__', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 20 '[': '__ob__', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 21 ']': '__cb__', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 22 '{': '__oc__', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 23 '}': '__cc__', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 24 '@': '__at__', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 25 '\n': '__cn__', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 26 '\r': '__cr__', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 27 '\t': '__tc__', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 28 '#': '__pd__' | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 29 } | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 30 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 31 for key, value in mapped_chars.iteritems(): | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 32 string = string.replace(value, key) | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 33 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 34 return string | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 35 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 36 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 37 def toData(text): | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 38 lis = text.split() | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 39 name = lis[index_of_name_in_hmm] | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 40 text = name + "\t" + text.replace("\n", "\\n") | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 41 return text | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 42 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 43 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 44 def hmmbuild(input): | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 45 file_name = directory + os.sep + input | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 46 # print file_name | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 47 # return subprocess.Popen(['hmmbuild', "--informat", "afa", file_name + ".hmm", file_name], stdout=subprocess.PIPE).communicate()[0] # ./muscle | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 48 pop = subprocess.Popen(['hmmbuild', "--informat", "afa", file_name + ".hmm", file_name]) | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 49 pop.wait() | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 50 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 51 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 52 class Sequence: | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 53 def __init__(self, string): | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 54 lis = string.split('\t') | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 55 # print lis | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 56 self.species = lis[0] | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 57 self.family = lis[1] | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 58 self.name = lis[2] | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 59 self.header = ' '.join(lis[:-1]) | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 60 self.sequence = lis[-1] | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 61 self.string = string | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 62 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 63 def printFASTA(self): | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 64 return '> ' + self.header + '\n' + self.sequence + '\n' | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 65 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 66 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 67 def main(): | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 68 usage = """%prog [options] | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 69 options (listed below) default to 'None' if omitted | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 70 """ | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 71 parser = optparse.OptionParser(usage=usage) | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 72 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 73 parser.add_option( | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 74 '-i', '--in', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 75 dest='input', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 76 action='store', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 77 type='string', | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 78 metavar="FILE", | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 79 help='Name of input data.') | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 80 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 81 options, args = parser.parse_args() | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 82 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 83 global inputFile, directory | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 84 inputFile = unescape(options.input) | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 85 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 86 os.mkdir(directory) | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 87 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 88 with open(inputFile) as f: | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 89 for line in f: | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 90 seq = Sequence(line) | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 91 with open(directory + os.sep + seq.family + extension, "a") as p: | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 92 p.write(seq.printFASTA()) | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 93 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 94 pool = Pool() | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 95 list_of_files = [file for file in os.listdir(directory) if file.lower().endswith(extension)] | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 96 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 97 pool.map(hmmbuild, list_of_files) | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 98 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 99 result = [file for file in os.listdir(directory) if file.lower().endswith(model_extension)] | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 100 with open(directory + os.sep + results, "a") as f: | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 101 for file in result: | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 102 with open(directory + os.sep + file, "r") as r: | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 103 f.write(toData(r.read()) + "\n") | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 104 | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 105 if __name__ == '__main__': | 
| 
5b9a38ec4a39
First commit of old repositories
 osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> parents: diff
changeset | 106 main() | 
