comparison split_by_allele.py @ 1:4e03573653fe draft default tip

planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
author nml
date Tue, 19 Sep 2017 16:34:57 -0400
parents fc0f15ca12e0
children
comparison
equal deleted inserted replaced
0:fc0f15ca12e0 1:4e03573653fe
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 import getopt 2 import getopt
3 import os
3 import sys 4 import sys
4 import os 5
5 from Bio import SeqIO 6 from Bio import SeqIO
6 7
7 def split_allele_file(alleles,profiles): 8 ERROR_MSG = "Error could not parse out allele name and number from '%s'"
8 9
10
11 def split_allele_file(alleles, profiles):
12
9 writers = {} 13 writers = {}
10 14
11 handle = open(alleles, "rU") 15 handle = open(alleles, "rU")
12 for record in SeqIO.parse(handle, "fasta"): 16 for record in SeqIO.parse(handle, "fasta"):
13 17
14 seqid=record.id 18 seqid = record.id
15 19
16 #split out the alelle name from the version number 20 # split out the alelle name from the version number
17 #attempting to split based on '-' first, if that fails, then '_' 21 # attempting to split based on '-' first, if that fails, then '_'
18 result = seqid.split('_') 22 result = seqid.split('_')
19 23
20 if len(result) !=2: 24 if len(result) != 2:
21 result = seqid.split('-') 25 result = seqid.split('-')
22 if len(result) ==2: 26 if len(result) == 2:
23 newid = '_'.join(result) 27 newid = '_'.join(result)
24 record.id = newid 28 record.id = newid
25 else: 29 else:
26 print "Error could not parse out allele name and number from '%s'" % seqid 30 print(ERROR_MSG % seqid)
27 exit(0) 31 exit(0)
28
29
30 name,num = result
31 32
33 name, num = result
32 34
33 #if writer exist, then write to that current fasta file 35 # if writer exist, then write to that current fasta file
34 if name in writers: 36 if name in writers:
35 SeqIO.write(record, writers[name], "fasta") 37 SeqIO.write(record, writers[name], "fasta")
36 else: 38 else:
37 #new allele found, create new writer and add the first record 39 # new allele found, create new writer and add the first record
38 file_name = name + '.fasta' 40 file_name = name + '.fasta'
39 output_fh = open(file_name, "w") 41 output_fh = open(file_name, "w")
40 SeqIO.write(record, output_fh, "fasta") 42 SeqIO.write(record, output_fh, "fasta")
41 writers[name] = output_fh 43 writers[name] = output_fh
42 44
43 handle.close() 45 handle.close()
44 46
45 #creat config file based on the alleles found 47 # create config file based on the alleles found
46 with open('config.txt','w') as cfile: 48 with open('config.txt', 'w') as cfile:
47 cfile.write("[loci]\n") 49 cfile.write("[loci]\n")
48 for name, writer in writers.iteritems() : 50 for name, writer in writers.items():
49 path = os.path.realpath(writer.name) 51 path = os.path.realpath(writer.name)
50 cfile.write("%s\t%s\n" % (name,path)) 52 cfile.write("%s\t%s\n" % (name, path))
51 cfile.write("[profile]\n") 53 cfile.write("[profile]\n")
52 cfile.write("profile\t%s\n" % profiles) 54 cfile.write("profile\t%s\n" % profiles)
53 55
54
55 return 56 return
56 57
57 58
58 alleles=None 59 alleles = None
59 profiles=None 60 profiles = None
60 61
61 """Input arguments""" 62 """Input arguments"""
62 options, remainder = getopt.getopt(sys.argv[1:], '', [ 63 options, remainder = getopt.getopt(sys.argv[1:], '', [
63 'alleles=', 64 'alleles=',
64 'profiles=' 65 'profiles='
69 alleles = arg 70 alleles = arg
70 elif opt in ('--profiles'): 71 elif opt in ('--profiles'):
71 profiles = arg 72 profiles = arg
72 73
73 if alleles and profiles: 74 if alleles and profiles:
74 split_allele_file(alleles,profiles) 75 split_allele_file(alleles, profiles)
75