comparison split_by_allele.py @ 0:fc0f15ca12e0 draft

planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
author nml
date Mon, 24 Oct 2016 13:15:20 -0400
parents
children 4e03573653fe
comparison
equal deleted inserted replaced
-1:000000000000 0:fc0f15ca12e0
1 #!/usr/bin/env python
2 import getopt
3 import sys
4 import os
5 from Bio import SeqIO
6
7 def split_allele_file(alleles,profiles):
8
9 writers = {}
10
11 handle = open(alleles, "rU")
12 for record in SeqIO.parse(handle, "fasta"):
13
14 seqid=record.id
15
16 #split out the alelle name from the version number
17 #attempting to split based on '-' first, if that fails, then '_'
18 result = seqid.split('_')
19
20 if len(result) !=2:
21 result = seqid.split('-')
22 if len(result) ==2:
23 newid = '_'.join(result)
24 record.id = newid
25 else:
26 print "Error could not parse out allele name and number from '%s'" % seqid
27 exit(0)
28
29
30 name,num = result
31
32
33 #if writer exist, then write to that current fasta file
34 if name in writers:
35 SeqIO.write(record, writers[name], "fasta")
36 else:
37 #new allele found, create new writer and add the first record
38 file_name = name + '.fasta'
39 output_fh = open(file_name, "w")
40 SeqIO.write(record, output_fh, "fasta")
41 writers[name] = output_fh
42
43 handle.close()
44
45 #creat config file based on the alleles found
46 with open('config.txt','w') as cfile:
47 cfile.write("[loci]\n")
48 for name, writer in writers.iteritems() :
49 path = os.path.realpath(writer.name)
50 cfile.write("%s\t%s\n" % (name,path))
51 cfile.write("[profile]\n")
52 cfile.write("profile\t%s\n" % profiles)
53
54
55 return
56
57
58 alleles=None
59 profiles=None
60
61 """Input arguments"""
62 options, remainder = getopt.getopt(sys.argv[1:], '', [
63 'alleles=',
64 'profiles='
65 ])
66
67 for opt, arg in options:
68 if opt in ('--alleles'):
69 alleles = arg
70 elif opt in ('--profiles'):
71 profiles = arg
72
73 if alleles and profiles:
74 split_allele_file(alleles,profiles)
75