Mercurial > repos > nml > stringmlst
annotate split_by_allele.py @ 1:4e03573653fe draft default tip
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
| author | nml |
|---|---|
| date | Tue, 19 Sep 2017 16:34:57 -0400 |
| parents | fc0f15ca12e0 |
| children |
| rev | line source |
|---|---|
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
1 #!/usr/bin/env python |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
2 import getopt |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
3 import os |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
4 import sys |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
5 |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
6 from Bio import SeqIO |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
7 |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
8 ERROR_MSG = "Error could not parse out allele name and number from '%s'" |
|
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
9 |
|
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
10 |
|
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
11 def split_allele_file(alleles, profiles): |
|
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
12 |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
13 writers = {} |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
14 |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
15 handle = open(alleles, "rU") |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
16 for record in SeqIO.parse(handle, "fasta"): |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
17 |
|
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
18 seqid = record.id |
|
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
19 |
|
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
20 # split out the alelle name from the version number |
|
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
21 # attempting to split based on '-' first, if that fails, then '_' |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
22 result = seqid.split('_') |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
23 |
|
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
24 if len(result) != 2: |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
25 result = seqid.split('-') |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
26 if len(result) == 2: |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
27 newid = '_'.join(result) |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
28 record.id = newid |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
29 else: |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
30 print(ERROR_MSG % seqid) |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
31 exit(0) |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
32 |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
33 name, num = result |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
34 |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
35 # if writer exist, then write to that current fasta file |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
36 if name in writers: |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
37 SeqIO.write(record, writers[name], "fasta") |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
38 else: |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
39 # new allele found, create new writer and add the first record |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
40 file_name = name + '.fasta' |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
41 output_fh = open(file_name, "w") |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
42 SeqIO.write(record, output_fh, "fasta") |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
43 writers[name] = output_fh |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
44 |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
45 handle.close() |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
46 |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
47 # create config file based on the alleles found |
|
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
48 with open('config.txt', 'w') as cfile: |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
49 cfile.write("[loci]\n") |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
50 for name, writer in writers.items(): |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
51 path = os.path.realpath(writer.name) |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
52 cfile.write("%s\t%s\n" % (name, path)) |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
53 cfile.write("[profile]\n") |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
54 cfile.write("profile\t%s\n" % profiles) |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
55 |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
56 return |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
57 |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
58 |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
59 alleles = None |
|
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
60 profiles = None |
|
0
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
61 |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
62 """Input arguments""" |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
63 options, remainder = getopt.getopt(sys.argv[1:], '', [ |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
64 'alleles=', |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
65 'profiles=' |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
66 ]) |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
67 |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
68 for opt, arg in options: |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
69 if opt in ('--alleles'): |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
70 alleles = arg |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
71 elif opt in ('--profiles'): |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
72 profiles = arg |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
73 |
|
fc0f15ca12e0
planemo upload commit 0366addb646f1ddea484915abdeda939d7d49bd5
nml
parents:
diff
changeset
|
74 if alleles and profiles: |
|
1
4e03573653fe
planemo upload commit 008f4667b70be22e9ddf496738b3f74bb942ed28
nml
parents:
0
diff
changeset
|
75 split_allele_file(alleles, profiles) |
