comparison xmfa2gff3.py @ 0:74093fb62bdf draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 2645abbd04dd68266f995b8259e991c31388cda8
author iuc
date Wed, 17 Aug 2016 14:46:55 -0400
parents
children bca52822843e
comparison
equal deleted inserted replaced
-1:000000000000 0:74093fb62bdf
1 #!/usr/bin/env python
2 import sys
3 from Bio import SeqIO
4 from Bio.Seq import Seq
5 from Bio.SeqRecord import SeqRecord
6 from Bio.SeqFeature import SeqFeature, FeatureLocation
7 import argparse
8 from BCBio import GFF
9 import logging
10 logging.basicConfig(level=logging.INFO)
11 log = logging.getLogger(__name__)
12
13
14 def parse_xmfa(xmfa):
15 """Simple XMFA parser until https://github.com/biopython/biopython/pull/544
16 """
17 current_lcb = []
18 current_seq = {}
19 for line in xmfa.readlines():
20 if line.startswith('#'):
21 continue
22
23 if line.strip() == '=':
24 if 'id' in current_seq:
25 current_lcb.append(current_seq)
26 current_seq = {}
27 yield current_lcb
28 current_lcb = []
29 else:
30 line = line.strip()
31 if line.startswith('>'):
32 if 'id' in current_seq:
33 current_lcb.append(current_seq)
34 current_seq = {}
35 data = line.strip().split()
36 id, loc = data[1].split(':')
37 start, end = loc.split('-')
38 current_seq = {
39 'rid': '_'.join(data[1:]),
40 'id': id,
41 'start': int(start),
42 'end': int(end),
43 'strand': 1 if data[2] == '+' else -1,
44 'seq': ''
45 }
46 else:
47 current_seq['seq'] += line.strip()
48
49
50 def _percent_identity(a, b):
51 """Calculate % identity, ignoring gaps in the host sequence
52 """
53 match = 0
54 mismatch = 0
55 for char_a, char_b in zip(list(a), list(b)):
56 if char_a == '-':
57 continue
58 if char_a == char_b:
59 match += 1
60 else:
61 mismatch += 1
62
63 if match + mismatch == 0:
64 return 0
65 return 100 * float(match) / (match + mismatch)
66
67
68 def _id_tn_dict(sequences):
69 """Figure out sequence IDs
70 """
71 label_convert = {}
72 if sequences is not None:
73 if len(sequences) == 1:
74 for i, record in enumerate(SeqIO.parse(sequences[0], 'fasta')):
75 label_convert[str(i + 1)] = record.id
76 else:
77 for i, sequence in enumerate(sequences):
78 for record in SeqIO.parse(sequence, 'fasta'):
79 label_convert[str(i + 1)] = record.id
80 continue
81 return label_convert
82
83
84 def convert_xmfa_to_gff3(xmfa_file, relative_to='1', sequences=None, window_size=1000):
85 label_convert = _id_tn_dict(sequences)
86
87 lcbs = parse_xmfa(xmfa_file)
88
89 records = [SeqRecord(Seq("A"), id=label_convert.get(relative_to, relative_to))]
90 for lcb in lcbs:
91 ids = [seq['id'] for seq in lcb]
92
93 # Doesn't match part of our sequence
94 if relative_to not in ids:
95 continue
96
97 # Skip sequences that are JUST our "relative_to" genome
98 if len(ids) == 1:
99 continue
100
101 parent = [seq for seq in lcb if seq['id'] == relative_to][0]
102 others = [seq for seq in lcb if seq['id'] != relative_to]
103
104 for other in others:
105 other['feature'] = SeqFeature(
106 FeatureLocation(parent['start'], parent['end'] + 1),
107 type="match", strand=parent['strand'],
108 qualifiers={
109 "source": "progressiveMauve",
110 "target": label_convert.get(other['id'], other['id']),
111 "ID": label_convert.get(other['id'], 'xmfa_' + other['rid'])
112 }
113 )
114
115 for i in range(0, len(lcb[0]['seq']), window_size):
116 block_seq = parent['seq'][i:i + window_size]
117 real_window_size = len(block_seq)
118 real_start = abs(parent['start']) - parent['seq'][0:i].count('-') + i
119 real_end = real_start + real_window_size - block_seq.count('-')
120
121 if (real_end - real_start) < 10:
122 continue
123
124 if parent['start'] < 0:
125 strand = -1
126 else:
127 strand = 1
128
129 for other in others:
130 pid = _percent_identity(block_seq, other['seq'][i:i + real_window_size])
131 # Ignore 0% identity sequences
132 if pid == 0:
133 continue
134 other['feature'].sub_features.append(
135 SeqFeature(
136 FeatureLocation(real_start, real_end),
137 type="match_part", strand=strand,
138 qualifiers={
139 "source": "progressiveMauve",
140 'score': pid
141 }
142 )
143 )
144
145 for other in others:
146 records[0].features.append(other['feature'])
147 return records
148
149
150 if __name__ == '__main__':
151 parser = argparse.ArgumentParser(description='Convert XMFA alignments to gff3', prog='xmfa2gff3')
152 parser.add_argument('xmfa_file', type=file, help='XMFA File')
153 parser.add_argument('--window_size', type=int, help='Window size for analysis', default=1000)
154 parser.add_argument('--relative_to', type=str, help='Index of the parent sequence in the MSA', default='1')
155 parser.add_argument('--sequences', type=file, nargs='+',
156 help='Fasta files (in same order) passed to parent for reconstructing proper IDs')
157 parser.add_argument('--version', action='version', version='%(prog)s 1.0')
158
159 args = parser.parse_args()
160
161 result = convert_xmfa_to_gff3(**vars(args))
162 GFF.write(result, sys.stdout)