Mercurial > repos > iuc > progressivemauve
diff xmfa2gff3.py @ 0:74093fb62bdf draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 2645abbd04dd68266f995b8259e991c31388cda8
author | iuc |
---|---|
date | Wed, 17 Aug 2016 14:46:55 -0400 |
parents | |
children | bca52822843e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xmfa2gff3.py Wed Aug 17 14:46:55 2016 -0400 @@ -0,0 +1,162 @@ +#!/usr/bin/env python +import sys +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature, FeatureLocation +import argparse +from BCBio import GFF +import logging +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) + + +def parse_xmfa(xmfa): + """Simple XMFA parser until https://github.com/biopython/biopython/pull/544 + """ + current_lcb = [] + current_seq = {} + for line in xmfa.readlines(): + if line.startswith('#'): + continue + + if line.strip() == '=': + if 'id' in current_seq: + current_lcb.append(current_seq) + current_seq = {} + yield current_lcb + current_lcb = [] + else: + line = line.strip() + if line.startswith('>'): + if 'id' in current_seq: + current_lcb.append(current_seq) + current_seq = {} + data = line.strip().split() + id, loc = data[1].split(':') + start, end = loc.split('-') + current_seq = { + 'rid': '_'.join(data[1:]), + 'id': id, + 'start': int(start), + 'end': int(end), + 'strand': 1 if data[2] == '+' else -1, + 'seq': '' + } + else: + current_seq['seq'] += line.strip() + + +def _percent_identity(a, b): + """Calculate % identity, ignoring gaps in the host sequence + """ + match = 0 + mismatch = 0 + for char_a, char_b in zip(list(a), list(b)): + if char_a == '-': + continue + if char_a == char_b: + match += 1 + else: + mismatch += 1 + + if match + mismatch == 0: + return 0 + return 100 * float(match) / (match + mismatch) + + +def _id_tn_dict(sequences): + """Figure out sequence IDs + """ + label_convert = {} + if sequences is not None: + if len(sequences) == 1: + for i, record in enumerate(SeqIO.parse(sequences[0], 'fasta')): + label_convert[str(i + 1)] = record.id + else: + for i, sequence in enumerate(sequences): + for record in SeqIO.parse(sequence, 'fasta'): + label_convert[str(i + 1)] = record.id + continue + return label_convert + + +def convert_xmfa_to_gff3(xmfa_file, relative_to='1', sequences=None, window_size=1000): + label_convert = _id_tn_dict(sequences) + + lcbs = parse_xmfa(xmfa_file) + + records = [SeqRecord(Seq("A"), id=label_convert.get(relative_to, relative_to))] + for lcb in lcbs: + ids = [seq['id'] for seq in lcb] + + # Doesn't match part of our sequence + if relative_to not in ids: + continue + + # Skip sequences that are JUST our "relative_to" genome + if len(ids) == 1: + continue + + parent = [seq for seq in lcb if seq['id'] == relative_to][0] + others = [seq for seq in lcb if seq['id'] != relative_to] + + for other in others: + other['feature'] = SeqFeature( + FeatureLocation(parent['start'], parent['end'] + 1), + type="match", strand=parent['strand'], + qualifiers={ + "source": "progressiveMauve", + "target": label_convert.get(other['id'], other['id']), + "ID": label_convert.get(other['id'], 'xmfa_' + other['rid']) + } + ) + + for i in range(0, len(lcb[0]['seq']), window_size): + block_seq = parent['seq'][i:i + window_size] + real_window_size = len(block_seq) + real_start = abs(parent['start']) - parent['seq'][0:i].count('-') + i + real_end = real_start + real_window_size - block_seq.count('-') + + if (real_end - real_start) < 10: + continue + + if parent['start'] < 0: + strand = -1 + else: + strand = 1 + + for other in others: + pid = _percent_identity(block_seq, other['seq'][i:i + real_window_size]) + # Ignore 0% identity sequences + if pid == 0: + continue + other['feature'].sub_features.append( + SeqFeature( + FeatureLocation(real_start, real_end), + type="match_part", strand=strand, + qualifiers={ + "source": "progressiveMauve", + 'score': pid + } + ) + ) + + for other in others: + records[0].features.append(other['feature']) + return records + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Convert XMFA alignments to gff3', prog='xmfa2gff3') + parser.add_argument('xmfa_file', type=file, help='XMFA File') + parser.add_argument('--window_size', type=int, help='Window size for analysis', default=1000) + parser.add_argument('--relative_to', type=str, help='Index of the parent sequence in the MSA', default='1') + parser.add_argument('--sequences', type=file, nargs='+', + help='Fasta files (in same order) passed to parent for reconstructing proper IDs') + parser.add_argument('--version', action='version', version='%(prog)s 1.0') + + args = parser.parse_args() + + result = convert_xmfa_to_gff3(**vars(args)) + GFF.write(result, sys.stdout)