diff xmfa2gff3.py @ 0:74093fb62bdf draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 2645abbd04dd68266f995b8259e991c31388cda8
author iuc
date Wed, 17 Aug 2016 14:46:55 -0400
parents
children bca52822843e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xmfa2gff3.py	Wed Aug 17 14:46:55 2016 -0400
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+import sys
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+import argparse
+from BCBio import GFF
+import logging
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+
+def parse_xmfa(xmfa):
+    """Simple XMFA parser until https://github.com/biopython/biopython/pull/544
+    """
+    current_lcb = []
+    current_seq = {}
+    for line in xmfa.readlines():
+        if line.startswith('#'):
+            continue
+
+        if line.strip() == '=':
+            if 'id' in current_seq:
+                current_lcb.append(current_seq)
+                current_seq = {}
+            yield current_lcb
+            current_lcb = []
+        else:
+            line = line.strip()
+            if line.startswith('>'):
+                if 'id' in current_seq:
+                    current_lcb.append(current_seq)
+                    current_seq = {}
+                data = line.strip().split()
+                id, loc = data[1].split(':')
+                start, end = loc.split('-')
+                current_seq = {
+                    'rid': '_'.join(data[1:]),
+                    'id': id,
+                    'start': int(start),
+                    'end': int(end),
+                    'strand': 1 if data[2] == '+' else -1,
+                    'seq': ''
+                }
+            else:
+                current_seq['seq'] += line.strip()
+
+
+def _percent_identity(a, b):
+    """Calculate % identity, ignoring gaps in the host sequence
+    """
+    match = 0
+    mismatch = 0
+    for char_a, char_b in zip(list(a), list(b)):
+        if char_a == '-':
+            continue
+        if char_a == char_b:
+            match += 1
+        else:
+            mismatch += 1
+
+    if match + mismatch == 0:
+        return 0
+    return 100 * float(match) / (match + mismatch)
+
+
+def _id_tn_dict(sequences):
+    """Figure out sequence IDs
+    """
+    label_convert = {}
+    if sequences is not None:
+        if len(sequences) == 1:
+            for i, record in enumerate(SeqIO.parse(sequences[0], 'fasta')):
+                label_convert[str(i + 1)] = record.id
+        else:
+            for i, sequence in enumerate(sequences):
+                for record in SeqIO.parse(sequence, 'fasta'):
+                    label_convert[str(i + 1)] = record.id
+                    continue
+    return label_convert
+
+
+def convert_xmfa_to_gff3(xmfa_file, relative_to='1', sequences=None, window_size=1000):
+    label_convert = _id_tn_dict(sequences)
+
+    lcbs = parse_xmfa(xmfa_file)
+
+    records = [SeqRecord(Seq("A"), id=label_convert.get(relative_to, relative_to))]
+    for lcb in lcbs:
+        ids = [seq['id'] for seq in lcb]
+
+        # Doesn't match part of our sequence
+        if relative_to not in ids:
+            continue
+
+        # Skip sequences that are JUST our "relative_to" genome
+        if len(ids) == 1:
+            continue
+
+        parent = [seq for seq in lcb if seq['id'] == relative_to][0]
+        others = [seq for seq in lcb if seq['id'] != relative_to]
+
+        for other in others:
+            other['feature'] = SeqFeature(
+                FeatureLocation(parent['start'], parent['end'] + 1),
+                type="match", strand=parent['strand'],
+                qualifiers={
+                    "source": "progressiveMauve",
+                    "target": label_convert.get(other['id'], other['id']),
+                    "ID": label_convert.get(other['id'], 'xmfa_' + other['rid'])
+                }
+            )
+
+        for i in range(0, len(lcb[0]['seq']), window_size):
+            block_seq = parent['seq'][i:i + window_size]
+            real_window_size = len(block_seq)
+            real_start = abs(parent['start']) - parent['seq'][0:i].count('-') + i
+            real_end = real_start + real_window_size - block_seq.count('-')
+
+            if (real_end - real_start) < 10:
+                continue
+
+            if parent['start'] < 0:
+                strand = -1
+            else:
+                strand = 1
+
+            for other in others:
+                pid = _percent_identity(block_seq, other['seq'][i:i + real_window_size])
+                # Ignore 0% identity sequences
+                if pid == 0:
+                    continue
+                other['feature'].sub_features.append(
+                    SeqFeature(
+                        FeatureLocation(real_start, real_end),
+                        type="match_part", strand=strand,
+                        qualifiers={
+                            "source": "progressiveMauve",
+                            'score': pid
+                        }
+                    )
+                )
+
+        for other in others:
+            records[0].features.append(other['feature'])
+    return records
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Convert XMFA alignments to gff3', prog='xmfa2gff3')
+    parser.add_argument('xmfa_file', type=file, help='XMFA File')
+    parser.add_argument('--window_size', type=int, help='Window size for analysis', default=1000)
+    parser.add_argument('--relative_to', type=str, help='Index of the parent sequence in the MSA', default='1')
+    parser.add_argument('--sequences', type=file, nargs='+',
+                        help='Fasta files (in same order) passed to parent for reconstructing proper IDs')
+    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
+
+    args = parser.parse_args()
+
+    result = convert_xmfa_to_gff3(**vars(args))
+    GFF.write(result, sys.stdout)