diff transtermhp.py @ 0:c28817831a24 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/transtermhp commit 799339e22181d28cb2b145454d353d6025779636
author iuc
date Fri, 09 Oct 2015 09:22:42 -0400
parents
children 1a1ec22a7e28
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/transtermhp.py	Fri Oct 09 09:22:42 2015 -0400
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+import sys
+import re
+import subprocess
+from Bio import SeqIO
+from BCBio import GFF
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+
+
+def main(expterm, fasta, gff3):
+    with open(fasta, 'r') as handle:
+        seq_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
+
+    # Build coords file
+    with open(gff3, 'r') as handle:
+        for rec in GFF.parse(handle, base_dict=seq_dict):
+            with open('tmp.coords', 'w') as coords:
+                for feat in rec.features:
+                    if feat.type == 'gene':
+                        coords.write('\t'.join([
+                            feat.id,
+                            str(feat.location.start + 1),
+                            str(feat.location.end),
+                            rec.id,
+                        ]) + '\n')
+            with open('tmp.fasta', 'w') as fasta_handle:
+                SeqIO.write(rec, fasta_handle, 'fasta')
+
+            cmd = ['transterm', '-p', expterm, fasta, 'tmp.coords']
+            output = subprocess.check_output(cmd)
+            #   TERM 1         4342 - 4366     + F    93 -11.5 -3.22878 | opp_overlap 4342, overlap 4340 4357
+            ttre = re.compile(
+                '^  (?P<name>.*) (?P<start>\d+) - (?P<end>\d+)\s+'
+                '(?P<strand>[-+])\s+(?P<loc>[GFRTHNgfr]+)\s+'
+                '(?P<conf>\d+)\s+(?P<hp>[0-9.-]+)\s+(?P<tail>[0-9.-]+)'
+            )
+
+            rec.features = []
+            batches = output.split('SEQUENCE ')
+            for batch in batches[1:]:
+                batch_lines = batch.split('\n')
+                # Strip the header
+                interesting = batch_lines[2:]
+                unformatted = [x for x in interesting if x.startswith('  ')][0::2]
+                for terminator in unformatted:
+                    m = ttre.match(terminator)
+                    if m:
+                        start = int(m.group('start')) - 1
+                        end = int(m.group('end'))
+                        if m.group('strand') == '+':
+                            strand = 1
+                        else:
+                            strand = 0
+
+                        feature = SeqFeature(
+                            FeatureLocation(start, end),
+                            type="terminator",
+                            strand=strand,
+                            qualifiers={
+                                "source": "TransTermHP_2.09",
+                                "score": m.group('conf'),
+                                "ID": m.group('name'),
+                            }
+                        )
+                        rec.features.append(feature)
+            yield rec
+
+if __name__ == '__main__':
+    for record in main(*sys.argv[1:4]):
+        GFF.write([record], sys.stdout)