diff make_unique_id.py @ 0:a3a09dd8d09a draft

"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/make_unique_id commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
author brinkmanlab
date Fri, 24 Jan 2020 17:38:28 -0500
parents
children 061c3402a977
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/make_unique_id.py	Fri Jan 24 17:38:28 2020 -0500
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+import sys
+from Bio import SeqIO
+from collections import defaultdict
+
+usage = """
+make_unique_id
+Makes all record ids unique across all input data.
+All input data must be the same format.
+
+Use: make_unique_id.py [-v] <format> <input1> <output1> [<input2> <output2> ... <inputn> <outputn>]
+\t-v Print version and exit 
+
+Valid formats: clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina, genbank, gb, imgt,
+nexus, phd, phylip, pir, seqxml, sff, stockholm, tab, qual
+"""
+
+if __name__ == '__main__':
+    if '-v' in sys.argv:
+        print('1.0')
+        exit(0)
+
+    if len(sys.argv) < 4:
+        print("Missing arguments", file=sys.stderr)
+        print(usage, file=sys.stderr)
+        exit(1)
+
+    format = sys.argv[1]
+    ids = defaultdict(int)
+    
+    def makeUnique(seq):
+        count = ids[seq.id]
+        ids[seq.id] += 1
+        if count:
+            suffix = "_" + str(count)
+            newid = seq.id
+            seqlenlen = len(str(len(seq)))
+            if len(newid) + len(suffix) + 1 + seqlenlen > 28:
+                # Genbank has a max length for the id and sequence length number, truncate the sequence id if too long
+               newid = newid[:27 - seqlenlen - len(suffix)]
+
+            print(f"{seq.id}\t{newid}{suffix}")
+            seq.id = newid + suffix
+            seq.name += suffix
+
+        return seq
+
+
+    paths = iter(sys.argv[2:])
+
+    for input, output in zip(paths, paths):
+        SeqIO.write(
+            map(makeUnique, SeqIO.parse(input, format)),
+            output,
+            format
+        )
+
+