# HG changeset patch # User brinkmanlab # Date 1579905508 18000 # Node ID a3a09dd8d09aba3da66bd0cfe28a6b8293cec5d0 "planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/make_unique_id commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e" diff -r 000000000000 -r a3a09dd8d09a make_unique_id.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make_unique_id.py Fri Jan 24 17:38:28 2020 -0500 @@ -0,0 +1,58 @@ +#!/usr/bin/env python +import sys +from Bio import SeqIO +from collections import defaultdict + +usage = """ +make_unique_id +Makes all record ids unique across all input data. +All input data must be the same format. + +Use: make_unique_id.py [-v] [ ... ] +\t-v Print version and exit + +Valid formats: clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina, genbank, gb, imgt, +nexus, phd, phylip, pir, seqxml, sff, stockholm, tab, qual +""" + +if __name__ == '__main__': + if '-v' in sys.argv: + print('1.0') + exit(0) + + if len(sys.argv) < 4: + print("Missing arguments", file=sys.stderr) + print(usage, file=sys.stderr) + exit(1) + + format = sys.argv[1] + ids = defaultdict(int) + + def makeUnique(seq): + count = ids[seq.id] + ids[seq.id] += 1 + if count: + suffix = "_" + str(count) + newid = seq.id + seqlenlen = len(str(len(seq))) + if len(newid) + len(suffix) + 1 + seqlenlen > 28: + # Genbank has a max length for the id and sequence length number, truncate the sequence id if too long + newid = newid[:27 - seqlenlen - len(suffix)] + + print(f"{seq.id}\t{newid}{suffix}") + seq.id = newid + suffix + seq.name += suffix + + return seq + + + paths = iter(sys.argv[2:]) + + for input, output in zip(paths, paths): + SeqIO.write( + map(makeUnique, SeqIO.parse(input, format)), + output, + format + ) + + diff -r 000000000000 -r a3a09dd8d09a make_unique_id.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make_unique_id.xml Fri Jan 24 17:38:28 2020 -0500 @@ -0,0 +1,53 @@ + + Makes all record ids unique across all input data + + topic_3345 + topic_3489 + topic_0091 + + + operation_3282 + + + python + biopython + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.5281/zenodo.3364789 + + diff -r 000000000000 -r a3a09dd8d09a test-data/two_records.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/two_records.fastq Fri Jan 24 17:38:28 2020 -0500 @@ -0,0 +1,8 @@ +@HANNIBAL_1_FC302VTAAXX:2:1:228:167 +GAATTGATCAGGACATAGGACAACTGTAGGCACCAT ++HANNIBAL_1_FC302VTAAXX:2:1:228:167 +40 40 40 40 35 40 40 40 25 40 40 26 40 9 33 11 40 35 17 40 40 33 40 7 9 15 3 22 15 30 11 17 9 4 9 4 +@HANNIBAL_1_FC302VTAAXX:2:1:156:340 +GAGTTCTCGTCGCCTGTAGGCACCATCAATCGTATG ++HANNIBAL_1_FC302VTAAXX:2:1:156:340 +40 15 40 17 6 36 40 40 40 25 40 9 35 33 40 14 14 18 15 17 19 28 31 4 24 18 27 14 15 18 2 8 12 8 11 9 \ No newline at end of file