Mercurial > repos > brinkmanlab > make_unique_id
comparison make_unique_id.py @ 0:a3a09dd8d09a draft
"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/make_unique_id commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
| author | brinkmanlab |
|---|---|
| date | Fri, 24 Jan 2020 17:38:28 -0500 |
| parents | |
| children | 061c3402a977 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a3a09dd8d09a |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import sys | |
| 3 from Bio import SeqIO | |
| 4 from collections import defaultdict | |
| 5 | |
| 6 usage = """ | |
| 7 make_unique_id | |
| 8 Makes all record ids unique across all input data. | |
| 9 All input data must be the same format. | |
| 10 | |
| 11 Use: make_unique_id.py [-v] <format> <input1> <output1> [<input2> <output2> ... <inputn> <outputn>] | |
| 12 \t-v Print version and exit | |
| 13 | |
| 14 Valid formats: clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina, genbank, gb, imgt, | |
| 15 nexus, phd, phylip, pir, seqxml, sff, stockholm, tab, qual | |
| 16 """ | |
| 17 | |
| 18 if __name__ == '__main__': | |
| 19 if '-v' in sys.argv: | |
| 20 print('1.0') | |
| 21 exit(0) | |
| 22 | |
| 23 if len(sys.argv) < 4: | |
| 24 print("Missing arguments", file=sys.stderr) | |
| 25 print(usage, file=sys.stderr) | |
| 26 exit(1) | |
| 27 | |
| 28 format = sys.argv[1] | |
| 29 ids = defaultdict(int) | |
| 30 | |
| 31 def makeUnique(seq): | |
| 32 count = ids[seq.id] | |
| 33 ids[seq.id] += 1 | |
| 34 if count: | |
| 35 suffix = "_" + str(count) | |
| 36 newid = seq.id | |
| 37 seqlenlen = len(str(len(seq))) | |
| 38 if len(newid) + len(suffix) + 1 + seqlenlen > 28: | |
| 39 # Genbank has a max length for the id and sequence length number, truncate the sequence id if too long | |
| 40 newid = newid[:27 - seqlenlen - len(suffix)] | |
| 41 | |
| 42 print(f"{seq.id}\t{newid}{suffix}") | |
| 43 seq.id = newid + suffix | |
| 44 seq.name += suffix | |
| 45 | |
| 46 return seq | |
| 47 | |
| 48 | |
| 49 paths = iter(sys.argv[2:]) | |
| 50 | |
| 51 for input, output in zip(paths, paths): | |
| 52 SeqIO.write( | |
| 53 map(makeUnique, SeqIO.parse(input, format)), | |
| 54 output, | |
| 55 format | |
| 56 ) | |
| 57 | |
| 58 |
