Mercurial > repos > brinkmanlab > mauve_contig_mover
comparison stitch.py @ 0:c14690ec0c9f draft
"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/mauve_contig_mover commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
| author | brinkmanlab |
|---|---|
| date | Fri, 24 Jan 2020 17:43:19 -0500 |
| parents | |
| children | 71831ead9e16 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c14690ec0c9f |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import sys | |
| 3 import csv | |
| 4 import getopt | |
| 5 | |
| 6 from Bio import SeqIO, Alphabet | |
| 7 from Bio.Seq import Seq | |
| 8 | |
| 9 usage = """ | |
| 10 Mauve Contig Mover - Stitch | |
| 11 Stitch contigs into a single contig. | |
| 12 Compliments reversed sequences and rewrites all feature coordinates. | |
| 13 | |
| 14 Use: stitch.py [-v] [-s 'final sequence id'] <padding length> <draft file path> <draft file format> [MauveCM contigs.tab path] | |
| 15 \t-v Print version and exit | |
| 16 \t-s Provide an ID for the final sequence, the first sequence ID will be used otherwise | |
| 17 Valid draft file formats: | |
| 18 abi, abi-trim, ace, cif-atom, cif-seqres, clustal, embl, fasta, fasta-2line, fastq-sanger, fastq, fastq-solexa, fastq-illumina, | |
| 19 genbank, gb, ig, imgt, nexus, pdb-seqres, pdb-atom, phd, phylip, pir, seqxml, sff, sff-trim, stockholm, swiss, tab, qual, uniprot-xml, gff3 | |
| 20 """ | |
| 21 | |
| 22 | |
| 23 def getOrder(path): | |
| 24 """ | |
| 25 Parse MCM contig order file and iterate rows after "Ordered Contigs" | |
| 26 :param path: path to MCM *_contig.tab | |
| 27 :return: tuple(type, label, contig_type, strand, left_end, right_end) | |
| 28 """ | |
| 29 with open(path, "r") as alignment_file: | |
| 30 alignments = iter(csv.reader(alignment_file, delimiter="\t")) | |
| 31 try: | |
| 32 alignment = next(alignments) | |
| 33 | |
| 34 # Jog to beginning of ordered alignments | |
| 35 while not (len(alignment) and "Ordered Contigs" == alignment[0]): | |
| 36 alignment = next(alignments) | |
| 37 | |
| 38 # Skip column header | |
| 39 next(alignments) | |
| 40 | |
| 41 while True: | |
| 42 yield next(alignments) | |
| 43 except StopIteration: | |
| 44 return | |
| 45 | |
| 46 | |
| 47 def stitch(pad, contigs, order): | |
| 48 """ | |
| 49 Reduce contigs to single contig by concatenation. | |
| 50 Compliments reversed sequences and rewrites all feature coordinates. | |
| 51 :param pad: Seq or SeqRecord instance to insert between contigs | |
| 52 :param contigs: dict of SeqRecords keyed on the record name | |
| 53 :param order: iterable of tuples containing sequence names and orientation | |
| 54 :return: concatentated SeqRecord | |
| 55 """ | |
| 56 result = None | |
| 57 # Concat in order with padding | |
| 58 for alignment in order: | |
| 59 if len(alignment) < 4: continue | |
| 60 try: | |
| 61 contig = contigs.pop(alignment[1]) # type: SeqIO.SeqRecord | |
| 62 if alignment[3] == "complement": | |
| 63 contig = contig.reverse_complement() | |
| 64 if result: | |
| 65 # A lot is happening in the background here. Biopython handles the feature coordinates implicitly. | |
| 66 result += pad + contig | |
| 67 else: | |
| 68 result = contig | |
| 69 pad.alphabet = result.seq.alphabet | |
| 70 except KeyError: | |
| 71 pass | |
| 72 | |
| 73 # Concat remaining in arbitrary order | |
| 74 for unordered in contigs.values(): | |
| 75 if result: | |
| 76 result += pad + unordered | |
| 77 else: | |
| 78 result = unordered | |
| 79 | |
| 80 return result | |
| 81 | |
| 82 | |
| 83 if __name__ == '__main__': | |
| 84 seqid = None | |
| 85 # Parse arguments | |
| 86 try: | |
| 87 opts, args = getopt.gnu_getopt(sys.argv[1:], 'vs:iq:') | |
| 88 for opt, val in opts: | |
| 89 if opt == '-v': | |
| 90 print('1.0') | |
| 91 exit(0) | |
| 92 elif opt == '-s': | |
| 93 seqid = val | |
| 94 except getopt.GetoptError as err: | |
| 95 print("Argument error(" + str(err.opt) + "): " + err.msg, file=sys.stderr) | |
| 96 args = [] | |
| 97 | |
| 98 # Check for minimum number of arguments | |
| 99 if len(args) < 3: | |
| 100 print(usage, file=sys.stderr) | |
| 101 exit(1) | |
| 102 | |
| 103 pad_len = int(args[0]) | |
| 104 if pad_len < 0: | |
| 105 print("Padding length must be >= 0", file=sys.stderr) | |
| 106 print(help, file=sys.stderr) | |
| 107 exit(1) | |
| 108 | |
| 109 draft_path = args[1] | |
| 110 draft_format = args[2] | |
| 111 | |
| 112 if len(args) < 4: | |
| 113 order = () | |
| 114 else: | |
| 115 order = getOrder(args[3]) | |
| 116 | |
| 117 pad = Seq('N'*pad_len) | |
| 118 contigs = {seq.name: seq for seq in SeqIO.parse(draft_path, draft_format)} | |
| 119 | |
| 120 result = stitch(pad, contigs, order) | |
| 121 | |
| 122 if result: | |
| 123 # Ensure there is only one 'source' feature | |
| 124 # TODO | |
| 125 pass | |
| 126 | |
| 127 if result and seqid: | |
| 128 result.id = seqid | |
| 129 result.description = "" | |
| 130 | |
| 131 seqlenlen = len(str(len(result))) | |
| 132 if len(result.id) + 1 + seqlenlen > 28: | |
| 133 # Genbank has a max length for the id and sequence length number, truncate the sequence id ov too long | |
| 134 result.id = result.id[:27 - seqlenlen] | |
| 135 | |
| 136 result.seq.alphabet = Alphabet.generic_dna # TODO Investigate why this is required for some datasets | |
| 137 SeqIO.write(result, sys.stdout, draft_format) |
