Mercurial > repos > iuc > concoct_merge_cut_up_clustering
comparison cut_up_fasta.py @ 0:b546422c9128 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/concoct commit 40a09cbfd6052f7b0295946621db1bdf58228b09"
| author | iuc |
|---|---|
| date | Sun, 13 Mar 2022 08:44:34 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b546422c9128 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import argparse | |
| 4 import gzip | |
| 5 from functools import partial | |
| 6 | |
| 7 from Bio import SeqIO | |
| 8 | |
| 9 | |
| 10 def cut_up_fasta(input_fasta, chunk_size, overlap, merge_last, output_fasta, output_bed, gzipped): | |
| 11 if gzipped: | |
| 12 _open = partial(gzip.open, mode='rt') | |
| 13 else: | |
| 14 _open = open | |
| 15 | |
| 16 fasta_fh = open(output_fasta, 'w') | |
| 17 | |
| 18 if output_bed is not None: | |
| 19 bed_fh = open(output_bed, 'w') | |
| 20 | |
| 21 with _open(input_fasta) as input_fh: | |
| 22 for record in SeqIO.parse(input_fh, "fasta"): | |
| 23 if (not merge_last and len(record.seq) > chunk_size) or (merge_last and len(record.seq) >= 2 * chunk_size): | |
| 24 for index, split_seq in enumerate(chunks(record.seq, chunk_size, overlap, merge_last)): | |
| 25 fasta_fh.write(f">{record.id}.concoct_part_{index}\n{split_seq}\n") | |
| 26 if output_bed is not None: | |
| 27 bed_fh.write(f"{record.id}\t{chunk_size * index}\t{chunk_size * index + len(split_seq)}\t{record.id}.concoct_part_{index}\n") | |
| 28 else: | |
| 29 fasta_fh.write(f">{record.id}.concoct_part_0\n{record.seq}\n") | |
| 30 if output_bed is not None: | |
| 31 bed_fh.write(f"{record.id}\t0\t{len(record.seq)}\t{record.id}.concoct_part_0\n") | |
| 32 if output_bed is not None: | |
| 33 bed_fh.close() | |
| 34 | |
| 35 | |
| 36 def chunks(seq, chunk_size, overlap_size, merge_last): | |
| 37 # Yield successive chunk_size-sized chunks from seq | |
| 38 # with given overlap overlap_size between the chunks. | |
| 39 assert chunk_size > overlap_size | |
| 40 if merge_last: | |
| 41 for i in range(0, len(seq) - chunk_size + 1, chunk_size - overlap_size): | |
| 42 yield seq[i:i + chunk_size] if i + chunk_size + chunk_size - overlap_size <= len(seq) else seq[i:] | |
| 43 else: | |
| 44 for i in range(0, len(seq), chunk_size - overlap_size): | |
| 45 yield seq[i:i + chunk_size] | |
| 46 | |
| 47 | |
| 48 parser = argparse.ArgumentParser() | |
| 49 parser.add_argument("--input_fasta", action="store", dest="input_fasta", help="Fasta files with contigs") | |
| 50 parser.add_argument("--gzipped", action="store_true", dest="gzipped", help="Input file is gzipped") | |
| 51 parser.add_argument("--chunk_size", action="store", dest="chunk_size", type=int, help="Chunk size\n") | |
| 52 parser.add_argument("--overlap_size", action="store", dest="overlap_size", type=int, help="Overlap size\n") | |
| 53 parser.add_argument("--merge_last", default=False, action="store_true", dest="merge_last", help="Concatenate final part to last contig\n") | |
| 54 parser.add_argument("--output_bed", action="store", dest="output_bed", default=None, help="BED file to be created with exact regions of the original contigs corresponding to the newly created contigs") | |
| 55 parser.add_argument("--output_fasta", action="store", dest="output_fasta", help="Output fasta file with cut contigs") | |
| 56 | |
| 57 args = parser.parse_args() | |
| 58 cut_up_fasta(args.input_fasta, args.chunk_size, args.overlap_size, args.merge_last, args.output_fasta, args.output_bed, args.gzipped) |
