Mercurial > repos > rnateam > splitfasta
comparison split_fasta.py @ 6:7521d865e770 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
| author | bgruening |
|---|---|
| date | Tue, 14 Jan 2025 21:52:36 +0000 |
| parents | 733ca84b21ee |
| children |
comparison
equal
deleted
inserted
replaced
| 5:733ca84b21ee | 6:7521d865e770 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 import argparse | |
| 3 import os | 4 import os |
| 4 import sys | 5 |
| 5 from Bio import SeqIO | 6 from Bio import SeqIO |
| 6 | 7 |
| 7 num_chunks = 0 | 8 parser = argparse.ArgumentParser() |
| 8 if len(sys.argv) == 3: | 9 parser.add_argument("--records", type=int, default=None) |
| 9 num_chunks = int(sys.argv[2]) | 10 parser.add_argument("--limit", type=int, default=None) |
| 10 input_filename = sys.argv[1] | 11 parser.add_argument("--num-chunks", type=int, default=0) |
| 11 elif len(sys.argv) == 2: | 12 parser.add_argument("input_file") |
| 12 input_filename = sys.argv[1] | 13 args = parser.parse_args() |
| 13 else: | |
| 14 exit("Usage: split_fasta.py <input_filename> [<num_chunks>]") | |
| 15 | 14 |
| 16 os.mkdir('splits') | 15 input_filename = args.input_file |
| 16 num_chunks = args.num_chunks | |
| 17 record_count = args.records | |
| 18 record_limit = args.limit | |
| 17 | 19 |
| 18 if num_chunks != 0: | 20 os.mkdir("splits") |
| 19 # if splitting into chunks we need to count how many records are in the | 21 |
| 20 # input file | 22 if record_limit and num_chunks > record_limit: |
| 23 exit(f"ERROR: Requested number of chunks {num_chunks} exceeds limit {record_limit}") | |
| 24 | |
| 25 if not record_count and (num_chunks != 0 or record_limit): | |
| 26 # if no count is provided and if splitting into chunks or a limit is set, we need to count how many records are in the input file | |
| 21 record_count = 0 | 27 record_count = 0 |
| 22 with open(input_filename) as input_file: | 28 with open(input_filename) as input_file: |
| 23 for line in input_file: | 29 for line in input_file: |
| 24 if line.lstrip().startswith('>'): | 30 if line.lstrip().startswith(">"): |
| 25 record_count += 1 | 31 record_count += 1 |
| 26 | 32 |
| 33 if num_chunks != 0: | |
| 27 records_per_chunk = round(float(record_count) / num_chunks) | 34 records_per_chunk = round(float(record_count) / num_chunks) |
| 35 | |
| 36 if record_limit and record_count > record_limit: | |
| 37 exit(f"ERROR: Number of sequences {record_count} exceeds limit {record_limit}") | |
| 28 | 38 |
| 29 count = 1 | 39 count = 1 |
| 30 with open(input_filename) as input_file: | 40 with open(input_filename) as input_file: |
| 31 | 41 |
| 32 chunk_record_count = 0 # how many lines have we written to the output file | 42 chunk_record_count = 0 # how many lines have we written to the output file |
| 33 records = [] | 43 records = [] |
| 34 for record in SeqIO.parse(input_file, 'fasta'): | 44 for record in SeqIO.parse(input_file, "fasta"): |
| 35 records.append(record) | 45 records.append(record) |
| 36 if num_chunks == 0 or (count < num_chunks and | 46 if num_chunks == 0 or ( |
| 37 len(records) >= records_per_chunk): | 47 count < num_chunks and len(records) >= records_per_chunk |
| 48 ): | |
| 38 if num_chunks == 0: | 49 if num_chunks == 0: |
| 39 output_filename = os.path.join('splits', record.id) | 50 output_filename = os.path.join("splits", record.id) |
| 40 else: | 51 else: |
| 41 output_filename = os.path.join('splits', 'part{}'.format(count)) | 52 output_filename = os.path.join("splits", "part{}".format(count)) |
| 42 SeqIO.write(records, output_filename, 'fasta') | 53 SeqIO.write(records, output_filename, "fasta") |
| 43 count += 1 | 54 count += 1 |
| 44 records = [] | 55 records = [] |
| 45 | 56 |
| 46 if records: | 57 if records: |
| 47 # this only applies for the mode where input file is | 58 # this only applies for the mode where input file is |
| 48 # split into chunks | 59 # split into chunks |
| 49 output_filename = os.path.join('splits', 'part{}'.format(count)) | 60 output_filename = os.path.join("splits", "part{}".format(count)) |
| 50 SeqIO.write(records, output_filename, 'fasta') | 61 SeqIO.write(records, output_filename, "fasta") |
