Mercurial > repos > rnateam > splitfasta
comparison split_fasta.py @ 6:7521d865e770 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
author | bgruening |
---|---|
date | Tue, 14 Jan 2025 21:52:36 +0000 |
parents | 733ca84b21ee |
children |
comparison
equal
deleted
inserted
replaced
5:733ca84b21ee | 6:7521d865e770 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import argparse | |
3 import os | 4 import os |
4 import sys | 5 |
5 from Bio import SeqIO | 6 from Bio import SeqIO |
6 | 7 |
7 num_chunks = 0 | 8 parser = argparse.ArgumentParser() |
8 if len(sys.argv) == 3: | 9 parser.add_argument("--records", type=int, default=None) |
9 num_chunks = int(sys.argv[2]) | 10 parser.add_argument("--limit", type=int, default=None) |
10 input_filename = sys.argv[1] | 11 parser.add_argument("--num-chunks", type=int, default=0) |
11 elif len(sys.argv) == 2: | 12 parser.add_argument("input_file") |
12 input_filename = sys.argv[1] | 13 args = parser.parse_args() |
13 else: | |
14 exit("Usage: split_fasta.py <input_filename> [<num_chunks>]") | |
15 | 14 |
16 os.mkdir('splits') | 15 input_filename = args.input_file |
16 num_chunks = args.num_chunks | |
17 record_count = args.records | |
18 record_limit = args.limit | |
17 | 19 |
18 if num_chunks != 0: | 20 os.mkdir("splits") |
19 # if splitting into chunks we need to count how many records are in the | 21 |
20 # input file | 22 if record_limit and num_chunks > record_limit: |
23 exit(f"ERROR: Requested number of chunks {num_chunks} exceeds limit {record_limit}") | |
24 | |
25 if not record_count and (num_chunks != 0 or record_limit): | |
26 # if no count is provided and if splitting into chunks or a limit is set, we need to count how many records are in the input file | |
21 record_count = 0 | 27 record_count = 0 |
22 with open(input_filename) as input_file: | 28 with open(input_filename) as input_file: |
23 for line in input_file: | 29 for line in input_file: |
24 if line.lstrip().startswith('>'): | 30 if line.lstrip().startswith(">"): |
25 record_count += 1 | 31 record_count += 1 |
26 | 32 |
33 if num_chunks != 0: | |
27 records_per_chunk = round(float(record_count) / num_chunks) | 34 records_per_chunk = round(float(record_count) / num_chunks) |
35 | |
36 if record_limit and record_count > record_limit: | |
37 exit(f"ERROR: Number of sequences {record_count} exceeds limit {record_limit}") | |
28 | 38 |
29 count = 1 | 39 count = 1 |
30 with open(input_filename) as input_file: | 40 with open(input_filename) as input_file: |
31 | 41 |
32 chunk_record_count = 0 # how many lines have we written to the output file | 42 chunk_record_count = 0 # how many lines have we written to the output file |
33 records = [] | 43 records = [] |
34 for record in SeqIO.parse(input_file, 'fasta'): | 44 for record in SeqIO.parse(input_file, "fasta"): |
35 records.append(record) | 45 records.append(record) |
36 if num_chunks == 0 or (count < num_chunks and | 46 if num_chunks == 0 or ( |
37 len(records) >= records_per_chunk): | 47 count < num_chunks and len(records) >= records_per_chunk |
48 ): | |
38 if num_chunks == 0: | 49 if num_chunks == 0: |
39 output_filename = os.path.join('splits', record.id) | 50 output_filename = os.path.join("splits", record.id) |
40 else: | 51 else: |
41 output_filename = os.path.join('splits', 'part{}'.format(count)) | 52 output_filename = os.path.join("splits", "part{}".format(count)) |
42 SeqIO.write(records, output_filename, 'fasta') | 53 SeqIO.write(records, output_filename, "fasta") |
43 count += 1 | 54 count += 1 |
44 records = [] | 55 records = [] |
45 | 56 |
46 if records: | 57 if records: |
47 # this only applies for the mode where input file is | 58 # this only applies for the mode where input file is |
48 # split into chunks | 59 # split into chunks |
49 output_filename = os.path.join('splits', 'part{}'.format(count)) | 60 output_filename = os.path.join("splits", "part{}".format(count)) |
50 SeqIO.write(records, output_filename, 'fasta') | 61 SeqIO.write(records, output_filename, "fasta") |