comparison split_fasta.py @ 6:7521d865e770 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
author bgruening
date Tue, 14 Jan 2025 21:52:36 +0000
parents 733ca84b21ee
children
comparison
equal deleted inserted replaced
5:733ca84b21ee 6:7521d865e770
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse
3 import os 4 import os
4 import sys 5
5 from Bio import SeqIO 6 from Bio import SeqIO
6 7
7 num_chunks = 0 8 parser = argparse.ArgumentParser()
8 if len(sys.argv) == 3: 9 parser.add_argument("--records", type=int, default=None)
9 num_chunks = int(sys.argv[2]) 10 parser.add_argument("--limit", type=int, default=None)
10 input_filename = sys.argv[1] 11 parser.add_argument("--num-chunks", type=int, default=0)
11 elif len(sys.argv) == 2: 12 parser.add_argument("input_file")
12 input_filename = sys.argv[1] 13 args = parser.parse_args()
13 else:
14 exit("Usage: split_fasta.py <input_filename> [<num_chunks>]")
15 14
16 os.mkdir('splits') 15 input_filename = args.input_file
16 num_chunks = args.num_chunks
17 record_count = args.records
18 record_limit = args.limit
17 19
18 if num_chunks != 0: 20 os.mkdir("splits")
19 # if splitting into chunks we need to count how many records are in the 21
20 # input file 22 if record_limit and num_chunks > record_limit:
23 exit(f"ERROR: Requested number of chunks {num_chunks} exceeds limit {record_limit}")
24
25 if not record_count and (num_chunks != 0 or record_limit):
26 # if no count is provided and if splitting into chunks or a limit is set, we need to count how many records are in the input file
21 record_count = 0 27 record_count = 0
22 with open(input_filename) as input_file: 28 with open(input_filename) as input_file:
23 for line in input_file: 29 for line in input_file:
24 if line.lstrip().startswith('>'): 30 if line.lstrip().startswith(">"):
25 record_count += 1 31 record_count += 1
26 32
33 if num_chunks != 0:
27 records_per_chunk = round(float(record_count) / num_chunks) 34 records_per_chunk = round(float(record_count) / num_chunks)
35
36 if record_limit and record_count > record_limit:
37 exit(f"ERROR: Number of sequences {record_count} exceeds limit {record_limit}")
28 38
29 count = 1 39 count = 1
30 with open(input_filename) as input_file: 40 with open(input_filename) as input_file:
31 41
32 chunk_record_count = 0 # how many lines have we written to the output file 42 chunk_record_count = 0 # how many lines have we written to the output file
33 records = [] 43 records = []
34 for record in SeqIO.parse(input_file, 'fasta'): 44 for record in SeqIO.parse(input_file, "fasta"):
35 records.append(record) 45 records.append(record)
36 if num_chunks == 0 or (count < num_chunks and 46 if num_chunks == 0 or (
37 len(records) >= records_per_chunk): 47 count < num_chunks and len(records) >= records_per_chunk
48 ):
38 if num_chunks == 0: 49 if num_chunks == 0:
39 output_filename = os.path.join('splits', record.id) 50 output_filename = os.path.join("splits", record.id)
40 else: 51 else:
41 output_filename = os.path.join('splits', 'part{}'.format(count)) 52 output_filename = os.path.join("splits", "part{}".format(count))
42 SeqIO.write(records, output_filename, 'fasta') 53 SeqIO.write(records, output_filename, "fasta")
43 count += 1 54 count += 1
44 records = [] 55 records = []
45 56
46 if records: 57 if records:
47 # this only applies for the mode where input file is 58 # this only applies for the mode where input file is
48 # split into chunks 59 # split into chunks
49 output_filename = os.path.join('splits', 'part{}'.format(count)) 60 output_filename = os.path.join("splits", "part{}".format(count))
50 SeqIO.write(records, output_filename, 'fasta') 61 SeqIO.write(records, output_filename, "fasta")