annotate split_fasta.py @ 6:7521d865e770 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
author bgruening
date Tue, 14 Jan 2025 21:52:36 +0000
parents 733ca84b21ee
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
1 #!/usr/bin/env python
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
2
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
3 import argparse
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
4 import os
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
5
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
6 from Bio import SeqIO
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
7
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
8 parser = argparse.ArgumentParser()
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
9 parser.add_argument("--records", type=int, default=None)
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
10 parser.add_argument("--limit", type=int, default=None)
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
11 parser.add_argument("--num-chunks", type=int, default=0)
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
12 parser.add_argument("input_file")
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
13 args = parser.parse_args()
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
14
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
15 input_filename = args.input_file
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
16 num_chunks = args.num_chunks
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
17 record_count = args.records
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
18 record_limit = args.limit
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
19
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
20 os.mkdir("splits")
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
21
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
22 if record_limit and num_chunks > record_limit:
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
23 exit(f"ERROR: Requested number of chunks {num_chunks} exceeds limit {record_limit}")
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
24
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
25 if not record_count and (num_chunks != 0 or record_limit):
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
26 # if no count is provided and if splitting into chunks or a limit is set, we need to count how many records are in the input file
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
27 record_count = 0
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
28 with open(input_filename) as input_file:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
29 for line in input_file:
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
30 if line.lstrip().startswith(">"):
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
31 record_count += 1
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
32
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
33 if num_chunks != 0:
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
34 records_per_chunk = round(float(record_count) / num_chunks)
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
35
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
36 if record_limit and record_count > record_limit:
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
37 exit(f"ERROR: Number of sequences {record_count} exceeds limit {record_limit}")
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
38
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
39 count = 1
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
40 with open(input_filename) as input_file:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
41
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
42 chunk_record_count = 0 # how many lines have we written to the output file
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
43 records = []
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
44 for record in SeqIO.parse(input_file, "fasta"):
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
45 records.append(record)
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
46 if num_chunks == 0 or (
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
47 count < num_chunks and len(records) >= records_per_chunk
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
48 ):
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
49 if num_chunks == 0:
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
50 output_filename = os.path.join("splits", record.id)
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
51 else:
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
52 output_filename = os.path.join("splits", "part{}".format(count))
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
53 SeqIO.write(records, output_filename, "fasta")
5
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
54 count += 1
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
55 records = []
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
56
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
57 if records:
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
58 # this only applies for the mode where input file is
733ca84b21ee "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff changeset
59 # split into chunks
6
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
60 output_filename = os.path.join("splits", "part{}".format(count))
7521d865e770 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents: 5
diff changeset
61 SeqIO.write(records, output_filename, "fasta")