Mercurial > repos > rnateam > splitfasta
annotate split_fasta.py @ 6:7521d865e770 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
author | bgruening |
---|---|
date | Tue, 14 Jan 2025 21:52:36 +0000 |
parents | 733ca84b21ee |
children |
rev | line source |
---|---|
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
1 #!/usr/bin/env python |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
2 |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
3 import argparse |
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
4 import os |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
5 |
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
6 from Bio import SeqIO |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
7 |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
8 parser = argparse.ArgumentParser() |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
9 parser.add_argument("--records", type=int, default=None) |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
10 parser.add_argument("--limit", type=int, default=None) |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
11 parser.add_argument("--num-chunks", type=int, default=0) |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
12 parser.add_argument("input_file") |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
13 args = parser.parse_args() |
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
14 |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
15 input_filename = args.input_file |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
16 num_chunks = args.num_chunks |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
17 record_count = args.records |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
18 record_limit = args.limit |
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
19 |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
20 os.mkdir("splits") |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
21 |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
22 if record_limit and num_chunks > record_limit: |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
23 exit(f"ERROR: Requested number of chunks {num_chunks} exceeds limit {record_limit}") |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
24 |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
25 if not record_count and (num_chunks != 0 or record_limit): |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
26 # if no count is provided and if splitting into chunks or a limit is set, we need to count how many records are in the input file |
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
27 record_count = 0 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
28 with open(input_filename) as input_file: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
29 for line in input_file: |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
30 if line.lstrip().startswith(">"): |
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
31 record_count += 1 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
32 |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
33 if num_chunks != 0: |
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
34 records_per_chunk = round(float(record_count) / num_chunks) |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
35 |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
36 if record_limit and record_count > record_limit: |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
37 exit(f"ERROR: Number of sequences {record_count} exceeds limit {record_limit}") |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
38 |
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
39 count = 1 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
40 with open(input_filename) as input_file: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
41 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
42 chunk_record_count = 0 # how many lines have we written to the output file |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
43 records = [] |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
44 for record in SeqIO.parse(input_file, "fasta"): |
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
45 records.append(record) |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
46 if num_chunks == 0 or ( |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
47 count < num_chunks and len(records) >= records_per_chunk |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
48 ): |
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
49 if num_chunks == 0: |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
50 output_filename = os.path.join("splits", record.id) |
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
51 else: |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
52 output_filename = os.path.join("splits", "part{}".format(count)) |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
53 SeqIO.write(records, output_filename, "fasta") |
5
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
54 count += 1 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
55 records = [] |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
56 |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
57 if records: |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
58 # this only applies for the mode where input file is |
733ca84b21ee
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 31945d5d8c5ebee64ebf29c6ea022fb831f47274"
rnateam
parents:
diff
changeset
|
59 # split into chunks |
6
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
60 output_filename = os.path.join("splits", "part{}".format(count)) |
7521d865e770
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/splitfasta commit 49709e680f90372edd2b8a2715d95e5949641afa
bgruening
parents:
5
diff
changeset
|
61 SeqIO.write(records, output_filename, "fasta") |