comparison gbk2fa.py @ 25:5c7b70713fb5 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit a4c17efb8ec4b3a315766f5b5602effa275fade3
author iuc
date Wed, 03 Aug 2022 16:33:45 +0000
parents cfcf33df7fc0
children ca2b512e8d7c
comparison
equal deleted inserted replaced
24:cfcf33df7fc0 25:5c7b70713fb5
1 import argparse 1 import argparse
2 import bz2 2 import bz2
3 import contextlib
4 import gzip 3 import gzip
5 import sys
6 4
7 import magic
8 from Bio import SeqIO 5 from Bio import SeqIO
9 6
7
8 def get_opener(gbk_filename):
9 try:
10 bz2.open(gbk_filename).read(1)
11 return bz2.open
12 except OSError:
13 pass
14 try:
15 gzip.open(gbk_filename).read(1)
16 return gzip.open
17 except OSError:
18 return open
19
20
10 parser = argparse.ArgumentParser() 21 parser = argparse.ArgumentParser()
11 parser.add_argument("genbank_file", help="GenBank input file. Can be compressed with gzip or bzip2") 22 parser.add_argument(
12 parser.add_argument("fasta_file", help="FASTA output datset") 23 "genbank_file",
13 parser.add_argument("--remove_version", dest="remove_version", action="store_true", help="Remove version number from NCBI form formatted accession numbers. For example, this would convert 'B000657.2' to 'B000657'") 24 help="GenBank input file. Can be compressed with gzip or bzip2"
25 )
26 parser.add_argument(
27 "fasta_file", help="FASTA output datset"
28 )
29 parser.add_argument(
30 "--remove_version", action="store_true",
31 help="Remove version number from NCBI form formatted accession numbers. "
32 "For example, this would convert 'B000657.2' to 'B000657'"
33 )
14 args = parser.parse_args() 34 args = parser.parse_args()
15 35
16 gbk_filename = args.genbank_file
17 fa_filename = args.fasta_file
18 36
19 37 gbk_open = get_opener(args.genbank_file)
20 @contextlib.contextmanager 38 with gbk_open(args.genbank_file, 'rt') as input_handle, \
21 def get_file_handle(gbk_filename): 39 open(args.fasta_file, 'w') as output_handle:
22 f_type = magic.from_file(args.genbank_file, mime=True) 40 for seq_record in SeqIO.parse(input_handle, 'genbank'):
23 if f_type == 'text/plain':
24 input_handle = open(gbk_filename, "r")
25 elif f_type == 'application/gzip' or f_type == 'application/x-gzip':
26 input_handle = gzip.open(gbk_filename, "rt")
27 elif f_type == 'application/x-bzip2':
28 input_handle = bz2.open(gbk_filename, "rt")
29 else:
30 sys.exit("Cannot process file of type {}. Only plain, gzip'ed, and bzip2'ed genbank files are accepted ".format(f_type))
31 yield input_handle
32 input_handle.close()
33
34
35 with get_file_handle(gbk_filename) as input_handle, open(fa_filename, "w") as output_handle:
36
37 for seq_record in SeqIO.parse(input_handle, "genbank"):
38 if args.remove_version: 41 if args.remove_version:
39 seq_id = seq_record.id.split('.')[0] 42 seq_id = seq_record.id.split('.')[0]
40 else: 43 else:
41 seq_id = seq_record.id 44 seq_id = seq_record.id
42 print('Writing FASTA record: {}'.format(seq_id)) 45 print('Writing FASTA record: {}'.format(seq_id))
43 output_handle.write(">{}\n{}\n".format(seq_id, seq_record.seq)) 46 print('>' + seq_id, file=output_handle)
47 print(seq_record.seq, file=output_handle)