Mercurial > repos > iuc > snpeff
diff gbk2fa.py @ 10:5b4ac70948d2 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/snpeff commit eea43430ff90fe6b13b295f6d5efb2208401a7ef
author | iuc |
---|---|
date | Tue, 27 Mar 2018 09:44:18 -0400 |
parents | |
children | 479c4f2f4826 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gbk2fa.py Tue Mar 27 09:44:18 2018 -0400 @@ -0,0 +1,43 @@ +import argparse +import bz2 +import contextlib +import gzip +import sys + +import magic +from Bio import SeqIO + +parser = argparse.ArgumentParser() +parser.add_argument("genbank_file", help="GenBank input file. Can be compressed with gzip or bzip2") +parser.add_argument("fasta_file", help="FASTA output datset") +parser.add_argument("--remove_version", dest="remove_version", action="store_true", help="Remove version number from NCBI form formatted accession numbers. For example, this would convert 'B000657.2' to 'B000657'") +args = parser.parse_args() + +gbk_filename = args.genbank_file +fa_filename = args.fasta_file + + +@contextlib.contextmanager +def get_file_handle(gbk_filename): + f_type = magic.from_file(args.genbank_file, mime=True) + if f_type == 'text/plain': + input_handle = open(gbk_filename, "r") + elif f_type == 'application/gzip': + input_handle = gzip.open(gbk_filename, "rt") + elif f_type == 'application/x-bzip2': + input_handle = bz2.open(gbk_filename, "rt") + else: + sys.exit("Cannot process file of type {}. Only plain, gzip'ed, and bzip2'ed genbank files are accepted ".format(f_type)) + yield input_handle + input_handle.close() + + +with get_file_handle(gbk_filename) as input_handle, open(fa_filename, "w") as output_handle: + + for seq_record in SeqIO.parse(input_handle, "genbank"): + if args.remove_version: + seq_id = seq_record.id.split('.')[0] + else: + seq_id = seq_record.id + print('Writing FASTA record: {}'.format( seq_id )) + output_handle.write(">{}\n{}\n".format(seq_id, seq_record.seq))