Mercurial > repos > cpt > cpt_gbk_to_5col
diff gbk_to_five_col.py @ 1:1bdd481d5c25 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:42:57 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gbk_to_five_col.py Mon Jun 05 02:42:57 2023 +0000 @@ -0,0 +1,61 @@ +#!/usr/bin/env python +import BIO_FIX_TOPO # NOQA +import argparse +import logging +from Bio import SeqIO + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger() + + +# Read in Genbank file and parse features +# Output features into Five Column format + +""" +>Feature SeqID +Line 1 + Column 1: Start location (first nucleotide) of a feature + Column 2: Stop location (last nucleotide) of a feature + Column 3: Feature name (for example, 'CDS' or 'mRNA' or 'rRNA' or 'gene' or 'exon') +Line2: + Column 4: Qualifier name (for example, 'product' or 'number' or 'gene' or 'note') + Column 5: Qualifier value + +Repeat for each feature in a seq +Repeat Line 2 for each qualifier in a feature +""" + + +def gbk_to_5col(genbank): + """Converts genbank to BankIt five column format""" + for record in SeqIO.parse(genbank, "genbank"): + print(">Feature %s" % record.id) + for feature in record.features: + if feature.type == "source": + continue + else: + for index, part in enumerate(feature.location.parts): + if part.strand > 0: + start = int(part.start) + 1 + end = int(part.end) + else: + start = int(part.end) + end = int(part.start) + 1 + if index == 0: + name = feature.type + print("%d\t%d\t%s" % (start, end, name)) + else: + print("%d\t%d" % (start, end)) + for (qualifier, values) in feature.qualifiers.items(): + for value in values: + print("\t\t\t%s\t%s" % (qualifier, value)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Convert a Genbank file into five column format" + ) + parser.add_argument("genbank", type=argparse.FileType("r"), help="Genbank file") + + args = vars(parser.parse_args()) + gbk_to_5col(**args)