Mercurial > repos > lldelisle > fromgtftobed12
view fromgtfTobed12.py @ 0:418e4d0fe0bd draft
planemo upload for repository https://github.com/lldelisle/tools-lldelisle/tree/master/tools/fromgtfTobed12 commit 1aaffda5b95e0389e315179345642c0d005867c1
author | lldelisle |
---|---|
date | Fri, 04 Nov 2022 15:37:12 +0000 |
parents | |
children | 6fd4b3b90220 |
line wrap: on
line source
import argparse import sys import warnings import gffutils warnings.filterwarnings("ignore", message="It appears you have a gene feature" " in your GTF file. You may want to use the " "`disable_infer_genes` option to speed up database " "creation") warnings.filterwarnings("ignore", message="It appears you have a transcript " "feature in your GTF file. You may want to use the " "`disable_infer_transcripts` option to speed up " "database creation") # In gffutils v0.10 they changed the error message: warnings.filterwarnings("ignore", message="It appears you have a gene feature" " in your GTF file. You may want to use the " "`disable_infer_genes=True` option to speed up " "database creation") warnings.filterwarnings("ignore", message="It appears you have a transcript " "feature in your GTF file. You may want to use the " "`disable_infer_transcripts=True` option to speed up " "database creation") def convert_gtf_to_bed(fn, fo, useGene, mergeTranscripts, mergeTranscriptsAndOverlappingExons, ucsc): db = gffutils.create_db(fn, ':memory:') # For each transcript: prefered_name = "transcript_name" if useGene or mergeTranscripts or mergeTranscriptsAndOverlappingExons: prefered_name = "gene_name" if mergeTranscripts or mergeTranscriptsAndOverlappingExons: all_items = db.features_of_type("gene", order_by='start') else: all_items = db.features_of_type("transcript", order_by='start') for tr in all_items: # The name would be the name of the transcript/gene if exists try: # First try to have it directly on the feature trName = tr.attributes[prefered_name][0] except KeyError: # Else try to guess the name of the transcript/gene from exons: try: trName = set([e.attributes[prefered_name][0] for e in db.children(tr, featuretype='exon', order_by='start')]).pop() except KeyError: # Else take the transcript id trName = tr.id # If the cds is defined in the gtf, # use it to define the thick start and end # The gtf is 1-based closed intervalls and # bed are 0-based half-open so: # I need to remove one from each start try: # In case of multiple CDS (when there is one entry per gene) # I use the first one to get the start # and the last one to get the end (order_by=-start) cds_start = next(db.children(tr, featuretype='CDS', order_by='start')).start - 1 cds_end = next(db.children(tr, featuretype='CDS', order_by='-start')).end except StopIteration: # If the CDS is not defined, then it is set to the start # as proposed here: # https://genome.ucsc.edu/FAQ/FAQformat.html#format1 cds_start = tr.start - 1 cds_end = tr.start - 1 # Get all exons starts and lengths if mergeTranscriptsAndOverlappingExons: # We merge overlapping exons: exons_starts = [] exons_length = [] current_start = -1 current_end = None for e in db.children(tr, featuretype='exon', order_by='start'): if current_start == -1: current_start = e.start - 1 current_end = e.end else: if e.start > current_end: # This is a non-overlapping exon # We store the previous exon: exons_starts.append(current_start) exons_length.append(current_end - current_start) # We set the current: current_start = e.start - 1 current_end = e.end else: # This is an overlapping exon # We update current_end if necessary current_end = max(current_end, e.end) if current_start != -1: # There is a last exon to store: exons_starts.append(current_start) exons_length.append(current_end - current_start) else: exons_starts = [e.start - 1 for e in db.children(tr, featuretype='exon', order_by='start')] exons_length = [len(e) for e in db.children(tr, featuretype='exon', order_by='start')] # Rewrite the chromosome name if needed: chrom = tr.chrom if ucsc and chrom[0:3] != 'chr': chrom = 'chr' + chrom fo.write("%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\n" % (chrom, tr.start - 1, tr.end, trName, 0, tr.strand, cds_start, cds_end, "0", len(exons_starts), ",".join([str(ex_l) for ex_l in exons_length]), ",".join([str(s - (tr.start - 1)) for s in exons_starts]))) argp = argparse.ArgumentParser( description=("Convert a gtf to a bed12 with one entry" " per transcript/gene")) argp.add_argument('input', default=None, help="Input gtf file (can be gzip).") argp.add_argument('--output', default=sys.stdout, type=argparse.FileType('w'), help="Output bed12 file.") argp.add_argument('--useGene', action="store_true", help="Use the gene name instead of the " "transcript name.") argp.add_argument('--ucscformat', action="store_true", help="If you want that all chromosome names " "begin with 'chr'.") group = argp.add_mutually_exclusive_group() group.add_argument('--mergeTranscripts', action="store_true", help="Merge all transcripts into a single " "entry to have one line per gene.") group.add_argument('--mergeTranscriptsAndOverlappingExons', action="store_true", help="Merge all transcripts into a single " "entry to have one line per gene and merge" " overlapping exons.") args = argp.parse_args() convert_gtf_to_bed(args.input, args.output, args.useGene, args.mergeTranscripts, args.mergeTranscriptsAndOverlappingExons, args.ucscformat)