Mercurial > repos > vipints > fml_gff3togtf
view fml_gff_converter_programs/scripts/gff3_to_bed_converter.py @ 0:ed53dca1c6ff
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
author | vipints |
---|---|
date | Tue, 07 Jun 2011 17:26:20 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society # Copyright (C) 2010 Max Planck Society # # Description : Convert a genome annotation in GFF3 format to UCSC 12 column Wiggle BED format. BED format typically represents the transcript models. import re, sys def WriteBED(tinfo, einfo): for contig_id, features in tinfo.items(): for tid, tloc in features.items(): if tid in einfo: # get corresponding exon info exon_cnt, exon_len, exon_cod, fex, rstart = 0, '', '', 0, None if tloc[-1] == '-': if einfo[tid][0][1] > einfo[tid][-1][1]:einfo[tid].sort() for ex_ele in einfo[tid]: if ex_ele[0] != contig_id:continue exon_cnt += 1 exon_len += str(int(ex_ele[2])-int(ex_ele[1])+1) + ',' if fex == 0: # calculate the relative exon start exon_cod += '0,' fex = 1 rstart = int(ex_ele[1]) else: exon_cod += str(int(ex_ele[1])-rstart) + ',' if exon_len: # display bed line print contig_id + '\t' + tloc[0] + '\t' + tloc[1] + '\t' + tid + '\t' + tloc[2] + '\t' + tloc[-1] + '\t' + tloc[0] + '\t' + tloc[1] + '\t0\t' + str(exon_cnt) + '\t' + exon_len + '\t' + exon_cod def ParseAnno(gff_fh): tinfo, einfo = dict(), dict() for gff_line in gff_fh: gff_line = gff_line.strip('\n\r').split('\t') if re.match(r'#', gff_line[0]):continue if re.match(r'>', gff_line[0]):continue if len(gff_line) == 1: if re.search(r'\w+', gff_line[0]):continue## GFF files with FASTA sequence together if len(gff_line) != 9:sys.stderr.write('Warning: Found invalid GFF line\n' + '\t'.join(gff_line) + '\n');continue if gff_line[3] == '' and gff_line[4] == '':sys.stderr.write('Warning: Found invalid coordinates in GFF line: ' + '\t'.join(gff_line) + '\n');continue if gff_line[2] == 'transcript' or gff_line[2] == 'scRNA' or gff_line[2] == "mRNA" or gff_line[2] == 'ncRNA' or gff_line[2] == 'miRNA' or gff_line[2] == 'rRNA' or gff_line[2] == 'snoRNA' or gff_line[2] == 'snRNA' or gff_line[2] == 'tRNA' or gff_line[2] == 'pseudogenic_transcript': col9 = gff_line[-1].split(';') tid = None for ele in col9: if re.search(r'ID=', ele):tid = re.search(r'ID=(.+)', ele).group(1);break if gff_line[0] in tinfo: tinfo[gff_line[0]][tid] = (gff_line[3], gff_line[4], gff_line[5], gff_line[6]) else: tinfo[gff_line[0]] = {tid:(gff_line[3], gff_line[4], gff_line[5], gff_line[6])} if gff_line[2] == 'exon': col9 = gff_line[-1].split(';') pid = None for ele in col9: if re.search(r'Parent=', ele):pid = re.search(r'Parent=(.+)', ele).group(1);break if pid in einfo: einfo[pid].append((gff_line[0], int(gff_line[3]), int(gff_line[4]))) else: einfo[pid] = [(gff_line[0], int(gff_line[3]), int(gff_line[4]))] gff_fh.close() return tinfo, einfo if __name__ == "__main__": try: gff_fh = open(sys.argv[1], 'rU') except: sys.stderr.write('GFF format file fail to open, Cannot continue...\n') sys.stderr.write('USAGE: gff3_to_bed_converter.py <gff file> > *.bed\n') sys.exit(-1) ## get transcript annotation tinfo, einfo = ParseAnno(gff_fh) ## write into bed format WriteBED(tinfo, einfo)