# HG changeset patch
# User vipints
# Date 1307481980 14400
# Node ID ed53dca1c6ff9f9f208f68e510f8f18378bee935
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
diff -r 000000000000 -r ed53dca1c6ff fml_gff_converter_programs/README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fml_gff_converter_programs/README Tue Jun 07 17:26:20 2011 -0400
@@ -0,0 +1,60 @@
+A collection of tools for converting genome annotation between GTF (Gene Transfer Format), BED (Browser Extensible Data) and GFF (Generic Feature Format)
+
+INTRODUCTION
+
+Several genome annotation centers provide their data in GTF, BED, GFF3 etc. I have few programs
+they mainly deals with converting between GTF, BED and GFF3 formats. They are extensively tested
+with files from different centers like ENSEMBL, UCSC, JGI and NCBI AceView. Please follow the
+instructions below to clone these tools into your galaxy instance.
+
+CONTENTS
+
+galaxy: Contains tool configuration files in *.xml format.
+
+ Tool configuration files. Please add right path to locate the scripts.
+
+ gtf_to_gff3.xml
+
+ gff3_to_gtf.xml
+
+ bed_to_gff3.xml
+
+ gff3_to_bed.xml
+
+t: Test data set. (move to your galaxy root folder/test-data/)
+
+ You may need to move the test files into your test-data directory so galaxy can find them.
+ If you want to run the functional tests eg as:
+
+ sh run_functional_tests.sh -id fml_gtf2gff3
+
+scripts: Python based and Perl based scripts.
+
+ gtf_to_gff3_converter.py: This tool converts data from GTF format to valid GFF3 format.
+
+ gff3_to_gtf_converter.pl: This tool converts data from GFF3 format to GTF format.
+
+ bed_to_gff3_converter.py: This tool converts data from a 12 column UCSC wiggle BED format to GFF3 format.
+
+ gff3_to_bed_converter.py: This tool converts gene transcript annotation from GFF3 format to UCSC wiggle 12 column BED format.
+
+REQUIREMENTS
+
+ Python, Perl and if you are interested to use gff3_to_gtf_converter.pl please install Bio::Perl module.
+
+COMMENTS/QUESTIONS
+
+I can be reached at vipin.ts@tuebingen.mpg.de
+
+LICENSE
+
+Copyright (C) 2010 Friedrich Miescher Laboratory of the Max Planck Society
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+COURTESY
+
+To the Galaxy Team.
diff -r 000000000000 -r ed53dca1c6ff fml_gff_converter_programs/galaxy/bed_to_gff3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fml_gff_converter_programs/galaxy/bed_to_gff3.xml Tue Jun 07 17:26:20 2011 -0400
@@ -0,0 +1,84 @@
+
+ converter
+ bed_to_gff3_converter.py $inf_bed > $gff_format
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool converts data from a 12 column UCSC wiggle BED format to GFF3 format.
+
+--------
+
+**Example**
+
+- The following data in UCSC Wiggle BED format::
+
+ chr1 11873 14409 uc001aaa.3 0 + 11873 11873 0 3 354,109,1189, 0,739,1347,
+
+- Will be converted to GFF3 format::
+
+ ##gff-version 3
+ chr1 bed2gff gene 11874 14409 0 + . ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3
+ chr1 bed2gff transcript 11874 14409 0 + . ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3
+ chr1 bed2gff exon 11874 12227 0 + . Parent=uc001aaa.3
+ chr1 bed2gff exon 12613 12721 0 + . Parent=uc001aaa.3
+ chr1 bed2gff exon 13221 14409 0 + . Parent=uc001aaa.3
+
+--------
+
+**About formats**
+
+**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones:
+
+The first three BED fields (required) are::
+
+ 1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
+ 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
+ 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
+
+The additional BED fields (optional) are::
+
+ 4. name - The name of the BED line.
+ 5. score - A score between 0 and 1000.
+ 6. strand - Defines the strand - either '+' or '-'.
+ 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
+ 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
+ 9. reserved - This should always be set to zero.
+ 10. blockCount - The number of blocks (exons) in the BED line.
+ 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
+ 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:
+
+ 1. seqid - Must be a chromosome or scaffold or contig.
+ 2. source - The program that generated this feature.
+ 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. stop - The ending position of the feature (inclusive).
+ 6. score - A score between 0 and 1000. If there is no score value, enter ".".
+ 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+ 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+ 9. attributes - All lines with the same group are linked together into a single item.
+
+--------
+
+This tool is a part of the **MLB Group at Friedrich Miescher Laboratory of the Max Planck Society**. Copyright (C) 2010 Vipin T. Sreedharan (vipin.ts@tuebingen.mpg.de)
+
+
diff -r 000000000000 -r ed53dca1c6ff fml_gff_converter_programs/galaxy/gff3_to_bed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fml_gff_converter_programs/galaxy/gff3_to_bed.xml Tue Jun 07 17:26:20 2011 -0400
@@ -0,0 +1,84 @@
+
+ converter
+ gff3_to_bed_converter.py $inf_gff > $bed_format
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool converts gene transcript annotation from GFF3 format to UCSC wiggle 12 column BED format.
+
+--------
+
+**Example**
+
+- The following data in GFF3 format::
+
+ ##gff-version 3
+ chr1 protein_coding gene 11874 14409 0 + . ID=Gene:uc001aaa.3;Name=Gene:uc001aaa.3
+ chr1 protein_coding transcript 11874 14409 0 + . ID=uc001aaa.3;Name=uc001aaa.3;Parent=Gene:uc001aaa.3
+ chr1 protein_coding exon 11874 12227 0 + . Parent=uc001aaa.3
+ chr1 protein_coding exon 12613 12721 0 + . Parent=uc001aaa.3
+ chr1 protein_coding exon 13221 14409 0 + . Parent=uc001aaa.3
+
+- Will be converted to UCSC Wiggle BED format::
+
+ chr1 11874 14409 uc001aaa.3 0 + 11874 14409 0 3 354,109,1189, 0,739,1347,
+
+--------
+
+**About formats**
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:
+
+ 1. seqid - Must be a chromosome or scaffold or contig.
+ 2. source - The program that generated this feature.
+ 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. stop - The ending position of the feature (inclusive).
+ 6. score - A score between 0 and 1000. If there is no score value, enter ".".
+ 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+ 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+ 9. attributes - All lines with the same group are linked together into a single item.
+
+**BED format** Browser Extensible Data format was designed at UCSC for displaying data tracks in the Genome Browser. It has three required fields and several additional optional ones:
+
+The first three BED fields (required) are::
+
+ 1. chrom - The name of the chromosome (e.g. chr1, chrY_random).
+ 2. chromStart - The starting position in the chromosome. (The first base in a chromosome is numbered 0.)
+ 3. chromEnd - The ending position in the chromosome, plus 1 (i.e., a half-open interval).
+
+The additional BED fields (optional) are::
+
+ 4. name - The name of the BED line.
+ 5. score - A score between 0 and 1000.
+ 6. strand - Defines the strand - either '+' or '-'.
+ 7. thickStart - The starting position where the feature is drawn thickly at the Genome Browser.
+ 8. thickEnd - The ending position where the feature is drawn thickly at the Genome Browser.
+ 9. reserved - This should always be set to zero.
+ 10. blockCount - The number of blocks (exons) in the BED line.
+ 11. blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
+ 12. blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
+
+--------
+
+This tool is a part of the **MLB Group at Friedrich Miescher Laboratory of the Max Planck Society**. Copyright (C) 2010 Vipin T. Sreedharan (vipin.ts@tuebingen.mpg.de)
+
+
diff -r 000000000000 -r ed53dca1c6ff fml_gff_converter_programs/galaxy/gff3_to_gtf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fml_gff_converter_programs/galaxy/gff3_to_gtf.xml Tue Jun 07 17:26:20 2011 -0400
@@ -0,0 +1,83 @@
+
+ converter
+ gff3_to_gtf_converter.pl $inf_gff3 $gtf_format
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool converts data from GFF3 format to GTF format.
+
+--------
+
+**Example**
+
+- The following data in GFF3 format::
+
+ ##gff-version 3
+ 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11
+ 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859
+ 17 protein_coding protein 7256262 7256960 . + . ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751
+ 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751
+ 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352
+ 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751
+ 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751
+
+- Will be converted to GTF format::
+
+ 17 protein_coding exon 7255208 7258258 . + . gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+ 17 protein_coding CDS 7256262 7256957 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352";
+ 17 protein_coding start_codon 7256262 7256264 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+ 17 protein_coding stop_codon 7256958 7256960 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+
+--------
+
+**About formats**
+
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:
+
+ 1. seqid - Must be a chromosome or scaffold.
+ 2. source - The program that generated this feature.
+ 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. stop - The ending position of the feature (inclusive).
+ 6. score - A score between 0 and 1000. If there is no score value, enter ".".
+ 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+ 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+ 9. attributes - All lines with the same group are linked together into a single item.
+
+
+**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields:
+
+ 1. seqname - The name of the sequence.
+ 2. source - This indicating where the annotation came from.
+ 3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon'
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. end - The ending position of the feature (inclusive).
+ 6. score - The score field indicates a degree of confidence in the feature's existence and coordinates.
+ 7. strand - Valid entries include '+', '-', or '.'
+ 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base.
+ 9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region.
+
+--------
+
+This tool is a part of the **MLB Group at Friedrich Miescher Laboratory of the Max Planck Society**. Copyright (C) 2010 Vipin T. Sreedharan (vipin.ts@tuebingen.mpg.de)
+
+
diff -r 000000000000 -r ed53dca1c6ff fml_gff_converter_programs/galaxy/gtf_to_gff3.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fml_gff_converter_programs/galaxy/gtf_to_gff3.xml Tue Jun 07 17:26:20 2011 -0400
@@ -0,0 +1,89 @@
+
+ converter
+ gtf_to_gff3_converter.py $inf_gtf > $gff3_format
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool converts data from GTF format to valid GFF3 format.
+
+--------
+
+**Example**
+
+- The following data in GTF format::
+
+ 17 protein_coding exon 7255208 7258258 . + . gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+ 17 protein_coding CDS 7256262 7256957 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001"; protein_id "ENSP00000328352";
+ 17 protein_coding start_codon 7256262 7256264 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+ 17 protein_coding stop_codon 7256958 7256960 . + 0 gene_id "ENSG00000213859"; transcript_id "ENST00000333751"; exon_number "1"; gene_name "KCTD11"; transcript_name "KCTD11-001";
+
+- Will be converted to GFF3 format::
+
+ ##gff-version 3
+ 17 protein_coding gene 7255208 7258258 . + . ID=ENSG00000213859;Name=KCTD11
+ 17 protein_coding mRNA 7255208 7258258 . + . ID=ENST00000333751;Name=KCTD11-001;Parent=ENSG00000213859
+ 17 protein_coding protein 7256262 7256960 . + . ID=ENSP00000328352;Name=KCTD11-001;Parent=ENST00000333751
+ 17 protein_coding five_prime_UTR 7255208 7256261 . + . Parent=ENST00000333751
+ 17 protein_coding CDS 7256262 7256960 . + 0 Name=CDS:KCTD11;Parent=ENST00000333751,ENSP00000328352
+ 17 protein_coding three_prime_UTR 7256961 7258258 . + . Parent=ENST00000333751
+ 17 protein_coding exon 7255208 7258258 . + . Parent=ENST00000333751
+
+--------
+
+**About formats**
+
+**GTF format** Gene Transfer Format, it borrows from GFF, but has additional structure that warrants a separate definition and format name. GTF lines have nine tab-seaparated fields:
+
+ 1. seqname - The name of the sequence.
+ 2. source - This indicating where the annotation came from.
+ 3. feature - The name of the feature types. The following feature types are required: 'CDS', 'start_codon' and 'stop_codon'
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. end - The ending position of the feature (inclusive).
+ 6. score - The score field indicates a degree of confidence in the feature's existence and coordinates.
+ 7. strand - Valid entries include '+', '-', or '.'
+ 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base.
+ 9. attributes - These attributes are designed for handling multiple transcripts from the same genomic region.
+
+**GFF3 format** General Feature Format is a format for describing genes and other features associated with DNA, RNA and Protein sequences. GFF3 lines have nine tab-separated fields:
+
+ 1. seqid - Must be a chromosome or scaffold.
+ 2. source - The program that generated this feature.
+ 3. type - The name of this type of feature. Some examples of standard feature types are "gene", "CDS", "protein", "mRNA", and "exon".
+ 4. start - The starting position of the feature in the sequence. The first base is numbered 1.
+ 5. stop - The ending position of the feature (inclusive).
+ 6. score - A score between 0 and 1000. If there is no score value, enter ".".
+ 7. strand - Valid entries include '+', '-', or '.' (for don't know/care).
+ 8. phase - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'.
+ 9. attributes - All lines with the same group are linked together into a single item.
+
+--------
+
+This tool is a part of the **MLB Group at Friedrich Miescher Laboratory of the Max Planck Society**. Copyright (C) 2010 Vipin T. Sreedharan (vipin.ts@tuebingen.mpg.de)
+
+
diff -r 000000000000 -r ed53dca1c6ff fml_gff_converter_programs/scripts/bed_to_gff3_converter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fml_gff_converter_programs/scripts/bed_to_gff3_converter.py Tue Jun 07 17:26:20 2011 -0400
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society
+# Copyright (C) 2010 Max Planck Society
+#
+# Description : Convert a BED format file to GFF3 format
+
+import re, sys
+
+def __main__():
+
+ try:
+ bed_fh = open(sys.argv[1], 'rU')
+ except:
+ sys.stderr.write('BED format file fail to open, Cannot continue...\n')
+ sys.stderr.write('USAGE: bed_to_gff3_converter.py > *.gff3\n')
+ sys.exit(-1)
+ print '##gff-version 3'
+ for line in bed_fh:
+ line = line.strip( '\n\r' ).split( '\t' )
+ if re.match('#', line[0]):continue
+ if len(line) != 12: # considering BED lines with 12 fields
+ line = '\t'.join(line)
+ sys.stdout.write('Warning: Invalid BED line found- ' + line + '\n')
+ continue
+ if len(line[-1].split(',')) != len(line[-2].split(',')):continue # checking the consistency b/w relative start of exon and its length
+ rstart = line[-1].split(',')
+ if rstart[-1] == '': rstart.pop()
+ exon_len = line[-2].split(',')
+ if exon_len[-1] == '': exon_len.pop()
+ if len(rstart) != int(line[-3]): continue # checking the number of exons and block count are same
+ if line[5] != '+' and line[5] != '-':line[5] = '.' # replace the unknown starnd with '.'
+ # write feature lines to the result file
+ print line[0] + '\tbed2gff\tgene\t' + str(int(line[1]) + 1) + '\t' + line[2] + '\t' + line[4] + '\t' + line[5] + '\t.\t' + 'ID=Gene:' + line[3] + ';Name=Gene:' + line[3]
+ print line[0] + '\tbed2gff\ttranscript\t' + str(int(line[1]) + 1) + '\t' + line[2] + '\t' + line[4] + '\t' + line[5] + '\t.\t' + 'ID=' + line[3] + ';Name=' + line[3] + ';Parent=Gene:' + line[3]
+ st = int(line[1])
+ for ex_cnt in range(int(line[-3])):
+ start = st + int(rstart[ex_cnt]) + 1
+ stop = start + int(exon_len[ex_cnt]) - 1
+ print line[0] + '\tbed2gff\texon\t' + str(start) + '\t' + str(stop) + '\t' + line[4] + '\t' + line[5] + '\t.\t' + 'Parent=' + line[3]
+ bed_fh.close()
+
+if __name__ == "__main__": __main__()
diff -r 000000000000 -r ed53dca1c6ff fml_gff_converter_programs/scripts/gff3_to_bed_converter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fml_gff_converter_programs/scripts/gff3_to_bed_converter.py Tue Jun 07 17:26:20 2011 -0400
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society
+# Copyright (C) 2010 Max Planck Society
+#
+# Description : Convert a genome annotation in GFF3 format to UCSC 12 column Wiggle BED format. BED format typically represents the transcript models.
+
+import re, sys
+
+def WriteBED(tinfo, einfo):
+
+ for contig_id, features in tinfo.items():
+ for tid, tloc in features.items():
+ if tid in einfo: # get corresponding exon info
+ exon_cnt, exon_len, exon_cod, fex, rstart = 0, '', '', 0, None
+ if tloc[-1] == '-':
+ if einfo[tid][0][1] > einfo[tid][-1][1]:einfo[tid].sort()
+ for ex_ele in einfo[tid]:
+ if ex_ele[0] != contig_id:continue
+ exon_cnt += 1
+ exon_len += str(int(ex_ele[2])-int(ex_ele[1])+1) + ','
+ if fex == 0: # calculate the relative exon start
+ exon_cod += '0,'
+ fex = 1
+ rstart = int(ex_ele[1])
+ else:
+ exon_cod += str(int(ex_ele[1])-rstart) + ','
+ if exon_len: # display bed line
+ print contig_id + '\t' + tloc[0] + '\t' + tloc[1] + '\t' + tid + '\t' + tloc[2] + '\t' + tloc[-1] + '\t' + tloc[0] + '\t' + tloc[1] + '\t0\t' + str(exon_cnt) + '\t' + exon_len + '\t' + exon_cod
+
+def ParseAnno(gff_fh):
+
+ tinfo, einfo = dict(), dict()
+ for gff_line in gff_fh:
+ gff_line = gff_line.strip('\n\r').split('\t')
+ if re.match(r'#', gff_line[0]):continue
+ if re.match(r'>', gff_line[0]):continue
+ if len(gff_line) == 1:
+ if re.search(r'\w+', gff_line[0]):continue## GFF files with FASTA sequence together
+ if len(gff_line) != 9:sys.stderr.write('Warning: Found invalid GFF line\n' + '\t'.join(gff_line) + '\n');continue
+ if gff_line[3] == '' and gff_line[4] == '':sys.stderr.write('Warning: Found invalid coordinates in GFF line: ' + '\t'.join(gff_line) + '\n');continue
+ if gff_line[2] == 'transcript' or gff_line[2] == 'scRNA' or gff_line[2] == "mRNA" or gff_line[2] == 'ncRNA' or gff_line[2] == 'miRNA' or gff_line[2] == 'rRNA' or gff_line[2] == 'snoRNA' or gff_line[2] == 'snRNA' or gff_line[2] == 'tRNA' or gff_line[2] == 'pseudogenic_transcript':
+ col9 = gff_line[-1].split(';')
+ tid = None
+ for ele in col9:
+ if re.search(r'ID=', ele):tid = re.search(r'ID=(.+)', ele).group(1);break
+ if gff_line[0] in tinfo:
+ tinfo[gff_line[0]][tid] = (gff_line[3], gff_line[4], gff_line[5], gff_line[6])
+ else:
+ tinfo[gff_line[0]] = {tid:(gff_line[3], gff_line[4], gff_line[5], gff_line[6])}
+ if gff_line[2] == 'exon':
+ col9 = gff_line[-1].split(';')
+ pid = None
+ for ele in col9:
+ if re.search(r'Parent=', ele):pid = re.search(r'Parent=(.+)', ele).group(1);break
+ if pid in einfo:
+ einfo[pid].append((gff_line[0], int(gff_line[3]), int(gff_line[4])))
+ else:
+ einfo[pid] = [(gff_line[0], int(gff_line[3]), int(gff_line[4]))]
+ gff_fh.close()
+ return tinfo, einfo
+
+if __name__ == "__main__":
+
+ try:
+ gff_fh = open(sys.argv[1], 'rU')
+ except:
+ sys.stderr.write('GFF format file fail to open, Cannot continue...\n')
+ sys.stderr.write('USAGE: gff3_to_bed_converter.py > *.bed\n')
+ sys.exit(-1)
+ ## get transcript annotation
+ tinfo, einfo = ParseAnno(gff_fh)
+ ## write into bed format
+ WriteBED(tinfo, einfo)
diff -r 000000000000 -r ed53dca1c6ff fml_gff_converter_programs/scripts/gff3_to_gtf_converter.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/fml_gff_converter_programs/scripts/gff3_to_gtf_converter.pl Tue Jun 07 17:26:20 2011 -0400
@@ -0,0 +1,83 @@
+#!/usr/bin/env perl
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society
+# Copyright (C) 2010 Max Planck Society
+#
+# Description : Convert a GFF3 format file to GTF format.
+
+use strict;
+use warnings;
+
+use lib '/home/galaxy/perl5/share/perl/5.8.8/';
+use Bio::FeatureIO;
+
+my $usage = q(
+gff3_to_gtf_converter.pl - Program to convert a valid GFF3 format file to GTF format.
+USAGE: gff3_to_gtf_converter.pl