diff bedToGff3.py @ 0:804a93e87cc8 draft

planemo upload for repository https://github.com/Yating-L/jbrowse_hub commit f22711ea7a464bdaf4d5aaea07f2eacf967aa66e-dirty
author yating-l
date Wed, 12 Apr 2017 17:41:55 -0400
parents
children 7e471cdd9e71
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bedToGff3.py	Wed Apr 12 17:41:55 2017 -0400
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+'''
+Convert BED format to gff3
+'''
+import os
+from collections import OrderedDict
+import utils
+
+class bedToGff3():
+    def __init__(self, inputBedFile, chrom_sizes, bed_type, output):
+        self.input = inputBedFile
+        #file_dir = os.path.basename(inputBedFile)
+        #print file_dir + "\n\n"
+        self.output = output
+        self.chrom_sizes = chrom_sizes
+        self.type = bed_type
+        if self.type == "trfbig":
+            self.trfbig_to_gff3()
+        if self.type == "regtools":
+            self.splicejunctions_to_gff3()
+
+    def trfbig_to_gff3(self):
+        gff3 = open(self.output, 'w')
+        gff3.write("##gff-version 3\n")
+        sizes_dict = utils.sequence_region(self.chrom_sizes)
+        seq_regions = dict()
+        with open(self.input, 'r') as bed:
+            for line in bed:
+                field = OrderedDict()
+                attribute = OrderedDict()
+                li = line.rstrip().split("\t")
+                field['seqid'] = li[0]
+                if field['seqid'] not in seq_regions:
+                    end_region = sizes_dict[field['seqid']]
+                    gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
+                    seq_regions[field['seqid']] = end_region
+                field['source'] = li[3]
+                field['type'] = 'tandem_repeat'
+                # The first base in a chromosome is numbered 0 in BED format
+                field['start'] = str(int(li[1]) + 1)
+                field['end'] = li[2]
+                field['score'] = li[9]
+                field['strand'] = '+'
+                field['phase'] = '.'
+                attribute['length of repeat unit'] = li[4]
+                attribute['mean number of copies of repeat'] = li[5]
+                attribute['length of consensus sequence'] = li[6]
+                attribute['percentage match'] = li[7]
+                attribute['percentage indel'] = li[8]
+                attribute['percent of a\'s in repeat unit'] = li[10]
+                attribute['percent of c\'s in repeat unit'] = li[11]
+                attribute['percent of g\'s in repeat unit'] = li[12]
+                attribute['percent of t\'s in repeat unit'] = li[13]
+                attribute['entropy'] = li[14]
+                attribute['sequence of repeat unit element'] = li[15]
+                utils.write_features(field, attribute, gff3)
+        gff3.close()
+
+
+    def splicejunctions_to_gff3(self):
+        gff3 = open(self.output, 'w')
+        gff3.write("##gff-version 3\n")
+        sizes_dict = utils.sequence_region(self.chrom_sizes)
+        seq_regions = dict()
+        with open(self.input, 'r') as bed:
+            for line in bed:
+                field = OrderedDict()
+                attribute = OrderedDict()
+                li = line.rstrip().split("\t")
+                field['seqid'] = li[0]
+                if field['seqid'] not in seq_regions:
+                    end_region = sizes_dict[field['seqid']]
+                    gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
+                    seq_regions[field['seqid']] = end_region
+                field['source'] = li[3]
+                field['type'] = 'junction'
+                # The first base in a chromosome is numbered 0 in BED format
+                field['start'] = int(li[1]) + 1
+                field['end'] = li[2]
+                field['score'] = li[12]
+                field['strand'] = li[5]
+                field['phase'] = '.'
+                attribute['ID'] = li[3]
+                attribute['Name'] = li[3]
+                attribute['blockcount'] = li[9]
+                attribute['blocksizes'] = li[10]
+                attribute['chromstarts'] = li[11]
+                utils.write_features(field, attribute, gff3)
+                utils.child_blocks(field, attribute, gff3)
+        gff3.close()
+        
\ No newline at end of file