Mercurial > repos > yating-l > jbrowsearchivecreator
view bedToGff3.py @ 5:e762f4b9e4bd draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit a3a8cdf5c9b100db56b2534e8718b5fc976435ff
author | yating-l |
---|---|
date | Fri, 07 Jul 2017 17:21:23 -0400 |
parents | 7e471cdd9e71 |
children |
line wrap: on
line source
#!/usr/bin/env python ''' Convert BED format to gff3 reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md ''' import os from collections import OrderedDict import utils class bedToGff3(): def __init__(self, inputBedFile, chrom_sizes, bed_type, output): self.input = inputBedFile #file_dir = os.path.basename(inputBedFile) #print file_dir + "\n\n" self.output = output self.chrom_sizes = chrom_sizes self.type = bed_type if self.type == "trfbig": self.trfbig_to_gff3() if self.type == "regtools": self.splicejunctions_to_gff3() if self.type == "blat": self.bigpsl_to_gff3() def trfbig_to_gff3(self): gff3 = open(self.output, 'w') gff3.write("##gff-version 3\n") sizes_dict = utils.sequence_region(self.chrom_sizes) seq_regions = dict() with open(self.input, 'r') as bed: for line in bed: field = OrderedDict() attribute = OrderedDict() li = line.rstrip().split("\t") field['seqid'] = li[0] if field['seqid'] not in seq_regions: end_region = sizes_dict[field['seqid']] gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') seq_regions[field['seqid']] = end_region field['source'] = li[3] field['type'] = 'tandem_repeat' # The first base in a chromosome is numbered 0 in BED format field['start'] = str(int(li[1]) + 1) field['end'] = li[2] field['score'] = li[9] field['strand'] = '+' field['phase'] = '.' attribute['length of repeat unit'] = li[4] attribute['mean number of copies of repeat'] = li[5] attribute['length of consensus sequence'] = li[6] attribute['percentage match'] = li[7] attribute['percentage indel'] = li[8] attribute['percent of a\'s in repeat unit'] = li[10] attribute['percent of c\'s in repeat unit'] = li[11] attribute['percent of g\'s in repeat unit'] = li[12] attribute['percent of t\'s in repeat unit'] = li[13] attribute['entropy'] = li[14] attribute['sequence of repeat unit element'] = li[15] utils.write_features(field, attribute, gff3) gff3.close() def splicejunctions_to_gff3(self): gff3 = open(self.output, 'w') gff3.write("##gff-version 3\n") sizes_dict = utils.sequence_region(self.chrom_sizes) seq_regions = dict() with open(self.input, 'r') as bed: for line in bed: field = OrderedDict() attribute = OrderedDict() li = line.rstrip().split("\t") field['seqid'] = li[0] if field['seqid'] not in seq_regions: end_region = sizes_dict[field['seqid']] gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') seq_regions[field['seqid']] = end_region field['source'] = li[3] field['type'] = 'junction' # The first base in a chromosome is numbered 0 in BED format field['start'] = int(li[1]) + 1 field['end'] = li[2] field['score'] = li[12] field['strand'] = li[5] field['phase'] = '.' attribute['ID'] = li[0] + '_' + li[3] attribute['Name'] = li[3] attribute['blockcount'] = li[9] attribute['blocksizes'] = li[10] attribute['chromstarts'] = li[11] utils.write_features(field, attribute, gff3) utils.child_blocks(field, attribute, gff3, 'exon_junction') gff3.close() def bigpsl_to_gff3(self): gff3 = open(self.output, 'w') gff3.write("##gff-version 3\n") sizes_dict = utils.sequence_region(self.chrom_sizes) seq_regions = dict() with open(self.input, 'r') as bed: for line in bed: field = OrderedDict() attribute = OrderedDict() li = line.rstrip().split("\t") field['seqid'] = li[0] if field['seqid'] not in seq_regions: end_region = sizes_dict[field['seqid']] gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') seq_regions[field['seqid']] = end_region field['source'] = 'UCSC BLAT alignment tool' field['type'] = 'match' # The first base in a chromosome is numbered 0 in BED format field['start'] = str(int(li[1]) + 1) field['end'] = li[2] field['score'] = li[4] field['strand'] = li[5] field['phase'] = '.' attribute['ID'] = li[0] + '_' + li[3] attribute['Name'] = li[3] attribute['blockcount'] = li[9] attribute['blocksizes'] = li[10] attribute['chromstarts'] = li[11] attribute['ochrom_start'] = li[12] attribute['ochrom_end'] = li[13] attribute['ochrom_strand'] = li[14] attribute['ochrom_size'] = li[15] attribute['ochrom_starts'] = li[16] attribute['sequence on other chromosome'] = li[17] attribute['cds in ncbi format'] = li[18] attribute['size of target chromosome'] = li[19] attribute['number of bases matched'] = li[20] attribute['number of bases that don\'t match'] = li[21] attribute['number of bases that match but are part of repeats'] = li[22] attribute['number of \'N\' bases'] = li[23] utils.write_features(field, attribute, gff3) utils.child_blocks(field, attribute, gff3, 'match_part') gff3.close()