Mercurial > repos > yating-l > jbrowsearchivecreator
diff bedToGff3.py @ 4:7e471cdd9e71 draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 8d93b27353190eb23490c9480e560d84cb60c973
author | yating-l |
---|---|
date | Fri, 07 Jul 2017 16:17:57 -0400 |
parents | 804a93e87cc8 |
children |
line wrap: on
line diff
--- a/bedToGff3.py Wed May 31 15:45:47 2017 -0400 +++ b/bedToGff3.py Fri Jul 07 16:17:57 2017 -0400 @@ -2,6 +2,7 @@ ''' Convert BED format to gff3 +reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md ''' import os from collections import OrderedDict @@ -19,6 +20,8 @@ self.trfbig_to_gff3() if self.type == "regtools": self.splicejunctions_to_gff3() + if self.type == "blat": + self.bigpsl_to_gff3() def trfbig_to_gff3(self): gff3 = open(self.output, 'w') @@ -81,12 +84,56 @@ field['score'] = li[12] field['strand'] = li[5] field['phase'] = '.' - attribute['ID'] = li[3] + attribute['ID'] = li[0] + '_' + li[3] attribute['Name'] = li[3] attribute['blockcount'] = li[9] attribute['blocksizes'] = li[10] attribute['chromstarts'] = li[11] utils.write_features(field, attribute, gff3) - utils.child_blocks(field, attribute, gff3) + utils.child_blocks(field, attribute, gff3, 'exon_junction') + gff3.close() + + def bigpsl_to_gff3(self): + gff3 = open(self.output, 'w') + gff3.write("##gff-version 3\n") + sizes_dict = utils.sequence_region(self.chrom_sizes) + seq_regions = dict() + with open(self.input, 'r') as bed: + for line in bed: + field = OrderedDict() + attribute = OrderedDict() + li = line.rstrip().split("\t") + field['seqid'] = li[0] + if field['seqid'] not in seq_regions: + end_region = sizes_dict[field['seqid']] + gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') + seq_regions[field['seqid']] = end_region + field['source'] = 'UCSC BLAT alignment tool' + field['type'] = 'match' + # The first base in a chromosome is numbered 0 in BED format + field['start'] = str(int(li[1]) + 1) + field['end'] = li[2] + field['score'] = li[4] + field['strand'] = li[5] + field['phase'] = '.' + attribute['ID'] = li[0] + '_' + li[3] + attribute['Name'] = li[3] + attribute['blockcount'] = li[9] + attribute['blocksizes'] = li[10] + attribute['chromstarts'] = li[11] + attribute['ochrom_start'] = li[12] + attribute['ochrom_end'] = li[13] + attribute['ochrom_strand'] = li[14] + attribute['ochrom_size'] = li[15] + attribute['ochrom_starts'] = li[16] + attribute['sequence on other chromosome'] = li[17] + attribute['cds in ncbi format'] = li[18] + attribute['size of target chromosome'] = li[19] + attribute['number of bases matched'] = li[20] + attribute['number of bases that don\'t match'] = li[21] + attribute['number of bases that match but are part of repeats'] = li[22] + attribute['number of \'N\' bases'] = li[23] + utils.write_features(field, attribute, gff3) + utils.child_blocks(field, attribute, gff3, 'match_part') gff3.close() \ No newline at end of file