Mercurial > repos > rmarenco > hubarchivecreator
view Gtf.py @ 20:40469b265ddb draft
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 3af31e043f5b82636015c18e013d2f22ce6c9077-dirty
author | yating-l |
---|---|
date | Fri, 20 Jan 2017 17:12:03 -0500 |
parents | 0152500d9acd |
children | 2677f1899aa8 |
line wrap: on
line source
#!/usr/bin/python import os import tempfile # Internal dependencies from Datatype import Datatype from util import subtools class InfoModifiedGtf(): def __init__(self, is_modified=False, array_modified_lines=[]): self.is_modified = is_modified self.array_modified_lines = array_modified_lines def get_str_modified_lines(self): return ','.join(map(str, self.array_modified_lines)) class Gtf( Datatype ): def __init__( self, input_gtf_false_path, data_gtf): super(Gtf, self).__init__() self.track = None self.input_gtf_false_path = input_gtf_false_path self.name_gtf = data_gtf["name"] self.priority = data_gtf["order_index"] self.track_color = data_gtf["track_color"] # TODO: Think about how to avoid repetition of the group_name everywhere self.group_name = data_gtf["group_name"] #print "Creating TrackHub GTF from (falsePath: %s; name: %s)" % ( self.input_gtf_false_path, self.name_gtf) # TODO: See if we need these temporary files as part of the generated files genePredFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".genePred") unsorted_bigGenePred_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".unsorted.bigGenePred") sorted_bigGenePred_file = tempfile.NamedTemporaryFile(suffix=".sortedBed.bigGenePred") # GtfToGenePred ## Checking the integrity of the inputs modified_gtf = self._checkAndFixGtf() ## Processing the gtf subtools.gtfToGenePred(self.input_gtf_false_path, genePredFile.name) # TODO: From there, refactor because common use with Gff3.py # genePredToBigGenePred processing subtools.genePredToBigGenePred(genePredFile.name, unsorted_bigGenePred_file.name) # Sort processing subtools.sort(unsorted_bigGenePred_file.name, sorted_bigGenePred_file.name) # bedToBigBed processing trackName = "".join( ( self.name_gtf, ".bb") ) auto_sql_option = os.path.join(self.tool_directory, 'bigGenePred.as') myBigBedFilePath = os.path.join(self.myTrackFolderPath, trackName) with open(myBigBedFilePath, 'w') as bigBedFile: subtools.bedToBigBed(sorted_bigGenePred_file.name, self.chromSizesFile.name, bigBedFile.name, autoSql=auto_sql_option, typeOption='bed12+8', tab=True) # Create the Track Object self.createTrack(file_path=trackName, track_name=trackName, long_label=self.name_gtf, track_type='bigGenePred', visibility='dense', priority=self.priority, track_file=myBigBedFilePath, track_color=self.track_color, group_name=self.group_name) # TODO: Use Logging instead of print if modified_gtf.is_modified: print("- Warning: Gtf %s created with a modified version of your Gtf because of start/end coordinates issues." % self.name_gtf) print("Here are the lines removed: " + modified_gtf.get_str_modified_lines()) else: print("- Gtf %s created" % self.name_gtf) def _checkAndFixGtf(self): """ Call _checkAndFixGtf, check the integrity of gtf file, if coordinates exceed chromosome size, either removed the whole line(s) or truncated to the end of the scaffold depending on the user choice default: remove the whole line(s) """ # Set the boolean telling if we had to modify the file modified_gtf = InfoModifiedGtf() # Create a temp gtf just in case we have issues temp_gtf = tempfile.NamedTemporaryFile(bufsize=0, suffix=".gtf", delete=False) # TODO: Get the user choice and use it # TODO: Check if the start > 0 and the end <= chromosome size # Get the chrom.sizes into a dictionary to have a faster access # TODO: Think about doing this in Datatype.py, so everywhere we have access to this read-only dictionary dict_chrom_sizes = {} with open(self.chromSizesFile.name, 'r') as chromSizes: lines = chromSizes.readlines() for line in lines: fields = line.split() # fields[1] should be the name of the scaffold # fields[2] should be the size of the scaffold # TODO: Ensure this is true for all lines dict_chrom_sizes[fields[0]] = fields[1] # Parse the GTF and check each line using the chrom sizes dictionary with open(temp_gtf.name, 'a+') as tmp: with open(self.input_gtf_false_path, 'r') as gtf: lines = gtf.readlines() for index, line in enumerate(lines): # If this is not a comment, we check the fields if not line.startswith('#'): fields = line.split() # We are interested in fields[0] => Seqname (scaffold) # We are interested in fields[3] => Start of the scaffold # We are interested in fields[4] => End of the scaffold scaffold_size = dict_chrom_sizes[fields[0]] start_position = fields[3] end_position = fields[4] if start_position > 0 and end_position <= scaffold_size: # We are good, so we copy this line tmp.write(line) tmp.write(os.linesep) # The sequence is not good, we are going to process it regarding the user choice # TODO: Process the user choice # By default, we are assuming the user choice is to remove the lines: We don't copy it # If we are here, it means the gtf has been modified else: # We save the line for the feedback to the user modified_gtf.array_modified_lines.append(index + 1) if modified_gtf.is_modified is False: modified_gtf.is_modified = True else: pass else: tmp.write(line) tmp.write(os.linesep) # Once the process it completed, we just replace the path of the gtf self.input_gtf_false_path = temp_gtf.name # TODO: Manage the issue with the fact the dataset is going to still exist on the disk because of delete=False return modified_gtf