# HG changeset patch # User rmarenco # Date 1476413341 14400 # Node ID 0152500d9acdeea9abb3562ceb748599a8715330 # Parent d786bca6a75dcf232557b2111cbb05885a749584 Uploaded diff -r d786bca6a75d -r 0152500d9acd Gtf.py --- a/Gtf.py Fri Sep 30 18:33:03 2016 -0400 +++ b/Gtf.py Thu Oct 13 22:49:01 2016 -0400 @@ -5,10 +5,15 @@ # Internal dependencies from Datatype import Datatype -from Track import Track -from TrackDb import TrackDb from util import subtools +class InfoModifiedGtf(): + def __init__(self, is_modified=False, array_modified_lines=[]): + self.is_modified = is_modified + self.array_modified_lines = array_modified_lines + + def get_str_modified_lines(self): + return ','.join(map(str, self.array_modified_lines)) class Gtf( Datatype ): def __init__( self, input_gtf_false_path, data_gtf): @@ -32,6 +37,10 @@ sorted_bigGenePred_file = tempfile.NamedTemporaryFile(suffix=".sortedBed.bigGenePred") # GtfToGenePred + ## Checking the integrity of the inputs + modified_gtf = self._checkAndFixGtf() + + ## Processing the gtf subtools.gtfToGenePred(self.input_gtf_false_path, genePredFile.name) # TODO: From there, refactor because common use with Gff3.py @@ -66,4 +75,82 @@ track_color=self.track_color, group_name=self.group_name) - print("- Gtf %s created" % self.name_gtf) + # TODO: Use Logging instead of print + if modified_gtf.is_modified: + print("- Warning: Gtf %s created with a modified version of your Gtf because of start/end coordinates issues." + % self.name_gtf) + print("Here are the lines removed: " + modified_gtf.get_str_modified_lines()) + else: + print("- Gtf %s created" % self.name_gtf) + + def _checkAndFixGtf(self): + """ + Call _checkAndFixGtf, check the integrity of gtf file, + if coordinates exceed chromosome size, either removed the whole line(s) or truncated to the end of the scaffold + depending on the user choice + default: remove the whole line(s) + """ + # Set the boolean telling if we had to modify the file + modified_gtf = InfoModifiedGtf() + + # Create a temp gtf just in case we have issues + temp_gtf = tempfile.NamedTemporaryFile(bufsize=0, suffix=".gtf", delete=False) + + # TODO: Get the user choice and use it + # TODO: Check if the start > 0 and the end <= chromosome size + # Get the chrom.sizes into a dictionary to have a faster access + # TODO: Think about doing this in Datatype.py, so everywhere we have access to this read-only dictionary + dict_chrom_sizes = {} + with open(self.chromSizesFile.name, 'r') as chromSizes: + lines = chromSizes.readlines() + for line in lines: + fields = line.split() + # fields[1] should be the name of the scaffold + # fields[2] should be the size of the scaffold + # TODO: Ensure this is true for all lines + dict_chrom_sizes[fields[0]] = fields[1] + + # Parse the GTF and check each line using the chrom sizes dictionary + with open(temp_gtf.name, 'a+') as tmp: + with open(self.input_gtf_false_path, 'r') as gtf: + lines = gtf.readlines() + for index, line in enumerate(lines): + # If this is not a comment, we check the fields + if not line.startswith('#'): + fields = line.split() + # We are interested in fields[0] => Seqname (scaffold) + # We are interested in fields[3] => Start of the scaffold + # We are interested in fields[4] => End of the scaffold + scaffold_size = dict_chrom_sizes[fields[0]] + start_position = fields[3] + end_position = fields[4] + + if start_position > 0 and end_position <= scaffold_size: + # We are good, so we copy this line + tmp.write(line) + tmp.write(os.linesep) + + + # The sequence is not good, we are going to process it regarding the user choice + # TODO: Process the user choice + # By default, we are assuming the user choice is to remove the lines: We don't copy it + + # If we are here, it means the gtf has been modified + else: + # We save the line for the feedback to the user + modified_gtf.array_modified_lines.append(index + 1) + + if modified_gtf.is_modified is False: + modified_gtf.is_modified = True + else: + pass + else: + tmp.write(line) + tmp.write(os.linesep) + + # Once the process it completed, we just replace the path of the gtf + self.input_gtf_false_path = temp_gtf.name + + # TODO: Manage the issue with the fact the dataset is going to still exist on the disk because of delete=False + + return modified_gtf