changeset 19:0152500d9acd draft

Uploaded
author rmarenco
date Thu, 13 Oct 2016 22:49:01 -0400
parents d786bca6a75d
children 40469b265ddb
files Gtf.py
diffstat 1 files changed, 90 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/Gtf.py	Fri Sep 30 18:33:03 2016 -0400
+++ b/Gtf.py	Thu Oct 13 22:49:01 2016 -0400
@@ -5,10 +5,15 @@
 
 # Internal dependencies
 from Datatype import Datatype
-from Track import Track
-from TrackDb import TrackDb
 from util import subtools
 
+class InfoModifiedGtf():
+    def __init__(self, is_modified=False, array_modified_lines=[]):
+        self.is_modified = is_modified
+        self.array_modified_lines = array_modified_lines
+
+    def get_str_modified_lines(self):
+        return ','.join(map(str, self.array_modified_lines))
 
 class Gtf( Datatype ):
     def __init__( self, input_gtf_false_path, data_gtf):
@@ -32,6 +37,10 @@
         sorted_bigGenePred_file = tempfile.NamedTemporaryFile(suffix=".sortedBed.bigGenePred")
 
         # GtfToGenePred
+        ## Checking the integrity of the inputs
+        modified_gtf = self._checkAndFixGtf()
+
+        ## Processing the gtf
         subtools.gtfToGenePred(self.input_gtf_false_path, genePredFile.name)
 
         # TODO: From there, refactor because common use with Gff3.py
@@ -66,4 +75,82 @@
                          track_color=self.track_color,
                          group_name=self.group_name)
 
-        print("- Gtf %s created" % self.name_gtf)
+        # TODO: Use Logging instead of print
+        if modified_gtf.is_modified:
+            print("- Warning: Gtf %s created with a modified version of your Gtf because of start/end coordinates issues."
+                  % self.name_gtf)
+            print("Here are the lines removed: " + modified_gtf.get_str_modified_lines())
+        else:
+            print("- Gtf %s created" % self.name_gtf)
+
+    def _checkAndFixGtf(self):
+        """
+        Call _checkAndFixGtf, check the integrity of gtf file, 
+        if coordinates exceed chromosome size, either removed the whole line(s) or truncated to the end of the scaffold 
+        depending on the user choice
+        default: remove the whole line(s)
+        """
+        # Set the boolean telling if we had to modify the file
+        modified_gtf = InfoModifiedGtf()
+
+        # Create a temp gtf just in case we have issues
+        temp_gtf = tempfile.NamedTemporaryFile(bufsize=0, suffix=".gtf", delete=False)
+
+        # TODO: Get the user choice and use it
+        # TODO: Check if the start > 0 and the end <= chromosome size
+        # Get the chrom.sizes into a dictionary to have a faster access
+        # TODO: Think about doing this in Datatype.py, so everywhere we have access to this read-only dictionary
+        dict_chrom_sizes = {}
+        with open(self.chromSizesFile.name, 'r') as chromSizes:
+            lines = chromSizes.readlines()
+            for line in lines:
+                fields = line.split()
+                # fields[1] should be the name of the scaffold
+                # fields[2] should be the size of the scaffold
+                # TODO: Ensure this is true for all lines
+                dict_chrom_sizes[fields[0]] = fields[1]
+
+        # Parse the GTF and check each line using the chrom sizes dictionary
+        with open(temp_gtf.name, 'a+') as tmp:
+            with open(self.input_gtf_false_path, 'r') as gtf:
+                lines = gtf.readlines()
+                for index, line in enumerate(lines):
+                    # If this is not a comment, we check the fields
+                    if not line.startswith('#'):
+                        fields = line.split()
+                        # We are interested in fields[0] => Seqname (scaffold)
+                        # We are interested in fields[3] => Start of the scaffold
+                        # We are interested in fields[4] => End of the scaffold
+                        scaffold_size = dict_chrom_sizes[fields[0]]
+                        start_position = fields[3]
+                        end_position = fields[4]
+
+                        if start_position > 0 and end_position <= scaffold_size:
+                            # We are good, so we copy this line
+                            tmp.write(line)
+                            tmp.write(os.linesep)
+
+
+                        # The sequence is not good, we are going to process it regarding the user choice
+                        # TODO: Process the user choice
+                        # By default, we are assuming the user choice is to remove the lines: We don't copy it
+
+                        # If we are here, it means the gtf has been modified
+                        else:
+                            # We save the line for the feedback to the user
+                            modified_gtf.array_modified_lines.append(index + 1)
+
+                            if modified_gtf.is_modified is False:
+                                modified_gtf.is_modified = True
+                            else:
+                                pass
+                    else:
+                        tmp.write(line)
+                        tmp.write(os.linesep)
+
+        # Once the process it completed, we just replace the path of the gtf
+        self.input_gtf_false_path = temp_gtf.name
+
+        # TODO: Manage the issue with the fact the dataset is going to still exist on the disk because of delete=False
+
+        return modified_gtf