comparison Gtf.py @ 0:f493979f1408 draft default tip

planemo upload for repository https://github.com/Yating-L/hubarchivecreator-test commit 48b59e91e2dcc2e97735ee35d587960cbfbce932-dirty
author yating-l
date Wed, 21 Dec 2016 12:13:04 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f493979f1408
1 #!/usr/bin/python
2
3 import os
4 import tempfile
5
6 # Internal dependencies
7 from Datatype import Datatype
8 from util import subtools
9
10 class InfoModifiedGtf():
11 def __init__(self, is_modified=False, array_modified_lines=[]):
12 self.is_modified = is_modified
13 self.array_modified_lines = array_modified_lines
14
15 def get_str_modified_lines(self):
16 return ','.join(map(str, self.array_modified_lines))
17
18 class Gtf( Datatype ):
19 def __init__( self, input_gtf_false_path, data_gtf):
20
21 super(Gtf, self).__init__()
22
23 self.track = None
24
25 self.input_gtf_false_path = input_gtf_false_path
26 self.name_gtf = data_gtf["name"]
27 self.priority = data_gtf["order_index"]
28 self.track_color = data_gtf["track_color"]
29 # TODO: Think about how to avoid repetition of the group_name everywhere
30 self.group_name = data_gtf["group_name"]
31
32 #print "Creating TrackHub GTF from (falsePath: %s; name: %s)" % ( self.input_gtf_false_path, self.name_gtf)
33
34 # TODO: See if we need these temporary files as part of the generated files
35 genePredFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".genePred")
36 unsorted_bigGenePred_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".unsorted.bigGenePred")
37 sorted_bigGenePred_file = tempfile.NamedTemporaryFile(suffix=".sortedBed.bigGenePred")
38
39 # GtfToGenePred
40 ## Checking the integrity of the inputs
41 modified_gtf = self._checkAndFixGtf()
42
43 ## Processing the gtf
44 subtools.gtfToGenePred(self.input_gtf_false_path, genePredFile.name)
45
46 # TODO: From there, refactor because common use with Gff3.py
47 # genePredToBigGenePred processing
48 subtools.genePredToBigGenePred(genePredFile.name, unsorted_bigGenePred_file.name)
49
50 # Sort processing
51 subtools.sort(unsorted_bigGenePred_file.name, sorted_bigGenePred_file.name)
52
53 # bedToBigBed processing
54 trackName = "".join( ( self.name_gtf, ".bb") )
55
56 auto_sql_option = os.path.join(self.tool_directory, 'bigGenePred.as')
57
58 myBigBedFilePath = os.path.join(self.myTrackFolderPath, trackName)
59
60 with open(myBigBedFilePath, 'w') as bigBedFile:
61 subtools.bedToBigBed(sorted_bigGenePred_file.name,
62 self.chromSizesFile.name,
63 bigBedFile.name,
64 autoSql=auto_sql_option,
65 typeOption='bed12+8',
66 tab=True)
67
68
69 # Create the Track Object
70 self.createTrack(file_path=trackName,
71 track_name=trackName,
72 long_label=self.name_gtf, track_type='bigGenePred',
73 visibility='dense', priority=self.priority,
74 track_file=myBigBedFilePath,
75 track_color=self.track_color,
76 group_name=self.group_name)
77
78 # TODO: Use Logging instead of print
79 if modified_gtf.is_modified:
80 print("- Warning: Gtf %s created with a modified version of your Gtf because of start/end coordinates issues."
81 % self.name_gtf)
82 print("Here are the lines removed: " + modified_gtf.get_str_modified_lines())
83 else:
84 print("- Gtf %s created" % self.name_gtf)
85
86 def _checkAndFixGtf(self):
87 """
88 Call _checkAndFixGtf, check the integrity of gtf file,
89 if coordinates exceed chromosome size, either removed the whole line(s) or truncated to the end of the scaffold
90 depending on the user choice
91 default: remove the whole line(s)
92 """
93 # Set the boolean telling if we had to modify the file
94 modified_gtf = InfoModifiedGtf()
95
96 # Create a temp gtf just in case we have issues
97 temp_gtf = tempfile.NamedTemporaryFile(bufsize=0, suffix=".gtf", delete=False)
98
99 # TODO: Get the user choice and use it
100 # TODO: Check if the start > 0 and the end <= chromosome size
101 # Get the chrom.sizes into a dictionary to have a faster access
102 # TODO: Think about doing this in Datatype.py, so everywhere we have access to this read-only dictionary
103 dict_chrom_sizes = {}
104 with open(self.chromSizesFile.name, 'r') as chromSizes:
105 lines = chromSizes.readlines()
106 for line in lines:
107 fields = line.split()
108 # fields[1] should be the name of the scaffold
109 # fields[2] should be the size of the scaffold
110 # TODO: Ensure this is true for all lines
111 dict_chrom_sizes[fields[0]] = fields[1]
112
113 # Parse the GTF and check each line using the chrom sizes dictionary
114 with open(temp_gtf.name, 'a+') as tmp:
115 with open(self.input_gtf_false_path, 'r') as gtf:
116 lines = gtf.readlines()
117 for index, line in enumerate(lines):
118 # If this is not a comment, we check the fields
119 if not line.startswith('#'):
120 fields = line.split()
121 # We are interested in fields[0] => Seqname (scaffold)
122 # We are interested in fields[3] => Start of the scaffold
123 # We are interested in fields[4] => End of the scaffold
124 scaffold_size = dict_chrom_sizes[fields[0]]
125 start_position = fields[3]
126 end_position = fields[4]
127
128 if start_position > 0 and end_position <= scaffold_size:
129 # We are good, so we copy this line
130 tmp.write(line)
131 tmp.write(os.linesep)
132
133
134 # The sequence is not good, we are going to process it regarding the user choice
135 # TODO: Process the user choice
136 # By default, we are assuming the user choice is to remove the lines: We don't copy it
137
138 # If we are here, it means the gtf has been modified
139 else:
140 # We save the line for the feedback to the user
141 modified_gtf.array_modified_lines.append(index + 1)
142
143 if modified_gtf.is_modified is False:
144 modified_gtf.is_modified = True
145 else:
146 pass
147 else:
148 tmp.write(line)
149 tmp.write(os.linesep)
150
151 # Once the process it completed, we just replace the path of the gtf
152 self.input_gtf_false_path = temp_gtf.name
153
154 # TODO: Manage the issue with the fact the dataset is going to still exist on the disk because of delete=False
155
156 return modified_gtf