comparison util/subtools.py @ 7:5d5fdcb798da draft

planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 12fb52d5b285935b2353d93a5aa291838df7893e
author yating-l
date Fri, 20 Apr 2018 13:51:23 -0400
parents 237707a6b74d
children 43a700afd457
comparison
equal deleted inserted replaced
6:237707a6b74d 7:5d5fdcb798da
7 import json 7 import json
8 import subprocess 8 import subprocess
9 import os 9 import os
10 import sys 10 import sys
11 import tempfile 11 import tempfile
12 import string 12 import shutil
13 import logging 13 import logging
14 14
15 class PopenError(Exception): 15 class PopenError(Exception):
16 def __init__(self, cmd, error, return_code): 16 def __init__(self, cmd, error, return_code):
17 self.cmd = cmd 17 self.cmd = cmd
226 filename = bamfile + '.bai' 226 filename = bamfile + '.bai'
227 if os.path.exists(filename): 227 if os.path.exists(filename):
228 return filename 228 return filename
229 else: 229 else:
230 raise ValueError('Did not find bai file') 230 raise ValueError('Did not find bai file')
231
232 def createFastaIndex(fastaFile):
233 subprocess.call(['samtools', 'faidx', fastaFile])
234 filename = fastaFile + '.fai'
235 if os.path.exists(filename):
236 return filename
237 else:
238 raise ValueError('Did not find fai file')
239
240 def generate_indexed_refseq_track(fastaFile, referenceName, outputFolder):
241 faiFile = createFastaIndex(fastaFile)
242 refSeqFile = os.path.join(outputFolder, referenceName)
243 refSeqIndexFile = os.path.join(outputFolder, referenceName+'.fai')
244 shutil.copy(fastaFile, refSeqFile)
245 shutil.copy(faiFile, refSeqIndexFile)
246
247 def remove_gene_lines(gff3_file, gff3_filtered):
248 with open(gff3_file, 'r') as f:
249 with open(gff3_filtered, 'w') as out:
250 for line in f:
251 if not line.startswith('#'):
252 feature_type = line.split('\t')[2].rstrip()
253 if feature_type == 'transcript' or feature_type == 'mRNA':
254 arr = line.split('\t')
255 # as we remove the gene features, we should also remove the Parent attribute (gene id) from the transcript
256 arr[8] = ';'.join([item for item in arr[8].split(';') if 'Parent=' not in item]).rstrip()
257 line = '\t'.join(arr) + '\n'
258 if feature_type == 'gene':
259 continue
260 out.write(line)
261
262 def gff3sort(inputFile, outputFile, precise=False):
263 array_call = ['gff3sort.pl', inputFile]
264 if precise:
265 array_call.append('--precise')
266 p = _handleExceptionAndCheckCall(array_call, stdout=outputFile)
267 return p
268
269 def bedSort(inputFile, outputFile):
270 array_call = ['sort', '-k1,1', '-k2,2n', '-k6,6', inputFile]
271 p = _handleExceptionAndCheckCall(array_call, stdout=outputFile)
272 return p
273
274 def bgzip(inputFile):
275 subprocess.call(['bgzip', inputFile])
276 filename = inputFile + '.gz'
277 if os.path.exists(filename):
278 return filename
279 else:
280 raise ValueError('Did not find gz file')
281
282 def createTabix(inputFile, dataType):
283 subprocess.call(['tabix', '-p', dataType, inputFile])
284 filename = inputFile + '.tbi'
285 if os.path.exists(filename):
286 return filename
287 else:
288 raise ValueError('Did not find tbi file')
289
290 def generate_tabix_indexed_track(inputFile, dataType, trackName, outputFolder):
291 if "bed" in dataType:
292 fileType = 'bed'
293 sortedFile = tempfile.NamedTemporaryFile(bufsize=0)
294 bedSort(inputFile, sortedFile)
295 elif "gff" in dataType:
296 fileType = 'gff'
297 filteredFile = tempfile.NamedTemporaryFile(bufsize=0)
298 remove_gene_lines(inputFile, filteredFile.name)
299 sortedFile = tempfile.NamedTemporaryFile(bufsize=0)
300 gff3sort(filteredFile.name, sortedFile)
301 # add .gff3.gz extension to Tabix GFF3 files, in order to enable creating name index with generate-names.pl
302 trackName = trackName + '.gff3.gz'
303 compressedFile = bgzip(sortedFile.name)
304 tabixFile = createTabix(compressedFile, fileType)
305 trackPath = os.path.join(outputFolder, trackName)
306 trackIndexPath = os.path.join(outputFolder, trackName+'.tbi')
307 shutil.copy(compressedFile, trackPath)
308 shutil.copy(tabixFile, trackIndexPath)
231 309
232 def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=True): 310 def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=True):
233 if "bed" in dataType: 311 if "bed" in dataType:
234 fileType = "--bed" 312 fileType = "--bed"
235 elif "gff" in dataType: 313 elif "gff" in dataType:
295 track_json = json.dumps(track_json) 373 track_json = json.dumps(track_json)
296 new_track = subprocess.Popen(['echo', track_json], stdout=subprocess.PIPE) 374 new_track = subprocess.Popen(['echo', track_json], stdout=subprocess.PIPE)
297 p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout) 375 p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout)
298 return p 376 return p
299 377
300 def prepare_refseqs(fasta_file_name, outputFolder): 378 def prepare_refseqs(fastaFile, outputFolder):
301 array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder] 379 #array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder]
380 createFastaIndex(fastaFile)
381 array_call = ['prepare-refseqs.pl', '--indexed_fasta', fastaFile, '--out', outputFolder]
302 p = _handleExceptionAndCheckCall(array_call) 382 p = _handleExceptionAndCheckCall(array_call)
303 return p 383 return p
304 384
305 def generate_names(outputFolder): 385 def generate_names(outputFolder, hashBits=4):
306 array_call = ['generate-names.pl', '-v', '--out', outputFolder] 386 array_call = ['generate-names.pl', '--hashBits', '4', '-v', '--out', outputFolder]
307 p = _handleExceptionAndCheckCall(array_call) 387 p = _handleExceptionAndCheckCall(array_call)
308 return p 388 return p
309 389
310 def validateFiles(input_file, chrom_sizes_file_name, file_type, options=None): 390 def validateFiles(input_file, chrom_sizes_file_name, file_type, options=None):
311 """ 391 """