comparison gstf_preparation.py @ 8:92f3966d5bc3 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
author earlhaminst
date Wed, 16 May 2018 20:03:57 -0400
parents 56bbdbfe3eaa
children f4acbfe8d6fe
comparison
equal deleted inserted replaced
7:9ef7661e8e9c 8:92f3966d5bc3
1 from __future__ import print_function 1 from __future__ import print_function
2 2
3 import collections
4 import json 3 import json
5 import optparse 4 import optparse
6 import sqlite3 5 import sqlite3
7 import sys 6 import sys
8 7
9 version = "0.4.0" 8 version = "0.4.0"
10 gene_count = 0 9 gene_count = 0
11 10
12 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) 11
12 class Sequence(object):
13 def __init__(self, header, sequence_parts):
14 self.header = header
15 self.sequence_parts = sequence_parts
16 self._sequence = None
17
18 @property
19 def sequence(self):
20 if self._sequence is None:
21 self._sequence = ''.join(self.sequence_parts)
22 return self._sequence
23
24 def print(self, fh=sys.stdout):
25 print(self.header, file=fh)
26 for line in self.sequence_parts:
27 print(line, file=fh)
13 28
14 29
15 def FASTAReader_gen(fasta_filename): 30 def FASTAReader_gen(fasta_filename):
16 with open(fasta_filename) as fasta_file: 31 with open(fasta_filename) as fasta_file:
17 line = fasta_file.readline() 32 line = fasta_file.readline()
23 sequence_parts = [] 38 sequence_parts = []
24 line = fasta_file.readline() 39 line = fasta_file.readline()
25 while line and line[0] != '>': 40 while line and line[0] != '>':
26 sequence_parts.append(line.rstrip()) 41 sequence_parts.append(line.rstrip())
27 line = fasta_file.readline() 42 line = fasta_file.readline()
28 sequence = "\n".join(sequence_parts) 43 yield Sequence(header, sequence_parts)
29 yield Sequence(header, sequence)
30 44
31 45
32 def create_tables(conn): 46 def create_tables(conn):
33 cur = conn.cursor() 47 cur = conn.cursor()
34 48
358 for fasta_arg in options.fasta: 372 for fasta_arg in options.fasta:
359 for entry in FASTAReader_gen(fasta_arg): 373 for entry in FASTAReader_gen(fasta_arg):
360 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id 374 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
361 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) 375 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
362 376
377 if len(entry.sequence) % 3 != 0:
378 print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
379 continue
380
363 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) 381 gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
364 if not gene_id: 382 if not gene_id:
365 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) 383 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
366 continue 384 continue
367 385
381 for entry in FASTAReader_gen(fasta_arg): 399 for entry in FASTAReader_gen(fasta_arg):
382 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) 400 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
383 if options.longestCDS and transcript_id not in selected_transcript_ids: 401 if options.longestCDS and transcript_id not in selected_transcript_ids:
384 continue 402 continue
385 403
404 if len(entry.sequence) % 3 != 0:
405 print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
406 continue
407
386 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id) 408 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id)
387 if not species_for_transcript: 409 if not species_for_transcript:
388 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) 410 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
389 continue 411 continue
390 412
391 if options.headers: 413 if options.headers:
392 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest 414 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
393 # Remove any underscore in the species 415 # Remove any underscore in the species
394 header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) 416 entry.header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', ''))
417
418 if seq_region_for_transcript.lower() in regions:
419 entry.print(filtered_fasta_file)
395 else: 420 else:
396 header = entry.header 421 entry.print(output_fasta_file)
397
398 if seq_region_for_transcript.lower() in regions:
399 filtered_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
400 else:
401 output_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
402 422
403 conn.close() 423 conn.close()
404 424
405 425
406 if __name__ == '__main__': 426 if __name__ == '__main__':