comparison gstf_preparation.py @ 9:f4acbfe8d6fe draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
author earlhaminst
date Wed, 17 Oct 2018 07:31:29 -0400
parents 92f3966d5bc3
children e8e75a79de59
comparison
equal deleted inserted replaced
8:92f3966d5bc3 9:f4acbfe8d6fe
262 def fetch_species_and_seq_region_for_transcript(conn, transcript_id): 262 def fetch_species_and_seq_region_for_transcript(conn, transcript_id):
263 cur = conn.cursor() 263 cur = conn.cursor()
264 264
265 cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?', 265 cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?',
266 (transcript_id, )) 266 (transcript_id, ))
267 results = cur.fetchone() 267 row = cur.fetchone()
268 if not results: 268 if not row:
269 return None 269 return (None, None)
270 return results 270 return row
271 271
272 272
273 def fetch_gene_id_for_transcript(conn, transcript_id): 273 def fetch_gene_id_for_transcript(conn, transcript_id):
274 cur = conn.cursor() 274 cur = conn.cursor()
275 275
276 cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?', 276 cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?',
277 (transcript_id, )) 277 (transcript_id, ))
278 results = cur.fetchone() 278 row = cur.fetchone()
279 if not results: 279 if not row:
280 return None 280 return None
281 return results[0] 281 return row[0]
282 282
283 283
284 def remove_id_version(s): 284 def remove_id_version(s, force=False):
285 """ 285 """
286 Remove the optional '.VERSION' from an Ensembl id. 286 Remove the optional '.VERSION' from an id if it's an Ensembl id or if
287 `force` is True.
287 """ 288 """
288 if s.startswith('ENS'): 289 if force or s.startswith('ENS'):
289 return s.split('.')[0] 290 return s.split('.')[0]
290 else: 291 else:
291 return s 292 return s
292 293
293 294
356 unimplemented_feature_nlines_dict[feature_type] = 0 357 unimplemented_feature_nlines_dict[feature_type] = 0
357 except Exception as e: 358 except Exception as e:
358 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr) 359 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr)
359 360
360 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items(): 361 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items():
361 print("Skipped %d lines in file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr) 362 print("Skipped %d lines in GFF3 file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr)
362 363
363 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) 364 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
364 write_gene_dict_to_db(conn, gene_dict) 365 write_gene_dict_to_db(conn, gene_dict)
365 366
366 for json_arg in options.json: 367 for json_arg in options.json:
367 with open(json_arg) as f: 368 with open(json_arg) as f:
368 write_gene_dict_to_db(conn, json.load(f)) 369 write_gene_dict_to_db(conn, json.load(f))
369 370
371 # Read the FASTA files a first time to:
372 # - determine for each file if we need to force the removal of the version
373 # from the transcript id
374 # - fill gene_transcripts_dict when keeping only the longest CDS per gene
375 force_remove_id_version_file_list = []
376 gene_transcripts_dict = dict()
377 for fasta_arg in options.fasta:
378 force_remove_id_version = False
379 found_gene_transcript = False
380 for entry in FASTAReader_gen(fasta_arg):
381 # Extract the transcript id by removing everything after the first space and then removing the version if needed
382 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)
383
384 if len(entry.sequence) % 3 != 0:
385 continue
386
387 gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
388 if not gene_id and not found_gene_transcript:
389 # We have not found a proper gene transcript in this file yet,
390 # try to force the removal of the version from the transcript id
391 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], True)
392 gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
393 # Remember that we need to force the removal for this file
394 if gene_id:
395 force_remove_id_version = True
396 force_remove_id_version_file_list.append(fasta_arg)
397 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr)
398 if not gene_id:
399 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
400 continue
401 if options.longestCDS:
402 found_gene_transcript = True
403 else:
404 break
405
406 if gene_id in gene_transcripts_dict:
407 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
408 else:
409 gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
410
370 if options.longestCDS: 411 if options.longestCDS:
371 gene_transcripts_dict = dict() 412 # For each gene, select the transcript with the longest sequence.
372 for fasta_arg in options.fasta: 413 # If more than one transcripts have the same longest sequence for a
373 for entry in FASTAReader_gen(fasta_arg): 414 # gene, the first one to appear in the FASTA file is selected
374 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
375 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
376
377 if len(entry.sequence) % 3 != 0:
378 print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
379 continue
380
381 gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
382 if not gene_id:
383 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
384 continue
385
386 if gene_id in gene_transcripts_dict:
387 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
388 else:
389 gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
390
391 # For each gene, select the transcript with the longest sequence
392 # If more than one transcripts have the same longest sequence for a gene, the
393 # first one to appear in the FASTA file is selected
394 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] 415 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
395 416
396 regions = [_.strip().lower() for _ in options.regions.split(",")] 417 regions = [_.strip().lower() for _ in options.regions.split(",")]
397 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file: 418 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file:
398 for fasta_arg in options.fasta: 419 for fasta_arg in options.fasta:
420 force_remove_id_version = fasta_arg in force_remove_id_version_file_list
399 for entry in FASTAReader_gen(fasta_arg): 421 for entry in FASTAReader_gen(fasta_arg):
400 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) 422 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)
401 if options.longestCDS and transcript_id not in selected_transcript_ids: 423 if options.longestCDS and transcript_id not in selected_transcript_ids:
402 continue 424 continue
403 425
404 if len(entry.sequence) % 3 != 0: 426 if len(entry.sequence) % 3 != 0:
405 print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr) 427 print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
406 continue 428 continue
407 429
408 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id) 430 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id)
409 if not species_for_transcript: 431 if not species_for_transcript:
410 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) 432 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
411 continue 433 continue
412 434
413 if options.headers: 435 if options.headers:
414 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest 436 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
415 # Remove any underscore in the species 437 # Remove any underscore in the species