Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.py @ 9:f4acbfe8d6fe draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
author | earlhaminst |
---|---|
date | Wed, 17 Oct 2018 07:31:29 -0400 |
parents | 92f3966d5bc3 |
children | e8e75a79de59 |
comparison
equal
deleted
inserted
replaced
8:92f3966d5bc3 | 9:f4acbfe8d6fe |
---|---|
262 def fetch_species_and_seq_region_for_transcript(conn, transcript_id): | 262 def fetch_species_and_seq_region_for_transcript(conn, transcript_id): |
263 cur = conn.cursor() | 263 cur = conn.cursor() |
264 | 264 |
265 cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?', | 265 cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?', |
266 (transcript_id, )) | 266 (transcript_id, )) |
267 results = cur.fetchone() | 267 row = cur.fetchone() |
268 if not results: | 268 if not row: |
269 return None | 269 return (None, None) |
270 return results | 270 return row |
271 | 271 |
272 | 272 |
273 def fetch_gene_id_for_transcript(conn, transcript_id): | 273 def fetch_gene_id_for_transcript(conn, transcript_id): |
274 cur = conn.cursor() | 274 cur = conn.cursor() |
275 | 275 |
276 cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?', | 276 cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?', |
277 (transcript_id, )) | 277 (transcript_id, )) |
278 results = cur.fetchone() | 278 row = cur.fetchone() |
279 if not results: | 279 if not row: |
280 return None | 280 return None |
281 return results[0] | 281 return row[0] |
282 | 282 |
283 | 283 |
284 def remove_id_version(s): | 284 def remove_id_version(s, force=False): |
285 """ | 285 """ |
286 Remove the optional '.VERSION' from an Ensembl id. | 286 Remove the optional '.VERSION' from an id if it's an Ensembl id or if |
287 `force` is True. | |
287 """ | 288 """ |
288 if s.startswith('ENS'): | 289 if force or s.startswith('ENS'): |
289 return s.split('.')[0] | 290 return s.split('.')[0] |
290 else: | 291 else: |
291 return s | 292 return s |
292 | 293 |
293 | 294 |
356 unimplemented_feature_nlines_dict[feature_type] = 0 | 357 unimplemented_feature_nlines_dict[feature_type] = 0 |
357 except Exception as e: | 358 except Exception as e: |
358 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr) | 359 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr) |
359 | 360 |
360 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items(): | 361 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items(): |
361 print("Skipped %d lines in file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr) | 362 print("Skipped %d lines in GFF3 file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr) |
362 | 363 |
363 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) | 364 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) |
364 write_gene_dict_to_db(conn, gene_dict) | 365 write_gene_dict_to_db(conn, gene_dict) |
365 | 366 |
366 for json_arg in options.json: | 367 for json_arg in options.json: |
367 with open(json_arg) as f: | 368 with open(json_arg) as f: |
368 write_gene_dict_to_db(conn, json.load(f)) | 369 write_gene_dict_to_db(conn, json.load(f)) |
369 | 370 |
371 # Read the FASTA files a first time to: | |
372 # - determine for each file if we need to force the removal of the version | |
373 # from the transcript id | |
374 # - fill gene_transcripts_dict when keeping only the longest CDS per gene | |
375 force_remove_id_version_file_list = [] | |
376 gene_transcripts_dict = dict() | |
377 for fasta_arg in options.fasta: | |
378 force_remove_id_version = False | |
379 found_gene_transcript = False | |
380 for entry in FASTAReader_gen(fasta_arg): | |
381 # Extract the transcript id by removing everything after the first space and then removing the version if needed | |
382 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) | |
383 | |
384 if len(entry.sequence) % 3 != 0: | |
385 continue | |
386 | |
387 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) | |
388 if not gene_id and not found_gene_transcript: | |
389 # We have not found a proper gene transcript in this file yet, | |
390 # try to force the removal of the version from the transcript id | |
391 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], True) | |
392 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) | |
393 # Remember that we need to force the removal for this file | |
394 if gene_id: | |
395 force_remove_id_version = True | |
396 force_remove_id_version_file_list.append(fasta_arg) | |
397 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) | |
398 if not gene_id: | |
399 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | |
400 continue | |
401 if options.longestCDS: | |
402 found_gene_transcript = True | |
403 else: | |
404 break | |
405 | |
406 if gene_id in gene_transcripts_dict: | |
407 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) | |
408 else: | |
409 gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] | |
410 | |
370 if options.longestCDS: | 411 if options.longestCDS: |
371 gene_transcripts_dict = dict() | 412 # For each gene, select the transcript with the longest sequence. |
372 for fasta_arg in options.fasta: | 413 # If more than one transcripts have the same longest sequence for a |
373 for entry in FASTAReader_gen(fasta_arg): | 414 # gene, the first one to appear in the FASTA file is selected |
374 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id | |
375 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) | |
376 | |
377 if len(entry.sequence) % 3 != 0: | |
378 print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr) | |
379 continue | |
380 | |
381 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) | |
382 if not gene_id: | |
383 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | |
384 continue | |
385 | |
386 if gene_id in gene_transcripts_dict: | |
387 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) | |
388 else: | |
389 gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] | |
390 | |
391 # For each gene, select the transcript with the longest sequence | |
392 # If more than one transcripts have the same longest sequence for a gene, the | |
393 # first one to appear in the FASTA file is selected | |
394 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] | 415 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] |
395 | 416 |
396 regions = [_.strip().lower() for _ in options.regions.split(",")] | 417 regions = [_.strip().lower() for _ in options.regions.split(",")] |
397 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file: | 418 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file: |
398 for fasta_arg in options.fasta: | 419 for fasta_arg in options.fasta: |
420 force_remove_id_version = fasta_arg in force_remove_id_version_file_list | |
399 for entry in FASTAReader_gen(fasta_arg): | 421 for entry in FASTAReader_gen(fasta_arg): |
400 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) | 422 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) |
401 if options.longestCDS and transcript_id not in selected_transcript_ids: | 423 if options.longestCDS and transcript_id not in selected_transcript_ids: |
402 continue | 424 continue |
403 | 425 |
404 if len(entry.sequence) % 3 != 0: | 426 if len(entry.sequence) % 3 != 0: |
405 print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr) | 427 print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr) |
406 continue | 428 continue |
407 | 429 |
408 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id) | 430 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id) |
409 if not species_for_transcript: | 431 if not species_for_transcript: |
410 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | 432 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) |
411 continue | 433 continue |
412 | 434 |
413 if options.headers: | 435 if options.headers: |
414 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest | 436 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest |
415 # Remove any underscore in the species | 437 # Remove any underscore in the species |