gstf_preparation: gstf_preparation.py comparison

comparison gstf_preparation.py @ 9:f4acbfe8d6fe draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55

author	earlhaminst
date	Wed, 17 Oct 2018 07:31:29 -0400
parents	92f3966d5bc3
children	e8e75a79de59

comparison

equal deleted inserted replaced

-:92f3966d5bc3
+:f4acbfe8d6fe
 def fetch_species_and_seq_region_for_transcript(conn, transcript_id):
 cur = conn.cursor()
 cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?',
 (transcript_id, ))
-results = cur.fetchone()
+row = cur.fetchone()
-if not results:
+if not row:
-return None
+return (None, None)
-return results
+return row
 def fetch_gene_id_for_transcript(conn, transcript_id):
 cur = conn.cursor()
 cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?',
 (transcript_id, ))
-results = cur.fetchone()
+row = cur.fetchone()
-if not results:
+if not row:
 return None
-return results[0]
+return row[0]
-def remove_id_version(s):
+def remove_id_version(s, force=False):
 """
-Remove the optional '.VERSION' from an Ensembl id.
+Remove the optional '.VERSION' from an id if it's an Ensembl id or if
+`force` is True.
 """
-if s.startswith('ENS'):
+if force or s.startswith('ENS'):
 return s.split('.')[0]
 else:
 return s
 unimplemented_feature_nlines_dict[feature_type] = 0
 except Exception as e:
 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr)
 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items():
-print("Skipped %d lines in file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr)
+print("Skipped %d lines in GFF3 file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr)
 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
 write_gene_dict_to_db(conn, gene_dict)
 for json_arg in options.json:
 with open(json_arg) as f:
 write_gene_dict_to_db(conn, json.load(f))
+# Read the FASTA files a first time to:
+# - determine for each file if we need to force the removal of the version
+#   from the transcript id
+# - fill gene_transcripts_dict when keeping only the longest CDS per gene
+force_remove_id_version_file_list = []
+gene_transcripts_dict = dict()
+for fasta_arg in options.fasta:
+force_remove_id_version = False
+found_gene_transcript = False
+for entry in FASTAReader_gen(fasta_arg):
+# Extract the transcript id by removing everything after the first space and then removing the version if needed
+transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)
+if len(entry.sequence) % 3 != 0:
+continue
+gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
+if not gene_id and not found_gene_transcript:
+# We have not found a proper gene transcript in this file yet,
+# try to force the removal of the version from the transcript id
+transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], True)
+gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
+# Remember that we need to force the removal for this file
+if gene_id:
+force_remove_id_version = True
+force_remove_id_version_file_list.append(fasta_arg)
+print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr)
+if not gene_id:
+print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
+continue
+if options.longestCDS:
+found_gene_transcript = True
+else:
+break
+if gene_id in gene_transcripts_dict:
+gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
+else:
+gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
 if options.longestCDS:
-gene_transcripts_dict = dict()
+# For each gene, select the transcript with the longest sequence.
-for fasta_arg in options.fasta:
+# If more than one transcripts have the same longest sequence for a
-for entry in FASTAReader_gen(fasta_arg):
+# gene, the first one to appear in the FASTA file is selected
-# Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
-transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
-if len(entry.sequence) % 3 != 0:
-print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
-continue
-gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
-if not gene_id:
-print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
-continue
-if gene_id in gene_transcripts_dict:
-gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
-else:
-gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
-# For each gene, select the transcript with the longest sequence
-# If more than one transcripts have the same longest sequence for a gene, the
-# first one to appear in the FASTA file is selected
 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
 regions = [_.strip().lower() for _ in options.regions.split(",")]
 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file:
 for fasta_arg in options.fasta:
+force_remove_id_version = fasta_arg in force_remove_id_version_file_list
 for entry in FASTAReader_gen(fasta_arg):
-transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
+transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)
 if options.longestCDS and transcript_id not in selected_transcript_ids:
 continue
 if len(entry.sequence) % 3 != 0:
-print("Transcript '%s' in file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
+print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
 continue
 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id)
 if not species_for_transcript:
-print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
+print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
 continue
 if options.headers:
 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
 # Remove any underscore in the species

Mercurial > repos > earlhaminst > gstf_preparation

comparison gstf_preparation.py @ 9:f4acbfe8d6fe draft