Mercurial > repos > earlhaminst > gstf_preparation
diff gstf_preparation.py @ 4:284f64ad9d43 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
author | earlhaminst |
---|---|
date | Fri, 08 Dec 2017 05:32:12 -0500 |
parents | 7e11a7f4bdba |
children | b3ba0c84667c |
line wrap: on
line diff
--- a/gstf_preparation.py Fri Nov 24 12:32:39 2017 -0500 +++ b/gstf_preparation.py Fri Dec 08 05:32:12 2017 -0500 @@ -251,6 +251,17 @@ return results[0] +def fetch_gene_id_for_transcript(conn, transcript_id): + cur = conn.cursor() + + cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?', + (transcript_id, )) + results = cur.fetchone() + if not results: + return None + return results[0] + + def remove_id_version(s): """ Remove the optional '.VERSION' from an Ensembl id. @@ -266,6 +277,8 @@ parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') + parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') + parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') parser.add_option('-o', '--output', help='Path of the output SQLite file') parser.add_option('--of', help='Path of the output FASTA file') options, args = parser.parse_args() @@ -287,6 +300,7 @@ cds_parent_dict = dict() five_prime_utr_parent_dict = dict() three_prime_utr_parent_dict = dict() + with open(filename) as f: for i, line in enumerate(f, start=1): line = line.strip() @@ -325,19 +339,47 @@ with open(json_arg) as f: write_gene_dict_to_db(conn, json.load(f)) - with open(options.of, 'w') as output_fasta_file: + if options.longestCDS: + gene_transcripts_dict = dict() for fasta_arg in options.fasta: for entry in FASTAReader_gen(fasta_arg): # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) + + gene_id = fetch_gene_id_for_transcript(conn, transcript_id) + if not gene_id: + continue + + if gene_id in gene_transcripts_dict: + gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) + else: + gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] + + # For each gene, select the transcript with the longest sequence + # If more than one transcripts have the same longest sequence for a gene, the + # first one to appear in the FASTA file is selected + selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] + + with open(options.of, 'w') as output_fasta_file: + for fasta_arg in options.fasta: + for entry in FASTAReader_gen(fasta_arg): + transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) + if options.longestCDS and transcript_id not in selected_transcript_ids: + continue + species_for_transcript = fetch_species_for_transcript(conn, transcript_id) if not species_for_transcript: print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr) continue - # Remove any underscore in the species - species_for_transcript = species_for_transcript.replace('_', '') - # Write the FASTA sequence using '>TranscriptId_species' as the header, as required by TreeBest - output_fasta_file.write(">%s_%s\n%s\n" % (transcript_id, species_for_transcript, entry.sequence)) + + if options.headers: + # Change the FASTA header to '>TranscriptId_species', as required by TreeBest + # Remove any underscore in the species + header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) + else: + header = entry.header + + output_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) conn.close()