# HG changeset patch # User earlhaminst # Date 1512729132 18000 # Node ID 284f64ad9d43d883445ff291472f977acf7fffac # Parent 7e11a7f4bdbafcee9d0c52d08a1545944118076b planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty diff -r 7e11a7f4bdba -r 284f64ad9d43 gstf_preparation.py --- a/gstf_preparation.py Fri Nov 24 12:32:39 2017 -0500 +++ b/gstf_preparation.py Fri Dec 08 05:32:12 2017 -0500 @@ -251,6 +251,17 @@ return results[0] +def fetch_gene_id_for_transcript(conn, transcript_id): + cur = conn.cursor() + + cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?', + (transcript_id, )) + results = cur.fetchone() + if not results: + return None + return results[0] + + def remove_id_version(s): """ Remove the optional '.VERSION' from an Ensembl id. @@ -266,6 +277,8 @@ parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') + parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') + parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') parser.add_option('-o', '--output', help='Path of the output SQLite file') parser.add_option('--of', help='Path of the output FASTA file') options, args = parser.parse_args() @@ -287,6 +300,7 @@ cds_parent_dict = dict() five_prime_utr_parent_dict = dict() three_prime_utr_parent_dict = dict() + with open(filename) as f: for i, line in enumerate(f, start=1): line = line.strip() @@ -325,19 +339,47 @@ with open(json_arg) as f: write_gene_dict_to_db(conn, json.load(f)) - with open(options.of, 'w') as output_fasta_file: + if options.longestCDS: + gene_transcripts_dict = dict() for fasta_arg in options.fasta: for entry in FASTAReader_gen(fasta_arg): # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) + + gene_id = fetch_gene_id_for_transcript(conn, transcript_id) + if not gene_id: + continue + + if gene_id in gene_transcripts_dict: + gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) + else: + gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] + + # For each gene, select the transcript with the longest sequence + # If more than one transcripts have the same longest sequence for a gene, the + # first one to appear in the FASTA file is selected + selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] + + with open(options.of, 'w') as output_fasta_file: + for fasta_arg in options.fasta: + for entry in FASTAReader_gen(fasta_arg): + transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) + if options.longestCDS and transcript_id not in selected_transcript_ids: + continue + species_for_transcript = fetch_species_for_transcript(conn, transcript_id) if not species_for_transcript: print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr) continue - # Remove any underscore in the species - species_for_transcript = species_for_transcript.replace('_', '') - # Write the FASTA sequence using '>TranscriptId_species' as the header, as required by TreeBest - output_fasta_file.write(">%s_%s\n%s\n" % (transcript_id, species_for_transcript, entry.sequence)) + + if options.headers: + # Change the FASTA header to '>TranscriptId_species', as required by TreeBest + # Remove any underscore in the species + header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) + else: + header = entry.header + + output_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) conn.close() diff -r 7e11a7f4bdba -r 284f64ad9d43 gstf_preparation.xml --- a/gstf_preparation.xml Fri Nov 24 12:32:39 2017 -0500 +++ b/gstf_preparation.xml Fri Dec 08 05:32:12 2017 -0500 @@ -1,4 +1,4 @@ - + converts data for the workflow @@ -28,6 +34,8 @@ + + @@ -40,12 +48,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + @@ -55,7 +88,9 @@ TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow). Example GFF3 file:: diff -r 7e11a7f4bdba -r 284f64ad9d43 test-data/test1_longest.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test1_longest.fasta Fri Dec 08 05:32:12 2017 -0500 @@ -0,0 +1,51 @@ +>Y74C9A.3_caenorhabditiselegans +ATGAGCTCTTCATCATCTTCTAGAATTCACAATGGTGAAGATGTTTATGAAAAGGCGGAG +GAATACTGGAGCCGCGCGAGCCAGGACGTCAACGGAATGCTCGGCGGATTCGAAGCGCTT +CACGCGCCCGACATATCGGCGTCGAAACGATTTATTGAAGGACTGAAGAAAAAGAATCTA +TTCGGCTACTTTGACTATGCACTGGACTGCGGAGCGGGTATCGGACGTGTTACAAAGCAT +CTCTTAATGCCATTCTTCTCGAAAGTTGATATGGAAGACGTCGTCGAGGAGTTGATCACG +AAAAGTGATCAATATATTGGAAAACATCCACGAATTGGAGATAAATTCGTCGAAGGACTG +CAGACGTTTGCACCGCCCGAACGACGTTATGATTTGATATGGATTCAATGGGTTTCAGGG +CATTTGGTTGATGAGGATTTGGTTGATTTCTTTAAAAGATGTGCGAAAGGACTGAAACCT +GGTGGATGTATTGTGCTCAAGGATAATGTGACAAATCACGAGAAACGGTTATTCGACGAT +GATGATCATAGTTGGACGAGAACAGAGCCCGAGCTTCTTAAAGCGTTCGCCGATTCTCAA +CTGGACATGGTCTCGAAAGCACTGCAAACCGGATTCCCAAAGGAGATTTATCCAGTAAAA +ATGTATGCATTGAAGCCTCAACACACCGGATTCACCAATAATTGA +>Y74C9A.2a.1_caenorhabditiselegans +ATGAAACTCGTAATTCTGCTATCTTTTGTTGCGACAGTTGCGGTTTTTGCGGCTCCATCG +GCTCCGGCAGGTCTCGAGGAGAAGCTGCGTGCTCTTCAGGAGCAACTGTACAGTCTGGAG +AAAGAGAACGGAGTTGATGTGAAGCAAAAGGAGCAACCAGCAGCAGCCGACACATTCCTT +GGATTTGTTCCACAGAAGAGAATGGTCGCGTGGCAGCCGATGAAGCGGTCGATGATCAAT +GAGGATTCTAGAGCTCCATTGCTCCACGCAATCGAAGCCCGCTTGGCCGAAGTGTTGAGA +GCCGGAGAACGCCTCGGAGTCAACCCGGAGGAAGTTTTGGCGGATCTTCGTGCTCGTAAT +CAATTCCAATAA +>Y74C9A.4b_caenorhabditiselegans +ATGGATTCGTACACGTCATCTGACGAAGACGCCTCTCGAAAAGAAAATATTTCAGACGAA +GGCTTGAATATGTTGAATGCATCGCCGGAGCCAATGGAGGAAGATGATCCAGAGGAGCAG +GCGGAACAAGAAGAAGAAACCAGCAGAATGGCTCGTCCTATAAGATCCATGAGAAAACGC +GAAACAACGTCTGGGGAATCAATGGGCGATGAGGATGAAGATTTGGAGGATGAAGAGGAC +GAAGATGAAGAAGCTGAAGCTCGTGAGCATCATGAAAGTGGTGCTCATGACACATCTTTC +TCAAATCCACTTTCCAACGTCGACAATCTAATCCACGTGGGAACCGAATATCAGGCGATT +ATACAGCCAACTGCAGAGCAATTGGAAAAAGAACCGTGCAGAGATCAACAAATTTGGGCG +TTTCCAGACGAAATGAACGAGAATCGGCTTACAGAATACATTTCAGAAGCTACTGGACGA +TATCAATTACCTATAGATAGGGCTCTGTTCATTCTGAACAAACAGTCAAATGATTTCGAC +GCTGCGATGGTTCAAGCGATGAGAAGAAAAGAAATTCATGATGATTGGACGGCAGAAGAA +ATTAGTCTTTTCTCCACTTGCTTCTTTCATTTCGGAAAACGGTTCAAGAAGATTCATGCG +GCTATGCCCCAACGCTCGCTTTCTTCCATTATCCAATACTATTACAACACGAAAAAAGTG +CAAAACTATAAAACAATGATTAATGTGCATTTGAATGAAACCGACACTTATGATGAACTA +TTCAAAGAGGTCAATCATTTGGAGAGGGTTCCGTCGGGATATTGTGAGAATTGCAATGCA +AAAAGTGATCTGTTGATTCTAAATCGTGTAATGTCGCGTCACGAATGTAAACCGTGTATC +CTTTATTTCCGTTTGATGCGTGTTCCACGTCCGGCAAGCCTCCGTGCACTGACAAAACGA +CGGCAACGAGTTTTATGTCCAGAATACATGAAAATTTATGTATACGGATATCTTGAGCTC +ATGGAGCCAGCCAACGGAAAAGCGATCAAACGGCTTGGAATTGGAAAAGAAAAAGAAGAA +GACGATGATATTATGGTGGTCGACGACTGCCTTCTCCGTAAACCATCAGGCCCCTACATT +GTGGAGCAATCGATTGAAGCTGATCCAATCGATGAGAATACGTGCAGAATGACACGGTGC +TTCGATACACCGGCTGCACTGGCATTAATTGATAATATCAAGAGAAAACATCATATGTGT +GTTCCACTTGTTTGGAGAGTTAAACAAACGAAATGTATGGAGGAGAACGAAATTCTGAAT +GAAGAAGCCCGTCAACAAATGTTCCGTGCAACAATGACATACAGCCGTGTACCAAAAGGA +GAAATTGCAAATTGGAAGAAAGATATGATGGCGTTGAAGGGAAGATTCGAGAGATTTACT +CCTGAACTAGATACTACTGCAACAAATGGCAATCGATCTGGAAAAGTCAGGATAAATTAT +GGTTGGAGTCCTGAGGAAAAGAAGAATGCTATTAGATGTTTCCACTGGTACAAGGACAAT +TTCGAGTTGATCGCCGAGTTGATGGCCACAAAAACTGTGGAACAAATCAAAAAGTTCTAT +ATGGACAATGAAAAGCTAATTTTGGAGTCAATCGACACGTATCGCGCCGAGCTCAAGTCA +AAACTCGGCAAATAA