diff gstf_preparation.py @ 4:284f64ad9d43 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
author earlhaminst
date Fri, 08 Dec 2017 05:32:12 -0500
parents 7e11a7f4bdba
children b3ba0c84667c
line wrap: on
line diff
--- a/gstf_preparation.py	Fri Nov 24 12:32:39 2017 -0500
+++ b/gstf_preparation.py	Fri Dec 08 05:32:12 2017 -0500
@@ -251,6 +251,17 @@
     return results[0]
 
 
+def fetch_gene_id_for_transcript(conn, transcript_id):
+    cur = conn.cursor()
+
+    cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?',
+                (transcript_id, ))
+    results = cur.fetchone()
+    if not results:
+        return None
+    return results[0]
+
+
 def remove_id_version(s):
     """
     Remove the optional '.VERSION' from an Ensembl id.
@@ -266,6 +277,8 @@
     parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
     parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
     parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files')
+    parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene')
+    parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format')
     parser.add_option('-o', '--output', help='Path of the output SQLite file')
     parser.add_option('--of', help='Path of the output FASTA file')
     options, args = parser.parse_args()
@@ -287,6 +300,7 @@
         cds_parent_dict = dict()
         five_prime_utr_parent_dict = dict()
         three_prime_utr_parent_dict = dict()
+
         with open(filename) as f:
             for i, line in enumerate(f, start=1):
                 line = line.strip()
@@ -325,19 +339,47 @@
         with open(json_arg) as f:
             write_gene_dict_to_db(conn, json.load(f))
 
-    with open(options.of, 'w') as output_fasta_file:
+    if options.longestCDS:
+        gene_transcripts_dict = dict()
         for fasta_arg in options.fasta:
             for entry in FASTAReader_gen(fasta_arg):
                 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
                 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
+
+                gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
+                if not gene_id:
+                    continue
+
+                if gene_id in gene_transcripts_dict:
+                    gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
+                else:
+                    gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
+
+        # For each gene, select the transcript with the longest sequence
+        # If more than one transcripts have the same longest sequence for a gene, the
+        # first one to appear in the FASTA file is selected
+        selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
+
+    with open(options.of, 'w') as output_fasta_file:
+        for fasta_arg in options.fasta:
+            for entry in FASTAReader_gen(fasta_arg):
+                transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
+                if options.longestCDS and transcript_id not in selected_transcript_ids:
+                    continue
+
                 species_for_transcript = fetch_species_for_transcript(conn, transcript_id)
                 if not species_for_transcript:
                     print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr)
                     continue
-                # Remove any underscore in the species
-                species_for_transcript = species_for_transcript.replace('_', '')
-                # Write the FASTA sequence using '>TranscriptId_species' as the header, as required by TreeBest
-                output_fasta_file.write(">%s_%s\n%s\n" % (transcript_id, species_for_transcript, entry.sequence))
+
+                if options.headers:
+                    # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
+                    # Remove any underscore in the species
+                    header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', ''))
+                else:
+                    header = entry.header
+
+                output_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
 
     conn.close()