diff gstf_preparation.py @ 6:56bbdbfe3eaa draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
author earlhaminst
date Wed, 25 Apr 2018 11:00:33 -0400
parents b3ba0c84667c
children 92f3966d5bc3
line wrap: on
line diff
--- a/gstf_preparation.py	Mon Apr 16 14:05:09 2018 -0400
+++ b/gstf_preparation.py	Wed Apr 25 11:00:33 2018 -0400
@@ -6,7 +6,7 @@
 import sqlite3
 import sys
 
-version = "0.3.0"
+version = "0.4.0"
 gene_count = 0
 
 Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
@@ -41,6 +41,10 @@
     cur.execute('''CREATE TABLE gene (
         gene_id VARCHAR PRIMARY KEY NOT NULL,
         gene_symbol VARCHAR,
+        seq_region_name VARCHAR NOT NULL,
+        seq_region_start INTEGER NOT NULL,
+        seq_region_end INTEGER NOT NULL,
+        seq_region_strand INTEGER NOT NULL,
         species VARCHAR NOT NULL,
         gene_json VARCHAR NOT NULL)''')
     cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)')
@@ -52,7 +56,7 @@
         gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''')
 
     cur.execute('''CREATE VIEW transcript_species AS
-        SELECT transcript_id, species
+        SELECT transcript_id, species, seq_region_name
         FROM transcript JOIN gene
         ON transcript.gene_id = gene.gene_id''')
 
@@ -225,8 +229,8 @@
             # This can happen when loading a JSON file from Ensembl
             continue
         gene_id = gene['id']
-        cur.execute('INSERT INTO gene (gene_id, gene_symbol, species, gene_json) VALUES (?, ?, ?, ?)',
-                    (gene_id, gene.get('display_name', None), gene['species'], json.dumps(gene)))
+        cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
+                    (gene_id, gene.get('display_name', None), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], json.dumps(gene)))
 
         if "Transcript" in gene:
             for transcript in gene["Transcript"]:
@@ -241,15 +245,15 @@
     conn.commit()
 
 
-def fetch_species_for_transcript(conn, transcript_id):
+def fetch_species_and_seq_region_for_transcript(conn, transcript_id):
     cur = conn.cursor()
 
-    cur.execute('SELECT species FROM transcript_species WHERE transcript_id=?',
+    cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?',
                 (transcript_id, ))
     results = cur.fetchone()
     if not results:
         return None
-    return results[0]
+    return results
 
 
 def fetch_gene_id_for_transcript(conn, transcript_id):
@@ -280,8 +284,11 @@
     parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files')
     parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene')
     parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format')
+    parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered')
     parser.add_option('-o', '--output', help='Path of the output SQLite file')
     parser.add_option('--of', help='Path of the output FASTA file')
+    parser.add_option('--ff', help='Path of the filtered sequences output FASTA file')
+
     options, args = parser.parse_args()
     if args:
         raise Exception('Use options to provide inputs')
@@ -368,14 +375,15 @@
         # first one to appear in the FASTA file is selected
         selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
 
-    with open(options.of, 'w') as output_fasta_file:
+    regions = [_.strip().lower() for _ in options.regions.split(",")]
+    with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file:
         for fasta_arg in options.fasta:
             for entry in FASTAReader_gen(fasta_arg):
                 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
                 if options.longestCDS and transcript_id not in selected_transcript_ids:
                     continue
 
-                species_for_transcript = fetch_species_for_transcript(conn, transcript_id)
+                species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id)
                 if not species_for_transcript:
                     print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
                     continue
@@ -387,7 +395,10 @@
                 else:
                     header = entry.header
 
-                output_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
+                if seq_region_for_transcript.lower() in regions:
+                    filtered_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
+                else:
+                    output_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
 
     conn.close()