comparison gstf_preparation.py @ 6:56bbdbfe3eaa draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
author earlhaminst
date Wed, 25 Apr 2018 11:00:33 -0400
parents b3ba0c84667c
children 92f3966d5bc3
comparison
equal deleted inserted replaced
5:b3ba0c84667c 6:56bbdbfe3eaa
4 import json 4 import json
5 import optparse 5 import optparse
6 import sqlite3 6 import sqlite3
7 import sys 7 import sys
8 8
9 version = "0.3.0" 9 version = "0.4.0"
10 gene_count = 0 10 gene_count = 0
11 11
12 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) 12 Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
13 13
14 14
39 (version, )) 39 (version, ))
40 40
41 cur.execute('''CREATE TABLE gene ( 41 cur.execute('''CREATE TABLE gene (
42 gene_id VARCHAR PRIMARY KEY NOT NULL, 42 gene_id VARCHAR PRIMARY KEY NOT NULL,
43 gene_symbol VARCHAR, 43 gene_symbol VARCHAR,
44 seq_region_name VARCHAR NOT NULL,
45 seq_region_start INTEGER NOT NULL,
46 seq_region_end INTEGER NOT NULL,
47 seq_region_strand INTEGER NOT NULL,
44 species VARCHAR NOT NULL, 48 species VARCHAR NOT NULL,
45 gene_json VARCHAR NOT NULL)''') 49 gene_json VARCHAR NOT NULL)''')
46 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') 50 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)')
47 51
48 cur.execute('''CREATE TABLE transcript ( 52 cur.execute('''CREATE TABLE transcript (
50 protein_id VARCHAR UNIQUE, 54 protein_id VARCHAR UNIQUE,
51 protein_sequence VARCHAR, 55 protein_sequence VARCHAR,
52 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') 56 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''')
53 57
54 cur.execute('''CREATE VIEW transcript_species AS 58 cur.execute('''CREATE VIEW transcript_species AS
55 SELECT transcript_id, species 59 SELECT transcript_id, species, seq_region_name
56 FROM transcript JOIN gene 60 FROM transcript JOIN gene
57 ON transcript.gene_id = gene.gene_id''') 61 ON transcript.gene_id = gene.gene_id''')
58 62
59 conn.commit() 63 conn.commit()
60 64
223 for gene in gene_dict.values(): 227 for gene in gene_dict.values():
224 if gene is None: 228 if gene is None:
225 # This can happen when loading a JSON file from Ensembl 229 # This can happen when loading a JSON file from Ensembl
226 continue 230 continue
227 gene_id = gene['id'] 231 gene_id = gene['id']
228 cur.execute('INSERT INTO gene (gene_id, gene_symbol, species, gene_json) VALUES (?, ?, ?, ?)', 232 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
229 (gene_id, gene.get('display_name', None), gene['species'], json.dumps(gene))) 233 (gene_id, gene.get('display_name', None), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], json.dumps(gene)))
230 234
231 if "Transcript" in gene: 235 if "Transcript" in gene:
232 for transcript in gene["Transcript"]: 236 for transcript in gene["Transcript"]:
233 transcript_id = transcript['id'] 237 transcript_id = transcript['id']
234 protein_id = transcript.get('Translation', {}).get('id', None) 238 protein_id = transcript.get('Translation', {}).get('id', None)
239 raise Exception("Error while inserting (%s, %s, %s) into transcript table: %s" % (transcript_id, protein_id, gene_id, e)) 243 raise Exception("Error while inserting (%s, %s, %s) into transcript table: %s" % (transcript_id, protein_id, gene_id, e))
240 244
241 conn.commit() 245 conn.commit()
242 246
243 247
244 def fetch_species_for_transcript(conn, transcript_id): 248 def fetch_species_and_seq_region_for_transcript(conn, transcript_id):
245 cur = conn.cursor() 249 cur = conn.cursor()
246 250
247 cur.execute('SELECT species FROM transcript_species WHERE transcript_id=?', 251 cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?',
248 (transcript_id, )) 252 (transcript_id, ))
249 results = cur.fetchone() 253 results = cur.fetchone()
250 if not results: 254 if not results:
251 return None 255 return None
252 return results[0] 256 return results
253 257
254 258
255 def fetch_gene_id_for_transcript(conn, transcript_id): 259 def fetch_gene_id_for_transcript(conn, transcript_id):
256 cur = conn.cursor() 260 cur = conn.cursor()
257 261
278 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') 282 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
279 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') 283 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
280 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') 284 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files')
281 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') 285 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene')
282 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') 286 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format')
287 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered')
283 parser.add_option('-o', '--output', help='Path of the output SQLite file') 288 parser.add_option('-o', '--output', help='Path of the output SQLite file')
284 parser.add_option('--of', help='Path of the output FASTA file') 289 parser.add_option('--of', help='Path of the output FASTA file')
290 parser.add_option('--ff', help='Path of the filtered sequences output FASTA file')
291
285 options, args = parser.parse_args() 292 options, args = parser.parse_args()
286 if args: 293 if args:
287 raise Exception('Use options to provide inputs') 294 raise Exception('Use options to provide inputs')
288 295
289 conn = sqlite3.connect(options.output) 296 conn = sqlite3.connect(options.output)
366 # For each gene, select the transcript with the longest sequence 373 # For each gene, select the transcript with the longest sequence
367 # If more than one transcripts have the same longest sequence for a gene, the 374 # If more than one transcripts have the same longest sequence for a gene, the
368 # first one to appear in the FASTA file is selected 375 # first one to appear in the FASTA file is selected
369 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] 376 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
370 377
371 with open(options.of, 'w') as output_fasta_file: 378 regions = [_.strip().lower() for _ in options.regions.split(",")]
379 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file:
372 for fasta_arg in options.fasta: 380 for fasta_arg in options.fasta:
373 for entry in FASTAReader_gen(fasta_arg): 381 for entry in FASTAReader_gen(fasta_arg):
374 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) 382 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
375 if options.longestCDS and transcript_id not in selected_transcript_ids: 383 if options.longestCDS and transcript_id not in selected_transcript_ids:
376 continue 384 continue
377 385
378 species_for_transcript = fetch_species_for_transcript(conn, transcript_id) 386 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id)
379 if not species_for_transcript: 387 if not species_for_transcript:
380 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) 388 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
381 continue 389 continue
382 390
383 if options.headers: 391 if options.headers:
385 # Remove any underscore in the species 393 # Remove any underscore in the species
386 header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) 394 header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', ''))
387 else: 395 else:
388 header = entry.header 396 header = entry.header
389 397
390 output_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) 398 if seq_region_for_transcript.lower() in regions:
399 filtered_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
400 else:
401 output_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
391 402
392 conn.close() 403 conn.close()
393 404
394 405
395 if __name__ == '__main__': 406 if __name__ == '__main__':