Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.py @ 6:56bbdbfe3eaa draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
author | earlhaminst |
---|---|
date | Wed, 25 Apr 2018 11:00:33 -0400 |
parents | b3ba0c84667c |
children | 92f3966d5bc3 |
comparison
equal
deleted
inserted
replaced
5:b3ba0c84667c | 6:56bbdbfe3eaa |
---|---|
4 import json | 4 import json |
5 import optparse | 5 import optparse |
6 import sqlite3 | 6 import sqlite3 |
7 import sys | 7 import sys |
8 | 8 |
9 version = "0.3.0" | 9 version = "0.4.0" |
10 gene_count = 0 | 10 gene_count = 0 |
11 | 11 |
12 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) | 12 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) |
13 | 13 |
14 | 14 |
39 (version, )) | 39 (version, )) |
40 | 40 |
41 cur.execute('''CREATE TABLE gene ( | 41 cur.execute('''CREATE TABLE gene ( |
42 gene_id VARCHAR PRIMARY KEY NOT NULL, | 42 gene_id VARCHAR PRIMARY KEY NOT NULL, |
43 gene_symbol VARCHAR, | 43 gene_symbol VARCHAR, |
44 seq_region_name VARCHAR NOT NULL, | |
45 seq_region_start INTEGER NOT NULL, | |
46 seq_region_end INTEGER NOT NULL, | |
47 seq_region_strand INTEGER NOT NULL, | |
44 species VARCHAR NOT NULL, | 48 species VARCHAR NOT NULL, |
45 gene_json VARCHAR NOT NULL)''') | 49 gene_json VARCHAR NOT NULL)''') |
46 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') | 50 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') |
47 | 51 |
48 cur.execute('''CREATE TABLE transcript ( | 52 cur.execute('''CREATE TABLE transcript ( |
50 protein_id VARCHAR UNIQUE, | 54 protein_id VARCHAR UNIQUE, |
51 protein_sequence VARCHAR, | 55 protein_sequence VARCHAR, |
52 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') | 56 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') |
53 | 57 |
54 cur.execute('''CREATE VIEW transcript_species AS | 58 cur.execute('''CREATE VIEW transcript_species AS |
55 SELECT transcript_id, species | 59 SELECT transcript_id, species, seq_region_name |
56 FROM transcript JOIN gene | 60 FROM transcript JOIN gene |
57 ON transcript.gene_id = gene.gene_id''') | 61 ON transcript.gene_id = gene.gene_id''') |
58 | 62 |
59 conn.commit() | 63 conn.commit() |
60 | 64 |
223 for gene in gene_dict.values(): | 227 for gene in gene_dict.values(): |
224 if gene is None: | 228 if gene is None: |
225 # This can happen when loading a JSON file from Ensembl | 229 # This can happen when loading a JSON file from Ensembl |
226 continue | 230 continue |
227 gene_id = gene['id'] | 231 gene_id = gene['id'] |
228 cur.execute('INSERT INTO gene (gene_id, gene_symbol, species, gene_json) VALUES (?, ?, ?, ?)', | 232 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)', |
229 (gene_id, gene.get('display_name', None), gene['species'], json.dumps(gene))) | 233 (gene_id, gene.get('display_name', None), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], json.dumps(gene))) |
230 | 234 |
231 if "Transcript" in gene: | 235 if "Transcript" in gene: |
232 for transcript in gene["Transcript"]: | 236 for transcript in gene["Transcript"]: |
233 transcript_id = transcript['id'] | 237 transcript_id = transcript['id'] |
234 protein_id = transcript.get('Translation', {}).get('id', None) | 238 protein_id = transcript.get('Translation', {}).get('id', None) |
239 raise Exception("Error while inserting (%s, %s, %s) into transcript table: %s" % (transcript_id, protein_id, gene_id, e)) | 243 raise Exception("Error while inserting (%s, %s, %s) into transcript table: %s" % (transcript_id, protein_id, gene_id, e)) |
240 | 244 |
241 conn.commit() | 245 conn.commit() |
242 | 246 |
243 | 247 |
244 def fetch_species_for_transcript(conn, transcript_id): | 248 def fetch_species_and_seq_region_for_transcript(conn, transcript_id): |
245 cur = conn.cursor() | 249 cur = conn.cursor() |
246 | 250 |
247 cur.execute('SELECT species FROM transcript_species WHERE transcript_id=?', | 251 cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?', |
248 (transcript_id, )) | 252 (transcript_id, )) |
249 results = cur.fetchone() | 253 results = cur.fetchone() |
250 if not results: | 254 if not results: |
251 return None | 255 return None |
252 return results[0] | 256 return results |
253 | 257 |
254 | 258 |
255 def fetch_gene_id_for_transcript(conn, transcript_id): | 259 def fetch_gene_id_for_transcript(conn, transcript_id): |
256 cur = conn.cursor() | 260 cur = conn.cursor() |
257 | 261 |
278 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') | 282 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') |
279 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') | 283 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') |
280 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') | 284 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') |
281 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') | 285 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') |
282 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') | 286 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') |
287 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') | |
283 parser.add_option('-o', '--output', help='Path of the output SQLite file') | 288 parser.add_option('-o', '--output', help='Path of the output SQLite file') |
284 parser.add_option('--of', help='Path of the output FASTA file') | 289 parser.add_option('--of', help='Path of the output FASTA file') |
290 parser.add_option('--ff', help='Path of the filtered sequences output FASTA file') | |
291 | |
285 options, args = parser.parse_args() | 292 options, args = parser.parse_args() |
286 if args: | 293 if args: |
287 raise Exception('Use options to provide inputs') | 294 raise Exception('Use options to provide inputs') |
288 | 295 |
289 conn = sqlite3.connect(options.output) | 296 conn = sqlite3.connect(options.output) |
366 # For each gene, select the transcript with the longest sequence | 373 # For each gene, select the transcript with the longest sequence |
367 # If more than one transcripts have the same longest sequence for a gene, the | 374 # If more than one transcripts have the same longest sequence for a gene, the |
368 # first one to appear in the FASTA file is selected | 375 # first one to appear in the FASTA file is selected |
369 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] | 376 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] |
370 | 377 |
371 with open(options.of, 'w') as output_fasta_file: | 378 regions = [_.strip().lower() for _ in options.regions.split(",")] |
379 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file: | |
372 for fasta_arg in options.fasta: | 380 for fasta_arg in options.fasta: |
373 for entry in FASTAReader_gen(fasta_arg): | 381 for entry in FASTAReader_gen(fasta_arg): |
374 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) | 382 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) |
375 if options.longestCDS and transcript_id not in selected_transcript_ids: | 383 if options.longestCDS and transcript_id not in selected_transcript_ids: |
376 continue | 384 continue |
377 | 385 |
378 species_for_transcript = fetch_species_for_transcript(conn, transcript_id) | 386 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id) |
379 if not species_for_transcript: | 387 if not species_for_transcript: |
380 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) | 388 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) |
381 continue | 389 continue |
382 | 390 |
383 if options.headers: | 391 if options.headers: |
385 # Remove any underscore in the species | 393 # Remove any underscore in the species |
386 header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) | 394 header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) |
387 else: | 395 else: |
388 header = entry.header | 396 header = entry.header |
389 | 397 |
390 output_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) | 398 if seq_region_for_transcript.lower() in regions: |
399 filtered_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) | |
400 else: | |
401 output_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) | |
391 | 402 |
392 conn.close() | 403 conn.close() |
393 | 404 |
394 | 405 |
395 if __name__ == '__main__': | 406 if __name__ == '__main__': |