comparison gstf_preparation.py @ 11:dbe37a658cd2 draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
author earlhaminst
date Sun, 27 Sep 2020 18:54:31 +0000
parents e8e75a79de59
children 99bae410128c
comparison
equal deleted inserted replaced
10:e8e75a79de59 11:dbe37a658cd2
4 import optparse 4 import optparse
5 import os 5 import os
6 import sqlite3 6 import sqlite3
7 import sys 7 import sys
8 8
9 version = "0.4.0" 9 version = "0.5.0"
10 gene_count = 0 10 gene_count = 0
11 11
12 12
13 class Sequence(object): 13 class Sequence(object):
14 def __init__(self, header, sequence_parts): 14 def __init__(self, header, sequence_parts):
59 seq_region_name VARCHAR NOT NULL, 59 seq_region_name VARCHAR NOT NULL,
60 seq_region_start INTEGER NOT NULL, 60 seq_region_start INTEGER NOT NULL,
61 seq_region_end INTEGER NOT NULL, 61 seq_region_end INTEGER NOT NULL,
62 seq_region_strand INTEGER NOT NULL, 62 seq_region_strand INTEGER NOT NULL,
63 species VARCHAR NOT NULL, 63 species VARCHAR NOT NULL,
64 biotype VARCHAR,
64 gene_json VARCHAR NOT NULL)''') 65 gene_json VARCHAR NOT NULL)''')
65 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') 66 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)')
66 67
67 cur.execute('''CREATE TABLE transcript ( 68 cur.execute('''CREATE TABLE transcript (
68 transcript_id VARCHAR PRIMARY KEY NOT NULL, 69 transcript_id VARCHAR PRIMARY KEY NOT NULL,
70 transcript_symbol VARCHAR,
69 protein_id VARCHAR UNIQUE, 71 protein_id VARCHAR UNIQUE,
70 protein_sequence VARCHAR, 72 protein_sequence VARCHAR,
73 biotype VARCHAR,
74 is_canonical BOOLEAN NOT NULL DEFAULT FALSE,
71 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') 75 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''')
72 76
73 cur.execute('''CREATE VIEW transcript_species AS 77 # The following temporary view is not used in GAFA, so schema changes to it
74 SELECT transcript_id, species, seq_region_name 78 # don't require a meta version upgrade.
79 cur.execute('''CREATE TEMPORARY VIEW transcript_join_gene AS
80 SELECT transcript_id, transcript_symbol, COALESCE(transcript.biotype, gene.biotype) AS biotype, is_canonical, gene_id, gene_symbol, seq_region_name, species
75 FROM transcript JOIN gene 81 FROM transcript JOIN gene
76 ON transcript.gene_id = gene.gene_id''') 82 USING (gene_id)''')
77 83
78 conn.commit() 84 conn.commit()
79 85
80 86
81 def remove_type_from_list_of_ids(l): 87 def fetch_transcript_and_gene(conn, transcript_id):
82 return ','.join(remove_type_from_id(_) for _ in l.split(',')) 88 cur = conn.cursor()
89
90 cur.execute('SELECT * FROM transcript_join_gene WHERE transcript_id=?',
91 (transcript_id, ))
92 return cur.fetchone()
93
94
95 def remove_type_from_list_of_ids(ids):
96 return ','.join(remove_type_from_id(id_) for id_ in ids.split(','))
83 97
84 98
85 def remove_type_from_id(id_): 99 def remove_type_from_id(id_):
86 colon_index = id_.find(':') 100 colon_index = id_.find(':')
87 if colon_index >= 0: 101 if colon_index >= 0:
101 if tag == 'ID': 115 if tag == 'ID':
102 tag = 'id' 116 tag = 'id'
103 value = remove_type_from_id(value) 117 value = remove_type_from_id(value)
104 elif tag == 'Parent': 118 elif tag == 'Parent':
105 value = remove_type_from_list_of_ids(value) 119 value = remove_type_from_list_of_ids(value)
120 elif tag == 'representative':
121 tag = 'is_canonical'
106 d[tag] = value 122 d[tag] = value
107 if cols[6] == '+': 123 if cols[6] == '+':
108 d['strand'] = 1 124 d['strand'] = 1
109 elif cols[6] == '-': 125 elif cols[6] == '-':
110 d['strand'] = -1 126 d['strand'] = -1
120 136
121 137
122 def add_gene_to_dict(cols, species, gene_dict): 138 def add_gene_to_dict(cols, species, gene_dict):
123 global gene_count 139 global gene_count
124 gene = feature_to_dict(cols) 140 gene = feature_to_dict(cols)
141 if not gene['id']:
142 raise Exception("Id not found among column 9 attribute tags: %s" % cols[8])
125 gene.update({ 143 gene.update({
126 'member_id': gene_count, 144 'member_id': gene_count,
127 'object_type': 'Gene', 145 'object_type': 'Gene',
128 'seq_region_name': cols[0], 146 'seq_region_name': cols[0],
129 'species': species, 147 'species': species,
130 'Transcript': [], 148 'Transcript': [],
131 'display_name': gene.get('Name', None) 149 'display_name': gene.get('Name'),
132 }) 150 })
133 if gene['id']: 151 gene_dict[gene['id']] = gene
134 gene_dict[gene['id']] = gene 152 gene_count = gene_count + 1
135 gene_count = gene_count + 1
136 153
137 154
138 def add_transcript_to_dict(cols, species, transcript_dict): 155 def add_transcript_to_dict(cols, species, transcript_dict):
139 transcript = feature_to_dict(cols) 156 transcript = feature_to_dict(cols)
140 if 'biotype' in transcript and transcript['biotype'] != 'protein_coding':
141 return
142 transcript.update({ 157 transcript.update({
143 'object_type': 'Transcript', 158 'object_type': 'Transcript',
144 'seq_region_name': cols[0], 159 'seq_region_name': cols[0],
145 'species': species, 160 'species': species,
161 'display_name': transcript.get('Name'),
146 }) 162 })
147 transcript_dict[transcript['id']] = transcript 163 transcript_dict[transcript['id']] = transcript
148 164
149 165
150 def add_exon_to_dict(cols, species, exon_parent_dict): 166 def add_exon_to_dict(cols, species, exon_parent_dict):
240 256
241 for gene in gene_dict.values(): 257 for gene in gene_dict.values():
242 if gene is None: 258 if gene is None:
243 # This can happen when loading a JSON file from Ensembl 259 # This can happen when loading a JSON file from Ensembl
244 continue 260 continue
261 if 'confidence' in gene and gene['confidence'] != 'high':
262 print("Gene %s has confidence %s (not high), discarding" % (gene['id'], gene['confidence']), file=sys.stderr)
263 continue
245 gene_id = gene['id'] 264 gene_id = gene['id']
246 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)', 265 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)',
247 (gene_id, gene.get('display_name', None), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], json.dumps(gene))) 266 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene)))
248 267
249 if "Transcript" in gene: 268 if "Transcript" in gene:
250 for transcript in gene["Transcript"]: 269 for transcript in gene["Transcript"]:
251 transcript_id = transcript['id'] 270 transcript_id = transcript['id']
252 protein_id = transcript.get('Translation', {}).get('id', None) 271 transcript_symbol = transcript.get('display_name')
272 protein_id = transcript.get('Translation', {}).get('id')
273 biotype = transcript.get('biotype')
274 is_canonical = transcript.get('is_canonical', False)
275 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id)
253 try: 276 try:
254 cur.execute('INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)', 277 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)',
255 (transcript_id, protein_id, gene_id)) 278 to_insert)
256 except Exception as e: 279 except Exception as e:
257 raise Exception("Error while inserting (%s, %s, %s) into transcript table: %s" % (transcript_id, protein_id, gene_id, e)) 280 raise Exception("Error while inserting %s into transcript table: %s" % (str(to_insert), e))
258 281
259 conn.commit() 282 conn.commit()
260
261
262 def fetch_species_and_seq_region_for_transcript(conn, transcript_id):
263 cur = conn.cursor()
264
265 cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?',
266 (transcript_id, ))
267 row = cur.fetchone()
268 if not row:
269 return (None, None)
270 return row
271
272
273 def fetch_gene_id_for_transcript(conn, transcript_id):
274 cur = conn.cursor()
275
276 cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?',
277 (transcript_id, ))
278 row = cur.fetchone()
279 if not row:
280 return None
281 return row[0]
282 283
283 284
284 def remove_id_version(s, force=False): 285 def remove_id_version(s, force=False):
285 """ 286 """
286 Remove the optional '.VERSION' from an id if it's an Ensembl id or if 287 Remove the optional '.VERSION' from an id if it's an Ensembl id or if
295 def __main__(): 296 def __main__():
296 parser = optparse.OptionParser() 297 parser = optparse.OptionParser()
297 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') 298 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
298 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') 299 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
299 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') 300 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files')
300 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') 301 parser.add_option('--filter', type='choice', choices=['canonical', 'coding', ''], default='', help='Which transcripts to keep')
301 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') 302 parser.add_option('--headers', type='choice',
303 choices=['TranscriptId_species', 'GeneSymbol-TranscriptID_species', 'TranscriptSymbol-TranscriptID_species', ''],
304 default='', help='Change the header line of the FASTA sequences to this format')
302 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') 305 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered')
303 parser.add_option('-o', '--output', help='Path of the output SQLite file') 306 parser.add_option('-o', '--output', help='Path of the output SQLite file')
304 parser.add_option('--of', help='Path of the output FASTA file') 307 parser.add_option('--of', help='Path of the output FASTA file')
305 parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file') 308 parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file')
306 309
307 options, args = parser.parse_args() 310 options, args = parser.parse_args()
308 if args: 311 if args:
309 raise Exception('Use options to provide inputs') 312 raise Exception('Use options to provide inputs')
310 313
311 conn = sqlite3.connect(options.output) 314 conn = sqlite3.connect(options.output)
315 conn.row_factory = sqlite3.Row
312 conn.execute('PRAGMA foreign_keys = ON') 316 conn.execute('PRAGMA foreign_keys = ON')
313 create_tables(conn) 317 create_tables(conn)
314 318
315 for gff3_arg in options.gff3: 319 for gff3_arg in options.gff3:
316 try: 320 try:
369 write_gene_dict_to_db(conn, json.load(f)) 373 write_gene_dict_to_db(conn, json.load(f))
370 374
371 # Read the FASTA files a first time to: 375 # Read the FASTA files a first time to:
372 # - determine for each file if we need to force the removal of the version 376 # - determine for each file if we need to force the removal of the version
373 # from the transcript id 377 # from the transcript id
374 # - fill gene_transcripts_dict when keeping only the longest CDS per gene 378 # - fill gene_transcripts_dict when keeping only the canonical transcripts
375 force_remove_id_version_file_list = [] 379 force_remove_id_version_file_list = []
376 gene_transcripts_dict = dict() 380 gene_transcripts_dict = dict()
377 for fasta_arg in options.fasta: 381 for fasta_arg in options.fasta:
378 force_remove_id_version = False 382 force_remove_id_version = False
379 found_gene_transcript = False 383 found_gene_transcript = False
380 for entry in FASTAReader_gen(fasta_arg): 384 for entry in FASTAReader_gen(fasta_arg):
381 # Extract the transcript id by removing everything after the first space and then removing the version if needed 385 # Extract the transcript id by removing everything after the first space and then removing the version if needed
382 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) 386 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)
383 387
384 if len(entry.sequence) % 3 != 0: 388 transcript = fetch_transcript_and_gene(conn, transcript_id)
385 continue 389 if not transcript and not found_gene_transcript:
386
387 gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
388 if not gene_id and not found_gene_transcript:
389 # We have not found a proper gene transcript in this file yet, 390 # We have not found a proper gene transcript in this file yet,
390 # try to force the removal of the version from the transcript id 391 # try to force the removal of the version from the transcript id
391 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], True) 392 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], True)
392 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) 393 transcript = fetch_transcript_and_gene(conn, transcript_id)
393 # Remember that we need to force the removal for this file 394 # Remember that we need to force the removal for this file
394 if gene_id: 395 if transcript:
395 force_remove_id_version = True 396 force_remove_id_version = True
396 force_remove_id_version_file_list.append(fasta_arg) 397 force_remove_id_version_file_list.append(fasta_arg)
397 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr) 398 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr)
398 if not gene_id: 399 if not transcript:
399 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) 400 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
400 continue 401 continue
401 if options.longestCDS: 402 if options.filter != 'canonical':
402 found_gene_transcript = True 403 break
404 found_gene_transcript = True
405
406 if len(entry.sequence) % 3 != 0:
407 continue
408 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene
409 if transcript_biotype and transcript_biotype != 'protein_coding':
410 continue
411 gene_transcripts_dict.setdefault(transcript['gene_id'], []).append((transcript_id, transcript['is_canonical'], len(entry.sequence)))
412
413 if options.filter == 'canonical':
414 selected_transcript_ids = []
415 for gene_id, transcript_tuples in gene_transcripts_dict.items():
416 canonical_transcript_ids = [id_ for (id_, is_canonical, _) in transcript_tuples if is_canonical]
417 if not canonical_transcript_ids:
418 # Select the transcript with the longest sequence. If more than
419 # one transcripts have the same longest sequence for a gene, the
420 # first one to appear in the FASTA file is selected.
421 selected_transcript_id = max(transcript_tuples, key=lambda transcript_tuple: transcript_tuple[2])[0]
422 elif len(canonical_transcript_ids) > 1:
423 raise Exception("Gene %s has more than 1 canonical transcripts" % (gene_id))
403 else: 424 else:
404 break 425 selected_transcript_id = canonical_transcript_ids[0]
405 426 selected_transcript_ids.append(selected_transcript_id)
406 gene_transcripts_dict.setdefault(gene_id, []).append((transcript_id, len(entry.sequence)))
407
408 if options.longestCDS:
409 # For each gene, select the transcript with the longest sequence.
410 # If more than one transcripts have the same longest sequence for a
411 # gene, the first one to appear in the FASTA file is selected
412 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
413 427
414 regions = [_.strip().lower() for _ in options.regions.split(",")] 428 regions = [_.strip().lower() for _ in options.regions.split(",")]
415 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file: 429 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file:
416 for fasta_arg in options.fasta: 430 for fasta_arg in options.fasta:
417 force_remove_id_version = fasta_arg in force_remove_id_version_file_list 431 force_remove_id_version = fasta_arg in force_remove_id_version_file_list
418 for entry in FASTAReader_gen(fasta_arg): 432 for entry in FASTAReader_gen(fasta_arg):
419 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) 433 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)
420 if options.longestCDS and transcript_id not in selected_transcript_ids: 434
421 continue 435 transcript = fetch_transcript_and_gene(conn, transcript_id)
422 436 if not transcript:
423 if len(entry.sequence) % 3 != 0:
424 print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3" % (transcript_id, fasta_arg), file=sys.stderr)
425 continue
426
427 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id)
428 if not species_for_transcript:
429 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) 437 print("Transcript '%s' in FASTA file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
430 continue 438 continue
431 439
432 if options.headers: 440 if options.filter == 'canonical':
441 # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict
442 if transcript_id not in selected_transcript_ids:
443 continue
444 elif options.filter == 'coding':
445 if len(entry.sequence) % 3 != 0:
446 print("Transcript '%s' in FASTA file '%s' has a coding sequence length which is not multiple of 3, removing from FASTA output" % (transcript_id, fasta_arg), file=sys.stderr)
447 continue
448 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene
449 if transcript_biotype and transcript_biotype != 'protein_coding':
450 print("Transcript %s has biotype %s (not protein-coding), removing from FASTA output" % (transcript_id, transcript_biotype), file=sys.stderr)
451 continue
452
453 if options.headers == "TranscriptId_species":
433 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest 454 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
434 # Remove any underscore in the species 455 # Remove any underscore in the species
435 entry.header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) 456 entry.header = ">%s_%s" % (transcript_id, transcript['species'].replace('_', ''))
436 457 elif options.headers == "GeneSymbol-TranscriptID_species":
437 if seq_region_for_transcript.lower() in regions: 458 # Remove any underscore in the species
459 entry.header = ">%s-%s_%s" % (transcript['gene_symbol'], transcript_id, transcript['species'].replace('_', ''))
460 elif options.headers == "TranscriptSymbol-TranscriptID_species":
461 # Remove any underscore in the species
462 entry.header = ">%s-%s_%s" % (transcript['transcript_symbol'], transcript_id, transcript['species'].replace('_', ''))
463
464 if transcript['seq_region_name'].lower() in regions:
438 entry.print(filtered_fasta_file) 465 entry.print(filtered_fasta_file)
439 else: 466 else:
440 entry.print(output_fasta_file) 467 entry.print(output_fasta_file)
441 468
442 conn.close() 469 conn.close()