Mercurial > repos > earlhaminst > gstf_preparation
comparison gstf_preparation.py @ 4:284f64ad9d43 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
author | earlhaminst |
---|---|
date | Fri, 08 Dec 2017 05:32:12 -0500 |
parents | 7e11a7f4bdba |
children | b3ba0c84667c |
comparison
equal
deleted
inserted
replaced
3:7e11a7f4bdba | 4:284f64ad9d43 |
---|---|
249 if not results: | 249 if not results: |
250 return None | 250 return None |
251 return results[0] | 251 return results[0] |
252 | 252 |
253 | 253 |
254 def fetch_gene_id_for_transcript(conn, transcript_id): | |
255 cur = conn.cursor() | |
256 | |
257 cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?', | |
258 (transcript_id, )) | |
259 results = cur.fetchone() | |
260 if not results: | |
261 return None | |
262 return results[0] | |
263 | |
264 | |
254 def remove_id_version(s): | 265 def remove_id_version(s): |
255 """ | 266 """ |
256 Remove the optional '.VERSION' from an Ensembl id. | 267 Remove the optional '.VERSION' from an Ensembl id. |
257 """ | 268 """ |
258 if s.startswith('ENS'): | 269 if s.startswith('ENS'): |
264 def __main__(): | 275 def __main__(): |
265 parser = optparse.OptionParser() | 276 parser = optparse.OptionParser() |
266 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') | 277 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') |
267 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') | 278 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') |
268 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') | 279 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') |
280 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') | |
281 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') | |
269 parser.add_option('-o', '--output', help='Path of the output SQLite file') | 282 parser.add_option('-o', '--output', help='Path of the output SQLite file') |
270 parser.add_option('--of', help='Path of the output FASTA file') | 283 parser.add_option('--of', help='Path of the output FASTA file') |
271 options, args = parser.parse_args() | 284 options, args = parser.parse_args() |
272 if args: | 285 if args: |
273 raise Exception('Use options to provide inputs') | 286 raise Exception('Use options to provide inputs') |
285 transcript_dict = dict() | 298 transcript_dict = dict() |
286 exon_parent_dict = dict() | 299 exon_parent_dict = dict() |
287 cds_parent_dict = dict() | 300 cds_parent_dict = dict() |
288 five_prime_utr_parent_dict = dict() | 301 five_prime_utr_parent_dict = dict() |
289 three_prime_utr_parent_dict = dict() | 302 three_prime_utr_parent_dict = dict() |
303 | |
290 with open(filename) as f: | 304 with open(filename) as f: |
291 for i, line in enumerate(f, start=1): | 305 for i, line in enumerate(f, start=1): |
292 line = line.strip() | 306 line = line.strip() |
293 if not line: | 307 if not line: |
294 # skip empty lines | 308 # skip empty lines |
323 | 337 |
324 for json_arg in options.json: | 338 for json_arg in options.json: |
325 with open(json_arg) as f: | 339 with open(json_arg) as f: |
326 write_gene_dict_to_db(conn, json.load(f)) | 340 write_gene_dict_to_db(conn, json.load(f)) |
327 | 341 |
328 with open(options.of, 'w') as output_fasta_file: | 342 if options.longestCDS: |
343 gene_transcripts_dict = dict() | |
329 for fasta_arg in options.fasta: | 344 for fasta_arg in options.fasta: |
330 for entry in FASTAReader_gen(fasta_arg): | 345 for entry in FASTAReader_gen(fasta_arg): |
331 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id | 346 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id |
332 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) | 347 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) |
348 | |
349 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) | |
350 if not gene_id: | |
351 continue | |
352 | |
353 if gene_id in gene_transcripts_dict: | |
354 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) | |
355 else: | |
356 gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] | |
357 | |
358 # For each gene, select the transcript with the longest sequence | |
359 # If more than one transcripts have the same longest sequence for a gene, the | |
360 # first one to appear in the FASTA file is selected | |
361 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] | |
362 | |
363 with open(options.of, 'w') as output_fasta_file: | |
364 for fasta_arg in options.fasta: | |
365 for entry in FASTAReader_gen(fasta_arg): | |
366 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) | |
367 if options.longestCDS and transcript_id not in selected_transcript_ids: | |
368 continue | |
369 | |
333 species_for_transcript = fetch_species_for_transcript(conn, transcript_id) | 370 species_for_transcript = fetch_species_for_transcript(conn, transcript_id) |
334 if not species_for_transcript: | 371 if not species_for_transcript: |
335 print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr) | 372 print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr) |
336 continue | 373 continue |
337 # Remove any underscore in the species | 374 |
338 species_for_transcript = species_for_transcript.replace('_', '') | 375 if options.headers: |
339 # Write the FASTA sequence using '>TranscriptId_species' as the header, as required by TreeBest | 376 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest |
340 output_fasta_file.write(">%s_%s\n%s\n" % (transcript_id, species_for_transcript, entry.sequence)) | 377 # Remove any underscore in the species |
378 header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) | |
379 else: | |
380 header = entry.header | |
381 | |
382 output_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) | |
341 | 383 |
342 conn.close() | 384 conn.close() |
343 | 385 |
344 | 386 |
345 if __name__ == '__main__': | 387 if __name__ == '__main__': |