Mercurial > repos > earlhaminst > treebest_best
comparison fasta_header_converter.py @ 3:dd268de3a107 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
author | earlhaminst |
---|---|
date | Fri, 03 Mar 2017 07:22:53 -0500 |
parents | 4f9e5110914b |
children |
comparison
equal
deleted
inserted
replaced
2:7ea4df039a53 | 3:dd268de3a107 |
---|---|
1 from __future__ import print_function | 1 from __future__ import print_function |
2 | 2 |
3 import collections | |
3 import json | 4 import json |
4 import optparse | 5 import optparse |
6 import sys | |
7 | |
8 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) | |
9 | |
10 | |
11 def FASTAReader_gen(fasta_filename): | |
12 with open(fasta_filename) as fasta_file: | |
13 line = fasta_file.readline() | |
14 while True: | |
15 if not line: | |
16 return | |
17 assert line.startswith('>'), "FASTA headers must start with >" | |
18 header = line.rstrip() | |
19 sequence_parts = [] | |
20 line = fasta_file.readline() | |
21 while line and line[0] != '>': | |
22 sequence_parts.append(line.rstrip()) | |
23 line = fasta_file.readline() | |
24 sequence = "\n".join(sequence_parts) | |
25 yield Sequence(header, sequence) | |
5 | 26 |
6 | 27 |
7 def read_gene_info(gene_info): | 28 def read_gene_info(gene_info): |
8 transcript_species_dict = dict() | 29 transcript_species_dict = dict() |
9 for gene_dict in gene_info.values(): | 30 for gene_dict in gene_info.values(): |
15 parser = optparse.OptionParser() | 36 parser = optparse.OptionParser() |
16 parser.add_option('-j', '--json', dest="input_gene_filename", | 37 parser.add_option('-j', '--json', dest="input_gene_filename", |
17 help='Gene feature information in JSON format') | 38 help='Gene feature information in JSON format') |
18 parser.add_option('-f', '--fasta', dest="input_fasta_filename", | 39 parser.add_option('-f', '--fasta', dest="input_fasta_filename", |
19 help='Sequences in FASTA format') | 40 help='Sequences in FASTA format') |
41 parser.add_option('-o', '--output', dest="output_fasta_filename", | |
42 help='Output FASTA file name') | |
20 options, args = parser.parse_args() | 43 options, args = parser.parse_args() |
21 | 44 |
22 if options.input_gene_filename is None: | 45 if options.input_gene_filename is None: |
23 raise Exception('-j option must be specified') | 46 raise Exception('-j option must be specified') |
24 | |
25 if options.input_fasta_filename is None: | 47 if options.input_fasta_filename is None: |
26 raise Exception('-f option must be specified') | 48 raise Exception('-f option must be specified') |
49 if options.output_fasta_filename is None: | |
50 raise Exception('-o option must be specified') | |
27 | 51 |
28 with open(options.input_gene_filename) as json_fh: | 52 with open(options.input_gene_filename) as json_fh: |
29 gene_info = json.load(json_fh) | 53 gene_info = json.load(json_fh) |
30 transcript_species_dict = read_gene_info(gene_info) | 54 transcript_species_dict = read_gene_info(gene_info) |
31 | 55 |
32 with open(options.input_fasta_filename) as fasta_fh: | 56 with open(options.output_fasta_filename, 'w') as output_fasta_file: |
33 for line in fasta_fh: | 57 for entry in FASTAReader_gen(options.input_fasta_filename): |
34 line = line.rstrip() | 58 name = entry.header[1:].lstrip() |
35 if line.startswith(">"): | 59 if name not in transcript_species_dict: |
36 name = line[1:].lstrip() | 60 print("Transcript '%s' not found in the gene feature information" % name, file=sys.stderr) |
37 print(">" + name + "_" + transcript_species_dict[name]) | 61 continue |
38 else: | 62 output_fasta_file.write(">%s_%s\n%s\n" % (name, transcript_species_dict[name], entry.sequence)) |
39 print(line) |