Mercurial > repos > earlhaminst > treebest_best
comparison fasta_header_converter.py @ 3:dd268de3a107 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
| author | earlhaminst |
|---|---|
| date | Fri, 03 Mar 2017 07:22:53 -0500 |
| parents | 4f9e5110914b |
| children |
comparison
equal
deleted
inserted
replaced
| 2:7ea4df039a53 | 3:dd268de3a107 |
|---|---|
| 1 from __future__ import print_function | 1 from __future__ import print_function |
| 2 | 2 |
| 3 import collections | |
| 3 import json | 4 import json |
| 4 import optparse | 5 import optparse |
| 6 import sys | |
| 7 | |
| 8 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) | |
| 9 | |
| 10 | |
| 11 def FASTAReader_gen(fasta_filename): | |
| 12 with open(fasta_filename) as fasta_file: | |
| 13 line = fasta_file.readline() | |
| 14 while True: | |
| 15 if not line: | |
| 16 return | |
| 17 assert line.startswith('>'), "FASTA headers must start with >" | |
| 18 header = line.rstrip() | |
| 19 sequence_parts = [] | |
| 20 line = fasta_file.readline() | |
| 21 while line and line[0] != '>': | |
| 22 sequence_parts.append(line.rstrip()) | |
| 23 line = fasta_file.readline() | |
| 24 sequence = "\n".join(sequence_parts) | |
| 25 yield Sequence(header, sequence) | |
| 5 | 26 |
| 6 | 27 |
| 7 def read_gene_info(gene_info): | 28 def read_gene_info(gene_info): |
| 8 transcript_species_dict = dict() | 29 transcript_species_dict = dict() |
| 9 for gene_dict in gene_info.values(): | 30 for gene_dict in gene_info.values(): |
| 15 parser = optparse.OptionParser() | 36 parser = optparse.OptionParser() |
| 16 parser.add_option('-j', '--json', dest="input_gene_filename", | 37 parser.add_option('-j', '--json', dest="input_gene_filename", |
| 17 help='Gene feature information in JSON format') | 38 help='Gene feature information in JSON format') |
| 18 parser.add_option('-f', '--fasta', dest="input_fasta_filename", | 39 parser.add_option('-f', '--fasta', dest="input_fasta_filename", |
| 19 help='Sequences in FASTA format') | 40 help='Sequences in FASTA format') |
| 41 parser.add_option('-o', '--output', dest="output_fasta_filename", | |
| 42 help='Output FASTA file name') | |
| 20 options, args = parser.parse_args() | 43 options, args = parser.parse_args() |
| 21 | 44 |
| 22 if options.input_gene_filename is None: | 45 if options.input_gene_filename is None: |
| 23 raise Exception('-j option must be specified') | 46 raise Exception('-j option must be specified') |
| 24 | |
| 25 if options.input_fasta_filename is None: | 47 if options.input_fasta_filename is None: |
| 26 raise Exception('-f option must be specified') | 48 raise Exception('-f option must be specified') |
| 49 if options.output_fasta_filename is None: | |
| 50 raise Exception('-o option must be specified') | |
| 27 | 51 |
| 28 with open(options.input_gene_filename) as json_fh: | 52 with open(options.input_gene_filename) as json_fh: |
| 29 gene_info = json.load(json_fh) | 53 gene_info = json.load(json_fh) |
| 30 transcript_species_dict = read_gene_info(gene_info) | 54 transcript_species_dict = read_gene_info(gene_info) |
| 31 | 55 |
| 32 with open(options.input_fasta_filename) as fasta_fh: | 56 with open(options.output_fasta_filename, 'w') as output_fasta_file: |
| 33 for line in fasta_fh: | 57 for entry in FASTAReader_gen(options.input_fasta_filename): |
| 34 line = line.rstrip() | 58 name = entry.header[1:].lstrip() |
| 35 if line.startswith(">"): | 59 if name not in transcript_species_dict: |
| 36 name = line[1:].lstrip() | 60 print("Transcript '%s' not found in the gene feature information" % name, file=sys.stderr) |
| 37 print(">" + name + "_" + transcript_species_dict[name]) | 61 continue |
| 38 else: | 62 output_fasta_file.write(">%s_%s\n%s\n" % (name, transcript_species_dict[name], entry.sequence)) |
| 39 print(line) |
