Mercurial > repos > earlhaminst > treebest_best
annotate fasta_header_converter.py @ 3:dd268de3a107 draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
author | earlhaminst |
---|---|
date | Fri, 03 Mar 2017 07:22:53 -0500 |
parents | 4f9e5110914b |
children |
rev | line source |
---|---|
0
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
1 from __future__ import print_function |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
2 |
3
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
3 import collections |
0
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
4 import json |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
5 import optparse |
3
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
6 import sys |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
7 |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
8 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
9 |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
10 |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
11 def FASTAReader_gen(fasta_filename): |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
12 with open(fasta_filename) as fasta_file: |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
13 line = fasta_file.readline() |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
14 while True: |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
15 if not line: |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
16 return |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
17 assert line.startswith('>'), "FASTA headers must start with >" |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
18 header = line.rstrip() |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
19 sequence_parts = [] |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
20 line = fasta_file.readline() |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
21 while line and line[0] != '>': |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
22 sequence_parts.append(line.rstrip()) |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
23 line = fasta_file.readline() |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
24 sequence = "\n".join(sequence_parts) |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
25 yield Sequence(header, sequence) |
0
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
26 |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
27 |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
28 def read_gene_info(gene_info): |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
29 transcript_species_dict = dict() |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
30 for gene_dict in gene_info.values(): |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
31 for transcript in gene_dict['Transcript']: |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
32 transcript_species_dict[transcript['id']] = transcript['species'].replace("_", "") |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
33 return transcript_species_dict |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
34 |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
35 |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
36 parser = optparse.OptionParser() |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
37 parser.add_option('-j', '--json', dest="input_gene_filename", |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
38 help='Gene feature information in JSON format') |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
39 parser.add_option('-f', '--fasta', dest="input_fasta_filename", |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
40 help='Sequences in FASTA format') |
3
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
41 parser.add_option('-o', '--output', dest="output_fasta_filename", |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
42 help='Output FASTA file name') |
0
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
43 options, args = parser.parse_args() |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
44 |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
45 if options.input_gene_filename is None: |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
46 raise Exception('-j option must be specified') |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
47 if options.input_fasta_filename is None: |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
48 raise Exception('-f option must be specified') |
3
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
49 if options.output_fasta_filename is None: |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
50 raise Exception('-o option must be specified') |
0
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
51 |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
52 with open(options.input_gene_filename) as json_fh: |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
53 gene_info = json.load(json_fh) |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
54 transcript_species_dict = read_gene_info(gene_info) |
4f9e5110914b
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 136cba2b8c8a2ac2465e7e9420314f2511b991f2-dirty
earlhaminst
parents:
diff
changeset
|
55 |
3
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
56 with open(options.output_fasta_filename, 'w') as output_fasta_file: |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
57 for entry in FASTAReader_gen(options.input_fasta_filename): |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
58 name = entry.header[1:].lstrip() |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
59 if name not in transcript_species_dict: |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
60 print("Transcript '%s' not found in the gene feature information" % name, file=sys.stderr) |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
61 continue |
dd268de3a107
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/TreeBest commit 988b1fc1cb8739e45648465adbf099f3fdaf87f8
earlhaminst
parents:
0
diff
changeset
|
62 output_fasta_file.write(">%s_%s\n%s\n" % (name, transcript_species_dict[name], entry.sequence)) |