Mercurial > repos > earlhaminst > gstf_preparation
annotate gstf_preparation.py @ 7:9ef7661e8e9c draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 79c2cb2362b64134df778cc484e426642eb6895e
author | earlhaminst |
---|---|
date | Wed, 25 Apr 2018 11:06:03 -0400 |
parents | 56bbdbfe3eaa |
children | 92f3966d5bc3 |
rev | line source |
---|---|
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
1 from __future__ import print_function |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
2 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
3 import collections |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
4 import json |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
5 import optparse |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
6 import sqlite3 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
7 import sys |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
8 |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
9 version = "0.4.0" |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
10 gene_count = 0 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
11 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
12 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
13 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
14 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
15 def FASTAReader_gen(fasta_filename): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
16 with open(fasta_filename) as fasta_file: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
17 line = fasta_file.readline() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
18 while True: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
19 if not line: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
20 return |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
21 assert line.startswith('>'), "FASTA headers must start with >" |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
22 header = line.rstrip() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
23 sequence_parts = [] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
24 line = fasta_file.readline() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
25 while line and line[0] != '>': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
26 sequence_parts.append(line.rstrip()) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
27 line = fasta_file.readline() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
28 sequence = "\n".join(sequence_parts) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
29 yield Sequence(header, sequence) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
30 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
31 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
32 def create_tables(conn): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
33 cur = conn.cursor() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
34 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
35 cur.execute('''CREATE TABLE meta ( |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
36 version VARCHAR PRIMARY KEY NOT NULL)''') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
37 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
38 cur.execute('INSERT INTO meta (version) VALUES (?)', |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
39 (version, )) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
40 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
41 cur.execute('''CREATE TABLE gene ( |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
42 gene_id VARCHAR PRIMARY KEY NOT NULL, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
43 gene_symbol VARCHAR, |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
44 seq_region_name VARCHAR NOT NULL, |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
45 seq_region_start INTEGER NOT NULL, |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
46 seq_region_end INTEGER NOT NULL, |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
47 seq_region_strand INTEGER NOT NULL, |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
48 species VARCHAR NOT NULL, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
49 gene_json VARCHAR NOT NULL)''') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
50 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
51 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
52 cur.execute('''CREATE TABLE transcript ( |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
53 transcript_id VARCHAR PRIMARY KEY NOT NULL, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
54 protein_id VARCHAR UNIQUE, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
55 protein_sequence VARCHAR, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
56 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
57 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
58 cur.execute('''CREATE VIEW transcript_species AS |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
59 SELECT transcript_id, species, seq_region_name |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
60 FROM transcript JOIN gene |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
61 ON transcript.gene_id = gene.gene_id''') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
62 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
63 conn.commit() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
64 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
65 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
66 def remove_type_from_list_of_ids(l): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
67 return ','.join(remove_type_from_id(_) for _ in l.split(',')) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
68 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
69 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
70 def remove_type_from_id(id_): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
71 colon_index = id_.find(':') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
72 if colon_index >= 0: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
73 return id_[colon_index + 1:] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
74 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
75 return id_ |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
76 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
77 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
78 def feature_to_dict(cols, parent_dict=None): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
79 d = { |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
80 'end': int(cols[4]), |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
81 'start': int(cols[3]), |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
82 } |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
83 for attr in cols[8].split(';'): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
84 if '=' in attr: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
85 (tag, value) = attr.split('=') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
86 if tag == 'ID': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
87 tag = 'id' |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
88 value = remove_type_from_id(value) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
89 elif tag == 'Parent': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
90 value = remove_type_from_list_of_ids(value) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
91 d[tag] = value |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
92 if cols[6] == '+': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
93 d['strand'] = 1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
94 elif cols[6] == '-': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
95 d['strand'] = -1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
96 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
97 raise Exception("Unrecognized strand '%s'" % cols[6]) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
98 if parent_dict is not None and 'Parent' in d: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
99 # a 3' UTR can be split among multiple exons |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
100 # a 5' UTR can be split among multiple exons |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
101 # a CDS can be part of multiple transcripts |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
102 for parent in d['Parent'].split(','): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
103 if parent not in parent_dict: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
104 parent_dict[parent] = [d] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
105 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
106 parent_dict[parent].append(d) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
107 return d |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
108 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
109 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
110 def add_gene_to_dict(cols, species, gene_dict): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
111 global gene_count |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
112 gene = feature_to_dict(cols) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
113 gene.update({ |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
114 'member_id': gene_count, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
115 'object_type': 'Gene', |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
116 'seq_region_name': cols[0], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
117 'species': species, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
118 'Transcript': [], |
1
a36645976342
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 68cf06ec77e20eaf057d77d5bea19e201c4ab27e-dirty
earlhaminst
parents:
0
diff
changeset
|
119 'display_name': gene.get('Name', None) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
120 }) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
121 if gene['id']: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
122 gene_dict[gene['id']] = gene |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
123 gene_count = gene_count + 1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
124 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
125 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
126 def add_transcript_to_dict(cols, species, transcript_dict): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
127 transcript = feature_to_dict(cols) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
128 transcript.update({ |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
129 'object_type': 'Transcript', |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
130 'seq_region_name': cols[0], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
131 'species': species, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
132 }) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
133 transcript_dict[transcript['id']] = transcript |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
134 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
135 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
136 def add_exon_to_dict(cols, species, exon_parent_dict): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
137 exon = feature_to_dict(cols, exon_parent_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
138 exon.update({ |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
139 'length': int(cols[4]) - int(cols[3]) + 1, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
140 'object_type': 'Exon', |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
141 'seq_region_name': cols[0], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
142 'species': species, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
143 }) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
144 if 'id' not in exon and 'Name' in exon: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
145 exon['id'] = exon['Name'] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
146 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
147 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
148 def add_cds_to_dict(cols, cds_parent_dict): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
149 cds = feature_to_dict(cols, cds_parent_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
150 if 'id' not in cds: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
151 if 'Name' in cds: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
152 cds['id'] = cds['Name'] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
153 elif 'Parent' in cds and ',' not in cds['Parent']: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
154 cds['id'] = cds['Parent'] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
155 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
156 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
157 def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
158 for parent, exon_list in exon_parent_dict.items(): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
159 if parent in transcript_dict: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
160 exon_list.sort(key=lambda _: _['start']) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
161 transcript_dict[parent]['Exon'] = exon_list |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
162 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
163 for transcript_id, transcript in transcript_dict.items(): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
164 translation = { |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
165 'CDS': [], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
166 'id': None, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
167 'end': transcript['end'], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
168 'object_type': 'Translation', |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
169 'species': transcript['species'], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
170 'start': transcript['start'], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
171 } |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
172 found_cds = False |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
173 derived_translation_start = None |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
174 derived_translation_end = None |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
175 if transcript_id in cds_parent_dict: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
176 cds_list = cds_parent_dict[transcript_id] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
177 cds_ids = set(_['id'] for _ in cds_list) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
178 if len(cds_ids) > 1: |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
179 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id) |
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
180 cds_id = cds_ids.pop() |
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
181 translation['id'] = cds_id |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
182 cds_list.sort(key=lambda _: _['start']) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
183 translation['CDS'] = cds_list |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
184 translation['start'] = cds_list[0]['start'] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
185 translation['end'] = cds_list[-1]['end'] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
186 found_cds = True |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
187 if transcript_id in five_prime_utr_parent_dict: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
188 five_prime_utr_list = five_prime_utr_parent_dict[transcript_id] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
189 five_prime_utr_list.sort(key=lambda _: _['start']) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
190 if transcript['strand'] == 1: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
191 derived_translation_start = five_prime_utr_list[-1]['end'] + 1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
192 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
193 derived_translation_end = five_prime_utr_list[0]['start'] - 1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
194 if transcript_id in three_prime_utr_parent_dict: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
195 three_prime_utr_list = three_prime_utr_parent_dict[transcript_id] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
196 three_prime_utr_list.sort(key=lambda _: _['start']) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
197 if transcript['strand'] == 1: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
198 derived_translation_end = three_prime_utr_list[0]['start'] - 1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
199 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
200 derived_translation_start = three_prime_utr_list[-1]['end'] + 1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
201 if derived_translation_start is not None: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
202 if found_cds: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
203 if derived_translation_start > translation['start']: |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
204 raise Exception("Transcript %s has the start of CDS %s overlapping with the UTR end" % (transcript_id, cds_id)) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
205 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
206 translation['start'] = derived_translation_start |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
207 if derived_translation_end is not None: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
208 if found_cds: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
209 if derived_translation_end < translation['end']: |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
210 raise Exception("Transcript %s has the end of CDS %s overlapping with the UTR start" % (transcript_id, cds_id)) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
211 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
212 translation['end'] = derived_translation_end |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
213 if found_cds or derived_translation_start is not None or derived_translation_end is not None: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
214 transcript['Translation'] = translation |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
215 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
216 for transcript in transcript_dict.values(): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
217 if 'Parent' in transcript: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
218 # A polycistronic transcript can have multiple parents |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
219 for parent in transcript['Parent'].split(','): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
220 if parent in gene_dict: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
221 gene_dict[parent]['Transcript'].append(transcript) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
222 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
223 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
224 def write_gene_dict_to_db(conn, gene_dict): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
225 cur = conn.cursor() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
226 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
227 for gene in gene_dict.values(): |
3
7e11a7f4bdba
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 77ed525b753e34d3e9dd4f07a239592ce764f7e6-dirty
earlhaminst
parents:
1
diff
changeset
|
228 if gene is None: |
7e11a7f4bdba
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 77ed525b753e34d3e9dd4f07a239592ce764f7e6-dirty
earlhaminst
parents:
1
diff
changeset
|
229 # This can happen when loading a JSON file from Ensembl |
7e11a7f4bdba
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 77ed525b753e34d3e9dd4f07a239592ce764f7e6-dirty
earlhaminst
parents:
1
diff
changeset
|
230 continue |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
231 gene_id = gene['id'] |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
232 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)', |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
233 (gene_id, gene.get('display_name', None), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], json.dumps(gene))) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
234 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
235 if "Transcript" in gene: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
236 for transcript in gene["Transcript"]: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
237 transcript_id = transcript['id'] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
238 protein_id = transcript.get('Translation', {}).get('id', None) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
239 try: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
240 cur.execute('INSERT INTO transcript (transcript_id, protein_id, gene_id) VALUES (?, ?, ?)', |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
241 (transcript_id, protein_id, gene_id)) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
242 except Exception as e: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
243 raise Exception("Error while inserting (%s, %s, %s) into transcript table: %s" % (transcript_id, protein_id, gene_id, e)) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
244 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
245 conn.commit() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
246 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
247 |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
248 def fetch_species_and_seq_region_for_transcript(conn, transcript_id): |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
249 cur = conn.cursor() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
250 |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
251 cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?', |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
252 (transcript_id, )) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
253 results = cur.fetchone() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
254 if not results: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
255 return None |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
256 return results |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
257 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
258 |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
259 def fetch_gene_id_for_transcript(conn, transcript_id): |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
260 cur = conn.cursor() |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
261 |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
262 cur.execute('SELECT gene_id FROM transcript WHERE transcript_id=?', |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
263 (transcript_id, )) |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
264 results = cur.fetchone() |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
265 if not results: |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
266 return None |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
267 return results[0] |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
268 |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
269 |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
270 def remove_id_version(s): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
271 """ |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
272 Remove the optional '.VERSION' from an Ensembl id. |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
273 """ |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
274 if s.startswith('ENS'): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
275 return s.split('.')[0] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
276 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
277 return s |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
278 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
279 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
280 def __main__(): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
281 parser = optparse.OptionParser() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
282 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
283 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
284 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
285 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
286 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
287 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
288 parser.add_option('-o', '--output', help='Path of the output SQLite file') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
289 parser.add_option('--of', help='Path of the output FASTA file') |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
290 parser.add_option('--ff', help='Path of the filtered sequences output FASTA file') |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
291 |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
292 options, args = parser.parse_args() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
293 if args: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
294 raise Exception('Use options to provide inputs') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
295 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
296 conn = sqlite3.connect(options.output) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
297 conn.execute('PRAGMA foreign_keys = ON') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
298 create_tables(conn) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
299 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
300 for gff3_arg in options.gff3: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
301 try: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
302 (species, filename) = gff3_arg.split(':') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
303 except ValueError: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
304 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
305 gene_dict = dict() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
306 transcript_dict = dict() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
307 exon_parent_dict = dict() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
308 cds_parent_dict = dict() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
309 five_prime_utr_parent_dict = dict() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
310 three_prime_utr_parent_dict = dict() |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
311 unimplemented_feature_nlines_dict = dict() |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
312 |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
313 with open(filename) as f: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
314 for i, line in enumerate(f, start=1): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
315 line = line.strip() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
316 if not line: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
317 # skip empty lines |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
318 continue |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
319 if line[0] == '#': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
320 # skip comment lines |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
321 continue |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
322 cols = line.split('\t') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
323 if len(cols) != 9: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
324 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line)) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
325 feature_type = cols[2] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
326 try: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
327 if feature_type == 'gene': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
328 add_gene_to_dict(cols, species, gene_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
329 elif feature_type in ('mRNA', 'transcript'): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
330 add_transcript_to_dict(cols, species, transcript_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
331 elif feature_type == 'exon': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
332 add_exon_to_dict(cols, species, exon_parent_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
333 elif feature_type == 'five_prime_UTR': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
334 feature_to_dict(cols, five_prime_utr_parent_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
335 elif feature_type == 'three_prime_UTR': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
336 feature_to_dict(cols, three_prime_utr_parent_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
337 elif feature_type == 'CDS': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
338 add_cds_to_dict(cols, cds_parent_dict) |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
339 elif feature_type in unimplemented_feature_nlines_dict: |
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
340 unimplemented_feature_nlines_dict[feature_type] += 1 |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
341 else: |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
342 unimplemented_feature_nlines_dict[feature_type] = 0 |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
343 except Exception as e: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
344 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
345 |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
346 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items(): |
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
347 print("Skipped %d lines in file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr) |
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
348 |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
349 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
350 write_gene_dict_to_db(conn, gene_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
351 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
352 for json_arg in options.json: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
353 with open(json_arg) as f: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
354 write_gene_dict_to_db(conn, json.load(f)) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
355 |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
356 if options.longestCDS: |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
357 gene_transcripts_dict = dict() |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
358 for fasta_arg in options.fasta: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
359 for entry in FASTAReader_gen(fasta_arg): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
360 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
361 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
362 |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
363 gene_id = fetch_gene_id_for_transcript(conn, transcript_id) |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
364 if not gene_id: |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
365 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
366 continue |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
367 |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
368 if gene_id in gene_transcripts_dict: |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
369 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence))) |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
370 else: |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
371 gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))] |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
372 |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
373 # For each gene, select the transcript with the longest sequence |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
374 # If more than one transcripts have the same longest sequence for a gene, the |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
375 # first one to appear in the FASTA file is selected |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
376 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
377 |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
378 regions = [_.strip().lower() for _ in options.regions.split(",")] |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
379 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file: |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
380 for fasta_arg in options.fasta: |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
381 for entry in FASTAReader_gen(fasta_arg): |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
382 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
383 if options.longestCDS and transcript_id not in selected_transcript_ids: |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
384 continue |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
385 |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
386 species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
387 if not species_for_transcript: |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
388 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
389 continue |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
390 |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
391 if options.headers: |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
392 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
393 # Remove any underscore in the species |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
394 header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', '')) |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
395 else: |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
396 header = entry.header |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
397 |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
398 if seq_region_for_transcript.lower() in regions: |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
399 filtered_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
400 else: |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
401 output_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
402 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
403 conn.close() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
404 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
405 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
406 if __name__ == '__main__': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
407 __main__() |