annotate gstf_preparation.py @ 13:51a7a2a82902 draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
author earlhaminst
date Tue, 06 Oct 2020 17:10:37 +0000
parents 99bae410128c
children 598e9172b8e7
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
1 import json
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
2 import optparse
10
e8e75a79de59 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9c8611fee927883f50bc6955771aa69df1ce8457"
earlhaminst
parents: 9
diff changeset
3 import os
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
4 import sqlite3
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
5 import sys
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
6
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
7 version = "0.5.0"
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
8 gene_count = 0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
9
8
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
10
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
11 def asbool(val):
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
12 if isinstance(val, str):
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
13 val_lower = val.strip().lower()
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
14 if val_lower in ('true', '1'):
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
15 return True
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
16 elif val_lower in ('false', '0'):
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
17 return False
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
18 else:
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
19 raise ValueError(f"Cannot convert {val} to bool")
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
20 else:
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
21 return bool(val)
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
22
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
23
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
24 class Sequence:
8
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
25 def __init__(self, header, sequence_parts):
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
26 self.header = header
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
27 self.sequence_parts = sequence_parts
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
28 self._sequence = None
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
29
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
30 @property
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
31 def sequence(self):
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
32 if self._sequence is None:
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
33 self._sequence = ''.join(self.sequence_parts)
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
34 return self._sequence
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
35
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
36 def print(self, fh=sys.stdout):
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
37 print(self.header, file=fh)
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
38 for line in self.sequence_parts:
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
39 print(line, file=fh)
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
40
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
41
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
42 def FASTAReader_gen(fasta_filename):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
43 with open(fasta_filename) as fasta_file:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
44 line = fasta_file.readline()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
45 while True:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
46 if not line:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
47 return
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
48 assert line.startswith('>'), "FASTA headers must start with >"
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
49 header = line.rstrip()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
50 sequence_parts = []
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
51 line = fasta_file.readline()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
52 while line and line[0] != '>':
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
53 sequence_parts.append(line.rstrip())
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
54 line = fasta_file.readline()
8
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
55 yield Sequence(header, sequence_parts)
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
56
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
57
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
58 def create_tables(conn):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
59 cur = conn.cursor()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
60
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
61 cur.execute('''CREATE TABLE meta (
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
62 version VARCHAR PRIMARY KEY NOT NULL)''')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
63
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
64 cur.execute('INSERT INTO meta (version) VALUES (?)',
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
65 (version, ))
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
66
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
67 cur.execute('''CREATE TABLE gene (
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
68 gene_id VARCHAR PRIMARY KEY NOT NULL,
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
69 gene_symbol VARCHAR,
6
56bbdbfe3eaa planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents: 5
diff changeset
70 seq_region_name VARCHAR NOT NULL,
56bbdbfe3eaa planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents: 5
diff changeset
71 seq_region_start INTEGER NOT NULL,
56bbdbfe3eaa planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents: 5
diff changeset
72 seq_region_end INTEGER NOT NULL,
56bbdbfe3eaa planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents: 5
diff changeset
73 seq_region_strand INTEGER NOT NULL,
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
74 species VARCHAR NOT NULL,
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
75 biotype VARCHAR,
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
76 gene_json VARCHAR NOT NULL)''')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
77 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
78
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
79 cur.execute('''CREATE TABLE transcript (
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
80 transcript_id VARCHAR PRIMARY KEY NOT NULL,
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
81 transcript_symbol VARCHAR,
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
82 protein_id VARCHAR UNIQUE,
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
83 protein_sequence VARCHAR,
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
84 biotype VARCHAR,
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
85 is_canonical BOOLEAN NOT NULL DEFAULT FALSE,
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
86 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
87
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
88 # The following temporary view is not used in GAFA, so schema changes to it
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
89 # don't require a meta version upgrade.
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
90 cur.execute('''CREATE TEMPORARY VIEW transcript_join_gene AS
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
91 SELECT transcript_id, transcript_symbol, COALESCE(transcript.biotype, gene.biotype) AS biotype, is_canonical, gene_id, gene_symbol, seq_region_name, species
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
92 FROM transcript JOIN gene
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
93 USING (gene_id)''')
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
94
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
95 conn.commit()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
96
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
97
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
98 def fetch_transcript_and_gene(conn, transcript_id):
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
99 cur = conn.cursor()
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
100
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
101 cur.execute('SELECT * FROM transcript_join_gene WHERE transcript_id=?',
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
102 (transcript_id, ))
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
103 return cur.fetchone()
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
104
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
105
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
106 def remove_type_from_list_of_ids(ids):
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
107 return ','.join(remove_type_from_id(id_) for id_ in ids.split(','))
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
108
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
109
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
110 def remove_type_from_id(id_):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
111 colon_index = id_.find(':')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
112 if colon_index >= 0:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
113 return id_[colon_index + 1:]
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
114 else:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
115 return id_
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
116
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
117
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
118 def feature_to_dict(cols, parent_dict=None):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
119 d = {
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
120 'end': int(cols[4]),
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
121 'start': int(cols[3]),
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
122 }
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
123 for attr in cols[8].split(';'):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
124 if '=' in attr:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
125 (tag, value) = attr.split('=')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
126 if tag == 'ID':
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
127 tag = 'id'
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
128 value = remove_type_from_id(value)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
129 elif tag == 'Parent':
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
130 value = remove_type_from_list_of_ids(value)
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
131 elif tag == 'representative':
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
132 tag = 'is_canonical'
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
133 d[tag] = value
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
134 if cols[6] == '+':
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
135 d['strand'] = 1
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
136 elif cols[6] == '-':
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
137 d['strand'] = -1
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
138 else:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
139 raise Exception("Unrecognized strand '%s'" % cols[6])
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
140 if parent_dict is not None and 'Parent' in d:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
141 # a 3' UTR can be split among multiple exons
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
142 # a 5' UTR can be split among multiple exons
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
143 # a CDS can be part of multiple transcripts
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
144 for parent in d['Parent'].split(','):
10
e8e75a79de59 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9c8611fee927883f50bc6955771aa69df1ce8457"
earlhaminst
parents: 9
diff changeset
145 parent_dict.setdefault(parent, []).append(d)
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
146 return d
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
147
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
148
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
149 def add_gene_to_dict(cols, species, gene_dict):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
150 global gene_count
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
151 gene = feature_to_dict(cols)
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
152 if not gene['id']:
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
153 raise Exception("Id not found among column 9 attribute tags: %s" % cols[8])
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
154 gene.update({
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
155 'member_id': gene_count,
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
156 'object_type': 'Gene',
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
157 'seq_region_name': cols[0],
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
158 'species': species,
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
159 'Transcript': [],
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
160 'display_name': gene.get('Name'),
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
161 })
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
162 gene_dict[gene['id']] = gene
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
163 gene_count = gene_count + 1
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
164
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
165
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
166 def add_transcript_to_dict(cols, species, transcript_dict):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
167 transcript = feature_to_dict(cols)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
168 transcript.update({
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
169 'object_type': 'Transcript',
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
170 'seq_region_name': cols[0],
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
171 'species': species,
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
172 'display_name': transcript.get('Name'),
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
173 })
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
174 transcript_dict[transcript['id']] = transcript
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
175
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
176
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
177 def add_exon_to_dict(cols, species, exon_parent_dict):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
178 exon = feature_to_dict(cols, exon_parent_dict)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
179 exon.update({
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
180 'length': int(cols[4]) - int(cols[3]) + 1,
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
181 'object_type': 'Exon',
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
182 'seq_region_name': cols[0],
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
183 'species': species,
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
184 })
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
185 if 'id' not in exon and 'Name' in exon:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
186 exon['id'] = exon['Name']
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
187
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
188
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
189 def add_cds_to_dict(cols, cds_parent_dict):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
190 cds = feature_to_dict(cols, cds_parent_dict)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
191 if 'id' not in cds:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
192 if 'Name' in cds:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
193 cds['id'] = cds['Name']
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
194 elif 'Parent' in cds and ',' not in cds['Parent']:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
195 cds['id'] = cds['Parent']
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
196
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
197
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
198 def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
199 for parent, exon_list in exon_parent_dict.items():
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
200 if parent in transcript_dict:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
201 exon_list.sort(key=lambda _: _['start'])
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
202 transcript_dict[parent]['Exon'] = exon_list
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
203
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
204 for transcript_id, transcript in transcript_dict.items():
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
205 translation = {
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
206 'CDS': [],
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
207 'id': None,
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
208 'end': transcript['end'],
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
209 'object_type': 'Translation',
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
210 'species': transcript['species'],
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
211 'start': transcript['start'],
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
212 }
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
213 found_cds = False
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
214 derived_translation_start = None
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
215 derived_translation_end = None
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
216 if transcript_id in cds_parent_dict:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
217 cds_list = cds_parent_dict[transcript_id]
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
218 cds_ids = {_['id'] for _ in cds_list}
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
219 if len(cds_ids) > 1:
5
b3ba0c84667c planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents: 4
diff changeset
220 raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id)
b3ba0c84667c planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents: 4
diff changeset
221 cds_id = cds_ids.pop()
b3ba0c84667c planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents: 4
diff changeset
222 translation['id'] = cds_id
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
223 cds_list.sort(key=lambda _: _['start'])
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
224 translation['CDS'] = cds_list
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
225 translation['start'] = cds_list[0]['start']
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
226 translation['end'] = cds_list[-1]['end']
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
227 found_cds = True
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
228 if transcript_id in five_prime_utr_parent_dict:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
229 five_prime_utr_list = five_prime_utr_parent_dict[transcript_id]
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
230 five_prime_utr_list.sort(key=lambda _: _['start'])
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
231 if transcript['strand'] == 1:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
232 derived_translation_start = five_prime_utr_list[-1]['end'] + 1
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
233 else:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
234 derived_translation_end = five_prime_utr_list[0]['start'] - 1
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
235 if transcript_id in three_prime_utr_parent_dict:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
236 three_prime_utr_list = three_prime_utr_parent_dict[transcript_id]
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
237 three_prime_utr_list.sort(key=lambda _: _['start'])
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
238 if transcript['strand'] == 1:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
239 derived_translation_end = three_prime_utr_list[0]['start'] - 1
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
240 else:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
241 derived_translation_start = three_prime_utr_list[-1]['end'] + 1
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
242 if derived_translation_start is not None:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
243 if found_cds:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
244 if derived_translation_start > translation['start']:
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
245 raise Exception(f"Transcript {transcript_id} has the start of CDS {cds_id} overlapping with the UTR end")
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
246 else:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
247 translation['start'] = derived_translation_start
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
248 if derived_translation_end is not None:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
249 if found_cds:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
250 if derived_translation_end < translation['end']:
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
251 raise Exception(f"Transcript {transcript_id} has the end of CDS {cds_id} overlapping with the UTR start")
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
252 else:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
253 translation['end'] = derived_translation_end
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
254 if found_cds or derived_translation_start is not None or derived_translation_end is not None:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
255 transcript['Translation'] = translation
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
256
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
257 for transcript in transcript_dict.values():
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
258 if 'Parent' in transcript:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
259 # A polycistronic transcript can have multiple parents
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
260 for parent in transcript['Parent'].split(','):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
261 if parent in gene_dict:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
262 gene_dict[parent]['Transcript'].append(transcript)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
263
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
264
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
265 def write_gene_dict_to_db(conn, gene_dict):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
266 cur = conn.cursor()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
267
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
268 for gene in gene_dict.values():
3
7e11a7f4bdba planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 77ed525b753e34d3e9dd4f07a239592ce764f7e6-dirty
earlhaminst
parents: 1
diff changeset
269 if gene is None:
7e11a7f4bdba planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 77ed525b753e34d3e9dd4f07a239592ce764f7e6-dirty
earlhaminst
parents: 1
diff changeset
270 # This can happen when loading a JSON file from Ensembl
7e11a7f4bdba planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 77ed525b753e34d3e9dd4f07a239592ce764f7e6-dirty
earlhaminst
parents: 1
diff changeset
271 continue
12
99bae410128c "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 4579d0c461c30183a3092d84013e30f53f072ca1-dirty"
earlhaminst
parents: 11
diff changeset
272 if 'confidence' in gene and gene['confidence'].lower() != 'high':
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
273 print("Gene {} has confidence {} (not high), discarding".format(gene['id'], gene['confidence']), file=sys.stderr)
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
274 continue
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
275 gene_id = gene['id']
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
276 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)',
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
277 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene)))
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
278
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
279 if "Transcript" in gene:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
280 for transcript in gene["Transcript"]:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
281 transcript_id = transcript['id']
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
282 transcript_symbol = transcript.get('display_name')
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
283 protein_id = transcript.get('Translation', {}).get('id')
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
284 biotype = transcript.get('biotype')
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
285 is_canonical = asbool(transcript.get('is_canonical', False))
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
286 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id)
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
287 try:
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
288 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)',
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
289 to_insert)
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
290 except Exception as e:
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
291 raise Exception("Error while inserting {} into transcript table: {}".format(str(to_insert), e))
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
292
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
293 conn.commit()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
294
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
295
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
296 def remove_id_version(s, force=False):
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
297 """
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
298 Remove the optional '.VERSION' from an id if it's an Ensembl id or if
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
299 `force` is True.
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
300 """
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
301 if force or s.startswith('ENS'):
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
302 return s.split('.')[0]
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
303 else:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
304 return s
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
305
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
306
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
307 def __main__():
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
308 parser = optparse.OptionParser()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
309 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
310 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
311 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files')
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
312 parser.add_option('--filter', type='choice', choices=['canonical', 'coding', ''], default='', help='Which transcripts to keep')
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
313 parser.add_option('--headers', type='choice',
12
99bae410128c "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 4579d0c461c30183a3092d84013e30f53f072ca1-dirty"
earlhaminst
parents: 11
diff changeset
314 choices=['TranscriptId_species', 'TranscriptID-GeneSymbol_species', 'TranscriptID-TranscriptSymbol_species', ''],
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
315 default='', help='Change the header line of the FASTA sequences to this format')
6
56bbdbfe3eaa planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents: 5
diff changeset
316 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered')
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
317 parser.add_option('-o', '--output', help='Path of the output SQLite file')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
318 parser.add_option('--of', help='Path of the output FASTA file')
10
e8e75a79de59 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9c8611fee927883f50bc6955771aa69df1ce8457"
earlhaminst
parents: 9
diff changeset
319 parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file')
6
56bbdbfe3eaa planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents: 5
diff changeset
320
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
321 options, args = parser.parse_args()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
322 if args:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
323 raise Exception('Use options to provide inputs')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
324
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
325 conn = sqlite3.connect(options.output)
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
326 conn.row_factory = sqlite3.Row
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
327 conn.execute('PRAGMA foreign_keys = ON')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
328 create_tables(conn)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
329
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
330 for gff3_arg in options.gff3:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
331 try:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
332 (species, filename) = gff3_arg.split(':')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
333 except ValueError:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
334 raise Exception("Argument for --gff3 '%s' is not in the SPECIES:FILENAME format" % gff3_arg)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
335 gene_dict = dict()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
336 transcript_dict = dict()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
337 exon_parent_dict = dict()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
338 cds_parent_dict = dict()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
339 five_prime_utr_parent_dict = dict()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
340 three_prime_utr_parent_dict = dict()
5
b3ba0c84667c planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents: 4
diff changeset
341 unimplemented_feature_nlines_dict = dict()
4
284f64ad9d43 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents: 3
diff changeset
342
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
343 with open(filename) as f:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
344 for i, line in enumerate(f, start=1):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
345 line = line.strip()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
346 if not line:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
347 # skip empty lines
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
348 continue
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
349 if line[0] == '#':
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
350 # skip comment lines
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
351 continue
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
352 cols = line.split('\t')
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
353 if len(cols) != 9:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
354 raise Exception("Line %i in file '%s': '%s' does not have 9 columns" % (i, filename, line))
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
355 feature_type = cols[2]
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
356 try:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
357 if feature_type == 'gene':
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
358 add_gene_to_dict(cols, species, gene_dict)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
359 elif feature_type in ('mRNA', 'transcript'):
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
360 add_transcript_to_dict(cols, species, transcript_dict)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
361 elif feature_type == 'exon':
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
362 add_exon_to_dict(cols, species, exon_parent_dict)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
363 elif feature_type == 'five_prime_UTR':
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
364 feature_to_dict(cols, five_prime_utr_parent_dict)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
365 elif feature_type == 'three_prime_UTR':
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
366 feature_to_dict(cols, three_prime_utr_parent_dict)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
367 elif feature_type == 'CDS':
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
368 add_cds_to_dict(cols, cds_parent_dict)
5
b3ba0c84667c planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents: 4
diff changeset
369 elif feature_type in unimplemented_feature_nlines_dict:
b3ba0c84667c planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents: 4
diff changeset
370 unimplemented_feature_nlines_dict[feature_type] += 1
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
371 else:
5
b3ba0c84667c planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents: 4
diff changeset
372 unimplemented_feature_nlines_dict[feature_type] = 0
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
373 except Exception as e:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
374 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
375
5
b3ba0c84667c planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents: 4
diff changeset
376 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items():
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
377 print("Skipped %d lines in GFF3 file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr)
5
b3ba0c84667c planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents: 4
diff changeset
378
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
379 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
380 write_gene_dict_to_db(conn, gene_dict)
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
381
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
382 for json_arg in options.json:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
383 with open(json_arg) as f:
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
384 write_gene_dict_to_db(conn, json.load(f))
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
385
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
386 # Read the FASTA files a first time to:
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
387 # - determine for each file if we need to force the removal of the version
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
388 # from the transcript id
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
389 # - fill gene_transcripts_dict when keeping only the canonical transcripts
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
390 force_remove_id_version_file_list = []
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
391 gene_transcripts_dict = dict()
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
392 for fasta_arg in options.fasta:
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
393 force_remove_id_version = False
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
394 found_gene_transcript = False
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
395 for entry in FASTAReader_gen(fasta_arg):
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
396 # Extract the transcript id by removing everything after the first space and then removing the version if needed
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
397 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)
4
284f64ad9d43 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents: 3
diff changeset
398
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
399 transcript = fetch_transcript_and_gene(conn, transcript_id)
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
400 if not transcript and not found_gene_transcript:
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
401 # We have not found a proper gene transcript in this file yet,
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
402 # try to force the removal of the version from the transcript id
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
403 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], True)
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
404 transcript = fetch_transcript_and_gene(conn, transcript_id)
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
405 # Remember that we need to force the removal for this file
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
406 if transcript:
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
407 force_remove_id_version = True
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
408 force_remove_id_version_file_list.append(fasta_arg)
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
409 print("Forcing removal of id version in FASTA file '%s'" % fasta_arg, file=sys.stderr)
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
410 if not transcript:
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
411 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr)
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
412 continue
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
413 if options.filter != 'canonical':
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
414 break
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
415 found_gene_transcript = True
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
416
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
417 if len(entry.sequence) % 3 != 0:
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
418 continue
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
419 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
420 if transcript_biotype and transcript_biotype != 'protein_coding':
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
421 continue
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
422 gene_transcripts_dict.setdefault(transcript['gene_id'], []).append((transcript_id, transcript['is_canonical'], len(entry.sequence)))
4
284f64ad9d43 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents: 3
diff changeset
423
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
424 if options.filter == 'canonical':
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
425 selected_transcript_ids = []
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
426 for gene_id, transcript_tuples in gene_transcripts_dict.items():
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
427 canonical_transcript_ids = [id_ for (id_, is_canonical, _) in transcript_tuples if is_canonical]
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
428 if not canonical_transcript_ids:
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
429 # Select the transcript with the longest sequence. If more than
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
430 # one transcripts have the same longest sequence for a gene, the
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
431 # first one to appear in the FASTA file is selected.
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
432 selected_transcript_id = max(transcript_tuples, key=lambda transcript_tuple: transcript_tuple[2])[0]
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
433 elif len(canonical_transcript_ids) > 1:
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
434 raise Exception("Gene %s has more than 1 canonical transcripts" % (gene_id))
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
435 else:
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
436 selected_transcript_id = canonical_transcript_ids[0]
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
437 selected_transcript_ids.append(selected_transcript_id)
4
284f64ad9d43 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents: 3
diff changeset
438
6
56bbdbfe3eaa planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents: 5
diff changeset
439 regions = [_.strip().lower() for _ in options.regions.split(",")]
56bbdbfe3eaa planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents: 5
diff changeset
440 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file:
4
284f64ad9d43 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents: 3
diff changeset
441 for fasta_arg in options.fasta:
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
442 force_remove_id_version = fasta_arg in force_remove_id_version_file_list
4
284f64ad9d43 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents: 3
diff changeset
443 for entry in FASTAReader_gen(fasta_arg):
9
f4acbfe8d6fe planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents: 8
diff changeset
444 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version)
4
284f64ad9d43 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents: 3
diff changeset
445
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
446 transcript = fetch_transcript_and_gene(conn, transcript_id)
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
447 if not transcript:
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
448 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr)
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
449 continue
4
284f64ad9d43 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents: 3
diff changeset
450
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
451 if options.filter == 'canonical':
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
452 # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
453 if transcript_id not in selected_transcript_ids:
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
454 continue
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
455 elif options.filter == 'coding':
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
456 if len(entry.sequence) % 3 != 0:
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
457 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' has a coding sequence length which is not multiple of 3, removing from FASTA output", file=sys.stderr)
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
458 continue
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
459 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
460 if transcript_biotype and transcript_biotype != 'protein_coding':
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
461 print(f"Transcript {transcript_id} has biotype {transcript_biotype} (not protein-coding), removing from FASTA output", file=sys.stderr)
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
462 continue
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
463
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
464 if options.headers == "TranscriptId_species":
4
284f64ad9d43 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents: 3
diff changeset
465 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
284f64ad9d43 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents: 3
diff changeset
466 # Remove any underscore in the species
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
467 entry.header = ">{}_{}".format(transcript_id, transcript['species'].replace('_', ''))
12
99bae410128c "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 4579d0c461c30183a3092d84013e30f53f072ca1-dirty"
earlhaminst
parents: 11
diff changeset
468 elif options.headers == "TranscriptID-GeneSymbol_species":
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
469 # Remove any underscore in the species
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
470 entry.header = ">{}-{}_{}".format(transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', ''))
12
99bae410128c "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 4579d0c461c30183a3092d84013e30f53f072ca1-dirty"
earlhaminst
parents: 11
diff changeset
471 elif options.headers == "TranscriptID-TranscriptSymbol_species":
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
472 # Remove any underscore in the species
13
51a7a2a82902 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents: 12
diff changeset
473 entry.header = ">{}-{}_{}".format(transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', ''))
4
284f64ad9d43 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents: 3
diff changeset
474
11
dbe37a658cd2 "planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents: 10
diff changeset
475 if transcript['seq_region_name'].lower() in regions:
8
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
476 entry.print(filtered_fasta_file)
6
56bbdbfe3eaa planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents: 5
diff changeset
477 else:
8
92f3966d5bc3 planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents: 6
diff changeset
478 entry.print(output_fasta_file)
0
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
479
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
480 conn.close()
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
481
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
482
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
483 if __name__ == '__main__':
28879ca33b5f planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff changeset
484 __main__()