annotate cpt_gff_apollo_prep/gff3_prep_for_apollo.py @ 2:6901154b1003 draft default tip

Uploaded
author cpt
date Fri, 20 May 2022 08:59:37 +0000
parents eb0c42719156
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
eb0c42719156 Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
eb0c42719156 Uploaded
cpt
parents:
diff changeset
2 import sys
eb0c42719156 Uploaded
cpt
parents:
diff changeset
3 import logging
eb0c42719156 Uploaded
cpt
parents:
diff changeset
4 import argparse
eb0c42719156 Uploaded
cpt
parents:
diff changeset
5 import copy
eb0c42719156 Uploaded
cpt
parents:
diff changeset
6 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
eb0c42719156 Uploaded
cpt
parents:
diff changeset
7 from gff3 import feature_lambda, feature_test_type
eb0c42719156 Uploaded
cpt
parents:
diff changeset
8 from Bio.SeqFeature import FeatureLocation
eb0c42719156 Uploaded
cpt
parents:
diff changeset
9
eb0c42719156 Uploaded
cpt
parents:
diff changeset
10 logging.basicConfig(level=logging.INFO)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
11 log = logging.getLogger(__name__)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
12
eb0c42719156 Uploaded
cpt
parents:
diff changeset
13 ALLOWED_FEATURES = [
eb0c42719156 Uploaded
cpt
parents:
diff changeset
14 "mRNA",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
15 "exon",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
16 "transposable_element",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
17 "tRNA",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
18 "transcript",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
19 "terminator",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
20 "Shine_Dalgarno_Sequence",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
21 "pseudogene",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
22 "stop_codon_read_through",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
23 "repeat_region",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
24 "CDS",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
25 "gene",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
26 "rRNA",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
27 "ncRNA",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
28 "snRNA",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
29 "snoRNA",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
30 "miRNA",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
31 ]
eb0c42719156 Uploaded
cpt
parents:
diff changeset
32
eb0c42719156 Uploaded
cpt
parents:
diff changeset
33 SPECIAL_REMOVED_FEATURES = ["gene_component_region", "sequence_difference"]
eb0c42719156 Uploaded
cpt
parents:
diff changeset
34
eb0c42719156 Uploaded
cpt
parents:
diff changeset
35
eb0c42719156 Uploaded
cpt
parents:
diff changeset
36
eb0c42719156 Uploaded
cpt
parents:
diff changeset
37 def add_exons(features):
eb0c42719156 Uploaded
cpt
parents:
diff changeset
38 for gene in feature_lambda(
eb0c42719156 Uploaded
cpt
parents:
diff changeset
39 features, feature_test_type, {"type": "gene"}, subfeatures=True
eb0c42719156 Uploaded
cpt
parents:
diff changeset
40 ):
eb0c42719156 Uploaded
cpt
parents:
diff changeset
41 clean_gene = copy.deepcopy(gene)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
42 exon_start = None
eb0c42719156 Uploaded
cpt
parents:
diff changeset
43 exon_end = None
eb0c42719156 Uploaded
cpt
parents:
diff changeset
44 exon_strand = None
eb0c42719156 Uploaded
cpt
parents:
diff changeset
45 cds_list = []
eb0c42719156 Uploaded
cpt
parents:
diff changeset
46
eb0c42719156 Uploaded
cpt
parents:
diff changeset
47 #for mRNA in gene.sub_features:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
48 # for x in mRNA.sub_features:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
49 # x.qualifiers["Parent"] = [gene.id]
eb0c42719156 Uploaded
cpt
parents:
diff changeset
50 # gene.sub_features.append(x)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
51
eb0c42719156 Uploaded
cpt
parents:
diff changeset
52 for exon in feature_lambda(gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False,recurse=False):
eb0c42719156 Uploaded
cpt
parents:
diff changeset
53 #if the gene contains an exon, skip.
eb0c42719156 Uploaded
cpt
parents:
diff changeset
54 continue
eb0c42719156 Uploaded
cpt
parents:
diff changeset
55 hasMRNA = False
eb0c42719156 Uploaded
cpt
parents:
diff changeset
56 for x in gene.sub_features:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
57 if x.type == "mRNA":
eb0c42719156 Uploaded
cpt
parents:
diff changeset
58 hasMRNA = True
eb0c42719156 Uploaded
cpt
parents:
diff changeset
59 mRNA = x
eb0c42719156 Uploaded
cpt
parents:
diff changeset
60 """
eb0c42719156 Uploaded
cpt
parents:
diff changeset
61 if not hasMRNA:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
62 mRNA = gffSeqFeature(
eb0c42719156 Uploaded
cpt
parents:
diff changeset
63 location=FeatureLocation(gene.location.start, gene.location.end, gene.location.strand),
eb0c42719156 Uploaded
cpt
parents:
diff changeset
64 type="mRNA",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
65 source = "cpt.prepApollo",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
66 qualifiers={
eb0c42719156 Uploaded
cpt
parents:
diff changeset
67 "ID": ["%s.mRNA" % clean_gene.qualifiers["ID"][0]],
eb0c42719156 Uploaded
cpt
parents:
diff changeset
68 "Parent": clean_gene.qualifiers["ID"],
eb0c42719156 Uploaded
cpt
parents:
diff changeset
69 },
eb0c42719156 Uploaded
cpt
parents:
diff changeset
70 sub_features=gene.sub_features,
eb0c42719156 Uploaded
cpt
parents:
diff changeset
71 strand=exon_strand
eb0c42719156 Uploaded
cpt
parents:
diff changeset
72 )
eb0c42719156 Uploaded
cpt
parents:
diff changeset
73 for x in mRNA.sub_features:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
74 x.qualifiers["Parent"] = mRNA["ID"]
eb0c42719156 Uploaded
cpt
parents:
diff changeset
75 clean_gene.sub_features = [mRNA]
eb0c42719156 Uploaded
cpt
parents:
diff changeset
76 else:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
77 for x in clean_gene.sub_features:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
78 if x.type != "mRNA":
eb0c42719156 Uploaded
cpt
parents:
diff changeset
79 x.qualifiers["Parent"] = [mRNA.id] """
eb0c42719156 Uploaded
cpt
parents:
diff changeset
80
eb0c42719156 Uploaded
cpt
parents:
diff changeset
81 # check for CDS child features of the gene, do not go a further step (this should skip any CDS children of exon child features)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
82 for cds in feature_lambda(
eb0c42719156 Uploaded
cpt
parents:
diff changeset
83 gene.sub_features,
eb0c42719156 Uploaded
cpt
parents:
diff changeset
84 feature_test_type,
eb0c42719156 Uploaded
cpt
parents:
diff changeset
85 {"type": "CDS"},
eb0c42719156 Uploaded
cpt
parents:
diff changeset
86 subfeatures=False,
eb0c42719156 Uploaded
cpt
parents:
diff changeset
87 recurse=False,
eb0c42719156 Uploaded
cpt
parents:
diff changeset
88 ):
eb0c42719156 Uploaded
cpt
parents:
diff changeset
89 # check all CDS features for min/max boundaries
eb0c42719156 Uploaded
cpt
parents:
diff changeset
90 if exon_start is None:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
91 exon_start = cds.location.start
eb0c42719156 Uploaded
cpt
parents:
diff changeset
92 exon_strand = cds.location.strand
eb0c42719156 Uploaded
cpt
parents:
diff changeset
93 if exon_end is None:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
94 exon_end = cds.location.end
eb0c42719156 Uploaded
cpt
parents:
diff changeset
95 exon_start = min(exon_start, cds.location.start)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
96 exon_end = max(exon_end, cds.location.end)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
97 cds_list.append(cds)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
98 if cds_list:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
99 # we found a CDS to adopt
eb0c42719156 Uploaded
cpt
parents:
diff changeset
100 new_exon = gffSeqFeature(
eb0c42719156 Uploaded
cpt
parents:
diff changeset
101 location=FeatureLocation(exon_start, exon_end),
eb0c42719156 Uploaded
cpt
parents:
diff changeset
102 type="exon",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
103 source = "cpt.prepApollo",
eb0c42719156 Uploaded
cpt
parents:
diff changeset
104 qualifiers={
eb0c42719156 Uploaded
cpt
parents:
diff changeset
105 "ID": ["%s.exon" % clean_gene.qualifiers["ID"][0]],
eb0c42719156 Uploaded
cpt
parents:
diff changeset
106 "Parent": [clean_gene.id],
eb0c42719156 Uploaded
cpt
parents:
diff changeset
107 "ApolloExon": ["True"],
eb0c42719156 Uploaded
cpt
parents:
diff changeset
108 },
eb0c42719156 Uploaded
cpt
parents:
diff changeset
109 sub_features=[],
eb0c42719156 Uploaded
cpt
parents:
diff changeset
110 strand=exon_strand
eb0c42719156 Uploaded
cpt
parents:
diff changeset
111 )
eb0c42719156 Uploaded
cpt
parents:
diff changeset
112 for cds in cds_list:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
113 cds.qualifiers["Parent"] = new_exon.qualifiers["ID"]
eb0c42719156 Uploaded
cpt
parents:
diff changeset
114 new_exon.sub_features.append(cds)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
115 #gene.sub_features.append(new_exon)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
116 # get all the other children of gene that AREN'T a CDS including the new exon
eb0c42719156 Uploaded
cpt
parents:
diff changeset
117 clean_gene.sub_features.append(copy.deepcopy(new_exon))
eb0c42719156 Uploaded
cpt
parents:
diff changeset
118 #clean_gene.sub_features.append(gffSeqFeature(location=FeatureLocation(exon_start, exon_end, exon_strand), type="exon", source = "cpt.prepApollo", qualifiers={"ID": ["%s.exon" % clean_gene.qualifiers["ID"][0]], "Parent": clean_gene.qualifiers["ID"]}, sub_features=[], strand=exon_strand))
eb0c42719156 Uploaded
cpt
parents:
diff changeset
119 """
eb0c42719156 Uploaded
cpt
parents:
diff changeset
120 for sf in feature_lambda(
eb0c42719156 Uploaded
cpt
parents:
diff changeset
121 gene.sub_features,
eb0c42719156 Uploaded
cpt
parents:
diff changeset
122 feature_test_type,
eb0c42719156 Uploaded
cpt
parents:
diff changeset
123 {"type": "CDS"},
eb0c42719156 Uploaded
cpt
parents:
diff changeset
124 subfeatures=True,
eb0c42719156 Uploaded
cpt
parents:
diff changeset
125 recurse=False,
eb0c42719156 Uploaded
cpt
parents:
diff changeset
126 invert=True,
eb0c42719156 Uploaded
cpt
parents:
diff changeset
127 ):
eb0c42719156 Uploaded
cpt
parents:
diff changeset
128 child = copy.deepcopy(sf)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
129 child.qualifiers["Parent"] = new_exon.qualifiers["ID"]
eb0c42719156 Uploaded
cpt
parents:
diff changeset
130 clean_gene.sub_features.append(child)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
131 """
eb0c42719156 Uploaded
cpt
parents:
diff changeset
132 # add them to the new Exon feature
eb0c42719156 Uploaded
cpt
parents:
diff changeset
133 # return the cleaned gene with new exon
eb0c42719156 Uploaded
cpt
parents:
diff changeset
134 yield clean_gene
eb0c42719156 Uploaded
cpt
parents:
diff changeset
135
eb0c42719156 Uploaded
cpt
parents:
diff changeset
136 def process_features(features):
eb0c42719156 Uploaded
cpt
parents:
diff changeset
137 # change RBS to 'Shine_Dalgarno_sequence'
eb0c42719156 Uploaded
cpt
parents:
diff changeset
138 for rbs in feature_lambda(features, feature_test_type, {'type': "RBS"}):
eb0c42719156 Uploaded
cpt
parents:
diff changeset
139 rbs.type = "Shine_Dalgarno_sequence"
eb0c42719156 Uploaded
cpt
parents:
diff changeset
140
eb0c42719156 Uploaded
cpt
parents:
diff changeset
141 # Filter top level features
eb0c42719156 Uploaded
cpt
parents:
diff changeset
142 for feature in feature_lambda(features, feature_test_type, {"types": ALLOWED_FEATURES}, subfeatures=True):
eb0c42719156 Uploaded
cpt
parents:
diff changeset
143 cleaned_subfeatures = []
eb0c42719156 Uploaded
cpt
parents:
diff changeset
144 for sf in feature.sub_features:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
145 if sf.type in SPECIAL_REMOVED_FEATURES:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
146 # 'gene_component_region' is uncaught by feature_test_type as it contains `gene`
eb0c42719156 Uploaded
cpt
parents:
diff changeset
147 continue
eb0c42719156 Uploaded
cpt
parents:
diff changeset
148 else:
eb0c42719156 Uploaded
cpt
parents:
diff changeset
149 cleaned_subfeatures.append(sf)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
150 feature.sub_features = copy.deepcopy(cleaned_subfeatures)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
151 yield feature
eb0c42719156 Uploaded
cpt
parents:
diff changeset
152
eb0c42719156 Uploaded
cpt
parents:
diff changeset
153 def gff_filter(gff3):
eb0c42719156 Uploaded
cpt
parents:
diff changeset
154 for rec in gffParse(gff3):
eb0c42719156 Uploaded
cpt
parents:
diff changeset
155 cleaned_features = sorted(list(process_features(rec.features)), key=lambda x: x.location.start)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
156 rec.features = sorted(list(add_exons(cleaned_features)), key=lambda x: x.location.start)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
157 rec.annotations = {}
eb0c42719156 Uploaded
cpt
parents:
diff changeset
158 gffWrite([rec], sys.stdout)
eb0c42719156 Uploaded
cpt
parents:
diff changeset
159
eb0c42719156 Uploaded
cpt
parents:
diff changeset
160
eb0c42719156 Uploaded
cpt
parents:
diff changeset
161 if __name__ == "__main__":
eb0c42719156 Uploaded
cpt
parents:
diff changeset
162 parser = argparse.ArgumentParser(
eb0c42719156 Uploaded
cpt
parents:
diff changeset
163 description="add parent exon features to CDSs for Apollo"
eb0c42719156 Uploaded
cpt
parents:
diff changeset
164 )
eb0c42719156 Uploaded
cpt
parents:
diff changeset
165 parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations")
eb0c42719156 Uploaded
cpt
parents:
diff changeset
166 args = parser.parse_args()
eb0c42719156 Uploaded
cpt
parents:
diff changeset
167 gff_filter(**vars(args))