annotate cpt_lipop_conv/lipoP_to_gff3.py @ 2:f54cbb13f8cd draft default tip

Uploaded
author cpt
date Fri, 20 May 2022 08:57:11 +0000
parents adde21b6bdb3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
2 import sys
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
3 import copy
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
4 import argparse
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
5 from CPT_GFFParser import gffParse, gffWrite, gffSeqFeature
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
6 from Bio.Seq import Seq
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
7 from Bio.SeqRecord import SeqRecord
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
8 from Bio.SeqFeature import FeatureLocation
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
9 from gff3 import feature_lambda, feature_test_type, get_id
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
10
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
11
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
12 def lipoP_gff(lipoIn, gff3In, jBrowseOut, filterSP2):
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
13
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
14 orgIDs = {}
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
15 orgID = ""
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
16
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
17 # Take and parse the txt output into a sequence of records
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
18 # Dict of X records, with the ID as key and an array Y of each cleavage site as the value,
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
19 for row in lipoIn:
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
20 if row.startswith("#"):
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
21 orgID = ""
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
22 continue
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
23
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
24 rowElem = row.split("\t")
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
25
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
26 orgID = rowElem[0]
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
27
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
28 if filterSP2:
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
29 if rowElem[2] == "CleavII":
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
30 if not (orgID in orgIDs.keys()):
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
31 orgIDs[orgID] = []
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
32 orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4])))
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
33 else:
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
34 if rowElem[2] in "CleavII":
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
35 if not (orgID in orgIDs.keys()):
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
36 orgIDs[orgID] = []
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
37 orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4])))
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
38
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
39
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
40 # Rebase
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
41 for gff in gffParse(gff3In):
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
42 keepSeq = []
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
43 for xRec in gff.features:
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
44 cdss = list(
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
45 feature_lambda(
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
46 xRec.sub_features,
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
47 feature_test_type,
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
48 {"type": "CDS"},
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
49 subfeatures=False,
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
50 )
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
51 )
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
52 findCleave = ""
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
53 cdsOff = 0
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
54 for cds in cdss:
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
55 if cds.id in orgIDs:
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
56 findCleave = cds.id
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
57 break
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
58 cdsOff += 1
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
59 if findCleave == "":
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
60 if not jBrowseOut:
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
61 keepSeq.append(xRec)
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
62 continue
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
63
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
64 #if jBrowseOut:
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
65 # xRec.sub_features = []
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
66
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
67 i = 0
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
68 for cleaveBase in orgIDs[findCleave]:
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
69 tempQuals = xRec.qualifiers.copy()
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
70 i += 1
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
71 tempQuals["ID"] = xRec.id + "_cleavage_" + str(i)
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
72
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
73 xRec.sub_features.append(
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
74 gffSeqFeature(
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
75 FeatureLocation(
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
76 cdss[cdsOff].location.start + (cleaveBase * 3) - 1,
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
77 cdss[cdsOff].location.start + (cleaveBase * 3) + 1,
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
78 ),
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
79 type="cleavage_site",
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
80 strand=xRec.location.strand,
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
81 qualifiers=tempQuals,
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
82 )
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
83 )
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
84 keepSeq.append(xRec)
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
85
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
86 gff.features = keepSeq
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
87 gffWrite([gff], sys.stdout)
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
88
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
89
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
90 if __name__ == "__main__":
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
91 parser = argparse.ArgumentParser(description="add parent gene features to CDSs")
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
92 parser.add_argument(
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
93 "lipoIn", type=argparse.FileType("r"), help="LipoP tool's .txt output"
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
94 )
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
95 parser.add_argument(
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
96 "gff3In", type=argparse.FileType("r"), help="GFF3 to rebase LipoP results"
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
97 )
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
98 parser.add_argument(
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
99 "--jBrowseOut",
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
100 type=bool,
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
101 default=False,
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
102 help="Prepare Output for jBrowse instance",
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
103 )
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
104 parser.add_argument(
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
105 "--filterSP2",
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
106 action='store_true',
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
107 help="Filter for only SPII sites",
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
108 )
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
109 args = parser.parse_args()
adde21b6bdb3 Uploaded
cpt
parents:
diff changeset
110 lipoP_gff(**vars(args))