annotate cpt_req_sd/gff3_require_sd.py @ 3:a6dc48930318 draft default tip

Uploaded
author cpt
date Fri, 20 May 2022 09:03:04 +0000
parents ae2791f3108f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ae2791f3108f Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
ae2791f3108f Uploaded
cpt
parents:
diff changeset
2 import sys
ae2791f3108f Uploaded
cpt
parents:
diff changeset
3 import logging
ae2791f3108f Uploaded
cpt
parents:
diff changeset
4 import argparse
ae2791f3108f Uploaded
cpt
parents:
diff changeset
5 from Bio import SeqIO
ae2791f3108f Uploaded
cpt
parents:
diff changeset
6 from Bio.SeqFeature import FeatureLocation
ae2791f3108f Uploaded
cpt
parents:
diff changeset
7 from CPT_GFFParser import gffParse, gffWrite
ae2791f3108f Uploaded
cpt
parents:
diff changeset
8 from gff3 import feature_lambda, feature_test_type
ae2791f3108f Uploaded
cpt
parents:
diff changeset
9 from shinefind import NaiveSDCaller
ae2791f3108f Uploaded
cpt
parents:
diff changeset
10
ae2791f3108f Uploaded
cpt
parents:
diff changeset
11 logging.basicConfig(level=logging.INFO)
ae2791f3108f Uploaded
cpt
parents:
diff changeset
12 log = logging.getLogger(__name__)
ae2791f3108f Uploaded
cpt
parents:
diff changeset
13
ae2791f3108f Uploaded
cpt
parents:
diff changeset
14
ae2791f3108f Uploaded
cpt
parents:
diff changeset
15 def require_shinefind(gff3, fasta):
ae2791f3108f Uploaded
cpt
parents:
diff changeset
16 sd_finder = NaiveSDCaller()
ae2791f3108f Uploaded
cpt
parents:
diff changeset
17 # Load up sequence(s) for GFF3 data
ae2791f3108f Uploaded
cpt
parents:
diff changeset
18 seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta"))
ae2791f3108f Uploaded
cpt
parents:
diff changeset
19 # Parse GFF3 records
ae2791f3108f Uploaded
cpt
parents:
diff changeset
20 for record in gffParse(gff3, base_dict=seq_dict):
ae2791f3108f Uploaded
cpt
parents:
diff changeset
21 # Reopen
ae2791f3108f Uploaded
cpt
parents:
diff changeset
22 genes = list(
ae2791f3108f Uploaded
cpt
parents:
diff changeset
23 feature_lambda(
ae2791f3108f Uploaded
cpt
parents:
diff changeset
24 record.features, feature_test_type, {"type": "gene"}, subfeatures=True
ae2791f3108f Uploaded
cpt
parents:
diff changeset
25 )
ae2791f3108f Uploaded
cpt
parents:
diff changeset
26 )
ae2791f3108f Uploaded
cpt
parents:
diff changeset
27 good_genes = []
ae2791f3108f Uploaded
cpt
parents:
diff changeset
28 for gene in genes:
ae2791f3108f Uploaded
cpt
parents:
diff changeset
29 cdss = sorted(
ae2791f3108f Uploaded
cpt
parents:
diff changeset
30 list(
ae2791f3108f Uploaded
cpt
parents:
diff changeset
31 feature_lambda(
ae2791f3108f Uploaded
cpt
parents:
diff changeset
32 gene.sub_features,
ae2791f3108f Uploaded
cpt
parents:
diff changeset
33 feature_test_type,
ae2791f3108f Uploaded
cpt
parents:
diff changeset
34 {"type": "CDS"},
ae2791f3108f Uploaded
cpt
parents:
diff changeset
35 subfeatures=False,
ae2791f3108f Uploaded
cpt
parents:
diff changeset
36 )
ae2791f3108f Uploaded
cpt
parents:
diff changeset
37 ),
ae2791f3108f Uploaded
cpt
parents:
diff changeset
38 key=lambda x: x.location.start,
ae2791f3108f Uploaded
cpt
parents:
diff changeset
39 )
ae2791f3108f Uploaded
cpt
parents:
diff changeset
40 if len(cdss) == 0:
ae2791f3108f Uploaded
cpt
parents:
diff changeset
41 continue
ae2791f3108f Uploaded
cpt
parents:
diff changeset
42
ae2791f3108f Uploaded
cpt
parents:
diff changeset
43 cds = cdss[0]
ae2791f3108f Uploaded
cpt
parents:
diff changeset
44
ae2791f3108f Uploaded
cpt
parents:
diff changeset
45 sds, start, end, seq = sd_finder.testFeatureUpstream(
ae2791f3108f Uploaded
cpt
parents:
diff changeset
46 cds, record, sd_min=5, sd_max=15
ae2791f3108f Uploaded
cpt
parents:
diff changeset
47 )
ae2791f3108f Uploaded
cpt
parents:
diff changeset
48 if len(sds) >= 1:
ae2791f3108f Uploaded
cpt
parents:
diff changeset
49 sd_features = sd_finder.to_features(
ae2791f3108f Uploaded
cpt
parents:
diff changeset
50 sds, gene.location.strand, start, end, feature_id=gene.id
ae2791f3108f Uploaded
cpt
parents:
diff changeset
51 )
ae2791f3108f Uploaded
cpt
parents:
diff changeset
52 gene.sub_features.append(sd_features[0])
ae2791f3108f Uploaded
cpt
parents:
diff changeset
53 if gene.location.start > sd_features[0].location.start:
ae2791f3108f Uploaded
cpt
parents:
diff changeset
54 gene.location = FeatureLocation(int(sd_features[0].location.start), int(gene.location.end), gene.location.strand)
ae2791f3108f Uploaded
cpt
parents:
diff changeset
55 if gene.location.end < sd_features[0].location.end:
ae2791f3108f Uploaded
cpt
parents:
diff changeset
56 gene.location = FeatureLocation(int(gene.location.start), int(sd_features[0].location.end), gene.location.strand)
ae2791f3108f Uploaded
cpt
parents:
diff changeset
57 good_genes.append(gene)
ae2791f3108f Uploaded
cpt
parents:
diff changeset
58
ae2791f3108f Uploaded
cpt
parents:
diff changeset
59 record.features = good_genes
ae2791f3108f Uploaded
cpt
parents:
diff changeset
60 yield record
ae2791f3108f Uploaded
cpt
parents:
diff changeset
61
ae2791f3108f Uploaded
cpt
parents:
diff changeset
62
ae2791f3108f Uploaded
cpt
parents:
diff changeset
63 if __name__ == "__main__":
ae2791f3108f Uploaded
cpt
parents:
diff changeset
64 parser = argparse.ArgumentParser(description="Identify shine-dalgarno sequences")
ae2791f3108f Uploaded
cpt
parents:
diff changeset
65 parser.add_argument("fasta", type=argparse.FileType("r"), help="Fasta Genome")
ae2791f3108f Uploaded
cpt
parents:
diff changeset
66 parser.add_argument("gff3", type=argparse.FileType("r"), help="GFF3 annotations")
ae2791f3108f Uploaded
cpt
parents:
diff changeset
67 args = parser.parse_args()
ae2791f3108f Uploaded
cpt
parents:
diff changeset
68
ae2791f3108f Uploaded
cpt
parents:
diff changeset
69 for rec in require_shinefind(**vars(args)):
ae2791f3108f Uploaded
cpt
parents:
diff changeset
70 rec.annotations = {}
ae2791f3108f Uploaded
cpt
parents:
diff changeset
71 gffWrite([rec], sys.stdout)