annotate dante_gff_to_dna.py @ 10:d0431a839606 draft

Uploaded
author petr-novak
date Wed, 14 Aug 2019 11:24:15 -0400
parents 77d9f2ecb28a
children 1a766f9f623d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
2
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
3 import argparse
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
4 import time
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
5 import os
10
d0431a839606 Uploaded
petr-novak
parents: 0
diff changeset
6 import textwrap
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
7 from collections import defaultdict
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
8 from Bio import SeqIO
10
d0431a839606 Uploaded
petr-novak
parents: 0
diff changeset
9 import configuration
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
10
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
11 t_nt_seqs_extraction = time.time()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
12
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
13
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
14 def str2bool(v):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
15 if v.lower() in ('yes', 'true', 't', 'y', '1'):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
16 return True
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
17 elif v.lower() in ('no', 'false', 'f', 'n', '0'):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
18 return False
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
19 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
20 raise argparse.ArgumentTypeError('Boolean value expected')
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
21
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
22
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
23 def check_file_start(gff_file):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
24 count_comment = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
25 with open(gff_file, "r") as gff_all:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
26 line = gff_all.readline()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
27 while line.startswith("#"):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
28 line = gff_all.readline()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
29 count_comment += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
30 return count_comment, line
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
31
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
32
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
33 def extract_nt_seqs(DNA_SEQ, DOM_GFF, OUT_DIR, CLASS_TBL, EXTENDED):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
34 ''' Extract nucleotide sequences of protein domains found by DANTE from input DNA seq.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
35 Sequences are saved in fasta files separately for each transposon lineage.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
36 Sequences extraction is based on position of Best_Hit alignment reported by LASTAL.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
37 The positions can be extended (optional) based on what part of database domain was aligned (Best_Hit_DB_Pos attribute).
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
38 The strand orientation needs to be considered in extending and extracting the sequence itself
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
39 '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
40 [count_comment, first_line] = check_file_start(DOM_GFF)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
41 unique_classes = get_unique_classes(CLASS_TBL)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
42 files_dict = defaultdict(str)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
43 domains_counts_dict = defaultdict(int)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
44 allSeqs = SeqIO.to_dict(SeqIO.parse(DNA_SEQ, 'fasta'))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
45 with open(DOM_GFF, "r") as domains:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
46 for comment_idx in range(count_comment):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
47 next(domains)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
48 seq_id_stored = first_line.split("\t")[0]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
49 allSeqs = SeqIO.to_dict(SeqIO.parse(DNA_SEQ, 'fasta'))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
50 seq_nt = allSeqs[seq_id_stored]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
51 for line in domains:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
52 seq_id = line.split("\t")[0]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
53 dom_type = line.split("\t")[8].split(";")[0].split("=")[1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
54 elem_type = line.split("\t")[8].split(";")[1].split("=")[1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
55 strand = line.split("\t")[6]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
56 align_nt_start = int(line.split("\t")[8].split(";")[3].split(":")[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
57 -1].split("-")[0])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
58 align_nt_end = int(line.split("\t")[8].split(";")[3].split(":")[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
59 -1].split("-")[1].split("[")[0])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
60 if seq_id != seq_id_stored:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
61 seq_id_stored = seq_id
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
62 seq_nt = allSeqs[seq_id_stored]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
63 if EXTENDED:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
64 ## which part of database sequence was aligned
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
65 db_part = line.split("\t")[8].split(";")[4].split("=")[1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
66 ## datatabse seq length
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
67 dom_len = int(db_part.split("of")[1])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
68 ## start of alignment on database seq
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
69 db_start = int(db_part.split("of")[0].split(":")[0])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
70 ## end of alignment on database seq
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
71 db_end = int(db_part.split("of")[0].split(":")[1])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
72 ## number of nucleotides missing in the beginning
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
73 dom_nt_prefix = (db_start - 1) * 3
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
74 ## number of nucleotides missing in the end
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
75 dom_nt_suffix = (dom_len - db_end) * 3
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
76 if strand == "+":
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
77 dom_nt_start = align_nt_start - dom_nt_prefix
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
78 dom_nt_end = align_nt_end + dom_nt_suffix
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
79 ## reverse extending for - strand
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
80 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
81 dom_nt_start = align_nt_start - dom_nt_suffix
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
82 dom_nt_end = align_nt_end + dom_nt_prefix
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
83 ## correction for domain after extending having negative starting positon
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
84 dom_nt_start = max(1, dom_nt_start)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
85 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
86 dom_nt_start = align_nt_start
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
87 dom_nt_end = align_nt_end
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
88 full_dom_nt = seq_nt.seq[dom_nt_start - 1:dom_nt_end]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
89 ## for - strand take reverse complement of the extracted sequence
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
90 if strand == "-":
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
91 full_dom_nt = full_dom_nt.reverse_complement()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
92 full_dom_nt = str(full_dom_nt)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
93 ## report when domain classified to the last level and no Ns in extracted seq
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
94 if elem_type in unique_classes and "N" not in full_dom_nt:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
95 # lineages reported in separate fasta files
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
96 if not elem_type in files_dict:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
97 files_dict[elem_type] = os.path.join(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
98 OUT_DIR, "{}.fasta".format(elem_type.split("|")[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
99 -1].replace("/", "_")))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
100 with open(files_dict[elem_type], "a") as out_nt_seq:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
101 out_nt_seq.write(">{}:{}-{}|{}[{}]\n{}\n".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
102 seq_nt.id, dom_nt_start, dom_nt_end, dom_type,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
103 elem_type, textwrap.fill(full_dom_nt,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
104 configuration.FASTA_LINE)))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
105 domains_counts_dict[elem_type] += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
106 return domains_counts_dict
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
107
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
108
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
109 def get_unique_classes(CLASS_TBL):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
110 ''' Get all the lineages of current domains classification table to check if domains are classified to the last level.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
111 Only the sequences of unambiguous and completely classified domains will be extracted.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
112 '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
113 unique_classes = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
114 with open(CLASS_TBL, "r") as class_tbl:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
115 for line in class_tbl:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
116 line_class = "|".join(line.rstrip().split("\t")[1:])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
117 if line_class not in unique_classes:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
118 unique_classes.append(line_class)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
119 return unique_classes
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
120
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
121
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
122 def write_domains_stat(domains_counts_dict, OUT_DIR):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
123 ''' Report counts of domains for individual lineages
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
124 '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
125 total = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
126 with open(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
127 os.path.join(OUT_DIR,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
128 configuration.EXTRACT_DOM_STAT), "w") as dom_stat:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
129 for domain, count in domains_counts_dict.items():
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
130 dom_stat.write(";{}:{}\n".format(domain, count))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
131 total += count
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
132 dom_stat.write(";TOTAL:{}\n".format(total))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
133
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
134
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
135 def main(args):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
136
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
137 DNA_SEQ = args.input_dna
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
138 DOM_GFF = args.domains_gff
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
139 OUT_DIR = args.out_dir
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
140 CLASS_TBL = args.classification
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
141 EXTENDED = args.extended
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
142
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
143 if not os.path.exists(OUT_DIR):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
144 os.makedirs(OUT_DIR)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
145
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
146 domains_counts_dict = extract_nt_seqs(DNA_SEQ, DOM_GFF, OUT_DIR, CLASS_TBL,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
147 EXTENDED)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
148 write_domains_stat(domains_counts_dict, OUT_DIR)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
149
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
150 print("ELAPSED_TIME_EXTRACTION = {} s\n".format(time.time() -
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
151 t_nt_seqs_extraction))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
152
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
153
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
154 if __name__ == "__main__":
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
155
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
156 # Command line arguments
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
157 parser = argparse.ArgumentParser()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
158 parser.add_argument('-i',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
159 '--input_dna',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
160 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
161 required=True,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
162 help='path to input DNA sequence')
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
163 parser.add_argument('-d',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
164 '--domains_gff',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
165 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
166 required=True,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
167 help='GFF file of protein domains')
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
168 parser.add_argument('-cs',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
169 '--classification',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
170 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
171 required=True,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
172 help='protein domains classification file')
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
173 parser.add_argument('-out',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
174 '--out_dir',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
175 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
176 default=configuration.EXTRACT_OUT_DIR,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
177 help='output directory')
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
178 parser.add_argument(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
179 '-ex',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
180 '--extended',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
181 type=str2bool,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
182 default=True,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
183 help=
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
184 'extend the domains edges if not the whole datatabase sequence was aligned')
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
185 args = parser.parse_args()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
186 main(args)