annotate dante_gff_output_filtering.py @ 22:1eabd42e00ef draft

Uploaded
author petr-novak
date Fri, 03 Apr 2020 07:27:59 -0400
parents 1a766f9f623d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
22
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
2 import sys
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
3 import time
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
4 import configuration
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
5 import os
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
6 import textwrap
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
7 import subprocess
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
8 from tempfile import NamedTemporaryFile
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
9 from collections import defaultdict
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
10
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
11
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
12 class Range():
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
13 '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
14 This class is used to check float range in argparse
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
15 '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
16
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
17 def __init__(self, start, end):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
18 self.start = start
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
19 self.end = end
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
20
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
21 def __eq__(self, other):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
22 return self.start <= other <= self.end
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
23
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
24 def __str__(self):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
25 return "float range {}..{}".format(self.start, self.end)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
26
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
27 def __repr__(self):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
28 return "float range {}..{}".format(self.start, self.end)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
29
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
30
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
31 def check_file_start(gff_file):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
32 count_comment = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
33 with open(gff_file, "r") as gff_all:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
34 line = gff_all.readline()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
35 while line.startswith("#"):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
36 line = gff_all.readline()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
37 count_comment += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
38 return count_comment
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
39
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
40
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
41 def write_info(filt_dom_tmp, FILT_DOM_GFF, orig_class_dict, filt_class_dict,
22
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
42 dom_dict, version_lines, TH_IDENTITY, TH_SIMILARITY,
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
43 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM):
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
44 '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
45 Write domains statistics in beginning of filtered GFF
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
46 '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
47 with open(FILT_DOM_GFF, "w") as filt_gff:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
48 for line in version_lines:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
49 filt_gff.write(line)
22
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
50 filt_gff.write(("##Filtering thresholdss: min identity: {}, min similarity: {},"
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
51 " min relative alingment length: {}, max interuptions(stop or "
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
52 "frameshift): {}, max relative alignment length: {}, selected"
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
53 " domains: {} \n").format(TH_IDENTITY,
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
54 TH_SIMILARITY,
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
55 TH_LENGTH,
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
56 TH_INTERRUPT,
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
57 TH_LEN_RATIO,
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
58 SELECTED_DOM))
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
59 filt_gff.write("##CLASSIFICATION\tORIGINAL_COUNTS\tFILTERED_COUNTS\n")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
60 if not orig_class_dict:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
61 filt_gff.write("##NO DOMAINS CLASSIFICATIONS\n")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
62 for classification in sorted(orig_class_dict.keys()):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
63 if classification in filt_class_dict.keys():
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
64 filt_gff.write("##{}\t{}\t{}\n".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
65 classification, orig_class_dict[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
66 classification], filt_class_dict[classification]))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
67 else:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
68 filt_gff.write("##{}\t{}\t{}\n".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
69 classification, orig_class_dict[classification], 0))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
70 filt_gff.write("##-----------------------------------------------\n"
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
71 "##SEQ\tDOMAIN\tCOUNTS\n")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
72 if not dom_dict:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
73 filt_gff.write("##NO DOMAINS\n")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
74 for seq in sorted(dom_dict.keys()):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
75 for dom, count in sorted(dom_dict[seq].items()):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
76 filt_gff.write("##{}\t{}\t{}\n".format(seq, dom, count))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
77 filt_gff.write("##-----------------------------------------------\n")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
78 with open(filt_dom_tmp.name, "r") as filt_tmp:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
79 for line in filt_tmp:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
80 filt_gff.write(line)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
81
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
82
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
83 def get_file_start(gff_file):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
84 count_comment = 0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
85 lines = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
86 with open(gff_file, "r") as gff_all:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
87 line = gff_all.readline()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
88 while line.startswith("#"):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
89 lines.append(line)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
90 line = gff_all.readline()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
91 count_comment += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
92 return count_comment, lines
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
93
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
94
15
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
95 def parse_gff_line(line):
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
96 '''Return dictionary with gff fields and atributers
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
97 Note - type of fields is strings
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
98 '''
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
99 # order of first 9 column is fixed
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
100 gff_line = dict(
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
101 zip(
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
102 ['seqid', 'source', 'type', 'start', 'end',
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
103 'score', 'strand', 'phase', 'attributes'],
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
104 line.split("\t")
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
105 )
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
106 )
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
107 # split attributes and replace:
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
108 gff_line['attributes'] = dict([i.split("=") for i in gff_line['attributes'].split(";")])
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
109 return gff_line
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
110
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
111 def filter_qual_dom(DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
112 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
113 ELEMENT):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
114 ''' Filter gff output based on domain and quality of alignment '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
115 [count_comment, version_lines] = get_file_start(DOM_GFF)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
116 filt_dom_tmp = NamedTemporaryFile(delete=False)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
117 with open(DOM_GFF, "r") as gff_all, open(filt_dom_tmp.name,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
118 "w") as gff_filtered:
15
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
119 for _ in range(count_comment):
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
120 next(gff_all)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
121 dom_dict = defaultdict(lambda: defaultdict(int))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
122 orig_class_dict = defaultdict(int)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
123 filt_class_dict = defaultdict(int)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
124 seq_ids_all = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
125 xminimals = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
126 xmaximals = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
127 domains = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
128 xminimals_all = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
129 xmaximals_all = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
130 domains_all = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
131 start = True
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
132 for line in gff_all:
17
1a766f9f623d Uploaded
petr-novak
parents: 15
diff changeset
133 gff_line = parse_gff_line(line)
1a766f9f623d Uploaded
petr-novak
parents: 15
diff changeset
134 classification = gff_line['attributes']['Final_Classification']
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
135 orig_class_dict[classification] += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
136 ## ambiguous domains filtered out automatically
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
137 if classification != configuration.AMBIGUOUS_TAG:
15
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
138 al_identity = float(gff_line['attributes']['Identity'])
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
139 al_similarity = float(gff_line['attributes']['Similarity'])
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
140 al_length = float(gff_line['attributes']['Relat_Length'])
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
141 relat_interrupt = float(gff_line['attributes']['Relat_Interruptions'])
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
142 db_len_proportion = float(gff_line['attributes']['Hit_to_DB_Length'])
17
1a766f9f623d Uploaded
petr-novak
parents: 15
diff changeset
143 dom_type = gff_line['attributes']['Name']
15
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
144 seq_id = gff_line['seqid']
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
145 xminimal = int(gff_line['start'])
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
146 xmaximal = int(gff_line['end'])
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
147 c1 = al_identity >= TH_IDENTITY
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
148 c2 = al_similarity >= TH_SIMILARITY
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
149 if (c1 and c2 and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
150 db_len_proportion <= TH_LEN_RATIO and
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
151 (dom_type == SELECTED_DOM or SELECTED_DOM == "All") and
3151a72a6671 Uploaded
petr-novak
parents: 0
diff changeset
152 (ELEMENT in classification)):
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
153 gff_filtered.writelines(line)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
154 filt_class_dict[classification] += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
155 dom_dict[seq_id][dom_type] += 1
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
156 if start:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
157 seq_ids_all.append(line.split("\t")[0])
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
158 start = False
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
159 if seq_id != seq_ids_all[-1]:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
160 seq_ids_all.append(seq_id)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
161 xminimals_all.append(xminimals)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
162 xmaximals_all.append(xmaximals)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
163 domains_all.append(domains)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
164 xminimals = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
165 xmaximals = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
166 domains = []
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
167 xminimals.append(xminimal)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
168 xmaximals.append(xmaximal)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
169 domains.append(dom_type)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
170 path = os.path.dirname(os.path.realpath(__file__))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
171 write_info(filt_dom_tmp, FILT_DOM_GFF, orig_class_dict, filt_class_dict,
22
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
172 dom_dict, version_lines, TH_IDENTITY, TH_SIMILARITY,
1eabd42e00ef Uploaded
petr-novak
parents: 17
diff changeset
173 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM)
0
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
174 os.unlink(filt_dom_tmp.name)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
175 xminimals_all.append(xminimals)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
176 xmaximals_all.append(xmaximals)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
177 domains_all.append(domains)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
178 return xminimals_all, xmaximals_all, domains_all, seq_ids_all
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
179
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
180
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
181 def get_domains_protseq(FILT_DOM_GFF, DOMAIN_PROT_SEQ):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
182 ''' Get the translated protein sequence of original DNA seq for all the filtered domains regions
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
183 The translated sequences are taken from alignment reported by LASTAL (Query_Seq attribute in GFF)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
184 '''
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
185 count_comment = check_file_start(FILT_DOM_GFF)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
186 with open(FILT_DOM_GFF, "r") as filt_gff:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
187 for comment_idx in range(count_comment):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
188 next(filt_gff)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
189 with open(DOMAIN_PROT_SEQ, "w") as dom_prot_file:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
190 for line in filt_gff:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
191 attributes = line.rstrip().split("\t")[8]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
192 positions = attributes.split(";")[3].split("=")[1].split(":")[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
193 -1].split("[")[0]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
194 dom = attributes.split(";")[0].split("=")[1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
195 dom_class = attributes.split(";")[1].split("=")[1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
196 seq_id = line.rstrip().split("\t")[0]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
197 prot_seq_align = line.rstrip().split("\t")[8].split(";")[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
198 6].split("=")[1]
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
199 prot_seq = prot_seq_align.translate({ord(i): None
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
200 for i in '/\\-'})
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
201 header_prot_seq = ">{}:{} {} {}".format(seq_id, positions, dom,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
202 dom_class)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
203 dom_prot_file.write("{}\n{}\n".format(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
204 header_prot_seq, textwrap.fill(prot_seq,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
205 configuration.FASTA_LINE)))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
206
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
207
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
208 def main(args):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
209
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
210 t = time.time()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
211
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
212 DOM_GFF = args.dom_gff
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
213 DOMAIN_PROT_SEQ = args.domains_prot_seq
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
214 TH_IDENTITY = args.th_identity
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
215 TH_LENGTH = args.th_length
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
216 TH_INTERRUPT = args.interruptions
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
217 TH_SIMILARITY = args.th_similarity
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
218 TH_LEN_RATIO = args.max_len_proportion
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
219 FILT_DOM_GFF = args.domains_filtered
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
220 SELECTED_DOM = args.selected_dom
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
221 OUTPUT_DIR = args.output_dir
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
222 # DELETE : ELEMENT = args.element_type.replace("_pipe_", "|")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
223 ELEMENT = args.element_type
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
224
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
225 if DOMAIN_PROT_SEQ is None:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
226 DOMAIN_PROT_SEQ = configuration.DOM_PROT_SEQ
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
227 if FILT_DOM_GFF is None:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
228 FILT_DOM_GFF = configuration.FILT_DOM_GFF
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
229
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
230 if OUTPUT_DIR and not os.path.exists(OUTPUT_DIR):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
231 os.makedirs(OUTPUT_DIR)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
232
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
233 if not os.path.isabs(FILT_DOM_GFF):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
234 if OUTPUT_DIR is None:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
235 OUTPUT_DIR = os.path.dirname(os.path.abspath(DOM_GFF))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
236 FILT_DOM_GFF = os.path.join(OUTPUT_DIR, os.path.basename(FILT_DOM_GFF))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
237 DOMAIN_PROT_SEQ = os.path.join(OUTPUT_DIR,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
238 os.path.basename(DOMAIN_PROT_SEQ))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
239
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
240 [xminimals_all, xmaximals_all, domains_all, seq_ids_all] = filter_qual_dom(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
241 DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY, TH_LENGTH,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
242 TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM, ELEMENT)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
243 get_domains_protseq(FILT_DOM_GFF, DOMAIN_PROT_SEQ)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
244
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
245 print("ELAPSED_TIME_DOMAINS = {} s".format(time.time() - t))
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
246
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
247
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
248 if __name__ == "__main__":
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
249 import argparse
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
250 from argparse import RawDescriptionHelpFormatter
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
251
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
252 class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
253 argparse.RawDescriptionHelpFormatter):
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
254 pass
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
255
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
256 parser = argparse.ArgumentParser(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
257 description=
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
258 '''The script performs DANTE's output filtering for quality and/or extracting specific type of protein domain or mobile elements of origin. For the filtered domains it reports their translated protein sequence of original DNA.
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
259 WHEN NO PARAMETERS GIVEN, IT PERFORMS QUALITY FILTERING USING THE DEFAULT PARAMETRES (optimized for Viridiplantae species)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
260
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
261 INPUTS:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
262 - GFF3 file produced by protein_domains.py OR already filtered GFF3
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
263
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
264 FILTERING OPTIONS:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
265 > QUALITY: - Min relative length of alignemnt to the protein domain from DB (without gaps)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
266 - Identity
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
267 - Similarity (scoring matrix: BLOSUM82)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
268 - Interruption in the reading frame (frameshifts + stop codons) per every starting 100 AA
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
269 - Max alignment proportion to the original length of database domain sequence
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
270 > DOMAIN TYPE: choose from choices ('Name' attribute in GFF)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
271 Records for ambiguous domain type (e.g. INT/RH) are filtered out automatically
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
272
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
273 > MOBILE ELEMENT TYPE:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
274 arbitrary substring of the element classification ('Final_Classification' attribute in GFF)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
275
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
276 OUTPUTS:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
277 - filtered GFF3 file
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
278 - fasta file of translated protein sequences (from original DNA) for the aligned domains that match the filtering criteria
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
279
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
280 DEPENDENCIES:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
281 - python 3.4 or higher
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
282 > ProfRep modules:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
283 - configuration.py
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
284
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
285 EXAMPLE OF USAGE:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
286 Getting quality filtered integrase(INT) domains of all gypsy transposable elements:
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
287 ./domains_filtering.py -dom_gff PATH_TO_INPUT_GFF -pdb PATH_TO_PROTEIN_DB -cs PATH_TO_CLASSIFICATION_FILE --selected_dom INT --element_type Ty3/gypsy
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
288
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
289 ''',
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
290 epilog="""""",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
291 formatter_class=CustomFormatter)
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
292 requiredNamed = parser.add_argument_group('required named arguments')
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
293 requiredNamed.add_argument("-dg",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
294 "--dom_gff",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
295 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
296 required=True,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
297 help="basic unfiltered gff file of all domains")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
298 parser.add_argument("-ouf",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
299 "--domains_filtered",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
300 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
301 help="output filtered domains gff file")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
302 parser.add_argument("-dps",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
303 "--domains_prot_seq",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
304 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
305 help="output file containg domains protein sequences")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
306 parser.add_argument("-thl",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
307 "--th_length",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
308 type=float,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
309 choices=[Range(0.0, 1.0)],
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
310 default=0.8,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
311 help="proportion of alignment length threshold")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
312 parser.add_argument("-thi",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
313 "--th_identity",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
314 type=float,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
315 choices=[Range(0.0, 1.0)],
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
316 default=0.35,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
317 help="proportion of alignment identity threshold")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
318 parser.add_argument("-ths",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
319 "--th_similarity",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
320 type=float,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
321 choices=[Range(0.0, 1.0)],
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
322 default=0.45,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
323 help="threshold for alignment proportional similarity")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
324 parser.add_argument(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
325 "-ir",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
326 "--interruptions",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
327 type=int,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
328 default=3,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
329 help=
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
330 "interruptions (frameshifts + stop codons) tolerance threshold per 100 AA")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
331 parser.add_argument(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
332 "-mlen",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
333 "--max_len_proportion",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
334 type=float,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
335 default=1.2,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
336 help=
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
337 "maximal proportion of alignment length to the original length of protein domain from database")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
338 parser.add_argument(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
339 "-sd",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
340 "--selected_dom",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
341 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
342 default="All",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
343 choices=[
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
344 "All", "GAG", "INT", "PROT", "RH", "RT", "aRH", "CHDCR", "CHDII",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
345 "TPase", "YR", "HEL1", "HEL2", "ENDO"
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
346 ],
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
347 help="filter output domains based on the domain type")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
348 parser.add_argument(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
349 "-el",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
350 "--element_type",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
351 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
352 default="",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
353 help="filter output domains by typing substring from classification")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
354 parser.add_argument(
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
355 "-dir",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
356 "--output_dir",
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
357 type=str,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
358 default=None,
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
359 help="specify if you want to change the output directory")
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
360 args = parser.parse_args()
77d9f2ecb28a Uploaded
petr-novak
parents:
diff changeset
361 main(args)