Mercurial > repos > petr-novak > dante
comparison dante_gff_output_filtering.py @ 15:3151a72a6671 draft
Uploaded
author | petr-novak |
---|---|
date | Tue, 03 Sep 2019 05:20:02 -0400 |
parents | 77d9f2ecb28a |
children | 1a766f9f623d |
comparison
equal
deleted
inserted
replaced
14:a6c55d1bdb6c | 15:3151a72a6671 |
---|---|
80 line = gff_all.readline() | 80 line = gff_all.readline() |
81 count_comment += 1 | 81 count_comment += 1 |
82 return count_comment, lines | 82 return count_comment, lines |
83 | 83 |
84 | 84 |
85 def parse_gff_line(line): | |
86 '''Return dictionary with gff fields and atributers | |
87 Note - type of fields is strings | |
88 ''' | |
89 # order of first 9 column is fixed | |
90 gff_line = dict( | |
91 zip( | |
92 ['seqid', 'source', 'type', 'start', 'end', | |
93 'score', 'strand', 'phase', 'attributes'], | |
94 line.split("\t") | |
95 ) | |
96 ) | |
97 # split attributes and replace: | |
98 gff_line['attributes'] = dict([i.split("=") for i in gff_line['attributes'].split(";")]) | |
99 return gff_line | |
100 | |
85 def filter_qual_dom(DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY, | 101 def filter_qual_dom(DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY, |
86 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM, | 102 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM, |
87 ELEMENT): | 103 ELEMENT): |
88 ''' Filter gff output based on domain and quality of alignment ''' | 104 ''' Filter gff output based on domain and quality of alignment ''' |
89 [count_comment, version_lines] = get_file_start(DOM_GFF) | 105 [count_comment, version_lines] = get_file_start(DOM_GFF) |
90 filt_dom_tmp = NamedTemporaryFile(delete=False) | 106 filt_dom_tmp = NamedTemporaryFile(delete=False) |
91 with open(DOM_GFF, "r") as gff_all, open(filt_dom_tmp.name, | 107 with open(DOM_GFF, "r") as gff_all, open(filt_dom_tmp.name, |
92 "w") as gff_filtered: | 108 "w") as gff_filtered: |
93 for comment_idx in range(count_comment): | 109 for _ in range(count_comment): |
94 next(gff_all) | 110 next(gff_all) |
95 dom_dict = defaultdict(lambda: defaultdict(int)) | 111 dom_dict = defaultdict(lambda: defaultdict(int)) |
96 orig_class_dict = defaultdict(int) | 112 orig_class_dict = defaultdict(int) |
97 filt_class_dict = defaultdict(int) | 113 filt_class_dict = defaultdict(int) |
98 seq_ids_all = [] | 114 seq_ids_all = [] |
107 attributes = line.rstrip().split("\t")[-1] | 123 attributes = line.rstrip().split("\t")[-1] |
108 classification = attributes.split(";")[1].split("=")[1] | 124 classification = attributes.split(";")[1].split("=")[1] |
109 orig_class_dict[classification] += 1 | 125 orig_class_dict[classification] += 1 |
110 ## ambiguous domains filtered out automatically | 126 ## ambiguous domains filtered out automatically |
111 if classification != configuration.AMBIGUOUS_TAG: | 127 if classification != configuration.AMBIGUOUS_TAG: |
112 al_identity = float(attributes.split(";")[-5].split("=")[1]) | 128 gff_line = parse_gff_line(line) |
113 al_similarity = float(attributes.split(";")[-4].split("=")[1]) | 129 al_identity = float(gff_line['attributes']['Identity']) |
114 al_length = float(attributes.split(";")[-3].split("=")[1]) | 130 al_similarity = float(gff_line['attributes']['Similarity']) |
115 relat_interrupt = float(attributes.split(";")[-2].split("=")[ | 131 al_length = float(gff_line['attributes']['Relat_Length']) |
116 1]) | 132 relat_interrupt = float(gff_line['attributes']['Relat_Interruptions']) |
117 db_len_proportion = float(attributes.split(";")[-1].split("=")[ | 133 db_len_proportion = float(gff_line['attributes']['Hit_to_DB_Length']) |
118 1]) | 134 dom_type = gff_line['attributes']['Final_Classification'] |
119 dom_type = attributes.split(";")[0].split("=")[1] | 135 seq_id = gff_line['seqid'] |
120 seq_id = line.split("\t")[0] | 136 xminimal = int(gff_line['start']) |
121 xminimal = int(line.split("\t")[3]) | 137 xmaximal = int(gff_line['end']) |
122 xmaximal = int(line.split("\t")[4]) | 138 c1 = al_identity >= TH_IDENTITY |
123 if al_identity >= TH_IDENTITY and al_similarity >= TH_SIMILARITY and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and db_len_proportion <= TH_LEN_RATIO and ( | 139 c2 = al_similarity >= TH_SIMILARITY |
124 dom_type == SELECTED_DOM or | 140 if (c1 and c2 and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and |
125 SELECTED_DOM == "All") and (ELEMENT in classification): | 141 db_len_proportion <= TH_LEN_RATIO and |
142 (dom_type == SELECTED_DOM or SELECTED_DOM == "All") and | |
143 (ELEMENT in classification)): | |
126 gff_filtered.writelines(line) | 144 gff_filtered.writelines(line) |
127 filt_class_dict[classification] += 1 | 145 filt_class_dict[classification] += 1 |
128 dom_dict[seq_id][dom_type] += 1 | 146 dom_dict[seq_id][dom_type] += 1 |
129 if start: | 147 if start: |
130 seq_ids_all.append(line.split("\t")[0]) | 148 seq_ids_all.append(line.split("\t")[0]) |