comparison dante_gff_output_filtering.py @ 15:3151a72a6671 draft

Uploaded
author petr-novak
date Tue, 03 Sep 2019 05:20:02 -0400
parents 77d9f2ecb28a
children 1a766f9f623d
comparison
equal deleted inserted replaced
14:a6c55d1bdb6c 15:3151a72a6671
80 line = gff_all.readline() 80 line = gff_all.readline()
81 count_comment += 1 81 count_comment += 1
82 return count_comment, lines 82 return count_comment, lines
83 83
84 84
85 def parse_gff_line(line):
86 '''Return dictionary with gff fields and atributers
87 Note - type of fields is strings
88 '''
89 # order of first 9 column is fixed
90 gff_line = dict(
91 zip(
92 ['seqid', 'source', 'type', 'start', 'end',
93 'score', 'strand', 'phase', 'attributes'],
94 line.split("\t")
95 )
96 )
97 # split attributes and replace:
98 gff_line['attributes'] = dict([i.split("=") for i in gff_line['attributes'].split(";")])
99 return gff_line
100
85 def filter_qual_dom(DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY, 101 def filter_qual_dom(DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY,
86 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM, 102 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM,
87 ELEMENT): 103 ELEMENT):
88 ''' Filter gff output based on domain and quality of alignment ''' 104 ''' Filter gff output based on domain and quality of alignment '''
89 [count_comment, version_lines] = get_file_start(DOM_GFF) 105 [count_comment, version_lines] = get_file_start(DOM_GFF)
90 filt_dom_tmp = NamedTemporaryFile(delete=False) 106 filt_dom_tmp = NamedTemporaryFile(delete=False)
91 with open(DOM_GFF, "r") as gff_all, open(filt_dom_tmp.name, 107 with open(DOM_GFF, "r") as gff_all, open(filt_dom_tmp.name,
92 "w") as gff_filtered: 108 "w") as gff_filtered:
93 for comment_idx in range(count_comment): 109 for _ in range(count_comment):
94 next(gff_all) 110 next(gff_all)
95 dom_dict = defaultdict(lambda: defaultdict(int)) 111 dom_dict = defaultdict(lambda: defaultdict(int))
96 orig_class_dict = defaultdict(int) 112 orig_class_dict = defaultdict(int)
97 filt_class_dict = defaultdict(int) 113 filt_class_dict = defaultdict(int)
98 seq_ids_all = [] 114 seq_ids_all = []
107 attributes = line.rstrip().split("\t")[-1] 123 attributes = line.rstrip().split("\t")[-1]
108 classification = attributes.split(";")[1].split("=")[1] 124 classification = attributes.split(";")[1].split("=")[1]
109 orig_class_dict[classification] += 1 125 orig_class_dict[classification] += 1
110 ## ambiguous domains filtered out automatically 126 ## ambiguous domains filtered out automatically
111 if classification != configuration.AMBIGUOUS_TAG: 127 if classification != configuration.AMBIGUOUS_TAG:
112 al_identity = float(attributes.split(";")[-5].split("=")[1]) 128 gff_line = parse_gff_line(line)
113 al_similarity = float(attributes.split(";")[-4].split("=")[1]) 129 al_identity = float(gff_line['attributes']['Identity'])
114 al_length = float(attributes.split(";")[-3].split("=")[1]) 130 al_similarity = float(gff_line['attributes']['Similarity'])
115 relat_interrupt = float(attributes.split(";")[-2].split("=")[ 131 al_length = float(gff_line['attributes']['Relat_Length'])
116 1]) 132 relat_interrupt = float(gff_line['attributes']['Relat_Interruptions'])
117 db_len_proportion = float(attributes.split(";")[-1].split("=")[ 133 db_len_proportion = float(gff_line['attributes']['Hit_to_DB_Length'])
118 1]) 134 dom_type = gff_line['attributes']['Final_Classification']
119 dom_type = attributes.split(";")[0].split("=")[1] 135 seq_id = gff_line['seqid']
120 seq_id = line.split("\t")[0] 136 xminimal = int(gff_line['start'])
121 xminimal = int(line.split("\t")[3]) 137 xmaximal = int(gff_line['end'])
122 xmaximal = int(line.split("\t")[4]) 138 c1 = al_identity >= TH_IDENTITY
123 if al_identity >= TH_IDENTITY and al_similarity >= TH_SIMILARITY and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and db_len_proportion <= TH_LEN_RATIO and ( 139 c2 = al_similarity >= TH_SIMILARITY
124 dom_type == SELECTED_DOM or 140 if (c1 and c2 and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and
125 SELECTED_DOM == "All") and (ELEMENT in classification): 141 db_len_proportion <= TH_LEN_RATIO and
142 (dom_type == SELECTED_DOM or SELECTED_DOM == "All") and
143 (ELEMENT in classification)):
126 gff_filtered.writelines(line) 144 gff_filtered.writelines(line)
127 filt_class_dict[classification] += 1 145 filt_class_dict[classification] += 1
128 dom_dict[seq_id][dom_type] += 1 146 dom_dict[seq_id][dom_type] += 1
129 if start: 147 if start:
130 seq_ids_all.append(line.split("\t")[0]) 148 seq_ids_all.append(line.split("\t")[0])