dante: dante_gff_output_filtering.py comparison

comparison dante_gff_output_filtering.py @ 15:3151a72a6671 draft

Uploaded

author	petr-novak
date	Tue, 03 Sep 2019 05:20:02 -0400
parents	77d9f2ecb28a
children	1a766f9f623d

comparison

equal deleted inserted replaced

-:a6c55d1bdb6c
+:3151a72a6671
 line = gff_all.readline()
 count_comment += 1
 return count_comment, lines
+def parse_gff_line(line):
+'''Return dictionary with gff fields  and  atributers
+Note - type of fields is strings
+'''
+# order of first 9 column is fixed
+gff_line = dict(
+zip(
+['seqid', 'source', 'type', 'start', 'end',
+'score', 'strand', 'phase', 'attributes'],
+line.split("\t")
+)
+)
+# split attributes and replace:
+gff_line['attributes'] = dict([i.split("=") for i in gff_line['attributes'].split(";")])
+return gff_line
 def filter_qual_dom(DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY,
 TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM,
 ELEMENT):
 ''' Filter gff output based on domain and quality of alignment '''
 [count_comment, version_lines] = get_file_start(DOM_GFF)
 filt_dom_tmp = NamedTemporaryFile(delete=False)
 with open(DOM_GFF, "r") as gff_all, open(filt_dom_tmp.name,
 "w") as gff_filtered:
-for comment_idx in range(count_comment):
+for _ in range(count_comment):
 next(gff_all)
 dom_dict = defaultdict(lambda: defaultdict(int))
 orig_class_dict = defaultdict(int)
 filt_class_dict = defaultdict(int)
 seq_ids_all = []
 attributes = line.rstrip().split("\t")[-1]
 classification = attributes.split(";")[1].split("=")[1]
 orig_class_dict[classification] += 1
 ## ambiguous domains filtered out automatically
 if classification != configuration.AMBIGUOUS_TAG:
-al_identity = float(attributes.split(";")[-5].split("=")[1])
+gff_line = parse_gff_line(line)
-al_similarity = float(attributes.split(";")[-4].split("=")[1])
+al_identity = float(gff_line['attributes']['Identity'])
-al_length = float(attributes.split(";")[-3].split("=")[1])
+al_similarity = float(gff_line['attributes']['Similarity'])
-relat_interrupt = float(attributes.split(";")[-2].split("=")[
+al_length = float(gff_line['attributes']['Relat_Length'])
-1])
+relat_interrupt = float(gff_line['attributes']['Relat_Interruptions'])
-db_len_proportion = float(attributes.split(";")[-1].split("=")[
+db_len_proportion = float(gff_line['attributes']['Hit_to_DB_Length'])
-1])
+dom_type = gff_line['attributes']['Final_Classification']
-dom_type = attributes.split(";")[0].split("=")[1]
+seq_id = gff_line['seqid']
-seq_id = line.split("\t")[0]
+xminimal = int(gff_line['start'])
-xminimal = int(line.split("\t")[3])
+xmaximal = int(gff_line['end'])
-xmaximal = int(line.split("\t")[4])
+c1 = al_identity >= TH_IDENTITY
-if al_identity >= TH_IDENTITY and al_similarity >= TH_SIMILARITY and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and db_len_proportion <= TH_LEN_RATIO and (
+c2 = al_similarity >= TH_SIMILARITY
-dom_type == SELECTED_DOM or
+if (c1 and c2 and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and
-SELECTED_DOM == "All") and (ELEMENT in classification):
+db_len_proportion <= TH_LEN_RATIO and
+(dom_type == SELECTED_DOM or SELECTED_DOM == "All") and
+(ELEMENT in classification)):
 gff_filtered.writelines(line)
 filt_class_dict[classification] += 1
 dom_dict[seq_id][dom_type] += 1
 if start:
 seq_ids_all.append(line.split("\t")[0])

Mercurial > repos > petr-novak > dante

comparison dante_gff_output_filtering.py @ 15:3151a72a6671 draft