diff dante_gff_output_filtering.py @ 15:3151a72a6671 draft

Uploaded
author petr-novak
date Tue, 03 Sep 2019 05:20:02 -0400
parents 77d9f2ecb28a
children 1a766f9f623d
line wrap: on
line diff
--- a/dante_gff_output_filtering.py	Wed Aug 28 08:08:47 2019 -0400
+++ b/dante_gff_output_filtering.py	Tue Sep 03 05:20:02 2019 -0400
@@ -82,6 +82,22 @@
     return count_comment, lines
 
 
+def parse_gff_line(line):
+    '''Return dictionary with gff fields  and  atributers
+    Note - type of fields is strings
+    '''
+    # order of first 9 column is fixed
+    gff_line = dict(
+        zip(
+            ['seqid', 'source', 'type', 'start', 'end',
+             'score', 'strand', 'phase', 'attributes'],
+            line.split("\t")
+        )
+    )
+    # split attributes and replace:
+    gff_line['attributes'] = dict([i.split("=") for i in gff_line['attributes'].split(";")])
+    return gff_line
+
 def filter_qual_dom(DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY,
                     TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM,
                     ELEMENT):
@@ -90,7 +106,7 @@
     filt_dom_tmp = NamedTemporaryFile(delete=False)
     with open(DOM_GFF, "r") as gff_all, open(filt_dom_tmp.name,
                                              "w") as gff_filtered:
-        for comment_idx in range(count_comment):
+        for _ in range(count_comment):
             next(gff_all)
         dom_dict = defaultdict(lambda: defaultdict(int))
         orig_class_dict = defaultdict(int)
@@ -109,20 +125,22 @@
             orig_class_dict[classification] += 1
             ## ambiguous domains filtered out automatically
             if classification != configuration.AMBIGUOUS_TAG:
-                al_identity = float(attributes.split(";")[-5].split("=")[1])
-                al_similarity = float(attributes.split(";")[-4].split("=")[1])
-                al_length = float(attributes.split(";")[-3].split("=")[1])
-                relat_interrupt = float(attributes.split(";")[-2].split("=")[
-                    1])
-                db_len_proportion = float(attributes.split(";")[-1].split("=")[
-                    1])
-                dom_type = attributes.split(";")[0].split("=")[1]
-                seq_id = line.split("\t")[0]
-                xminimal = int(line.split("\t")[3])
-                xmaximal = int(line.split("\t")[4])
-                if al_identity >= TH_IDENTITY and al_similarity >= TH_SIMILARITY and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and db_len_proportion <= TH_LEN_RATIO and (
-                        dom_type == SELECTED_DOM or
-                        SELECTED_DOM == "All") and (ELEMENT in classification):
+                gff_line = parse_gff_line(line)
+                al_identity = float(gff_line['attributes']['Identity'])
+                al_similarity = float(gff_line['attributes']['Similarity'])
+                al_length = float(gff_line['attributes']['Relat_Length'])
+                relat_interrupt = float(gff_line['attributes']['Relat_Interruptions'])
+                db_len_proportion = float(gff_line['attributes']['Hit_to_DB_Length'])
+                dom_type = gff_line['attributes']['Final_Classification']
+                seq_id = gff_line['seqid']
+                xminimal = int(gff_line['start'])
+                xmaximal = int(gff_line['end'])
+                c1 = al_identity >= TH_IDENTITY
+                c2 = al_similarity >= TH_SIMILARITY
+                if (c1 and c2 and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and
+                        db_len_proportion <= TH_LEN_RATIO and
+                        (dom_type == SELECTED_DOM or SELECTED_DOM == "All") and
+                        (ELEMENT in classification)):
                     gff_filtered.writelines(line)
                     filt_class_dict[classification] += 1
                     dom_dict[seq_id][dom_type] += 1