# HG changeset patch
# User petr-novak
# Date 1568620485 14400
# Node ID 1a766f9f623dca745dd512eab074014053be43ab
# Parent  0e820310d4dc234c5d4953853f59b1786431d833
Uploaded

diff -r 0e820310d4dc -r 1a766f9f623d dante_gff_output_filtering.py
--- a/dante_gff_output_filtering.py	Wed Sep 04 06:45:18 2019 -0400
+++ b/dante_gff_output_filtering.py	Mon Sep 16 03:54:45 2019 -0400
@@ -120,18 +120,17 @@
         domains_all = []
         start = True
         for line in gff_all:
-            attributes = line.rstrip().split("\t")[-1]
-            classification = attributes.split(";")[1].split("=")[1]
+            gff_line = parse_gff_line(line)
+            classification = gff_line['attributes']['Final_Classification']
             orig_class_dict[classification] += 1
             ## ambiguous domains filtered out automatically
             if classification != configuration.AMBIGUOUS_TAG:
-                gff_line = parse_gff_line(line)
                 al_identity = float(gff_line['attributes']['Identity'])
                 al_similarity = float(gff_line['attributes']['Similarity'])
                 al_length = float(gff_line['attributes']['Relat_Length'])
                 relat_interrupt = float(gff_line['attributes']['Relat_Interruptions'])
                 db_len_proportion = float(gff_line['attributes']['Hit_to_DB_Length'])
-                dom_type = gff_line['attributes']['Final_Classification']
+                dom_type = gff_line['attributes']['Name']
                 seq_id = gff_line['seqid']
                 xminimal = int(gff_line['start'])
                 xmaximal = int(gff_line['end'])
diff -r 0e820310d4dc -r 1a766f9f623d dante_gff_to_dna.py
--- a/dante_gff_to_dna.py	Wed Sep 04 06:45:18 2019 -0400
+++ b/dante_gff_to_dna.py	Mon Sep 16 03:54:45 2019 -0400
@@ -7,7 +7,7 @@
 from collections import defaultdict
 from Bio import SeqIO
 import configuration
-
+from dante_gff_output_filtering import parse_gff_line
 t_nt_seqs_extraction = time.time()
 
 
@@ -34,7 +34,8 @@
     ''' Extract nucleotide sequences of protein domains found by DANTE from input DNA seq.
 		Sequences are saved in fasta files separately for each transposon lineage.
 		Sequences extraction is based on position of Best_Hit alignment reported by LASTAL.
-		The positions can be extended (optional) based on what part of database domain was aligned (Best_Hit_DB_Pos attribute).
+		The positions can be extended (optional) based on what part of database domain was aligned
+        (Best_Hit_DB_Pos attribute).
 		The strand orientation needs to be considered in extending and extracting the sequence itself
 	'''
     [count_comment, first_line] = check_file_start(DOM_GFF)
@@ -49,20 +50,24 @@
         allSeqs = SeqIO.to_dict(SeqIO.parse(DNA_SEQ, 'fasta'))
         seq_nt = allSeqs[seq_id_stored]
         for line in domains:
-            seq_id = line.split("\t")[0]
-            dom_type = line.split("\t")[8].split(";")[0].split("=")[1]
-            elem_type = line.split("\t")[8].split(";")[1].split("=")[1]
-            strand = line.split("\t")[6]
-            align_nt_start = int(line.split("\t")[8].split(";")[3].split(":")[
+            gff_line = parse_gff_line(line)
+            elem_type = gff_line['attributes']['Final_Classification']
+            if elem_type == configuration.AMBIGUOUS_TAG:
+                continue  # skip ambiguous classification
+            seq_id = gff_line['seqid']
+            dom_type = gff_line['attributes']['Name']
+            strand = gff_line['strand']
+            align_nt_start = int(gff_line['attributes']['Best_Hit'].split(":")[
                 -1].split("-")[0])
-            align_nt_end = int(line.split("\t")[8].split(";")[3].split(":")[
+            align_nt_end = int(gff_line['attributes']['Best_Hit'].split(":")[
                 -1].split("-")[1].split("[")[0])
             if seq_id != seq_id_stored:
                 seq_id_stored = seq_id
                 seq_nt = allSeqs[seq_id_stored]
             if EXTENDED:
                 ## which part of database sequence was aligned
-                db_part = line.split("\t")[8].split(";")[4].split("=")[1]
+                db_part = gff_line['attributes']['Best_Hit_DB_Pos']
+                ## db_part = line.split("\t")[8].split(";")[4].split("=")[1]
                 ## datatabse seq length
                 dom_len = int(db_part.split("of")[1])
                 ## start of alignment on database seq
diff -r 0e820310d4dc -r 1a766f9f623d test-data/toolshed.g2.bx.psu.edu_repos_petr-novak_dante_dante_1.0.0.tgz
Binary file test-data/toolshed.g2.bx.psu.edu_repos_petr-novak_dante_dante_1.0.0.tgz has changed