Mercurial > repos > petr-novak > dante
changeset 15:3151a72a6671 draft
Uploaded
author | petr-novak |
---|---|
date | Tue, 03 Sep 2019 05:20:02 -0400 |
parents | a6c55d1bdb6c |
children | 0e820310d4dc |
files | coverage2gff.py dante.py dante.xml dante_gff_output_filtering.py dante_pyan_scheme.png dante_pyan_scheme.svg parse_aln.py test-data/GEPY_test_long_1_output_unfiltered.gff3 |
diffstat | 8 files changed, 385 insertions(+), 446 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/coverage2gff.py Tue Sep 03 05:20:02 2019 -0400 @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +import argparse +import tempfile +import shutil +import sys + +def parse_args(): + '''Argument parsin''' + description = """ + parsing cap3 assembly aln output + """ + + parser = argparse.ArgumentParser( + description=description, + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument( + '-g', + '--gff_file', + default=None, + required=True, + help="input gff3 file for appending coverage information", + type=str, + action='store') + parser.add_argument( + '-p', + '--profile', + default=None, + required=True, + help="output file for coverage profile", + type=str, + action="store") + return parser.parse_args() + +def read_coverage(profile): + with open(profile) as p: + d = {} + for name, prof in zip(p, p): + d[name[1:].strip()] = [int(i) for i in prof.split()] + print(d, file=sys.stderr) + return d + + +def main(): + args = parse_args() + coverage_hash = read_coverage(args.profile) + gff_tmp = tempfile.NamedTemporaryFile() + with open(args.gff_file) as f, open(gff_tmp.name, 'w') as out: + for line in f: + if line[0] == "#": + out.write(line) + else: + line_parts = line.split() + start = int(line_parts[3]) + end = int(line_parts[4]) + coverage = round( sum(coverage_hash[line_parts[0]][( + start - 1):end]) / (end - start + 1), 3) + new_line = "{};Coverage={}\n".format(line.strip(), coverage) + out.write(new_line) + + shutil.copyfile(gff_tmp.name, args.gff_file) + + +if __name__ == "__main__": + + main()
--- a/dante.py Wed Aug 28 08:08:47 2019 -0400 +++ b/dante.py Tue Sep 03 05:20:02 2019 -0400 @@ -586,10 +586,10 @@ if count_region == len(indices_plus): strand_gff = "-" if strand_gff == "+": - feature_start = min(start_hit[regions_above_threshold])-1 + feature_start = min(start_hit[regions_above_threshold]) + 1 feature_end = max(end_hit[regions_above_threshold]) else: - feature_end = seq_len[region][0] - min(start_hit[regions_above_threshold]) - 1 + feature_end = seq_len[region][0] - min(start_hit[regions_above_threshold]) feature_start = seq_len[region][0] - max(end_hit[regions_above_threshold]) + 1 create_gff3(domain_type, ann_substring, unique_annotations, ann_pos_counts, feature_start,feature_end,
--- a/dante.xml Wed Aug 28 08:08:47 2019 -0400 +++ b/dante.xml Tue Sep 03 05:20:02 2019 -0400 @@ -6,123 +6,185 @@ <requirement type="package" version="1.0">rexdb</requirement> <requirement type="set_environment">REXDB</requirement> </requirements> -<stdio> - <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" /> - <regex match="error" source="stderr" level="fatal" description="Unknown error" /> -</stdio> -<command> -python3 ${__tool_directory__}/dante.py --query ${input} --domain_gff ${DomGff} - --protein_database \${REXDB}/${db_type}_pdb - --classification \${REXDB}/${db_type}_class - --scoring_matrix ${scoring_matrix} - && + <stdio> + <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" /> + <regex match="error" source="stderr" level="fatal" description="Unknown error" /> + </stdio> + <command> + #if str($input_type.input_type_selector) == "aln" + python3 ${__tool_directory__}/parse_aln.py -a $(input_sequences) -f sequences.fasta -p sequences.profile + && + INPUT_SEQUENCES="sequences.fasta" + #else + INPUT_SEQUENCES=$(input_sequences) + #end if + && + + + python3 ${__tool_directory__}/dante.py --query \${INPUT_SEQUENCES} --domain_gff ${DomGff} + --protein_database \${REXDB}/${db_type}_pdb + --classification \${REXDB}/${db_type}_class + --scoring_matrix ${scoring_matrix} + + + #if str($input_type.input_type_selector) == "aln" + && + python3 ${__tool_directory__}/coverage2gff.py -p sequences.profile -g ${DomGff} + #end if -python3 ${__tool_directory__}/dante_gff_output_filtering.py --dom_gff ${DomGff} ---domains_prot_seq domains_filtered.fasta --domains_filtered domains_filtered.gff ---output_dir . ---selected_dom All --th_identity 0.35 ---th_similarity 0.45 --th_length 0.9 ---interruptions 1 --max_len_proportion 1.1 ---element_type '' && + #if str($iterative) == "Yes" + && + python3 ${__tool_directory__}/dante_gff_output_filtering.py --dom_gff ${DomGff} + --domains_prot_seq domains_filtered.fasta --domains_filtered domains_filtered.gff + --output_dir . + --selected_dom All --th_identity 0.35 + --th_similarity 0.45 --th_length 0.9 + --interruptions 1 --max_len_proportion 1.1 + --element_type '' + && -python3 ${__tool_directory__}/fasta2database.py domains_filtered.fasta domains_filtered.db -domains_filtered.class && + + + python3 ${__tool_directory__}/fasta2database.py domains_filtered.fasta domains_filtered.db + domains_filtered.class + && -lastdb -p domains_filtered.db domains_filtered.db && + lastdb -p domains_filtered.db domains_filtered.db + && + + python3 ${__tool_directory__}/dante.py --query \${INPUT_SEQUENCES} --domain_gff ${DomGff2} + --protein_database domains_filtered.db + --classification domains_filtered.class + --scoring_matrix BL80 + -python3 ${__tool_directory__}/dante.py --query ${input} --domain_gff ${DomGff2} - --protein_database domains_filtered.db - --classification domains_filtered.class - --scoring_matrix BL80 + #if str($input_type.input_type_selector) == "aln" + && + python3 ${__tool_directory__}/coverage2gff.py -p sequences.profile -g ${DomGff2} + #end if + #end if -</command> -<inputs> - <param format="fasta" type="data" name="input" - label="Choose your input sequence" help="Input DNA must be in proper fasta format, multi-fasta containing more sequences is allowed" /> + </command> + <inputs> - <param name="db_type" type="select" label="Select taxon and protein domain database version (REXdb)" help=""> - <options from_file="rexdb_versions.loc"> - <column name="name" index="0"/> - <column name="value" index="1"/> - </options> - </param> + <conditional name="input_type"> + <param name="input_type_selector" type="select" label="Choose the type of sequence data"> + <option value="fasta" selected="true">Fasta</option> + <option value="aln">Aln file</option> + </param> + <when value="fasta"> + <param name="input_sequences" type="data" format="fasta" label="Sequences in fasta format"/> + </when> + <when value="aln"> + <param name="input_sequences" type="data" format="txt" label="Sequences in ALN format (extracted from RepeatExplorer)"/> + </when> + </conditional> + <param name="db_type" type="select" label="Select taxon and protein domain database version (REXdb)" help=""> + <options from_file="rexdb_versions.loc"> + <column name="name" index="0"/> + <column name="value" index="1"/> + </options> + </param> - <param name="scoring_matrix" type="select" label="Select scoring matrix"> - <option value="BL80" selected="true" >BLOSUM80</option> - <option value="BL62">BLOSUM62</option> - <option value="MIQS">MIQS</option> -</param> -</inputs> + <param name="scoring_matrix" type="select" label="Select scoring matrix"> + <option value="BL80" selected="true" >BLOSUM80</option> + <option value="BL62">BLOSUM62</option> + <option value="MIQS">MIQS</option> + </param> -<outputs> - <data format="gff3" name="DomGff" label="protein domains detected in ${input.hid} - 1st pass (unfiltered)" /> - <data format="gff3" name="DomGff2" label="protein domains detected in ${input.hid} - 2nd pass (unfiltered)" /> -</outputs> -<help> + <param name="iterative" type="select" label="Run iterative search" truevalue="true" valsevalue="false" + help="Second iteration run search against database of proteins extracted from query. Second iteration can yield some extra hits in some cases."> + <option value="No" selected="true">No</option> + <option value="Yes">Yes</option> + </param> + </inputs> -THIS IS A PRIMARY OUTPUT THAT SHOULD UNDERGO FURTHER QUALITY FILTERING TO GET RID OFF POTENTIAL FALSE POSITIVE DOMAINS - -**WHAT IT DOES** + <outputs> + <data format="gff3" name="DomGff" label="DANTE on ${on_string}" /> + <data format="gff3" name="DomGff2" label="DANTE on ${on_string}: 2nd pass"> + <filter>iterative == "Yes" </filter> + </data> + </outputs> + <tests> + <test> + <param name="input_type" value="fasta"/> + <param name="input_sequences" value="GEPY_test_long_1.fa"/> + <param name="db_type" value="Viridiplantae_v3.0"/> + <param name="scoring_matrix" value="BL80"/> + <param name="iterative" value="No"/> + <output name="DomGff" value="GEPY_test_long_1_output_unfiltered.gff3"/> + </test> -This tool uses external aligning programme `LAST`_ and RepeatExplorer database of TE protein domains(REXdb) (Viridiplantae and Metazoa) + + </tests> -.. _LAST: http://last.cbrc.jp/ -*Lastal* runs similarity search to find hits between query DNA sequence and our database of protein domains from all Viridiplantae repetitive elements. Hits with overlapping positions in the sequence (even through other hits) forms a cluster which represents one potential protein domain. Strand orientation is taken into consideration when forming the clusters which means each cluster is built from forward or reverse stranded hits exclusively. The clusters are subsequently processed separately; within one cluster positions are scanned base-by-base and classification strings are assigned for each of them based on the database sequences which were mapped on that place. These asigned classification strings consist of a domain type as well as class and lineage of the repetitive element where the database protein comes from. Different classification levels are separated by "|" character. Every hit is scored according to the scoring matrix used for DNA-protein alignment (BLOSUM80). For single position only the hits reaching certain percentage (80% by default) of the overall best score within the whole cluster are reported. One cluster of overlapping hits represents one domain region and is recorded as one line in the resulting GFF3 file. Regarding the classition strings assigned to one region (cluster) there are three situations that can occur: + <help> - 1. There is a single classification string assigned to each position as well as classifications along all the positions in the region are mutually uniform, in this case domain's final classification is equivalent to this unique classification. - 2. There are multiple classification strings assigned to one cluster, i.e. one domain, which leads to classification to the common (less specific) level of all the strings - 3. There is a conflict at the domain type level, domains are reported with slash (e.g. RT/INT) and the classification is in this case ambiguous - -**There are 2 outputs produced by this tool:** - -1. GFF3 file of all proteins domains built from all hits found by LAST. Domains are reported per line as regions (start - end) on the original DNA sequence including the seq ID, alignment score and strand orientation. The last "Attributes" column contains several semicolon-separated information related to annotation, repetitive classification, alignment and its quality. This file can undergo further filtering using *Protein Domain Filter* tool + + THIS IS A PRIMARY OUTPUT THAT SHOULD UNDERGO FURTHER QUALITY FILTERING TO GET RID OFF POTENTIAL FALSE POSITIVE DOMAINS + + **WHAT IT DOES** + + This tool uses external aligning programme `LAST`_ and RepeatExplorer database of TE protein domains(REXdb) (Viridiplantae and Metazoa) + + .. _LAST: http://last.cbrc.jp/ + + *Lastal* runs similarity search to find hits between query DNA sequence and our database of protein domains from all Viridiplantae repetitive elements. Hits with overlapping positions in the sequence (even through other hits) forms a cluster which represents one potential protein domain. Strand orientation is taken into consideration when forming the clusters which means each cluster is built from forward or reverse stranded hits exclusively. The clusters are subsequently processed separately; within one cluster positions are scanned base-by-base and classification strings are assigned for each of them based on the database sequences which were mapped on that place. These asigned classification strings consist of a domain type as well as class and lineage of the repetitive element where the database protein comes from. Different classification levels are separated by "|" character. Every hit is scored according to the scoring matrix used for DNA-protein alignment (BLOSUM80). For single position only the hits reaching certain percentage (80% by default) of the overall best score within the whole cluster are reported. One cluster of overlapping hits represents one domain region and is recorded as one line in the resulting GFF3 file. Regarding the classition strings assigned to one region (cluster) there are three situations that can occur: -- Attributes reported always: + 1. There is a single classification string assigned to each position as well as classifications along all the positions in the region are mutually uniform, in this case domain's final classification is equivalent to this unique classification. + 2. There are multiple classification strings assigned to one cluster, i.e. one domain, which leads to classification to the common (less specific) level of all the strings + 3. There is a conflict at the domain type level, domains are reported with slash (e.g. RT/INT) and the classification is in this case ambiguous + + **There are 2 outputs produced by this tool:** + + 1. GFF3 file of all proteins domains built from all hits found by LAST. Domains are reported per line as regions (start - end) on the original DNA sequence including the seq ID, alignment score and strand orientation. The last "Attributes" column contains several semicolon-separated information related to annotation, repetitive classification, alignment and its quality. This file can undergo further filtering using *Protein Domain Filter* tool - Name + - Attributes reported always: + + Name type of domain; if ambiguous reported with slash - Final_classification + Final_classification definite classification based on all partial classifications of Region_hits_classifications attribute or "Ambiguous_domain" when there is an ambiguous domain type - Region_Hits_Classifications + Region_Hits_Classifications all hits classifications (comma separated) from a certain domain region that reach the set score threshold; in case of multiple annotations the square brackets indicate the number of bases having this particular classification - -- Attributes only reported in case of unambiguous domain type (all the attributes including quality information are related to the Best_Hit of the region): - - Best_hit + + - Attributes only reported in case of unambiguous domain type (all the attributes including quality information are related to the Best_Hit of the region): + + Best_hit classification and position of the best alignment with the highest score within the cluster; in the square brackets is the percentage of the whole cluster range that this best hit covers - Best_Hit_DB_Pos + Best_Hit_DB_Pos showing which part of the original datatabase domain corresponding to the Best Hit was aligned on query DNA (e.g. **Best_Hit_DB_Pos=17:75of79** means the Best Hit reported in GFF represents region from 17th to 75th of total 79 aminoacids in the original domain from the database) - DB_Seq + DB_Seq database protein sequence of the best hit mapped to the query DNA - Query_Seq + Query_Seq alignment sequence of the query DNA for the best hit - Identity + Identity ratio of identical amino acids in alignment sequence to the length of alignment - Similarity + Similarity ratio of alignment positions with positive score (according to the scoring matrix) to the length of alignment - Relat_Length + Relat_Length ratio of gapless length of the aligned protein sequence to the whole length of the database protein - Relat_Interruptions + Relat_Interruptions number of the interruptions (frameshifts + stop codons) in aligned translated query sequence per each starting 100 AA - Hit_to_DB_Length + Hit_to_DB_Length proportion of alignment length to the original length of the protein domain from database - - + + -!NOTE: Tool can in average process 0.5 Gbps of the DNA sequence per day. This is only a rough estimate and it is highly dependent on input data (repetive elements occurence) as well as computing resources. Maximum running time of the tool is 7 days. + !NOTE: Tool can in average process 0.5 Gbps of the DNA sequence per day. This is only a rough estimate and it is highly dependent on input data (repetive elements occurence) as well as computing resources. Maximum running time of the tool is 7 days. - </help> + </help> </tool>
--- a/dante_gff_output_filtering.py Wed Aug 28 08:08:47 2019 -0400 +++ b/dante_gff_output_filtering.py Tue Sep 03 05:20:02 2019 -0400 @@ -82,6 +82,22 @@ return count_comment, lines +def parse_gff_line(line): + '''Return dictionary with gff fields and atributers + Note - type of fields is strings + ''' + # order of first 9 column is fixed + gff_line = dict( + zip( + ['seqid', 'source', 'type', 'start', 'end', + 'score', 'strand', 'phase', 'attributes'], + line.split("\t") + ) + ) + # split attributes and replace: + gff_line['attributes'] = dict([i.split("=") for i in gff_line['attributes'].split(";")]) + return gff_line + def filter_qual_dom(DOM_GFF, FILT_DOM_GFF, TH_IDENTITY, TH_SIMILARITY, TH_LENGTH, TH_INTERRUPT, TH_LEN_RATIO, SELECTED_DOM, ELEMENT): @@ -90,7 +106,7 @@ filt_dom_tmp = NamedTemporaryFile(delete=False) with open(DOM_GFF, "r") as gff_all, open(filt_dom_tmp.name, "w") as gff_filtered: - for comment_idx in range(count_comment): + for _ in range(count_comment): next(gff_all) dom_dict = defaultdict(lambda: defaultdict(int)) orig_class_dict = defaultdict(int) @@ -109,20 +125,22 @@ orig_class_dict[classification] += 1 ## ambiguous domains filtered out automatically if classification != configuration.AMBIGUOUS_TAG: - al_identity = float(attributes.split(";")[-5].split("=")[1]) - al_similarity = float(attributes.split(";")[-4].split("=")[1]) - al_length = float(attributes.split(";")[-3].split("=")[1]) - relat_interrupt = float(attributes.split(";")[-2].split("=")[ - 1]) - db_len_proportion = float(attributes.split(";")[-1].split("=")[ - 1]) - dom_type = attributes.split(";")[0].split("=")[1] - seq_id = line.split("\t")[0] - xminimal = int(line.split("\t")[3]) - xmaximal = int(line.split("\t")[4]) - if al_identity >= TH_IDENTITY and al_similarity >= TH_SIMILARITY and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and db_len_proportion <= TH_LEN_RATIO and ( - dom_type == SELECTED_DOM or - SELECTED_DOM == "All") and (ELEMENT in classification): + gff_line = parse_gff_line(line) + al_identity = float(gff_line['attributes']['Identity']) + al_similarity = float(gff_line['attributes']['Similarity']) + al_length = float(gff_line['attributes']['Relat_Length']) + relat_interrupt = float(gff_line['attributes']['Relat_Interruptions']) + db_len_proportion = float(gff_line['attributes']['Hit_to_DB_Length']) + dom_type = gff_line['attributes']['Final_Classification'] + seq_id = gff_line['seqid'] + xminimal = int(gff_line['start']) + xmaximal = int(gff_line['end']) + c1 = al_identity >= TH_IDENTITY + c2 = al_similarity >= TH_SIMILARITY + if (c1 and c2 and al_length >= TH_LENGTH and relat_interrupt <= TH_INTERRUPT and + db_len_proportion <= TH_LEN_RATIO and + (dom_type == SELECTED_DOM or SELECTED_DOM == "All") and + (ELEMENT in classification)): gff_filtered.writelines(line) filt_class_dict[classification] += 1 dom_dict[seq_id][dom_type] += 1
--- a/dante_pyan_scheme.svg Wed Aug 28 08:08:47 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,326 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="no"?> -<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" - "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> -<!-- Generated by graphviz version 2.36.0 (20140111.2315) - --> -<!-- Title: G Pages: 1 --> -<svg width="2270pt" height="436pt" - viewBox="0.00 0.00 2270.00 436.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> -<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 432)"> -<title>G</title> -<polygon fill="white" stroke="none" points="-4,4 -4,-432 2266,-432 2266,4 -4,4"/> -<g id="clust1" class="cluster"><title>cluster_G</title> -<path fill="#808080" fill-opacity="0.094118" stroke="black" d="M20,-8C20,-8 2242,-8 2242,-8 2248,-8 2254,-14 2254,-20 2254,-20 2254,-408 2254,-408 2254,-414 2248,-420 2242,-420 2242,-420 20,-420 20,-420 14,-420 8,-414 8,-408 8,-408 8,-20 8,-20 8,-14 14,-8 20,-8"/> -</g> -<!-- dante --> -<g id="node1" class="node"><title>dante</title> -<ellipse fill="#ffffff" fill-opacity="0.698039" stroke="black" cx="1004" cy="-394" rx="29.3479" ry="18"/> -<text text-anchor="middle" x="1004" y="-390.3" font-family="Times,serif" font-size="14.00" fill="#000000">dante</text> -</g> -<!-- dante__CustomFormatter --> -<g id="node2" class="node"><title>dante__CustomFormatter</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="89" cy="-322" rx="73.1654" ry="18"/> -<text text-anchor="middle" x="89" y="-318.3" font-family="Times,serif" font-size="14.00" fill="#000000">CustomFormatter</text> -</g> -<!-- dante->dante__CustomFormatter --> -<g id="edge4" class="edge"><title>dante->dante__CustomFormatter</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M974.471,-392.552C866.016,-390.536 480.929,-380.546 167,-340 160.79,-339.198 154.356,-338.221 147.963,-337.146"/> -<polygon fill="#838b8b" stroke="#838b8b" points="148.443,-333.677 137.987,-335.386 147.227,-340.57 148.443,-333.677"/> -</g> -<!-- dante->dante__CustomFormatter --> -<g id="edge21" class="edge"><title>dante->dante__CustomFormatter</title> -<path fill="none" stroke="#000000" d="M974.392,-392.41C867.401,-390.09 492.144,-379.67 185,-340 175.598,-338.786 165.683,-337.171 155.921,-335.42"/> -<polygon fill="#000000" stroke="#000000" points="156.301,-331.931 145.832,-333.556 155.029,-338.814 156.301,-331.931"/> -</g> -<!-- dante__adjust_gff --> -<g id="node3" class="node"><title>dante__adjust_gff</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="505" cy="-178" rx="46.2191" ry="18"/> -<text text-anchor="middle" x="505" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">adjust_gff</text> -</g> -<!-- dante->dante__adjust_gff --> -<g id="edge7" class="edge"><title>dante->dante__adjust_gff</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M977.391,-385.915C943.907,-376.703 884.868,-359.47 836,-340 821.049,-334.043 624.595,-237.732 541.603,-196.983"/> -<polygon fill="#838b8b" stroke="#838b8b" points="543.049,-193.794 532.53,-192.528 539.964,-200.077 543.049,-193.794"/> -</g> -<!-- dante__alignment_scoring --> -<g id="node4" class="node"><title>dante__alignment_scoring</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="219" cy="-34" rx="76.5394" ry="18"/> -<text text-anchor="middle" x="219" y="-30.3" font-family="Times,serif" font-size="14.00" fill="#000000">alignment_scoring</text> -</g> -<!-- dante->dante__alignment_scoring --> -<g id="edge20" class="edge"><title>dante->dante__alignment_scoring</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M974.566,-392.156C863.693,-388.738 471.497,-374.426 352,-340 269.671,-316.281 181,-336.678 181,-251 181,-251 181,-251 181,-177 181,-137.196 179.276,-126.039 191,-88 193.85,-78.7512 198.325,-69.2277 202.868,-60.8496"/> -<polygon fill="#838b8b" stroke="#838b8b" points="206.028,-62.3733 207.942,-51.9529 199.948,-58.905 206.028,-62.3733"/> -</g> -<!-- dante__annotations_dict --> -<g id="node5" class="node"><title>dante__annotations_dict</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="1290" cy="-106" rx="68.3417" ry="18"/> -<text text-anchor="middle" x="1290" y="-102.3" font-family="Times,serif" font-size="14.00" fill="#000000">annotations_dict</text> -</g> -<!-- dante->dante__annotations_dict --> -<g id="edge5" class="edge"><title>dante->dante__annotations_dict</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M1033.53,-393.289C1205.7,-394.283 2077.75,-389.63 2235,-196 2245.09,-183.58 2245.99,-171.626 2235,-160 2205.39,-128.688 1589.65,-113.102 1368.35,-108.496"/> -<polygon fill="#838b8b" stroke="#838b8b" points="1368.18,-104.992 1358.11,-108.285 1368.04,-111.991 1368.18,-104.992"/> -</g> -<!-- dante__best_score --> -<g id="node6" class="node"><title>dante__best_score</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="617" cy="-178" rx="48.1437" ry="18"/> -<text text-anchor="middle" x="617" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">best_score</text> -</g> -<!-- dante->dante__best_score --> -<g id="edge12" class="edge"><title>dante->dante__best_score</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M981.034,-382.622C959.076,-372.467 925.226,-356.204 897,-340 821.311,-296.55 809.361,-274.259 733,-232 708.76,-218.585 680.504,-205.622 657.948,-195.875"/> -<polygon fill="#838b8b" stroke="#838b8b" points="659.229,-192.616 648.659,-191.902 656.477,-199.052 659.229,-192.616"/> -</g> -<!-- dante__characterize_fasta --> -<g id="node7" class="node"><title>dante__characterize_fasta</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="757" cy="-178" rx="74.1402" ry="18"/> -<text text-anchor="middle" x="757" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">characterize_fasta</text> -</g> -<!-- dante->dante__characterize_fasta --> -<g id="edge2" class="edge"><title>dante->dante__characterize_fasta</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M987.794,-378.959C946.126,-342.859 834.951,-246.536 783.635,-202.077"/> -<polygon fill="#838b8b" stroke="#838b8b" points="785.783,-199.307 775.933,-195.404 781.199,-204.597 785.783,-199.307"/> -</g> -<!-- dante__create_gff3 --> -<g id="node8" class="node"><title>dante__create_gff3</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="271" cy="-178" rx="50.5427" ry="18"/> -<text text-anchor="middle" x="271" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">create_gff3</text> -</g> -<!-- dante->dante__create_gff3 --> -<g id="edge15" class="edge"><title>dante->dante__create_gff3</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M974.704,-390.945C914.316,-386.183 772.011,-372.247 657,-340 523.788,-302.65 375.042,-231.71 307.68,-197.872"/> -<polygon fill="#838b8b" stroke="#838b8b" points="309.229,-194.733 298.725,-193.351 306.074,-200.982 309.229,-194.733"/> -</g> -<!-- dante__domain_annotation --> -<g id="node9" class="node"><title>dante__domain_annotation</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="928" cy="-178" rx="78.9404" ry="18"/> -<text text-anchor="middle" x="928" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">domain_annotation</text> -</g> -<!-- dante->dante__domain_annotation --> -<g id="edge19" class="edge"><title>dante->dante__domain_annotation</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M997.993,-376.297C994.246,-365.866 989.33,-352.154 985,-340 968.209,-292.868 948.789,-237.921 937.501,-205.938"/> -<polygon fill="#838b8b" stroke="#838b8b" points="940.763,-204.665 934.135,-196.4 934.162,-206.995 940.763,-204.665"/> -</g> -<!-- dante__domain_search --> -<g id="node10" class="node"><title>dante__domain_search</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="1077" cy="-250" rx="64.0172" ry="18"/> -<text text-anchor="middle" x="1077" y="-246.3" font-family="Times,serif" font-size="14.00" fill="#000000">domain_search</text> -</g> -<!-- dante->dante__domain_search --> -<g id="edge9" class="edge"><title>dante->dante__domain_search</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M1005.01,-375.935C1006.73,-357.247 1011.4,-326.891 1024,-304 1030.18,-292.779 1039.44,-282.35 1048.5,-273.765"/> -<polygon fill="#838b8b" stroke="#838b8b" points="1050.91,-276.305 1056,-267.009 1046.23,-271.106 1050.91,-276.305"/> -</g> -<!-- dante__filter_params --> -<g id="node11" class="node"><title>dante__filter_params</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="258" cy="-106" rx="57.2688" ry="18"/> -<text text-anchor="middle" x="258" y="-102.3" font-family="Times,serif" font-size="14.00" fill="#000000">filter_params</text> -</g> -<!-- dante->dante__filter_params --> -<g id="edge13" class="edge"><title>dante->dante__filter_params</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M974.684,-392.285C901.967,-389.928 707.917,-380.066 552,-340 392.662,-299.055 300.959,-333.741 211,-196 197.122,-174.75 214.122,-148.963 231.429,-130.628"/> -<polygon fill="#838b8b" stroke="#838b8b" points="234.193,-132.813 238.776,-123.261 229.237,-127.87 234.193,-132.813"/> -</g> -<!-- dante__get_version --> -<g id="node12" class="node"><title>dante__get_version</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="1077" cy="-178" rx="51.4931" ry="18"/> -<text text-anchor="middle" x="1077" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">get_version</text> -</g> -<!-- dante->dante__get_version --> -<g id="edge6" class="edge"><title>dante->dante__get_version</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M999.191,-375.849C991.564,-345.083 979.882,-279.315 1004,-232 1011.29,-217.692 1024.46,-206.343 1037.62,-197.878"/> -<polygon fill="#838b8b" stroke="#838b8b" points="1039.42,-200.874 1046.2,-192.73 1035.83,-194.871 1039.42,-200.874"/> -</g> -<!-- dante__group_annot_regs --> -<g id="node13" class="node"><title>dante__group_annot_regs</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="1221" cy="-178" rx="74.1402" ry="18"/> -<text text-anchor="middle" x="1221" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">group_annot_regs</text> -</g> -<!-- dante->dante__group_annot_regs --> -<g id="edge11" class="edge"><title>dante->dante__group_annot_regs</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M1027.57,-383.191C1047.91,-373.987 1077.34,-358.834 1099,-340 1145.45,-299.62 1185.55,-238.939 1206.18,-204.769"/> -<polygon fill="#838b8b" stroke="#838b8b" points="1209.36,-206.257 1211.47,-195.873 1203.35,-202.679 1209.36,-206.257"/> -</g> -<!-- dante__hits_processing --> -<g id="node14" class="node"><title>dante__hits_processing</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="1379" cy="-178" rx="65.4659" ry="18"/> -<text text-anchor="middle" x="1379" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">hits_processing</text> -</g> -<!-- dante->dante__hits_processing --> -<g id="edge3" class="edge"><title>dante->dante__hits_processing</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M1027.98,-383.107C1051.74,-373.057 1088.98,-356.642 1120,-340 1204.55,-294.644 1299.18,-232.823 1347.26,-200.577"/> -<polygon fill="#838b8b" stroke="#838b8b" points="1349.33,-203.397 1355.68,-194.911 1345.43,-197.589 1349.33,-203.397"/> -</g> -<!-- dante__line_generator --> -<g id="node15" class="node"><title>dante__line_generator</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="1524" cy="-178" rx="61.6163" ry="18"/> -<text text-anchor="middle" x="1524" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">line_generator</text> -</g> -<!-- dante->dante__line_generator --> -<g id="edge18" class="edge"><title>dante->dante__line_generator</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M1029.5,-384.775C1059.87,-374.825 1112.03,-357.253 1156,-340 1276.65,-292.653 1416.09,-229.193 1483.43,-197.973"/> -<polygon fill="#838b8b" stroke="#838b8b" points="1485.24,-200.992 1492.84,-193.606 1482.29,-194.644 1485.24,-200.992"/> -</g> -<!-- dante__main --> -<g id="node16" class="node"><title>dante__main</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="1062" cy="-322" rx="28.3955" ry="18"/> -<text text-anchor="middle" x="1062" y="-318.3" font-family="Times,serif" font-size="14.00" fill="#000000">main</text> -</g> -<!-- dante->dante__main --> -<g id="edge16" class="edge"><title>dante->dante__main</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M1011.91,-376.411C1018.86,-366.746 1028.9,-354.462 1038.25,-344.112"/> -<polygon fill="#838b8b" stroke="#838b8b" points="1040.95,-346.351 1045.21,-336.65 1035.83,-341.577 1040.95,-346.351"/> -</g> -<!-- dante->dante__main --> -<g id="edge22" class="edge"><title>dante->dante__main</title> -<path fill="none" stroke="#000000" d="M1020.96,-379.17C1029.63,-370.179 1039.88,-358.201 1048.08,-347.616"/> -<polygon fill="#000000" stroke="#000000" points="1050.96,-349.613 1054.15,-339.51 1045.36,-345.417 1050.96,-349.613"/> -</g> -<!-- dante__overlapping_regions --> -<g id="node17" class="node"><title>dante__overlapping_regions</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="1705" cy="-178" rx="83.2892" ry="18"/> -<text text-anchor="middle" x="1705" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">overlapping_regions</text> -</g> -<!-- dante->dante__overlapping_regions --> -<g id="edge8" class="edge"><title>dante->dante__overlapping_regions</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M1030.97,-386.325C1069.51,-376.682 1142.47,-358.027 1204,-340 1368.96,-291.674 1562.05,-227.419 1652.83,-196.752"/> -<polygon fill="#838b8b" stroke="#838b8b" points="1653.99,-200.055 1662.34,-193.536 1651.75,-193.424 1653.99,-200.055"/> -</g> -<!-- dante__score_matrix_evaluation --> -<g id="node18" class="node"><title>dante__score_matrix_evaluation</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="1904" cy="-178" rx="98.212" ry="18"/> -<text text-anchor="middle" x="1904" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">score_matrix_evaluation</text> -</g> -<!-- dante->dante__score_matrix_evaluation --> -<g id="edge1" class="edge"><title>dante->dante__score_matrix_evaluation</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M1031.84,-387.883C1080.09,-378.91 1182.29,-359.439 1268,-340 1479.34,-292.068 1727.83,-226.455 1841.52,-195.917"/> -<polygon fill="#838b8b" stroke="#838b8b" points="1842.7,-199.226 1851.45,-193.249 1840.88,-192.466 1842.7,-199.226"/> -</g> -<!-- dante__score_table --> -<g id="node19" class="node"><title>dante__score_table</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="390" cy="-178" rx="50.5427" ry="18"/> -<text text-anchor="middle" x="390" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">score_table</text> -</g> -<!-- dante->dante__score_table --> -<g id="edge17" class="edge"><title>dante->dante__score_table</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M975.478,-388.762C929.132,-381.453 834.914,-364.833 758,-340 632.445,-299.462 491.475,-230.907 426.451,-197.887"/> -<polygon fill="#838b8b" stroke="#838b8b" points="427.918,-194.707 417.419,-193.281 424.737,-200.942 427.918,-194.707"/> -</g> -<!-- dante__split_fasta --> -<g id="node20" class="node"><title>dante__split_fasta</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="2067" cy="-178" rx="46.1964" ry="18"/> -<text text-anchor="middle" x="2067" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">split_fasta</text> -</g> -<!-- dante->dante__split_fasta --> -<g id="edge10" class="edge"><title>dante->dante__split_fasta</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M1032.64,-389.19C1091.77,-381.191 1231.55,-361.596 1348,-340 1644.48,-285.016 1719.16,-271.857 2011,-196 2014.76,-195.022 2018.66,-193.956 2022.55,-192.85"/> -<polygon fill="#838b8b" stroke="#838b8b" points="2023.55,-196.205 2032.17,-190.048 2021.59,-189.484 2023.55,-196.205"/> -</g> -<!-- dante__write_info --> -<g id="node21" class="node"><title>dante__write_info</title> -<ellipse fill="#fecccc" fill-opacity="0.698039" stroke="black" cx="2179" cy="-178" rx="47.169" ry="18"/> -<text text-anchor="middle" x="2179" y="-174.3" font-family="Times,serif" font-size="14.00" fill="#000000">write_info</text> -</g> -<!-- dante->dante__write_info --> -<g id="edge14" class="edge"><title>dante->dante__write_info</title> -<path fill="none" stroke="#838b8b" stroke-dasharray="5,2" d="M1033.01,-390.4C1104.17,-383.837 1292.9,-365.305 1449,-340 1605.77,-314.586 1643.71,-301.278 1799,-268 1942.81,-237.181 1979.66,-233.041 2122,-196 2125.83,-195.004 2129.78,-193.926 2133.74,-192.813"/> -<polygon fill="#838b8b" stroke="#838b8b" points="2134.89,-196.125 2143.53,-189.999 2132.96,-189.398 2134.89,-196.125"/> -</g> -<!-- dante__create_gff3->dante__filter_params --> -<g id="edge24" class="edge"><title>dante__create_gff3->dante__filter_params</title> -<path fill="none" stroke="#000000" d="M267.787,-159.697C266.354,-151.983 264.632,-142.712 263.035,-134.112"/> -<polygon fill="#000000" stroke="#000000" points="266.444,-133.297 261.177,-124.104 259.561,-134.575 266.444,-133.297"/> -</g> -<!-- dante__domain_search->dante__adjust_gff --> -<g id="edge28" class="edge"><title>dante__domain_search->dante__adjust_gff</title> -<path fill="none" stroke="#000000" d="M1013.92,-246.55C915.929,-241.845 721.572,-228.915 560,-196 556.144,-195.214 552.173,-194.268 548.214,-193.23"/> -<polygon fill="#000000" stroke="#000000" points="549.041,-189.827 538.468,-190.496 547.151,-196.567 549.041,-189.827"/> -</g> -<!-- dante__domain_search->dante__best_score --> -<g id="edge36" class="edge"><title>dante__domain_search->dante__best_score</title> -<path fill="none" stroke="#000000" d="M1016.41,-244.06C937.104,-236.906 794.083,-221.633 674,-196 670.079,-195.163 666.036,-194.189 661.998,-193.138"/> -<polygon fill="#000000" stroke="#000000" points="662.615,-189.678 652.045,-190.404 660.761,-196.428 662.615,-189.678"/> -</g> -<!-- dante__domain_search->dante__characterize_fasta --> -<g id="edge29" class="edge"><title>dante__domain_search->dante__characterize_fasta</title> -<path fill="none" stroke="#000000" d="M1028.05,-238.292C972.326,-226.102 881.138,-206.155 820.112,-192.806"/> -<polygon fill="#000000" stroke="#000000" points="820.729,-189.358 810.212,-190.64 819.233,-196.196 820.729,-189.358"/> -</g> -<!-- dante__domain_search->dante__create_gff3 --> -<g id="edge38" class="edge"><title>dante__domain_search->dante__create_gff3</title> -<path fill="none" stroke="#000000" d="M1012.49,-248.954C882.382,-247.869 579.624,-240.312 330,-196 325.631,-195.224 321.123,-194.252 316.637,-193.166"/> -<polygon fill="#000000" stroke="#000000" points="317.471,-189.767 306.912,-190.643 315.713,-196.543 317.471,-189.767"/> -</g> -<!-- dante__domain_search->dante__domain_annotation --> -<g id="edge34" class="edge"><title>dante__domain_search->dante__domain_annotation</title> -<path fill="none" stroke="#000000" d="M1045.4,-234.155C1023.46,-223.845 993.976,-209.995 970.048,-198.754"/> -<polygon fill="#000000" stroke="#000000" points="971.382,-195.514 960.843,-194.429 968.405,-201.849 971.382,-195.514"/> -</g> -<!-- dante__domain_search->dante__get_version --> -<g id="edge31" class="edge"><title>dante__domain_search->dante__get_version</title> -<path fill="none" stroke="#000000" d="M1077,-231.697C1077,-223.983 1077,-214.712 1077,-206.112"/> -<polygon fill="#000000" stroke="#000000" points="1080.5,-206.104 1077,-196.104 1073.5,-206.104 1080.5,-206.104"/> -</g> -<!-- dante__domain_search->dante__group_annot_regs --> -<g id="edge33" class="edge"><title>dante__domain_search->dante__group_annot_regs</title> -<path fill="none" stroke="#000000" d="M1107.54,-234.155C1128.73,-223.855 1157.18,-210.023 1180.29,-198.788"/> -<polygon fill="#000000" stroke="#000000" points="1182.05,-201.827 1189.51,-194.308 1178.99,-195.532 1182.05,-201.827"/> -</g> -<!-- dante__domain_search->dante__hits_processing --> -<g id="edge30" class="edge"><title>dante__domain_search->dante__hits_processing</title> -<path fill="none" stroke="#000000" d="M1124.85,-237.908C1177.96,-225.598 1263.68,-205.729 1320.68,-192.518"/> -<polygon fill="#000000" stroke="#000000" points="1321.67,-195.881 1330.62,-190.213 1320.09,-189.062 1321.67,-195.881"/> -</g> -<!-- dante__domain_search->dante__line_generator --> -<g id="edge40" class="edge"><title>dante__domain_search->dante__line_generator</title> -<path fill="none" stroke="#000000" d="M1135.58,-242.491C1209.78,-233.846 1341.44,-217.169 1453,-196 1458.29,-194.996 1463.78,-193.856 1469.26,-192.655"/> -<polygon fill="#000000" stroke="#000000" points="1470.08,-196.057 1479.07,-190.439 1468.54,-189.229 1470.08,-196.057"/> -</g> -<!-- dante__domain_search->dante__overlapping_regions --> -<g id="edge35" class="edge"><title>dante__domain_search->dante__overlapping_regions</title> -<path fill="none" stroke="#000000" d="M1138.58,-244.552C1238.31,-236.976 1441.24,-220.024 1612,-196 1619.66,-194.923 1627.64,-193.666 1635.58,-192.333"/> -<polygon fill="#000000" stroke="#000000" points="1636.32,-195.757 1645.58,-190.611 1635.13,-188.859 1636.32,-195.757"/> -</g> -<!-- dante__domain_search->dante__score_matrix_evaluation --> -<g id="edge27" class="edge"><title>dante__domain_search->dante__score_matrix_evaluation</title> -<path fill="none" stroke="#000000" d="M1139.77,-246.158C1265.04,-240.095 1554.84,-224.099 1797,-196 1806.06,-194.949 1815.52,-193.687 1824.91,-192.331"/> -<polygon fill="#000000" stroke="#000000" points="1825.67,-195.757 1835.05,-190.828 1824.64,-188.832 1825.67,-195.757"/> -</g> -<!-- dante__domain_search->dante__score_table --> -<g id="edge39" class="edge"><title>dante__domain_search->dante__score_table</title> -<path fill="none" stroke="#000000" d="M1013.28,-247.64C899.371,-244.318 653.662,-233.105 450,-196 445.566,-195.192 440.988,-194.198 436.43,-193.099"/> -<polygon fill="#000000" stroke="#000000" points="437.103,-189.658 426.546,-190.557 435.359,-196.437 437.103,-189.658"/> -</g> -<!-- dante__domain_search->dante__split_fasta --> -<g id="edge32" class="edge"><title>dante__domain_search->dante__split_fasta</title> -<path fill="none" stroke="#000000" d="M1140.69,-247.36C1322.78,-242.358 1843.19,-225.792 2011,-196 2015.16,-195.261 2019.45,-194.312 2023.72,-193.243"/> -<polygon fill="#000000" stroke="#000000" points="2024.83,-196.569 2033.56,-190.567 2022.99,-189.814 2024.83,-196.569"/> -</g> -<!-- dante__domain_search->dante__write_info --> -<g id="edge37" class="edge"><title>dante__domain_search->dante__write_info</title> -<path fill="none" stroke="#000000" d="M1140.79,-247.762C1337.32,-243.633 1931.78,-228.703 2122,-196 2126.24,-195.272 2130.6,-194.331 2134.94,-193.266"/> -<polygon fill="#000000" stroke="#000000" points="2136.19,-196.555 2144.95,-190.597 2134.39,-189.791 2136.19,-196.555"/> -</g> -<!-- dante__filter_params->dante__alignment_scoring --> -<g id="edge25" class="edge"><title>dante__filter_params->dante__alignment_scoring</title> -<path fill="none" stroke="#000000" d="M248.559,-88.055C244.075,-80.0067 238.614,-70.2046 233.612,-61.2259"/> -<polygon fill="#000000" stroke="#000000" points="236.549,-59.3075 228.625,-52.2753 230.434,-62.7145 236.549,-59.3075"/> -</g> -<!-- dante__line_generator->dante__line_generator --> -<g id="edge26" class="edge"><title>dante__line_generator->dante__line_generator</title> -<path fill="none" stroke="#000000" d="M1565.86,-191.203C1585.63,-192.737 1603,-188.336 1603,-178 1603,-169.521 1591.31,-165.036 1576.2,-164.545"/> -<polygon fill="#000000" stroke="#000000" points="1575.77,-161.055 1565.86,-164.797 1575.94,-168.053 1575.77,-161.055"/> -</g> -<!-- dante__main->dante__domain_search --> -<g id="edge41" class="edge"><title>dante__main->dante__domain_search</title> -<path fill="none" stroke="#000000" d="M1065.63,-304.055C1067.3,-296.261 1069.32,-286.822 1071.2,-278.079"/> -<polygon fill="#000000" stroke="#000000" points="1074.63,-278.787 1073.3,-268.275 1067.78,-277.32 1074.63,-278.787"/> -</g> -<!-- dante__score_table->dante__annotations_dict --> -<g id="edge23" class="edge"><title>dante__score_table->dante__annotations_dict</title> -<path fill="none" stroke="#000000" d="M426.204,-165.403C433.984,-163.289 442.21,-161.334 450,-160 727.01,-112.576 1061.92,-106.495 1211.37,-106.404"/> -<polygon fill="#000000" stroke="#000000" points="1211.38,-109.904 1221.38,-106.407 1211.38,-102.904 1211.38,-109.904"/> -</g> -</g> -</svg>
--- a/parse_aln.py Wed Aug 28 08:08:47 2019 -0400 +++ b/parse_aln.py Tue Sep 03 05:20:02 2019 -0400 @@ -4,6 +4,7 @@ profile file ''' import argparse +import re def parse_args(): @@ -11,33 +12,126 @@ description = """ parsing cap3 assembly aln output """ - parser = argparse.ArgumentParser(description=description, - formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument( - '-a', '--aln_file', - default=None, required=True, - help="Aln file input", - type=str, - action='store') - parser.add_argument( - '-f', '--fasta', - default=None, required=True, - help="fasta output file name", - type=str, - action='store') - parser.add_argument( - '-p', '--profile', - default=None, required=True, - help="output file for coverage profile", - type=str, - action="store" - ) + + parser = argparse.ArgumentParser( + description=description, + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('-a', + '--aln_file', + default=None, + required=True, + help="Aln file input", + type=str, + action='store') + parser.add_argument('-f', + '--fasta', + default=None, + required=True, + help="fasta output file name", + type=str, + action='store') + parser.add_argument('-p', + '--profile', + default=None, + required=True, + help="output file for coverage profile", + type=str, + action="store") return parser.parse_args() +def get_header(f): + aln_header = ". : . : . : . : . : . :" + contig_lead = "******************" + aln_start = -1 + while True: + line = f.readline() + if not line: + return None, None + if line[0:18] == contig_lead: + line2 = f.readline() + else: + continue + if aln_header in line2: + aln_start = line2.index(aln_header) + break + contig_name = line.split()[1] + line.split()[2] + return contig_name, aln_start + + +def segment_start(f): + pos = f.tell() + line = f.readline() + # detect next contig or end of file + if "********" in line or line == "": + segment = False + else: + segment = True + f.seek(pos) + return segment + + +def get_segment(f, seq_start): + if not segment_start(f): + return None, None + aln = [] + while True: + line = f.readline() + if ". : . :" in line: + continue + if "__________" in line: + consensus = f.readline().rstrip('\n')[seq_start:] + f.readline() # empty line + break + else: + aln.append(line.rstrip('\n')[seq_start:]) + return aln, consensus + + +def aln2coverage(aln): + coverage = [0] * len(aln[0]) + for a in aln: + for i, c in enumerate(a): + if c not in " -": + coverage[i] += 1 + return coverage + + +def read_contig(f, seq_start): + contig = "" + coverage = [] + while True: + aln, consensus = get_segment(f, seq_start) + if aln: + contig += consensus + coverage += aln2coverage(aln) + else: + break + return contig, coverage + +def remove_gaps(consensus, coverage): + if "-" not in consensus: + return consensus, coverage + new_coverage = [cov for cons, cov in zip(consensus, coverage) + if cons != "-"] + new_consensus = consensus.replace("-", "") + return new_consensus, new_coverage + +def main(): + args = parse_args() + with open(args.aln_file, 'r') as f1, open(args.fasta, 'w') as ffasta, open(args.profile, 'w') as fprofile: + while True: + contig_name, seq_start = get_header(f1) + if contig_name: + consensus, coverage = remove_gaps(*read_contig(f1, seq_start)) + ffasta.write(">{}\n".format(contig_name)) + ffasta.write("{}\n".format(consensus)) + fprofile.write(">{}\n".format(contig_name)) + fprofile.write("{}\n".format(" ".join([str(i) for i in coverage]))) + else: + break + + if __name__ == "__main__": - args = parse_args() - print(args.profile) - - + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/GEPY_test_long_1_output_unfiltered.gff3 Tue Sep 03 05:20:02 2019 -0400 @@ -0,0 +1,26 @@ +##gff-version 3 +##----------------------------------------------- +##PIPELINE VERSION : iter_search_optional-rv-3168(0b80fa0) +##PROTEIN DATABASE VERSION : Viridiplantae_v3.0_pdb +##----------------------------------------------- +scaffold146.1|size86774 dante protein_domain 976 1289 293 + . Name=RH;Final_Classification=Class_I|LTR|Ty1/copia|Bianca;Region_Hits_Classifications=RH|Class_I|LTR|Ty1/copia|Bianca;Best_Hit=Ty1-RH__REXdb_ID2558|Class_I|LTR|Ty1/copia|Bianca:976-1289[100percent];Best_Hit_DB_Pos=26:134of134;DB_Seq=ISWRSVKQTITATSSNHAELLALHEASRECVWLRSMIQHIQKNCG-LSSGRMDATIIYEDNTACIAQLKEGYIKGDRTKHISPKFF-FTHDLQKDGDISIQQIRSCDNLAD;Region_Seq=ISWRSTKQTIVAISSNHVELLAIHDTSRECVWLRFMIESI\IMXXXXXXXXXXXXXXXXXXQLKE*YIKCDRTKHISPKFF\FTQDLQKNGDVIIQQIRSNDNVVD;Query_Seq=ISWRSTKQTIVAISSNHVELLAIHDTSRECVWLRFMIESI-----\IMXXXXXXXXXXXXXXXXXXQLKE*YIKCDRTKHISPKFF\FTQDLQKNGDVIIQQIRSNDNVVD;Identity=0.59;Similarity=0.66;Relat_Length=0.813;Relat_Interruptions=1.5;Hit_to_DB_Length=0.83 +scaffold146.1|size86774 dante protein_domain 6810 7049 153 + . Name=PROT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Region_Hits_Classifications=PROT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Best_Hit=Ty3-PROT__REXdb_ID9702|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand:6810-7049[100percent];Best_Hit_DB_Pos=1:80of80;DB_Seq=LVDDGSKVNLLPYRVFQQMGIPEEQLVRDQAPVKGIGGVPVLVEGKVKLALTLGEAPRTRTHYAVFLVVKPPLSYNAILG;Region_Seq=LVDSGASCNLMSKRVMKQMGIPDEKLEFLDATLYAFDRRTIIPAGKIQLPVTLGEEERTRSEMVEFIIVDMDLAYNAILG;Query_Seq=LVDSGASCNLMSKRVMKQMGIPDEKLEFLDATLYAFDRRTIIPAGKIQLPVTLGEEERTRSEMVEFIIVDMDLAYNAILG;Identity=0.44;Similarity=0.62;Relat_Length=1.0;Relat_Interruptions=0.0;Hit_to_DB_Length=1.0 +scaffold146.1|size86774 dante protein_domain 7656 8296 . + . Name=RT/INT;Final_Classification=Ambiguous_domain;Region_Hits_Classifications_=RT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand[246bp],INT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand[468bp] +scaffold146.1|size86774 dante protein_domain 8756 9241 538 + . Name=RT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat;Region_Hits_Classifications=RT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand[486bp],RT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Ogre[441bp];Best_Hit=Ty3-RT__REXdb_ID8210|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand:8801-9241[90percent];Best_Hit_DB_Pos=27:173of173;DB_Seq=DFTDLNKACPKDSFPLPHIDRLVDSTAGNELLTFMDAFSGYNQIMMNPEDQEKTSFITDRGIYCYKVMPFGLKNAGATYQRLVNKMFHNHLGKTMEVYIDDMLVKSLKKEDHVKHLEECFDILNKYQMKLNPAKCTFGVPSGEFLGY;Region_Seq=TSIATASGGRTSDGADFKGVNKHCQPDPFPLPHIDRLVDAVAGSSLLSTMDAYSGYHQISLAREDQAKSSFLTEDGVFCYVVMPFGLRNAGATYQRLVNKIFADLLGKEMEIYVDDMIVKSLNDEDHIIYLSHCFEVCRTHRLKLNPAKCCFGVRSGKFLGY;Query_Seq=DFKGVNKHCQPDPFPLPHIDRLVDAVAGSSLLSTMDAYSGYHQISLAREDQAKSSFLTEDGVFCYVVMPFGLRNAGATYQRLVNKIFADLLGKEMEIYVDDMIVKSLNDEDHIIYLSHCFEVCRTHRLKLNPAKCCFGVRSGKFLGY;Identity=0.63;Similarity=0.8;Relat_Length=0.85;Relat_Interruptions=0.0;Hit_to_DB_Length=0.85 +scaffold146.1|size86774 dante protein_domain 9434 9781 343 + . Name=RH;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Region_Hits_Classifications=RH|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Best_Hit=Ty3-RH__REXdb_ID9729|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand:9434-9772[97percent];Best_Hit_DB_Pos=1:113of149;DB_Seq=WTEECEEAFQKLKEYLGSPHLLVKPIQGEPLFLYLAVSEHATSSVLVREDDGVQRPIYYTSRALVDAETRYLSLEKIVLALIVSARRLRPYFQAHTIIVLTDQPIRQVLAKPD;Region_Seq=WTDQCDRAFKELKTYLASPPLIVSPTPTETLGLYLAVSEHAVSSVLVAERDGVQHPVYYVSHTLLPAESRYSTVEKFVLALLKSVAKLRHYFESRKVIVYTDQPIKAVLGQSDHTS;Query_Seq=WTDQCDRAFKELKTYLASPPLIVSPTPTETLGLYLAVSEHAVSSVLVAERDGVQHPVYYVSHTLLPAESRYSTVEKFVLALLKSVAKLRHYFESRKVIVYTDQPIKAVLGQSD;Identity=0.58;Similarity=0.73;Relat_Length=0.758;Relat_Interruptions=0.0;Hit_to_DB_Length=0.76 +scaffold146.1|size86774 dante protein_domain 10810 11667 747 + . Name=INT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Region_Hits_Classifications=INT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Best_Hit=Ty3-INT__REXdb_ID9633|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand:10819-11667[98percent];Best_Hit_DB_Pos=30:310of310;DB_Seq=RDTHQYVQRCIQCQKFAPLIHKPGEEMTIMSAPCPFAQWGIDLVGPFPQTAGRKKFFIVAVDYFTKWVEAEALSKITEDEVMHFIWKYICCRFGLPRSLVSDNGTQFNGKKIRAWCEEMKITQKFVAVAHPQANGQVESTNRTIVNGLKKRIDELGGSWVDELPSVLWSYRTSAKAATGETPFRLTYGTEAVIPVEVAMDTLRIATF--DEEANDGALRTRLDEIFDLREAAYLHMERSKNLIKARYDQGVRSRSFQIGDLILRRADALKHTGKLEANWEGPY;Region_Seq=SVLRDAMDCVRRCQSCQYFAPINRKPGAEITLTELPCPFDRWGIDILGPFPQSVRQRRFCIVAVEYHSKWIEAEAVASITSEAVKKFVMNNIIVRFGCPRVLVSDNGPQFISDKFATFCEEYGIQQRTSSVYHPQTNGQAEASNKIILHGLRRNLDSLGGSWPDQLPHVLWAYRTTPKSSTGETPFSLVYGSEAVAPVESTIITPRIAAYMHTESANTEFRELDLDLLEERRNEVYGRVRKQQRALRKRYNQRVRPRQFEKGDLILRSVESQGHKGKLDRAWEGPY;Query_Seq=RDAMDCVRRCQSCQYFAPINRKPGAEITLTELPCPFDRWGIDILGPFPQSVRQRRFCIVAVEYHSKWIEAEAVASITSEAVKKFVMNNIIVRFGCPRVLVSDNGPQFISDKFATFCEEYGIQQRTSSVYHPQTNGQAEASNKIILHGLRRNLDSLGGSWPDQLPHVLWAYRTTPKSSTGETPFSLVYGSEAVAPVESTIITPRIAAYMHTESANTEFRELDLDLLEERRNEVYGRVRKQQRALRKRYNQRVRPRQFEKGDLILRSVESQGHKGKLDRAWEGPY;Identity=0.49;Similarity=0.66;Relat_Length=0.906;Relat_Interruptions=0.0;Hit_to_DB_Length=0.91 +scaffold146.1|size86774 dante protein_domain 14592 14828 289 + . Name=PROT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Region_Hits_Classifications=PROT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Best_Hit=Ty3-PROT__REXdb_ID6659|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila:14592-14828[100percent];Best_Hit_DB_Pos=1:80of80;DB_Seq=MLDLGASINVMPYSIYNSLNLGPMEETCIIIQLADRSNAYPKGVMEDVLVQVNELVFPADFYILKMEDELSPNPTPILLG;Region_Seq=MVDLGASINLMPYSIYSALQLGPLQGTAIVIKLADRSNTHPEGVIEDVLVQVNNLVFPADFYVLKMGKAENNDCPLLLG;Query_Seq=MVDLGASINLMPYSIYSALQLGPLQGTAIVIKLADRSNTHPEGVIEDVLVQVNNLVFPADFYVLKM-GKAENNDCPLLLG;Identity=0.68;Similarity=0.84;Relat_Length=1.0;Relat_Interruptions=0.0;Hit_to_DB_Length=1.0 +scaffold146.1|size86774 dante protein_domain 15420 15995 871 + . Name=RT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Region_Hits_Classifications=RT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Best_Hit=Ty3-RT__REXdb_ID6635|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila:15420-15995[100percent];Best_Hit_DB_Pos=1:192of192;DB_Seq=IYPITDSKWVAPIHVVPKKTGITLVKNKNDELIPTRISSGWRMCVDYRKLNLATRKDHFPLPFMDQMLERLAGKSFYCFLDGYSGYNQIVINPEDQEKTTFTCPFGTYAYRRMPFGLCNAPATFQRCMMSIFSDYVERIIEVFMDDFTVYGDSFDKCLENLSLILKRCIETNLVLNYEKCYFMVEQGIVLGH;Region_Seq=IYAISDSDWVSPVHVVPKKTGFTVERNKNGELVPKRVTNGWRVCIDYRKLNDATRKDHFPLPFIDQMLERLAGKKFYCFLDGYSGYNQVAIAPEDQEKTTFTCTYGTYAFRKMPFGLCNAPATFQRCMLSIFSEFTGKFIEVFMDDFTVYGDSFEGALENLEKVLQRCVEKKLVLNSEKCHFMVRQGIVLGH;Query_Seq=IYAISDSDWVSPVHVVPKKTGFTVERNKNGELVPKRVTNGWRVCIDYRKLNDATRKDHFPLPFIDQMLERLAGKKFYCFLDGYSGYNQVAIAPEDQEKTTFTCTYGTYAFRKMPFGLCNAPATFQRCMLSIFSEFTGKFIEVFMDDFTVYGDSFEGALENLEKVLQRCVEKKLVLNSEKCHFMVRQGIVLGH;Identity=0.76;Similarity=0.88;Relat_Length=1.0;Relat_Interruptions=0.0;Hit_to_DB_Length=1.0 +scaffold146.1|size86774 dante protein_domain 16188 16634 623 + . Name=RH;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Region_Hits_Classifications=RH|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Best_Hit=Ty3-RH__REXdb_ID6648|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila:16188-16634[100percent];Best_Hit_DB_Pos=1:149of149;DB_Seq=FNEACKVAFDKLKELLTSAPIIQPPDWSLPFEIMCDASNYVVGAVLGQRVGRAAHVIYYTSRTLDSAQCNYSTTEKELLAIVFALEKFRSYLLGTKVIIFSDHAALRYLLAKKEAKPRLIRWILLLQEFNLEIRDKKGTENLVADHLSR;Region_Seq=FNQECQEAFNKLKSLLTAAPIIQPPNWELPFELMCDASNYALGAVLGQKIEGKRHVIYYASKTLSEAQIHYTTTEKELLAIVYALEKFRSYLLGTKITVHSDHAALRHLLSKKESKPRLIRWILLLQEFDLEIKDRAGTENAVADNLSR;Query_Seq=FNQECQEAFNKLKSLLTAAPIIQPPNWELPFELMCDASNYALGAVLGQKIEGKRHVIYYASKTLSEAQIHYTTTEKELLAIVYALEKFRSYLLGTKITVHSDHAALRHLLSKKESKPRLIRWILLLQEFDLEIKDRAGTENAVADNLSR;Identity=0.74;Similarity=0.87;Relat_Length=1.0;Relat_Interruptions=0.0;Hit_to_DB_Length=1.0 +scaffold146.1|size86774 dante protein_domain 24522 24659 149 + . Name=PROT;Final_Classification=Class_I|LTR|Ty1/copia|Bianca;Region_Hits_Classifications=PROT|Class_I|LTR|Ty1/copia|Bianca;Best_Hit=Ty1-PROT__REXdb_ID2599|Class_I|LTR|Ty1/copia|Bianca:24531-24659[93percent];Best_Hit_DB_Pos=29:71of71;DB_Seq=STISGTTNLVEGSGRANIMLPNGTRFHINDALYSSKSRRNLLS;Region_Seq=IKASTIVCEANIVEGSGRATVVLPSGTHIRIDDALYANKSRRNLLS;Query_Seq=STIVCEANIVEGSGRATVVLPSGTHIRIDDALYANKSRRNLLS;Identity=0.65;Similarity=0.77;Relat_Length=0.606;Relat_Interruptions=0.0;Hit_to_DB_Length=0.61 +scaffold146.1|size86774 dante protein_domain 24873 25481 913 + . Name=INT;Final_Classification=Class_I|LTR|Ty1/copia|Bianca;Region_Hits_Classifications=INT|Class_I|LTR|Ty1/copia|Bianca;Best_Hit=Ty1-INT__REXdb_ID2558|Class_I|LTR|Ty1/copia|Bianca:24873-25481[100percent];Best_Hit_DB_Pos=1:203of203;DB_Seq=HERLGHPGSIMMRKIIEHSCGHQLKSREILQSNKFSCTSCSQGKLITRPSPTKIGSESLNFLERIHGDICGPIHPPCGPFRYFMVLIDASTRWSHVCLLSTRNQAFARLLAQLIRIRAHFPDYPVKKIRLDNAAEFSSQTFNDYCMSIGIDIEHPVAHVHTQNGLAESFIKRIQLIARPLLMRCKLPISTWGHAILHAATLIR;Region_Seq=HDRLGHPGMIMMRKIIRTTSGHSLKNREILHPREYICTACAQGKLITRPSPVKIMNERITFLERIQGDICGPIHPACGPFRYFIVLIDASSRWSHVSLLSTRNHAFARLLSQIIRLRAHFPDYPVKKIRLDNAAEFTSRTFNNYCLAMGIDVEHPVEYVHTQNGLAESLIKRLQLIARPLLMKSKLPVTCWGHAIIHASSLIR;Query_Seq=HDRLGHPGMIMMRKIIRTTSGHSLKNREILHPREYICTACAQGKLITRPSPVKIMNERITFLERIQGDICGPIHPACGPFRYFIVLIDASSRWSHVSLLSTRNHAFARLLSQIIRLRAHFPDYPVKKIRLDNAAEFTSRTFNNYCLAMGIDVEHPVEYVHTQNGLAESLIKRLQLIARPLLMKSKLPVTCWGHAIIHASSLIR;Identity=0.75;Similarity=0.9;Relat_Length=1.0;Relat_Interruptions=0.0;Hit_to_DB_Length=1.0 +scaffold146.1|size86774 dante protein_domain 26313 27071 1060 + . Name=RT;Final_Classification=Class_I|LTR|Ty1/copia|Bianca;Region_Hits_Classifications=RT|Class_I|LTR|Ty1/copia|Bianca;Best_Hit=Ty1-RT__REXdb_ID2558|Class_I|LTR|Ty1/copia|Bianca:26322-27032[93percent];Best_Hit_DB_Pos=1:237of262;DB_Seq=WKDAIKAELYSLNKRKVFGPVVRTPKGVKPVGYKWVFVRKRNENGEIARYKARLVAQGFSQRPGIDFNETYSPVVDATTFRYLISLIAYEGLNLHMMDVVTAYLYGSLDSDIYMKIPEGFNLPDTNSSGSREDYSIKLNKSLYGLKQSGRMWYNRLSEYLLKEGYKNDSVCPCIFMKRSENEFAIIAVYVDDINIIGTPEELPKAIDCLKKEFEMKDLGKTKFCLGLQIEHLNNGIF;Region_Seq=WPKWKDAIESELKSLNKRDVFGPVVRTPEGVQPVGYKWVFVRKRNDKGEISRYKARLVAQGFSQRPGIDYDETYSPVMDATTFRFLISLAIEYGLDLQLMDVVTAYLYGSLDCEIYMKIPEGFHMPERYSSEPRTDYAIKLNKSLYGLKQSGRMWYNRLSEYLIKEGYKNNLVCPCVFMKKFENEFVIIAVYVDDINIVGTQKALLDAVNCLKREFEMKDLGRTKYCLGLQIEYLKNGIFRTDYAIKLNKSLY;Query_Seq=WKDAIESELKSLNKRDVFGPVVRTPEGVQPVGYKWVFVRKRNDKGEISRYKARLVAQGFSQRPGIDYDETYSPVMDATTFRFLISLAIEYGLDLQLMDVVTAYLYGSLDCEIYMKIPEGFHMPERYSSEPRTDYAIKLNKSLYGLKQSGRMWYNRLSEYLIKEGYKNNLVCPCVFMKKFENEFVIIAVYVDDINIVGTQKALLDAVNCLKREFEMKDLGRTKYCLGLQIEYLKNGIF;Identity=0.78;Similarity=0.91;Relat_Length=0.905;Relat_Interruptions=0.0;Hit_to_DB_Length=0.9 +scaffold146.1|size86774 dante protein_domain 27723 28124 581 + . Name=RH;Final_Classification=Class_I|LTR|Ty1/copia|Bianca;Region_Hits_Classifications=RH|Class_I|LTR|Ty1/copia|Bianca;Best_Hit=Ty1-RH__REXdb_ID2558|Class_I|LTR|Ty1/copia|Bianca:27723-28124[100percent];Best_Hit_DB_Pos=1:134of134;DB_Seq=DAGYLSDPHHGRSQTGYLFTSGNTAISWRSVKQTITATSSNHAELLALHEASRECVWLRSMIQHIQKNCGLSSGRMDATIIYEDNTACIAQLKEGYIKGDRTKHISPKFFFTHDLQKDGDISIQQIRSCDNLAD;Region_Seq=DAGYRSDPHNGRSQTGYVFLNKGAAISWRSTKQTIAATSSNHAELLAIHETSRECVWLRSMIESIYNACGLFTDKMPPTVLYEDNSACIIQLKEGYIKGDRTKHISPKFFFTHDLQKNGEVIIQQIRSSDNVAD;Query_Seq=DAGYRSDPHNGRSQTGYVFLNKGAAISWRSTKQTIAATSSNHAELLAIHETSRECVWLRSMIESIYNACGLFTDKMPPTVLYEDNSACIIQLKEGYIKGDRTKHISPKFFFTHDLQKNGEVIIQQIRSSDNVAD;Identity=0.75;Similarity=0.84;Relat_Length=1.0;Relat_Interruptions=0.0;Hit_to_DB_Length=1.0 +scaffold146.1|size86774 dante protein_domain 9783 9956 178 - . Name=INT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Region_Hits_Classifications=INT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand;Best_Hit=Ty3-INT__REXdb_ID9635|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand:9783-9956[100percent];Best_Hit_DB_Pos=1:58of310;DB_Seq=HRGGCGEHGGARALIQKLHRAGYYWPGMKRDTHQYVQRCIQCQKFAPLIHKPGEEMTI;Region_Seq=HSGLCGNHPGARSLALRIQRAGYYWPTLLRDAMDCVRRCQSCQYFAPINRKPGAEITL;Query_Seq=HSGLCGNHPGARSLALRIQRAGYYWPTLLRDAMDCVRRCQSCQYFAPINRKPGAEITL;Identity=0.53;Similarity=0.69;Relat_Length=0.187;Relat_Interruptions=0.0;Hit_to_DB_Length=0.19 +scaffold146.1|size86774 dante protein_domain 10299 10658 303 - . Name=aRH;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat;Region_Hits_Classifications=aRH|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|TatII[360bp],aRH|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Ogre[360bp],aRH|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand[360bp];Best_Hit=Ty3-aRH__REXdb_ID9546|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand:10299-10658[100percent];Best_Hit_DB_Pos=1:121of121;DB_Seq=WILHVDGASSKQGSGIGIRLQSPYGEVIEQSFCLAFNASNNEAEYESLLAGLRLAVGIGVTKLRAFCNSQLVANQFSGDYEAKDSRMEAYLAQVQELSKKFLSFELARIPRSENSAADSLA;Region_Seq=WNMYIDGSTQSGAGVGVHYITPYGDWINLAVKLQFPATNNVAEYEALLAGMNFALSLGVTRLKTFSDSQLVVEQFSGHFQAKEPMLEAYKSRSQLLAAKFSEFSLEHIPRESNRAADSLA;Query_Seq=WNMYIDG-STQSGAGVGVHYITPYGDWINLAVKLQFPATNNVAEYEALLAGMNFALSLGVTRLKTFSDSQLVVEQFSGHFQAKEPMLEAYKSRSQLLAAKFSEFSLEHIPRESNRAADSLA;Identity=0.49;Similarity=0.7;Relat_Length=1.0;Relat_Interruptions=0.0;Hit_to_DB_Length=1.0 +scaffold146.1|size86774 dante protein_domain 10701 10817 136 - . Name=RH;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat;Region_Hits_Classifications=RH|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand[117bp],RH|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Ogre[99bp];Best_Hit=Ty3-RH__REXdb_ID8372|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Tat|Retand:10701-10817[100percent];Best_Hit_DB_Pos=279:317of317;DB_Seq=NREGTGRVVKWAIELSEFDLHFEPRHAIKSQALADFVVE;Region_Seq=NTDHTSRLAKWAIKVSAMDIAFEPRKAIKGQALADFVVE;Query_Seq=NTDHTSRLAKWAIKVSAMDIAFEPRKAIKGQALADFVVE;Identity=0.64;Similarity=0.77;Relat_Length=0.123;Relat_Interruptions=0.0;Hit_to_DB_Length=0.12 +scaffold146.1|size86774 dante protein_domain 16797 17666 1057 - . Name=INT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Region_Hits_Classifications=INT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Best_Hit=Ty3-INT__REXdb_ID6633|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila:16812-17666[98percent];Best_Hit_DB_Pos=1:285of313;DB_Seq=HSHSYGGHFGAKRTAHKVLESGFYWPSIFKDAYHFCKSCEKCQRTGNITHKNQMPLTNILVSEIFDVWGIDFMGPFPSSFGNLYILLVVDYVSKWIEAKATRTNDAKVVLDFVRTHIFNRFGIPKAIISDRGTHFCNRSMEALLRKYHVTHRTSTAYHPQTNGQAEISNREIKSILEKIVQPNRRDWSLRLGDALWAYRTAYKSPIGMSPYRMIYGKACHLPVELEHKAFWAIKQCNMDYDAAGIARKLQLQELEEIRNDAYENARIYKEKTKNLHDRMLTRKEF;Region_Seq=HASDYGGHFGPNRTARRILDVGFYWPSIFRDVYQFCRTCDACQRVGNITNRREMPQNYILANEIFDIWGLDFMGPFPQSQGNNYILVAVDYVSKWVEAIPTRTDDGKTVTEFLRKNIFTRYGVPKAIISDRGTHFCNSTMRAMMKKYNVIHKTTTAYHPQGNGQAEATNREIKSILEKVVNKKRSNWSQKLPDALWAYRTAYKTPIGTTPFRLIYGKHCNLPVGLEHKAYWAIREMNFEEGGDAELRQMQLQELDALRLEAYDNSRIYKERLKTYHDKKLLQQNFRERLS;Query_Seq=HASDYGGHFGPNRTARRILDVGFYWPSIFRDVYQFCRTCDACQRVGNITNRREMPQNYILANEIFDIWGLDFMGPFPQSQGNNYILVAVDYVSKWVEAIPTRTDDGKTVTEFLRKNIFTRYGVPKAIISDRGTHFCNSTMRAMMKKYNVIHKTTTAYHPQGNGQAEATNREIKSILEKVVNKKRSNWSQKLPDALWAYRTAYKTPIGTTPFRLIYGKHCNLPVGLEHKAYWAIREMNFEEGGDAELRQMQLQELDALRLEAYDNSRIYKERLKTYHDKKLLQQNF;Identity=0.61;Similarity=0.79;Relat_Length=0.911;Relat_Interruptions=0.0;Hit_to_DB_Length=0.91 +scaffold146.1|size86774 dante protein_domain 18554 18811 306 - . Name=INT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Region_Hits_Classifications=INT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Best_Hit=Ty3-INT__REXdb_ID6693|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila:18554-18802[96percent];Best_Hit_DB_Pos=231:313of313;DB_Seq=WALRLLNFDNNACGEKRKLQLQELEEMRLNAYESSRIYKERTKAYHDKKLQRREFQPGQQVLLFNSRLRLFPGKLKSKWSGPF;Region_Seq=QGNWAIREMNFEEGGDAELRQMQLQELDALRLEAYDNSRIYKERLKAYHDKKILQQNFREGQQVLLFNSKLRLFPGKLKSRWMGPF;Query_Seq=WAIREMNFEEGGDAELRQMQLQELDALRLEAYDNSRIYKERLKAYHDKKILQQNFREGQQVLLFNSKLRLFPGKLKSRWMGPF;Identity=0.65;Similarity=0.82;Relat_Length=0.265;Relat_Interruptions=0.0;Hit_to_DB_Length=0.27 +scaffold146.1|size86774 dante protein_domain 19158 19478 197 - . Name=INT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Region_Hits_Classifications=INT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Best_Hit=Ty3-INT__REXdb_ID6659|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila:19182-19448[83percent];Best_Hit_DB_Pos=216:304of314;DB_Seq=YGKPCHLPVELEHKAWWAVKQCNMELDVAGQHRxLQLQELEEIRNDAYESSxIYKEKTKAFHDKQILRKNFEVGQKVLIFHSRLKLFPG;Region_Seq=PRGTISIGLNFGKQCKVLVGMEHENYWEIREMNYEEGADVEQKQMQLQKMDALKLEAYDNSRIDKEKLKAHHAKRILQQNCKKRQQVLIFDSKLKMFPGIPRWMEPF;Query_Seq=FGKQCKVLVGMEHENYWEIREMNYEEGADVEQKQMQLQKMDALKLEAYDNSRIDKEKLKAHHAKRILQQNCKKRQQVLIFDSKLKMFPG;Identity=0.42;Similarity=0.71;Relat_Length=0.283;Relat_Interruptions=0.0;Hit_to_DB_Length=0.28 +scaffold146.1|size86774 dante protein_domain 19976 20212 259 - . Name=PROT;Final_Classification=Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Region_Hits_Classifications=PROT|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila;Best_Hit=Ty3-PROT__REXdb_ID6659|Class_I|LTR|Ty3/gypsy|non-chromovirus|OTA|Athila:19976-20212[100percent];Best_Hit_DB_Pos=1:80of80;DB_Seq=MLDLGASINVMPYSIYNSLNLGPMEETCIIIQLADRSNAYPKGVMEDVLVQVNELVFPADFYILKMEDELSPNPTPILLG;Region_Seq=MVDLGASINLMPYYIYSALKLGSLQGTAIIIKLADRSETHPEGVVKDVLAQVNNLVFPADFYVLKMGEAENDDCPLLLG;Query_Seq=MVDLGASINLMPYYIYSALKLGSLQGTAIIIKLADRSETHPEGVVKDVLAQVNNLVFPADFYVLKM-GEAENDDCPLLLG;Identity=0.62;Similarity=0.79;Relat_Length=1.0;Relat_Interruptions=0.0;Hit_to_DB_Length=1.0 +scaffold146.1|size86774 dante protein_domain 28912 29124 216 - . Name=PROT;Final_Classification=Class_I|LTR|Ty1/copia|Bianca;Region_Hits_Classifications=PROT|Class_I|LTR|Ty1/copia|Bianca;Best_Hit=Ty1-PROT__REXdb_ID2599|Class_I|LTR|Ty1/copia|Bianca:28912-29124[100percent];Best_Hit_DB_Pos=1:71of71;DB_Seq=CLADCATTHTILRDKRYFLELTLIKANVSTISGTTNLVEGSGRANIMLPNGTRFHINDALYSSKSRRNLLS;Region_Seq=CLVDSATTHTILKNMRYFTSFEKRDVNIATIVCEANIVEGSGRAVIVLPSGTHIRIDDALYANKSRRNLLS;Query_Seq=CLVDSATTHTILKNMRYFTSFEKRDVNIATIVCEANIVEGSGRAVIVLPSGTHIRIDDALYANKSRRNLLS;Identity=0.59;Similarity=0.7;Relat_Length=1.0;Relat_Interruptions=0.0;Hit_to_DB_Length=1.0