Mercurial > repos > abims-sbr > cds_search

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CDS_search.xml	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,306 @@
+<tool name="CDS_search" id="cds_search" version="2.1.2">
+
+	<description>
+		ORF and CDS search
+	</description>
+
+	<macros>
+		<import>macros.xml</import>
+	</macros>
+
+	<requirements>
+		<expand macro="python_required" />
+	</requirements>
+
+  	<command><![CDATA[
+        #for $input in $inputs
+            ln -s '$input' '$input.element_identifier';
+            echo '$input.element_identifier' >> list_files;
+        #end for
+
+        ln -s $__tool_directory__/scripts/dico.py . &&
+
+        python $__tool_directory__/scripts/S01_find_orf_on_multiple_alignment.py
+        $__tool_directory__/scripts/code_universel_modified.txt
+        $length.min_length_seq
+        $nb_species_keep
+        list_files
+        > '$log' &&
+
+        python $__tool_directory__/scripts/S02_remove_too_short_bit_or_whole_sequence.py
+        $nb_species_keep
+        $methionine
+        $length.min_length_seq
+        $length.min_length_subseq
+        >> '$log' &&
+
+        python $__tool_directory__/scripts/S03_remove_site_with_not_enough_species_represented.py
+        $nb_species_keep
+        $length.min_length_nuc
+        >> '$log';
+    ]]></command>
+
+ 	<inputs>
+        <param name="inputs" type="data" format="fasta" multiple="true" label="Input files" help="Only a fasta file with nucleic align sequences" />
+		<!-- <param name="code_file" type="data" format="txt" label="Choose your file containing the universal code (codons and their amino acids)" /> -->
+
+		<param name="nb_species_keep" type="integer" value="10" min="2" label="Minimal number of species in each locus" help="If you want to remove all the indels the maximum number of species is required" />
+
+        <param name="methionine" type="boolean" checked="true" truevalue="oui" falsevalue="non" label="Do you want to consider the Methionine in the search of CDS? " />
+
+        <section name="length" title="Do you want to choose the minimum length of the CDS?">
+            <param name="min_length_seq" type="integer" value="50" min="0" label="Minimal length of the CDS, in proteic" help="By default it's 50" />
+            <param name="min_length_subseq" type="integer" value="15" min="0" label="Minimal length of the subsequence, in proteic between two series of indels" help="By default it's 15" />
+            <param name="min_length_nuc" type="integer" value="50" min="0" label="Minimal length of the CDS, in nucleic without the indel" help="By default it's 50" />
+        </section>
+
+		<param name="out_BESTORF" type="select" label="Do you want the outputs (dataset collection list) containing files with the BEST ORF? ">
+			<option value="no">No</option>
+			<option value="aa">Yes, with the proteic format</option>
+			<option value="nuc">Yes, with the nucleic format</option>
+			<option value="both">Yes, with the proteic and nucleic format</option>
+		</param>
+
+		<param name="out_CDS" type="select" label="Do you want the outputs (dataset collection list) containing files with CDS? ">
+			<option value="no">No</option>
+			<option value="aa">Yes, with the proteic format</option>
+			<option value="nuc">Yes, with the nucleic format</option>
+			<option value="both">Yes, with the proteic and nucleic format</option>
+		</param>
+
+		<param name="out_CDS_filter" type="select" label="Do you want the outputs (dataset collection list) containing files with CDS without indel? ">
+			<option value="no">No</option>
+			<option value="aa">Yes, with the proteic format</option>
+			<option value="nuc">Yes, with the nucleic format</option>
+			<option value="both">Yes, with the proteic and nucleic format</option>
+		</param>
+	</inputs>
+
+	<outputs>
+		<data format="txt" name="log" label="ORF_Search" />
+        <collection name="output_BESTORF_aa" type="list" label="ORF_Search_Best_ORF_aa">
+            <filter>out_BESTORF in ["aa","both"]</filter>
+            <discover_datasets pattern="__name_and_ext__" directory="04_BEST_ORF_aa" />
+        </collection>
+
+        <collection name="output_BESTORF_nuc" type="list" label="ORF_Search_Best_ORF_nuc">
+            <filter>out_BESTORF in ["nuc","both"]</filter>
+            <discover_datasets pattern="__name_and_ext__" directory="04_BEST_ORF_nuc" />
+        </collection>
+
+        <collection name="output_CDS_aa" type="list" label="ORF_Search_CDS_aa">
+            <filter>out_CDS in ["aa","both"] and not methionine</filter>
+            <discover_datasets pattern="__name_and_ext__" directory="05_CDS_aa" />
+        </collection>
+
+        <collection name="output_CDS_nuc" type="list" label="ORF_Search_CDS_nuc">
+            <filter>out_CDS in ["nuc","both"] and not methionine</filter>
+            <discover_datasets pattern="__name_and_ext__" directory="05_CDS_nuc" />
+        </collection>
+
+        <collection name="output_CDS_M_aa" type="list" label="ORF_Search_CDS_with_M_aa">
+            <filter>(out_CDS == "aa" and methionine) or (out_CDS == "both" and methionine)</filter>
+            <discover_datasets pattern="__name_and_ext__" directory="06_CDS_with_M_aa" />
+        </collection>
+
+        <collection name="output_CDS_M_nuc" type="list" label="ORF_Search_CDS_with_M_nuc">
+            <filter>(out_CDS == "nuc" and methionine) or (out_CDS == "both" and methionine)</filter>
+            <discover_datasets pattern="__name_and_ext__" directory="06_CDS_with_M_nuc" />
+        </collection>
+
+        <collection name="output_filter_aa" type="list" label="ORF_Search_CDS_without_indel_aa">
+            <filter>out_CDS_filter in ["aa","both"]</filter>
+            <discover_datasets pattern="__name_and_ext__" directory="08_CDS_aa_MINIMUM_MISSING_SEQUENCES" />
+        </collection>
+
+        <collection name="output_filter_nuc" type="list" label="ORF_Search_CDS_without_indel_nuc">
+            <filter>out_CDS_filter in ["nuc","both"]</filter>
+            <discover_datasets pattern="__name_and_ext__" directory="08_CDS_nuc_MINIMUM_MISSING_SEQUENCES" />
+        </collection>
+	</outputs>
+
+	<tests>
+
+		<test>
+            <param name="inputs" ftype="fasta" value="inputs/orthogroup_1_with_4_sequences.fasta,inputs/orthogroup_6_with_4_sequences.fasta,inputs/orthogroup_7_with_3_sequences.fasta,inputs/orthogroup_8_with_4_sequences.fasta,inputs/orthogroup_12_with_5_sequences.fasta,inputs/orthogroup_14_with_4_sequences.fasta" />
+			<param name="nb_species_keep" value="3" />
+			<param name="methionine" value="non" />
+            <section name="length">
+                <param name="min_length_seq" value="50" />
+                <param name="min_length_subseq" value="15" />
+                <param name="min_length_nuc" value="50" />
+            </section>
+			<param name="out_BESTORF" value="both" />
+			<param name="out_CDS" value="both" />
+			<param name="out_CDS_filter" value="both" />
+            <output_collection name="output_BESTORF_aa" type="list" count="2">
+                <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_1_with_3_species.fasta" />
+                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_7_with_3_species.fasta" />
+            </output_collection>
+            <output_collection name="output_BESTORF_nuc" type="list" count="2">
+                <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_1_with_3_species.fasta" />
+                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_7_with_3_species.fasta" />
+            </output_collection>
+            <output_collection name="output_CDS_aa" type="list" count="2">
+                <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_05_CDS_aa/test1/orthogroup_1_with_3_species.fasta" />
+                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_05_CDS_aa/test1/orthogroup_7_with_3_species.fasta" />
+            </output_collection>
+            <output_collection name="output_CDS_nuc" type="list" count="2">
+                <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_1_with_3_species.fasta" />
+                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_7_with_3_species.fasta" />
+            </output_collection>
+            <output_collection name="output_filter_aa" type="list" count="1">
+                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_7_with_3_species.fasta" />
+            </output_collection>
+            <output_collection name="output_filter_nuc" type="list" count="1">
+                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_7_with_3_species.fasta" />
+            </output_collection>
+		</test>
+
+        <test>
+            <param name="inputs" ftype="fasta" value="inputs/orthogroup_1_with_4_sequences.fasta,inputs/orthogroup_6_with_4_sequences.fasta,inputs/orthogroup_7_with_3_sequences.fasta,inputs/orthogroup_8_with_4_sequences.fasta,inputs/orthogroup_12_with_5_sequences.fasta,inputs/orthogroup_14_with_4_sequences.fasta" />
+            <param name="nb_species_keep" value="2" />
+            <param name="methionine" value="oui" />
+            <section name="length">
+                <param name="min_length_seq" value="50" />
+                <param name="min_length_subseq" value="15" />
+                <param name="min_length_nuc" value="50" />
+            </section>
+            <param name="out_BESTORF" value="both" />
+            <param name="out_CDS" value="both" />
+            <param name="out_CDS_filter" value="both" />
+            <output_collection name="output_BESTORF_aa" type="list" count="4">
+                <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_1_with_3_species.fasta" />
+                <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_6_with_2_species.fasta" />
+                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_7_with_3_species.fasta" />
+                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_14_with_2_species.fasta" />
+            </output_collection>
+            <output_collection name="output_BESTORF_nuc" type="list" count="4">
+                <element name="orthogroup_1_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_1_with_3_species.fasta" />
+                <element name="orthogroup_6_with_2_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_6_with_2_species.fasta" />
+                <element name="orthogroup_7_with_3_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_7_with_3_species.fasta" />
+                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_14_with_2_species.fasta" />
+            </output_collection>
+            <output_collection name="output_filter_aa" type="list" count="1">
+                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_aa/test2/orthogroup_14_with_2_species.fasta" />
+            </output_collection>
+            <output_collection name="output_filter_nuc" type="list" count="1">
+                <element name="orthogroup_14_with_2_species" value="outputs_ORF_Search_08_CDS_without_indel_nuc/test2/orthogroup_14_with_2_species.fasta" />
+            </output_collection>
+		</test>
+
+	</tests>
+	<help>
+
+@HELP_AUTHORS@
+
+<![CDATA[
+
+**Description**
+
+This tool takes files containing nucleic aligned sequences and search the ORF and the CDS.
+
+--------
+
+**Inputs**
+
+Input files : (multiple) fasta files with nucleic aligned sequences.
+
+--------
+
+**Parameters**
+
+    - methionine : choose to consider the methionine in the search of CDS.
+        yes/no.
+
+    - 'Minimal number of species in each locus'
+        Default : 10 (integer).
+
+    - 'min_length_seq' :
+        minimal length of the sequence (in amino acids).
+        when the removal of the indel is done, the minimal length equals : previous length - 20.
+        for example if you choose 50 for the minimal length, the actual length equals 30.
+        Default : 50 (integer).
+
+    - 'min_length_subseq' :
+        minimal length of the subsequence (in amino acids).
+        subsequence means the part of the original sequence between 2 sets of indels.
+        an indel set is composed by more than 2 indels, if not the set is considered as unknown amino acid.
+        Default : 15 (integer).
+
+    - 'min_length_nuc' :
+        Minimal length of the sequence in the nucleic format, without indels.
+        Default : 50 (integer).
+
+    - others parameters allowing to choose which outputs you desire :
+        - outputs with best ORFs.
+        - outputs with CDS, with or without indels.
+        - in proteic or nucleic format.
+
+--------
+
+**Outputs**
+
+    - ORF_Search
+        the log file (mainly statistics about the tool).
+
+    - ORF_Search_Best_ORF_aa
+        the output with the best ORF in the proteic format.
+
+    - ORF_Search_Best_ORF_nuc
+        the output with the best ORF in the nucleic format.
+
+    - ORF_Search_CDS_aa
+        the output with the CDS (regardless the Methionine) in the proteic format.
+
+    - ORF_Search_CDS_nuc
+        the output with the CDS (regardless the Methionine) in the nucleic format.
+
+    - ORF_Search_CDS_with_M_aa
+        the output with the CDS (considering the Methionine) in proteic format.
+        the rule : they must have a methionine before the minimal length of the sequence.
+        for example before the 30 last amino acid.
+
+    - ORF_Search_CDS_with_M_nuc
+        the output with the CDS (considering the Methionine) in nucleic format.
+        the rule : they must have a methionine before the minimale length of the sequence.
+        for example before the 30 last amino acid.
+
+    - ORF_Search_CDS_without_indel_aa
+        is the output with the CDS without indel in proteic format.
+        considering the Methionine or not : according to the option chosen.
+
+    - ORF_Search_CDS_without_indel_nuc
+        is the output with the CDS without indel in proteic format.
+        considering the Methionine or not : according to the option chosen.
+
+---------
+
+**The AdaptSearch Pipeline**
+
+.. image:: adaptsearch_picture_helps.png
+
+---------
+
+Changelog
+---------
+
+**Version 2.0 - 05/07/2017**
+
+ - NEW: Replace the zip between tools by Dataset Collection
+
+**Version 1.0 - 13/04/2017**
+
+ - Added functional test with planemo
+ - planemo test with conda dependency for python
+ - Scripts renamed + symlinks to the directory 'scripts'
+
+    ]]>
+
+	</help>
+
+    <citations>
+
+    </citations>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,14 @@
+Changelog
+
+**Version 2.0 - 05/07/2017**
+
+ - NEW: Replace the zip between tools by Dataset Collection
+
+
+**Version 1.0 - 13/04/2017**
+
+ - Add funtional test with planemo
+
+ - planemo test with conda dependency for python
+
+ - Scripts renamed + symlinks to the directory 'scripts'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,32 @@
+<macros>
+
+	<xml name="python_required">
+			<requirement type="package" version="2.7">python</requirement>
+	</xml>
+
+    <token name="@HELP_AUTHORS@">
+.. class:: infomark
+
+**Authors**  Eric Fontanillas created the version 1 of this pipeline. Victor Mataigne developped version 2.
+
+.. class:: infomark
+
+**Galaxy integration** Julie Baffard and ABiMS TEAM, Roscoff Marine Station
+
+ | Contact support.abims@sb-roscoff.fr for any questions or concerns about the Galaxy implementation of this tool.
+ | Credits : Gildas le Corguillé, Misharl Monsoor
+
+---------------------------------------------------
+
+    </token>
+
+	<xml name="citations">
+		<citations>
+			<citation type="bibtex">Credits : ABIMS team, Roscoff Marine Station</citation>
+			<citation type="bibtex">Contact support.abims@sb-roscoff.fr for any questions or concerns about the Galaxy implementation of this tool.</citation>
+			<citation type="bibtex">Version 1 : Scripts by Eric Fontanillas -- Galaxy integration by Julie Baffard</citation>
+			<citation type="bibtex">Version 2 : improvments by Victor Mataigne, Gildas le Corguillé, Misharl Monsoor</citation>
+		</citations>
+	</xml>
+
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S01_find_orf_on_multiple_alignment.py	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,318 @@
+#!/usr/bin/env python
+# coding: utf8
+# Author: Eric Fontanillas
+# Modification: 03/09/14 by Julie BAFFARD
+# Last modification : 25/07/18 by Victor Mataigne
+
+# Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria
+                # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF
+                # CRITERIA 2 - This longest part should be > 150nc or 50aa
+                # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa
+                                 # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA
+                                 # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA
+
+import string, os, time, re, zipfile, sys, argparse
+from dico import dico
+
+def code_universel(F1):
+    """ Creates bash for genetic code (key : codon ; value : amino-acid) """
+    bash_codeUniversel = {}
+
+    with open(F1, "r") as file:
+        for line in file.readlines():
+            L1 = string.split(line, " ")
+            length1 = len(L1)
+            if length1 == 3:
+                key = L1[0]
+                value = L1[2][:-1]
+                bash_codeUniversel[key] = value
+            else:
+                key = L1[0]
+                value = L1[2]
+                bash_codeUniversel[key] = value
+
+    return(bash_codeUniversel)
+
+def multiple3(seq):
+    """ Tests if the sequence is a multiple of 3, and if not removes extra-bases
+        !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """
+
+    m = len(seq)%3
+    if m != 0 :
+        return seq[:-m], m
+    else :
+        return seq, m
+
+def detect_Methionine(seq_aa, Ortho, minimal_cds_length):
+    """ Detects if methionin in the aa sequence """
+
+    ln = len(seq_aa)
+    CUTOFF_Last_50aa = ln - minimal_cds_length
+
+    # Find all indices of occurances of "M" in a string of aa
+    list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"]
+
+    # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS
+    if list_indices != []:
+        first_M = list_indices[0]
+        if first_M < CUTOFF_Last_50aa:
+            Ortho = 1  # means orthologs found
+
+    return(Ortho)
+
+def ReverseComplement2(seq):
+    """ Reverse complement DNA sequence """
+    seq1 = 'ATCGN-TAGCN-atcgn-tagcn-'
+    seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 }
+
+    return "".join([seq_dict[base] for base in reversed(seq)])
+
+def simply_get_ORF(seq_dna, gen_code):
+    seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i+3] for i in range(0, len(seq_dna), 3)]
+    seq_by_aa = [gen_code[codon] if codon in gen_code.keys() else '?' for codon in seq_by_codons]
+
+    return ''.join(seq_by_aa)
+
+def find_good_ORF_criteria_3(bash_aligned_nc_seq, bash_codeUniversel, minimal_cds_length, min_spec):
+    # Multiple sequence based : Based on the alignment of several sequences (orthogroup)
+    # Criteria 1 : Get the segment in the alignment with no codon stop
+
+    # 1 - Get the list of aligned aa seq for the 3 ORF:
+    bash_of_aligned_aa_seq_3ORF = {}
+    bash_of_aligned_nuc_seq_3ORF = {}
+    BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = []
+
+    for fasta_name in bash_aligned_nc_seq.keys():
+        # Get sequence, chek if multiple 3, then get 6 orfs
+        sequence_nc = bash_aligned_nc_seq[fasta_name]
+        new_sequence_nc, modulo = multiple3(sequence_nc)
+        new_sequence_rev = ReverseComplement2(new_sequence_nc)
+        # For each seq of the multialignment => give the 6 ORFs (in nuc)
+        bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]]
+
+        seq_prot_ORF1 = simply_get_ORF(new_sequence_nc, bash_codeUniversel)
+        seq_prot_ORF2 = simply_get_ORF(new_sequence_nc[1:-2], bash_codeUniversel)
+        seq_prot_ORF3 = simply_get_ORF(new_sequence_nc[2:-1], bash_codeUniversel)
+        seq_prot_ORF4 = simply_get_ORF(new_sequence_rev, bash_codeUniversel)
+        seq_prot_ORF5 = simply_get_ORF(new_sequence_rev[1:-2], bash_codeUniversel)
+        seq_prot_ORF6 = simply_get_ORF(new_sequence_rev[2:-1], bash_codeUniversel)
+
+        # For each seq of the multialignment => give the 6 ORFs (in aa)
+        bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6]
+
+    # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF)
+    BEST_MAX = 0
+
+    for i in [0,1,2,3,4,5]: # Test the 6 ORFs
+        ORF_Aligned_aa = []
+        ORF_Aligned_nuc = []
+
+        # 2.1 - Get the alignment of sequence for a given ORF
+        # Compare the 1rst ORF between all sequence => list them in ORF_Aligned_aa // them do the same for the second ORF, and them the 3rd
+        for fasta_name in bash_of_aligned_aa_seq_3ORF.keys():
+            ORFsequence = bash_of_aligned_aa_seq_3ORF[fasta_name][i]
+            aa_length = len(ORFsequence)
+            ORF_Aligned_aa.append(ORFsequence)   ### List of all sequences in the ORF nb "i" =
+
+        n = i+1
+
+        for fasta_name in bash_of_aligned_nuc_seq_3ORF.keys():
+            ORFsequence = bash_of_aligned_nuc_seq_3ORF[fasta_name][i]
+            nuc_length = len(ORFsequence)
+            ORF_Aligned_nuc.append(ORFsequence) # List of all sequences in the ORF nb "i" =
+
+        # 2.2 - Get the list of sublist of positions whithout codon stop in the alignment
+        # For each ORF, now we have the list of sequences available (i.e. THE ALIGNMENT IN A GIVEN ORF)
+        # Next step is to get the longuest subsequence whithout stop
+        # We will explore the presence of stop "*" in each column of the alignment, and get the positions of the segments between the positions with "*"
+        MAX_LENGTH = 0
+        LONGUEST_SEGMENT_UNSTOPPED = ""
+        j = 0 # Start from first position in alignment
+        List_of_List_subsequences = []
+        List_positions_subsequence = []
+        while j < aa_length:
+                column = []
+                for seq in ORF_Aligned_aa:
+                    column.append(seq[j])
+                j = j+1
+                if "*" in column:
+                    List_of_List_subsequences.append(List_positions_subsequence) # Add previous list of positions
+                    List_positions_subsequence = []                              # Re-initialyse list of positions
+                else:
+                    List_positions_subsequence.append(j)
+
+        # 2.3 - Among all the sublists (separated by column with codon stop "*"), get the longuest one (BETTER SEGMENT for a given ORF)
+        LONGUEST_SUBSEQUENCE_LIST_POSITION = []
+        MAX=0
+        for sublist in List_of_List_subsequences:
+            if len(sublist) > MAX and len(sublist) > minimal_cds_length:
+                MAX = len(sublist)
+                LONGUEST_SUBSEQUENCE_LIST_POSITION = sublist
+
+        # 2.4. - Test if the longuest subsequence start exactly at the beginning of the original sequence (i.e. means the ORF maybe truncated)
+        if LONGUEST_SUBSEQUENCE_LIST_POSITION != []:
+            if LONGUEST_SUBSEQUENCE_LIST_POSITION[0] == 0:
+                CDS_maybe_truncated = 1
+            else:
+                CDS_maybe_truncated = 0
+        else:
+            CDS_maybe_truncated = 0
+
+
+        # 2.5 - Test if this BETTER SEGMENT for a given ORF, is the better than the one for the other ORF (GET THE BEST ORF)
+        # Test whether it is the better ORF
+        if MAX > BEST_MAX:
+            BEST_MAX = MAX
+            BEST_ORF = i+1
+            BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = LONGUEST_SUBSEQUENCE_LIST_POSITION
+
+
+    # 3 - ONCE we have this better segment (BEST CODING SEGMENT)
+    # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position)
+    # And get the INDEX of the best ORF [0, 1, or 2]
+    if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []:
+        pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0]
+        pos_MIN_aa = pos_MIN_aa - 1
+        pos_MAX_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[-1]
+
+
+        BESTORF_bash_of_aligned_aa_seq = {}
+        BESTORF_bash_of_aligned_aa_seq_CODING = {}
+        for fasta_name in bash_of_aligned_aa_seq_3ORF.keys():
+            index_BEST_ORF = BEST_ORF-1  # cause list going from 0 to 2 in LIST_3_ORF, while the ORF nb is indexed from 1 to 3
+            seq = bash_of_aligned_aa_seq_3ORF[fasta_name][index_BEST_ORF]
+            seq_coding = seq[pos_MIN_aa:pos_MAX_aa]
+            BESTORF_bash_of_aligned_aa_seq[fasta_name] = seq
+            BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding
+
+        # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment
+        pos_MIN_nuc = pos_MIN_aa * 3
+        pos_MAX_nuc = pos_MAX_aa * 3
+
+        BESTORF_bash_aligned_nc_seq = {}
+        BESTORF_bash_aligned_nc_seq_CODING = {}
+        for fasta_name in bash_aligned_nc_seq.keys():
+            seq = bash_of_aligned_nuc_seq_3ORF[fasta_name][index_BEST_ORF]
+            seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc]
+            BESTORF_bash_aligned_nc_seq[fasta_name] = seq
+            BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding
+
+    else: # no CDS found
+        BESTORF_bash_aligned_nc_seq = {}
+        BESTORF_bash_aligned_nc_seq_CODING = {}
+        BESTORF_bash_of_aligned_aa_seq = {}
+        BESTORF_bash_of_aligned_aa_seq_CODING ={}
+
+    # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa
+
+    BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {}
+    BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {}
+
+    Ortho = 0
+    for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys():
+        seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name]
+        Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length)   ### DEF6 ###
+
+    # CASE 1: A "M" is present and correctly localized (not in last 50 aa)
+    if Ortho == 1:
+        BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING
+        BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING
+
+    # CASE 2: in case the CDS is truncated, so the "M" is maybe missing:
+    if Ortho == 0 and CDS_maybe_truncated == 1:
+        BESTORF_bash_of_aligned_aa_seq_CDS_with_M = BESTORF_bash_of_aligned_aa_seq_CODING
+        BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = BESTORF_bash_aligned_nc_seq_CODING
+
+    # CASE 3: CDS not truncated AND no "M" found in good position (i.e. before the last 50 aa):
+        ## => the 2 bash "CDS_with_M" are left empty ("{}")
+
+    return(BESTORF_bash_aligned_nc_seq,  BESTORF_bash_aligned_nc_seq_CODING, BESTORF_bash_of_aligned_nuc_seq_CDS_with_M, BESTORF_bash_of_aligned_aa_seq, BESTORF_bash_of_aligned_aa_seq_CODING, BESTORF_bash_of_aligned_aa_seq_CDS_with_M)
+
+def write_output_file(results_dict, name_elems, path_out):
+    if results_dict != {}:
+        name_elems[3] = str(len(results_dict.keys()))
+        new_name = "_".join(name_elems)
+
+        out1 = open("%s/%s" %(path_out,new_name), "w")
+        for fasta_name in results_dict.keys():
+            seq = results_dict[fasta_name]
+            out1.write("%s\n" %fasta_name)
+            out1.write("%s\n" %seq)
+        out1.close()
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt")
+    parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int)
+    parser.add_argument("min_spec", help="Minimal number of species per alignment")
+    parser.add_argument("list_files", help="File with all input files names")
+    args = parser.parse_args()
+
+    minimal_cds_length = int(args.min_cds_len)  # in aa number
+    bash_codeUniversel = code_universel(args.codeUniversel)
+    minimum_species = int(args.min_spec)
+
+    # Inputs from file containing list of species
+    list_files = []
+    with open(args.list_files, 'r') as f:
+        for line in f.readlines():
+            list_files.append(line.strip('\n'))
+
+    # Directories for results
+    dirs = ["04_BEST_ORF_nuc", "04_BEST_ORF_aa", "05_CDS_nuc", "05_CDS_aa", "06_CDS_with_M_nuc", "06_CDS_with_M_aa"]
+    for directory in dirs:
+        os.mkdir(directory)
+
+    count_file_processed, count_file_with_CDS, count_file_without_CDS, count_file_with_CDS_plus_M = 0, 0, 0, 0
+    count_file_with_cds_and_enought_species, count_file_with_cds_M_and_enought_species = 0, 0
+
+    # ! : Currently, files are named "Orthogroup_x_y_sequences.fasta, where x is the number of the orthogroup (not important, juste here to make a distinct name),
+    # and y is the number of sequences/species in the group. These files are outputs of blastalign, where species can be removed. y is then modified.
+    name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
+
+    # by fixing the counter here, there will be some "holes" in the outputs directories (missing numbers), but the groups between directories will correspond
+    #n0 = 0
+
+    for file in list_files:
+        #n0 += 1
+
+        count_file_processed = count_file_processed + 1
+        nb_gp = file.split('_')[1] # Keep trace of the orthogroup number
+        fasta_file_path = "./%s" %file
+        bash_fasta = dico(fasta_file_path)
+        BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M  = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species)
+
+        name_elems[1] = nb_gp
+
+        # Update counts and write group in corresponding output directory
+        if BESTORF_nuc != {}:
+            count_file_with_CDS += 1
+            if len(BESTORF_nuc.keys()) >= minimum_species :
+                count_file_with_cds_and_enought_species += 1
+                write_output_file(BESTORF_nuc, name_elems, dirs[0]) # OUTPUT BESTORF_nuc
+                write_output_file(BESTORF_aa, name_elems, dirs[1]) # The most interesting
+        else:
+            count_file_without_CDS += 1
+
+        if BESTORF_nuc_CODING != {} and len(BESTORF_nuc_CODING.keys()) >= minimum_species:
+            write_output_file(BESTORF_nuc_CODING, name_elems, dirs[2])
+            write_output_file(BESTORF_aa_CODING, name_elems, dirs[3])
+
+        if BESTORF_nuc_CDS_with_M != {}:
+            count_file_with_CDS_plus_M += 1
+            if len(BESTORF_nuc_CDS_with_M.keys()) >= minimum_species :
+                count_file_with_cds_M_and_enought_species += 1
+                write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4])
+                write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5])
+
+    print "*************** CDS detection ***************"
+    print "\nFiles processed: %d" %count_file_processed
+    print "\tFiles with CDS: %d" %count_file_with_CDS
+    print "\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species)
+    print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M
+    print "\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species)
+    print "\tFiles without CDS: %d \n" %count_file_without_CDS
+    print ""
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S02_remove_too_short_bit_or_whole_sequence.py	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,187 @@
+#!/usr/bin/env python
+# coding: utf8
+## Author: Eric Fontanillas
+## Modification: 03/09/14 by Julie BAFFARD
+## Last modification : 05/03/18 by Victor Mataigne
+
+## Description : find and remove indels
+
+###################
+###### DEF 9 ######
+###################
+def detect_short_indel(seq,MAX_LENGTH_SMALL_INDEL):
+    ## 1 ## Built the list of sublist of consecutive gap position
+    LIST = []
+    sublist=[]
+    ln = len(seq)
+    i=0
+    while i < ln:
+        if seq[i] == "-":
+            sublist.append(i)  ## save gaps in sublist until a aa is found => else:
+        else:
+            LIST.append(sublist)  ## save the list of gap
+            sublist = []  ## create new list of gap
+        i = i+1
+            ## if gap at the end: add the last "sublist of gap" (not done in previous loop, at it add sublist (of gaps) only when in find aa, but if gap at the end, no aa after are present, so cannot add this last sublist to the LISt of gaps
+    if sublist != []:
+        LIST.append(sublist)
+
+    ## 2 ## keep only the records of the small indel (<MAX_LENGTH_SMALL_INDEL)
+    list_of_sublist_positions = []
+    for element in LIST:
+        if element != [] and len(element)<=MAX_LENGTH_SMALL_INDEL:
+            list_of_sublist_positions.append(element)
+
+    return(list_of_sublist_positions)
+####################################
+
+
+#######################
+##### RUN RUN RUN #####
+#######################
+import string, os, time, re, sys
+from dico import dico
+
+### 0 ### PARAMETERS
+MIN_LENGTH_ALL_aa = int(sys.argv[3])-20
+MIN_LENGTH_BIT_OF_SEQUENCE_aa = int(sys.argv[4])
+MAX_LENGTH_SMALL_INDEL = 2      ## in aa
+MAX_LENGTH_SMALL_INDEL_nuc = 6  ## in nuc
+MIN_SPECIES_NB = int(sys.argv[1])
+dicoco = {}
+dico_dico = {}
+list_new_file = []
+n0 = 0
+e=0
+j=0
+i=1
+name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
+
+### 1 ### IN
+if sys.argv[2] == "oui" :
+    path_IN1 = "./06_CDS_with_M_aa/"
+    L_IN1 = os.listdir(path_IN1)
+    path_IN2 = "./06_CDS_with_M_nuc/"
+    L_IN2 = os.listdir(path_IN2)
+elif sys.argv[2] == "non" :
+    path_IN1 = "./05_CDS_aa/"
+    L_IN1 = os.listdir(path_IN1)
+    path_IN2 = "./05_CDS_nuc/"
+    L_IN2 = os.listdir(path_IN2)
+
+## 2 ## OUT
+os.mkdir("07_CDS_aa")
+path_OUT1 = "07_CDS_aa"
+os.mkdir("07_CDS_nuc")
+path_OUT2 = "07_CDS_nuc"
+
+for file in L_IN1:
+    file_INaa = "%s/%s" %(path_IN1, file)
+    file_INnuc = "%s/%s" %(path_IN2, file)
+
+    dico_aa = dico(file_INaa)   ### DEF 0 ###
+    dico_nuc = dico(file_INnuc)   ### DEF 0 ###
+
+    new_bash_aa = {}
+    new_bash_nuc = {}
+    for fasta_name in dico_aa.keys():
+        seq = dico_aa[fasta_name]
+        seq_nuc = dico_nuc[fasta_name]
+
+        if "?" in seq:
+            seq = string.replace(seq, "?", "-")
+        if "?" in seq_nuc:
+            seq_nuc = string.replace(seq_nuc, "?", "-")
+
+        ## 4.1 ## [FILTER 1] : Detect and Replace short internal indel symbole (= "-" as for other longer gaps) by a "?"
+        ## aa
+        list_sublist_pos = detect_short_indel(seq, MAX_LENGTH_SMALL_INDEL)   ### DEF 9 ###
+        for pos_short_indels in list_sublist_pos:
+            for pos in pos_short_indels:
+                seq = seq[:pos] + "?" + seq[pos+1:]
+        ## nuc
+        list_sublist_pos = detect_short_indel(seq_nuc, MAX_LENGTH_SMALL_INDEL_nuc)   ### DEF 9 ###
+        for pos_short_indels in list_sublist_pos:
+            for pos in pos_short_indels:
+                seq_nuc = seq_nuc[:pos] + "?" + seq_nuc[pos+1:]
+
+        ## 4.2 ## [FILTER 2] : Remove short bits of sequence (<"MIN_LENGTH_BIT_OF_SEQUENCE_aa")
+        LIST_sublist_aa=[]
+        S1 = string.split(seq, "-")
+        for element in S1:
+            if len(element) > MIN_LENGTH_BIT_OF_SEQUENCE_aa:
+                LIST_sublist_aa.append(element)
+
+        ## 4.3 ## [FILTER 3] : Remove all the sequence if the total length of all subsequences < "MIN_LENGTH_ALL_aa")
+        seq_all = ""
+        for bit_of_sequence in LIST_sublist_aa:
+            seq_all = seq_all + bit_of_sequence
+
+        if len(seq_all) < MIN_LENGTH_ALL_aa:
+            LIST_sublist_aa = []
+
+        ## 4.4 ## [FILTER 4] : Detect sublist position in the original sequence, and recreate the filtered sequence from these positions:
+        seq_gap = "-" * len(seq)    ## 4.4.1 ## generate a sequence with only gaps inside
+        seq_gap_nuc = "-" * len(seq_nuc)
+
+        for subsequence in LIST_sublist_aa:
+            ## aa
+            START = string.find(seq, subsequence)
+            END = START + len(subsequence)
+            seq_gap = seq_gap[:START] + seq[START:END] + seq_gap[END:]  ## 4.4.2 ## and then replace the correponding gaps by coding subsequence in the sequence
+            ## nuc
+            START_nuc = START*3
+            END_nuc = END*3
+            seq_gap_nuc = seq_gap_nuc[:START_nuc] + seq_nuc[START_nuc:END_nuc] + seq_gap_nuc[END_nuc:]
+
+        ## 4.5 ## Save new sequence in bash if not empty
+        seq_empty_test = string.replace(seq_gap, "-", "")
+        if seq_empty_test != "":
+            new_bash_aa[fasta_name] = seq_gap
+
+        seq_empty_test = string.replace(seq_gap_nuc, "-", "")
+        if seq_empty_test != "":
+            new_bash_nuc[fasta_name] = seq_gap_nuc
+
+    # 4.6 ## Correct the nb of sequence in the output name, if necessary
+    n0 += 1
+    name_elems[1] = file.split('_')[1]
+    #name_elems[1] = str(n0)
+    name_elems[3] = str(len(new_bash_nuc.keys()))
+    new_name = "_".join(name_elems)
+    dico_dico[new_name] = [new_bash_aa, new_bash_nuc]
+    list_new_file.append(new_name)
+
+## [FILTER 6]: print output only if at least "MIN_SPECIES_NB" species remaining in the alignment
+for name in list_new_file :
+    dicoo = dico_dico[name]
+    dico_aa = dicoo[0]
+    dico_nuc = dicoo[1]
+    sp_nbre = len(dico_aa.keys())
+
+    if sp_nbre >= MIN_SPECIES_NB :
+        file_OUTaa = open("%s/%s" %(path_OUT1, name), "w")
+        file_OUTnuc = open("%s/%s" %(path_OUT2, name), "w")
+
+        for fasta_name in dico_aa.keys() :
+            seq_aa = dico_aa[fasta_name]
+            file_OUTaa.write("%s\n" %fasta_name)
+            file_OUTaa.write("%s\n" %seq_aa)
+        for fasta_name in dico_nuc.keys() :
+            seq_nuc = dico_nuc[fasta_name]
+            file_OUTnuc.write("%s\n" %fasta_name)
+            file_OUTnuc.write("%s\n" %seq_nuc)
+
+        file_OUTaa.close()
+        file_OUTnuc.close()
+
+    else:
+        e+=1
+
+###Print
+if sys.argv[2] == "oui" :
+    print "\nIn locus with CDS considering Methionine : \n"
+else :
+    print "\nIn locus with CDS regardless of the Methionine : \n"
+
+print "\nTotal number of locus recorded  = %d" %n0
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S03_remove_site_with_not_enough_species_represented.py	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+# coding: utf8
+## Author: Eric Fontanillas
+## Modification: 03/09/14 by Julie BAFFARD
+## Last modification : 05/03/18 by Victor Mataigne
+
+## Description : find and remove indels
+
+####################
+###### DEF 2 #######
+####################
+def remove_position_with_too_much_missing_data(bash_aa, bash_nuc, MIN_SPECIES_NB):
+
+    ## 1 ## Get alignment length
+    fasta_name0 = bash_aa.keys()[0]
+    ln_aa = len(bash_aa[fasta_name0])
+
+    ln_nuc = len(bash_nuc[fasta_name0])
+
+
+    ## 2 ## Get positions keeped in aa alignment
+    LIST_POSITION_KEEPED_aa = []
+    i=0
+    while i < ln_aa:
+        site = []
+        for fasta_name in bash_aa.keys():
+            pos = bash_aa[fasta_name][i]
+
+            if pos != "-" and pos != "?" and pos != "X":
+                site.append(pos)
+        if len(site) >= MIN_SPECIES_NB:
+            LIST_POSITION_KEEPED_aa.append(i)
+        i = i+1
+
+    ## 3 ## Get positions keeped in nuc alignment
+    LIST_POSITION_KEEPED_nuc = []
+    for position in LIST_POSITION_KEEPED_aa:
+        position1 = position*3
+        position2 = position*3 + 1
+        position3 = position*3 + 2
+        LIST_POSITION_KEEPED_nuc.append(position1)
+        LIST_POSITION_KEEPED_nuc.append(position2)
+        LIST_POSITION_KEEPED_nuc.append(position3)
+
+    ## 4 ## Create entries for "filtered_bash" for aa & nuc
+    filtered_bash_aa = {}
+    filtered_bash_nuc = {}
+    for fasta_name in bash_aa.keys():
+        filtered_bash_aa[fasta_name] = ""
+    for fasta_name in bash_nuc.keys():
+        filtered_bash_nuc[fasta_name] = ""
+
+    ## 5 ## Write "filtered_bash" for aa
+    j=0
+    while j < ln_aa:
+        for fasta_name in bash_aa.keys():
+            seq=filtered_bash_aa[fasta_name]
+            pos=bash_aa[fasta_name][j]
+
+            if j in LIST_POSITION_KEEPED_aa:
+                seq = seq + pos
+                filtered_bash_aa[fasta_name] = seq
+        j = j + 1
+
+    ## 6 ## Remove empty sequence
+    for name in filtered_bash_aa.keys():
+        seq = filtered_bash_aa[name]
+        if seq == '':
+            del filtered_bash_aa[name]
+
+
+    ## 7 ## Write "filtered_bash" for nuc
+    j=0
+    while j < ln_nuc:
+        for fasta_name in bash_nuc.keys():
+            seq=filtered_bash_nuc[fasta_name]
+            #print seq
+            pos=bash_nuc[fasta_name][j]
+
+            if j in LIST_POSITION_KEEPED_nuc:
+                seq = seq + pos
+                filtered_bash_nuc[fasta_name] = seq
+        j = j + 1
+
+    ## 8 ## Remove empty sequence
+    for name in filtered_bash_nuc.keys():
+        seq = filtered_bash_nuc[name]
+        if seq == '':
+            del filtered_bash_nuc[name]
+
+    return(filtered_bash_aa, filtered_bash_nuc)
+####################################
+
+
+#######################
+##### RUN RUN RUN #####
+#######################
+import string, os, time, re, sys
+from dico import dico
+
+### 0 ### PARAMETERS
+MIN_SPECIES_NB = int(sys.argv[1])
+MIN_LENGTH_FINAL_ALIGNMENT_NUC = int(sys.argv[2])
+n0 = 0
+bad = 0
+good = 0
+list_new_file = []
+dicoco = {}
+list_file = []
+name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
+
+### 1 ### IN
+path_IN1 = "./07_CDS_aa/"
+L_IN1 = os.listdir(path_IN1)
+lenght = len(L_IN1)
+path_IN2 = "./07_CDS_nuc/"
+L_IN2 = os.listdir(path_IN2)
+
+## 2 ## OUT
+os.mkdir("08_CDS_aa_MINIMUM_MISSING_SEQUENCES")
+path_OUT1 = "08_CDS_aa_MINIMUM_MISSING_SEQUENCES"
+os.mkdir("08_CDS_nuc_MINIMUM_MISSING_SEQUENCES")
+path_OUT2 = "08_CDS_nuc_MINIMUM_MISSING_SEQUENCES"
+
+
+for file in L_IN1:
+    file_INaa = "%s/%s" %(path_IN1, file)
+    file_INnuc = "%s/%s" %(path_IN2, file)
+
+    dico_aa = dico(file_INaa)   ### DEF 1 ###
+    dico_nuc = dico(file_INnuc)   ### DEF 1 ###
+
+    if len(dico_aa) < MIN_SPECIES_NB :
+        list_file.append(file)
+
+if list_file == lenght :
+    MIN_SPECIES_NB == MIN_SPECIES_NB - 1
+
+
+for file in L_IN1 :
+    file_INaa = "%s/%s" %(path_IN1, file)
+    file_INnuc = "%s/%s" %(path_IN2, file)
+
+    dico_aa = dico(file_INaa)   ### DEF 1 ###
+    dico_nuc = dico(file_INnuc)   ### DEF 1 ###
+
+    ## 4.1 ## REMOVE POSITIONS WITH TOO MUCH MISSING DATA (i.e. not enough taxa represented at each position in the alignment)
+    filtered_bash_aa, filtered_bash_nuc = remove_position_with_too_much_missing_data(dico_aa, dico_nuc, MIN_SPECIES_NB)   ### DEF 2 ###
+
+    k = filtered_bash_nuc.keys()
+    new_leng_nuc = 0
+    if k != []:
+        k0 = k[0]
+        seq0 = filtered_bash_nuc[k0]
+        new_leng_nuc = len(seq0)
+
+    ## 4.3 ## Change file name for output, depending the number of species remaining in the alignment
+    n0+=1
+    #name_elems[1] = str(n0)
+    name_elems[1] = file.split('_')[1]
+    name_elems[3] =  str(len(filtered_bash_aa.keys()))
+    new_name = "_".join(name_elems)
+
+    ## 4.5 ## Write filtered alignment in OUTPUTs
+    ## aa
+    if filtered_bash_aa != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC:
+        OUTaa=open("%s/%s" %(path_OUT1, new_name), "w")
+        for fasta_name in filtered_bash_aa.keys():
+            seq_aa = filtered_bash_aa[fasta_name]
+            OUTaa.write("%s\n" %fasta_name)
+            OUTaa.write("%s\n" %seq_aa)
+        OUTaa.close()
+    # nuc
+    if filtered_bash_nuc != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC:
+        good+=1
+        OUTnuc=open("%s/%s" %(path_OUT2, new_name), "w")
+        for fasta_name in filtered_bash_nuc.keys():
+            seq_nuc = filtered_bash_nuc[fasta_name]
+            OUTnuc.write("%s\n" %fasta_name)
+            OUTnuc.write("%s\n" %seq_nuc)
+        OUTnuc.close()
+    else:
+        bad+=1
+
+
+## 5 ## Print
+print "*************** 2nd Filter : removal of the indel ***************"
+print "\nTotal number of locus recorded  = %d" %n0
+print "\tTotal number of locus with no indels (SAVED) = %d" %good
+print "\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad
+print ""
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/code_universel_modified.txt	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,64 @@
+UUU Phe F
+UCU Ser S
+UAU Tyr Y
+UGU Cys C
+UUC Phe F
+UCC Ser S
+UAC Tyr Y
+UGC Cys C
+UUA Leu L
+UCA Ser S
+UAA Stop * <Stop>
+UGA Stop * <Stop>
+UUG Leu L
+UCG Ser S
+UAG Stop * <Stop>
+UGG Trp W
+CUU Leu L
+CCU Pro P
+CAU His H
+CGU Arg R
+CUC Leu L
+CCC Pro P
+CAC His H
+CGC Arg R
+CUA Leu L
+CCA Pro P
+CAA Gln Q
+CGA Arg R
+CUG Leu L
+CCG Pro P
+CAG Gln Q
+CGG Arg R
+AUU Ile I
+ACU Thr T
+AAU Asn N
+AGU Ser S
+AUC Ile I
+ACC Thr T
+AAC Asn N
+AGC Ser S
+AUA Ile I
+ACA Thr T
+AAA Lys K
+AGA Arg R
+AUG Met M <initiation a la traduction>
+ACG Thr T
+AAG Lys K
+AGG Arg R
+GUU Val V
+GCU Ala A
+GAU Asp D
+GGU Gly G
+GUC Val V
+GCC Ala A
+GAC Asp D
+GGC Gly G
+GUA Val V
+GCA Ala A
+GAA Glu E
+GGA Gly G
+GUG Val V
+GCG Ala A
+GAG Glu E
+GGG Gly G
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/dico.py	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,13 @@
+import string, itertools
+
+def dico(F1):
+    dicoco = {}
+    with open(F1, "r") as file:
+        for name, query in itertools.izip_longest(*[file]*2):
+            if name[0] == ">":
+                fasta_name_query = name[:-1]
+                Sn = string.split(fasta_name_query, "||")
+                fasta_name_query = Sn[0]
+                fasta_seq_query = query[:-1]
+                dicoco[fasta_name_query] = fasta_seq_query
+    return(dicoco)
Binary file static/images/adaptsearch_picture_helps.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cds_search.log	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,24 @@
+*************** CDS detection ***************
+
+Files processed: 4
+	Files with CDS: 2
+		Files with CDS plus M (codon start): 1
+	Files without CDS: 2
+
+
+
+In locus with CDS regardless of the Methionine :
+
+*************** 1st filter : selection of the locus ***************
+
+Total number of locus recorded  = 2
+	Number of locus with 1 species : 1
+	Number of locus with 2 species : 0
+Number of locus excluded (exclude if not at least 2 species in the alignment)= 1
+
+*************** 2nd Filter : removal of the indel ***************
+
+Total number of locus recorded  = 1
+	Total number of locus with no indels (SAVED) = 1
+	Total number of locus, when removing indel, wich are empty (EXCLUDED) = 0
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/cds_search_methionine.log	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,24 @@
+*************** CDS detection ***************
+
+Files processed: 1
+	Files with CDS: 1
+		Files with CDS plus M (codon start): 1
+	Files without CDS: 0
+
+
+
+In locus with CDS considering Methionine :
+
+*************** 1st filter : selection of the locus ***************
+
+Total number of locus recorded  = 1
+	Number of locus with 1 species : 0
+	Number of locus with 2 species : 0
+Number of locus excluded (exclude if not at least 1 species in the alignment)= 0
+
+*************** 2nd Filter : removal of the indel ***************
+
+Total number of locus recorded  = 1
+	Total number of locus with no indels (SAVED) = 1
+	Total number of locus, when removing indel, wich are empty (EXCLUDED) = 0
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/inputs/orthogroup_12_with_5_sequences.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Ac689_1/1_1.000_714
+ccgtccaaacgtgacgaatacgcggccgagctggccaaatacatcgacgtcgacgtctacggaaagtgcggcacgctgacgtgtccgaaggatgagaaggtcgactgcgaacagatgtgggccgaaacgtacaagtttcacttgtcctttgagaacacgatttgtcaagattacatcacg
+>Ap6163_1/1_1.000_569
+-------------------------------tggccaagtacatcgacgtagacgtctatggcaagtgcggca-----------------------------------------------------------------------------------------------------------
+>Pu6544_1/1_1.000_249
+------------------------------------------------------------------------------------------------------------------------------acgtacaagtttca----------------------------------------
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/inputs/orthogroup_14_with_4_sequences.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Ap1491_1/1_1.000_963
+cgaagaaacatgacggagcaaatgacgcttcgcggtaccctccaagggcacggaggatgggtaacccaaattgctacaacgccacaatttcctgatatcattttgtcggcttctagagacaaatcgctcattctgtggcagctgactcgcgaggaatcgcgttacggcttccctcgcaaggccttgcgcggacatggacacttcgtgtctgacgtcgtcatgtcatcagatggacagttcgccctgtctggatcctgggatggaacccttcgtttgtgggatcttggcactggtcagacaactcgtaggtttgttggacacacgaaggacgtgctaagtgtggctttctcagctgataaccgtcagattgtgtcaggttcacgtgacaagaccatcaagttgtggaacactcttggggtgtgcaagtataccattcaggaagatgggcacacagagtgggtatcatgtgttcgattctcaccaaacacccagaatcccatcattgtgtcctgtggctgggacaaactggttaaggtgtggaatctgacaaactgcaagctaaaaacaaaccacttcggacactcaggttatctgaactgtgtcactgtgtcccctgatggatctttgtgcgcttctggtggaaaagatggccaggcaatgttatgggatttgaatgaaggcaagcatctgtacacattggatggtggtgatgtcatcaactcactgtgcttcagccccaacagatactggctttgtgctgcttctggaccaagcataaagatctgggatctggaaggcaaggttgttgtggatgagctgcgtccagaagtgatcagcaccagtgccagtgccgagccacctcagtgtatatccctggcttggtcagctgatggccagacactgtttgctggatacacagacaacctgattcgtgtgtggcaggtatctatggcagctacccga
+>Ac6688_1/1_1.000_963
+cgaagaaacatgacggagcaaatgacgcttcgcggtaccctccaagggcacggaggatgggtaactcaaattgctacaacgccacaatttcccgatattattttgtcggcttcaagagacaaatcgctcatcctgtggcagctgactcgtgaggaatcgcgctacggtttccctcgcaaggccttgcgtggacatggacatttcgtgtctgacgttgttatgtcatcagatggacagttcgctctgtctggatcctgggatggaacccttcgtttgtgggatcttggcactggtcagacaactcgtaggtttgtcggacacacaaaagatgtgctaagcgtggccttctcagctgataaccgccagattgtgtcaggttcacgtgacaagaccatcaagttgtggaacactctcggtgtatgcaagtacaccattcaggaagatggacacacagagtgggtatcatgtgttcgcttctcaccaaacactcagaatcccatcattgtgtcttgtggctgggacaaactggttaaggtttggaatctgacaaactgcaaactaaaaacaaaccactttggacactcaggttacctgaactgtgtcaccgtgtcccctgatggatctttgtgtgcttctggtggtaaggatggccaggcaatgttgtgggatttgaatgaaggcaagcatctgtacacattggatggtggtgatgtcatcaactcactgtgcttcagccccaacagatattggctttgtgctgcctctggaccaagcataaagatctgggatctggaaggcaaggttgttgtggatgagttgcgtccagaagtgatcagtaccagtgccagcgctgaaccaccccagtgtatatccctggcatggtcagctgatggccagacgctgtttgcaggatacacagacaacctgatccgtgtctggcaggtgtccatggcagctacccga
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/inputs/orthogroup_1_with_4_sequences.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Ac3644_1/1_1.000_1626
+attggcacagcatcatttgaaagccttgttgagttagttaaatattacaagaagaacccactttacagaaaaatgaaactcagatatgctgttaatgaggaagttgttcaacaacaaggaatggatccagatgaacaggcaatttacagtggagaaatgtacacaaatccaaatgattttgtatctaagattaaagtgagggctttgtatgactacaagaaacaacgtgaagatgaactg
+>Ap2303_1/1_1.000_424
+attggcacagcatcctttgagagccttgttgagttagttaagtattacaagaagaacccactttacagaaaaatgaaactcagatatgcggttaatgaggaagttgtccagcaacaaggaatggatccagatgaacaggcaatatacagtggagaaatgtacacaaatccaaatgattttgtatctaagattaaag--------------------------------------------
+>Am7472_1/1_1.000_254
+attggtaccgcatcatttgagagtctggtagagctagtggaatactacaagaaaaaccc-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/inputs/orthogroup_6_with_4_sequences.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Ac1013_1/1_1.000_525
+ttgaccttcaaggagctgaaaaaagccctcaaggccaaaggctacaaggtcaagggcaaacaactcaaggctcaattcaaacagtttgataaagatggcgataagaaaataacccttcaagaatacttgatcgcgatgggccaagtcccggatgcctaccacaaagaggcagccatgcggcaggctttcgagcgggcggacaaaaacaaagacggaagcttggacatcggcgaggttaacgccattttccaagagatgaacaccttccttgatccagacgagctcttcaagatcgtccacgccatcgacaaggaccacagcggacggatcaactacgacgaattcttgaccttcttcatgaagcagcaaaatgtcaactttgagagcagcgacagcgactgggac
+>Ap5072_1/1_1.000_437
+ttgaccttcaaggagctgaaaaaggccctgaaggccaagggctacaaggtcaagggcaaacagctcaaggcccaattcaaacagtttgataaagacggcgacaagaaaatatcccttcaagaatacctgatcgcgatgggtcaagtcccggatgtctaccacaaagaggccgccatgcggcaggctttcgagcgggcggacaaaaacaaagacggaagcttggacatcggcgagatcaacgccatcttccgggagatgaacaccttcctcaatccagacgagctctttcagatcgtccacgaaatcgacaaggaccacagcggacggatcaactacaacgaattcctgaccttcttcatgaagcagcaaaatgtcaacttcgagagcagtgacagcgattggga-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/inputs/orthogroup_7_with_3_sequences.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Ac2173_1/1_1.000_330
+gcttggagaaggtcagaagcgttgaaaatgttgcagagagctcttcgtcttaaccagcttactcgtcgattttccacaagtgtggttcggcgaagtgaagaatggcaacaaaggggtcttcccggatctaacatgcctttcgatatgaacaaccgatacaagttgatggcttggttcatcctcttttttggttctggcttgggagtgccatatctcttagtccgccaccagcttctgaaggag
+>Ap5050_1/1_1.000_243
+gcttggggaaggtcagaagctgtgaaaatgttgcagagagctcttcgtcttaatcaacttactcgtcgattttccacaagtgtggttcgacggagtgaagaatggcaacagaggggtcttcccggatctaacatgcctttcgacatgaacaaccgatacaaattgatggcgtggttcatcctcttttttggttctggcttgggagtgccatatctcttagtccgccaccagcttctgaaggag
+>Am3527_1/1_1.000_270
+-------------------------------------------------------------ctcgtcgtttttccacaagtgtggtcagacaaagccaagaatggcaacagcttggagtacctggatcgaacatgccatttgacatcaacaacagatacaa----------------------------------------------------------------------------------
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/inputs/orthogroup_8_with_4_sequences.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Am6963_1/1_1.000_854
+gataagtcgtcaggagtacattatggcatcataacctgtgagggctgcaagggatttttc
+>Pg7693_1/1_1.000_511
+---------------------------------acctgtgagggctg-------------
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_1_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am7472_1/1_1.000_254
+?????????????????????????????????????????????????????????????VFLVVFH*LYQTLK*CGTN
+>Ap2303_1/1_1.000_424
+???????????????FNLRYKIIWICVHFSTVYCLFIWIHSLLLDNFLINRISEFHFSVKWVLLVILN*LNKALKGCCAN
+>Ac3644_1/1_1.000_1626
+QFIFTLFLVVIQSPHFNLRYKIIWICVHFSTVNCLFIWIHSLLLNNFLINSISEFHFSVKWVLLVIFN*LNKAFK*CCAN
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_04_Best_ORF_aa/test1/orthogroup_7_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+????????????????????????????VSVVDVKWHVRSRYSKLLPFLALSDHTCGKTT?????????????????????
+>Ap5050_1/1_1.000_243
+LLQKLVAD*EIWHSQARTKKEDEPRHQFVSVVHVERHVRSGKTPLLPFFTPSNHTCGKSTSKLIKTKSSLQHFHSF*PSPS
+>Ac2173_1/1_1.000_330
+LLQKLVAD*EIWHSQARTKKEDEPSHQLVSVVHIERHVRSGKTPLLPFFTSPNHTCGKSTSKLVKTKSSLQHFQRF*PSPS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_14_with_2_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Ac6688_1/1_1.000_963
+RVAAMDTCQTRIRLSVYPANSVWPSADHARDIHWGGSALALVLITSGRNSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSLPPEAHKDPSGDTVTQFR*PECPKWFVFSLQFVRFQTLTSLSQPQDTMMGF*VFGEKRTHDTHSVCPSS*MVYLHTPRVFHNLMVLSREPDTIWRLSAEKATLSTSFVCPTNLRVV*PVPRSHKRRVPSQDPDRANCPSDDITTSDTKCPCPRKALRGKP*RDSSRVSCHRMSDLSLEADKIISGNCGVVAI*VTHPPCPWRVPRSVICSVMFL
+>Ap1491_1/1_1.000_963
+RVAAIDTCHTRIRLSVYPANSVWPSADQARDIH*GGSALALVLITSGRSSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSFPPEAHKDPSGDTVTQFR*PECPKWFVFSLQFVRFHTLTSLSQPQDTMMGFWVFGENRTHDTHSVCPSS*MVYLHTPRVFHNLMVLSREPDTI*RLSAEKATLSTSFVCPTNLRVV*PVPRSHKRRVPSQDPDRANCPSDDMTTSDTKCPCPRKALRGKP*RDSSRVSCHRMSDLSLEADKMISGNCGVVAIWVTHPPCPWRVPRSVICSVMFL
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_1_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am7472_1/1_1.000_254
+?????????????????????????????????????????????????????????????VFLVVFH*LYQTLK*CGTN
+>Ap2303_1/1_1.000_424
+???????????????FNLRYKIIWICVHFSTVYCLFIWIHSLLLDNFLINRISEFHFSVKWVLLVILN*LNKALKGCCAN
+>Ac3644_1/1_1.000_1626
+QFIFTLFLVVIQSPHFNLRYKIIWICVHFSTVNCLFIWIHSLLLNNFLINSISEFHFSVKWVLLVIFN*LNKAFK*CCAN
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_6_with_2_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Ap5072_1/1_1.000_437
+PNRCHCSRS*HFAAS*RRSGIRCS*SVRCGPCRFRGRSERARLD*GRCSSPGRWR*SRRCPSFRLCFCPPARKPAAWRPLCGRHPGLDPSRSGILEGIFSCRRLYQTV*IGP*AVCP*PCSPWPSGPFSAP*RS
+>Ac1013_1/1_1.000_525
+PSRCRCSQS*HFAAS*RRSRIRRS*SVRCGPCRWRGRS*RARLDQGRCSSLGKWR*PRRCPSFRLCFCPPARKPAAWLPLCGRHPGLGPSRSSILEGLFSYRHLYQTV*IEP*VVCP*PCSLWP*GLFSAP*RS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_04_Best_ORF_aa/test2/orthogroup_7_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+????????????????????????????VSVVDVKWHVRSRYSKLLPFLALSDHTCGKTT?????????????????????
+>Ap5050_1/1_1.000_243
+LLQKLVAD*EIWHSQARTKKEDEPRHQFVSVVHVERHVRSGKTPLLPFFTPSNHTCGKSTSKLIKTKSSLQHFHSF*PSPS
+>Ac2173_1/1_1.000_330
+LLQKLVAD*EIWHSQARTKKEDEPSHQLVSVVHIERHVRSGKTPLLPFFTSPNHTCGKSTSKLVKTKSSLQHFQRF*PSPS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_1_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am7472_1/1_1.000_254
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------gggtttttcttgtagtattccactagctctaccagactctcaaatgatgcggtaccaat
+>Ap2303_1/1_1.000_424
+--------------------------------------------ctttaatcttagatacaaaatcatttggatttgtgtacatttctccactgtatattgcctgttcatctggatccattccttgttgctggacaacttcctcattaaccgcatatctgagtttcatttttctgtaaagtgggttcttcttgtaatacttaactaactcaacaaggctctcaaaggatgctgtgccaat
+>Ac3644_1/1_1.000_1626
+cagttcatcttcacgttgtttcttgtagtcatacaaagccctcactttaatcttagatacaaaatcatttggatttgtgtacatttctccactgtaaattgcctgttcatctggatccattccttgttgttgaacaacttcctcattaacagcatatctgagtttcatttttctgtaaagtgggttcttcttgtaatatttaactaactcaacaaggctttcaaatgatgctgtgccaat
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_04_Best_ORF_nuc/test1/orthogroup_7_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+----------------------------------------------------------------------------------ttgtatctgttgttgatgtcaaatggcatgttcgatccaggtactccaagctgttgccattcttggctttgtctgaccacacttgtggaaaaacgacgag-------------------------------------------------------------
+>Ap5050_1/1_1.000_243
+ctccttcagaagctggtggcggactaagagatatggcactcccaagccagaaccaaaaaagaggatgaaccacgccatcaatttgtatcggttgttcatgtcgaaaggcatgttagatccgggaagacccctctgttgccattcttcactccgtcgaaccacacttgtggaaaatcgacgagtaagttgattaagacgaagagctctctgcaacattttcacagcttctgaccttccccaagc
+>Ac2173_1/1_1.000_330
+ctccttcagaagctggtggcggactaagagatatggcactcccaagccagaaccaaaaaagaggatgaaccaagccatcaacttgtatcggttgttcatatcgaaaggcatgttagatccgggaagacccctttgttgccattcttcacttcgccgaaccacacttgtggaaaatcgacgagtaagctggttaagacgaagagctctctgcaacattttcaacgcttctgaccttctccaagc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_14_with_2_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Ac6688_1/1_1.000_963
+cgggtagctgccatggacacctgccagacacggatcaggttgtctgtgtatcctgcaaacagcgtctggccatcagctgaccatgccagggatatacactggggtggttcagcgctggcactggtactgatcacttctggacgcaactcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaggcagcacaaagccaatatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccacaacattgcctggccatccttaccaccagaagcacacaaagatccatcaggggacacggtgacacagttcaggtaacctgagtgtccaaagtggtttgtttttagtttgcagtttgtcagattccaaaccttaaccagtttgtcccagccacaagacacaatgatgggattctgagtgtttggtgagaagcgaacacatgatacccactctgtgtgtccatcttcctgaatggtgtacttgcatacaccgagagtgttccacaacttgatggtcttgtcacgtgaacctgacacaatctggcggttatcagctgagaaggccacgcttagcacatcttttgtgtgtccgacaaacctacgagttgtctgaccagtgccaagatcccacaaacgaagggttccatcccaggatccagacagagcgaactgtccatctgatgacataacaacgtcagacacgaaatgtccatgtccacgcaaggccttgcgagggaaaccgtagcgcgattcctcacgagtcagctgccacaggatgagcgatttgtctcttgaagccgacaaaataatatcgggaaattgtggcgttgtagcaatttgagttacccatcctccgtgcccttggagggtaccgcgaagcgtcatttgctccgtcatgtttctt
+>Ap1491_1/1_1.000_963
+cgggtagctgccatagatacctgccacacacgaatcaggttgtctgtgtatccagcaaacagtgtctggccatcagctgaccaagccagggatatacactgaggtggctcggcactggcactggtgctgatcacttctggacgcagctcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaagcagcacaaagccagtatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccataacattgcctggccatcttttccaccagaagcgcacaaagatccatcaggggacacagtgacacagttcagataacctgagtgtccgaagtggtttgtttttagcttgcagtttgtcagattccacaccttaaccagtttgtcccagccacaggacacaatgatgggattctgggtgtttggtgagaatcgaacacatgatacccactctgtgtgcccatcttcctgaatggtatacttgcacaccccaagagtgttccacaacttgatggtcttgtcacgtgaacctgacacaatctgacggttatcagctgagaaagccacacttagcacgtccttcgtgtgtccaacaaacctacgagttgtctgaccagtgccaagatcccacaaacgaagggttccatcccaggatccagacagggcgaactgtccatctgatgacatgacgacgtcagacacgaagtgtccatgtccgcgcaaggccttgcgagggaagccgtaacgcgattcctcgcgagtcagctgccacagaatgagcgatttgtctctagaagccgacaaaatgatatcaggaaattgtggcgttgtagcaatttgggttacccatcctccgtgcccttggagggtaccgcgaagcgtcatttgctccgtcatgtttctt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_1_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am7472_1/1_1.000_254
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------gggtttttcttgtagtattccactagctctaccagactctcaaatgatgcggtaccaat
+>Ap2303_1/1_1.000_424
+--------------------------------------------ctttaatcttagatacaaaatcatttggatttgtgtacatttctccactgtatattgcctgttcatctggatccattccttgttgctggacaacttcctcattaaccgcatatctgagtttcatttttctgtaaagtgggttcttcttgtaatacttaactaactcaacaaggctctcaaaggatgctgtgccaat
+>Ac3644_1/1_1.000_1626
+cagttcatcttcacgttgtttcttgtagtcatacaaagccctcactttaatcttagatacaaaatcatttggatttgtgtacatttctccactgtaaattgcctgttcatctggatccattccttgttgttgaacaacttcctcattaacagcatatctgagtttcatttttctgtaaagtgggttcttcttgtaatatttaactaactcaacaaggctttcaaatgatgctgtgccaat
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_6_with_2_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Ap5072_1/1_1.000_437
+cccaatcgctgtcactgctctcgaagttgacattttgctgcttcatgaagaaggtcaggaattcgttgtagttgatccgtccgctgtggtccttgtcgatttcgtggacgatctgaaagagctcgtctggattgaggaaggtgttcatctcccggaagatggcgttgatctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggcggcctctttgtggtagacatccgggacttgacccatcgcgatcaggtattcttgaagggatattttcttgtcgccgtctttatcaaactgtttgaattgggccttgagctgtttgcccttgaccttgtagcccttggccttcagggcctttttcagctccttgaaggtca
+>Ac1013_1/1_1.000_525
+cccagtcgctgtcgctgctctcaaagttgacattttgctgcttcatgaagaaggtcaagaattcgtcgtagttgatccgtccgctgtggtccttgtcgatggcgtggacgatcttgaagagctcgtctggatcaaggaaggtgttcatctcttggaaaatggcgttaacctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggctgcctctttgtggtaggcatccgggacttggcccatcgcgatcaagtattcttgaagggttattttcttatcgccatctttatcaaactgtttgaattgagccttgagttgtttgcccttgaccttgtagcctttggccttgagggcttttttcagctccttgaaggtca
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_04_Best_ORF_nuc/test2/orthogroup_7_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+----------------------------------------------------------------------------------ttgtatctgttgttgatgtcaaatggcatgttcgatccaggtactccaagctgttgccattcttggctttgtctgaccacacttgtggaaaaacgacgag-------------------------------------------------------------
+>Ap5050_1/1_1.000_243
+ctccttcagaagctggtggcggactaagagatatggcactcccaagccagaaccaaaaaagaggatgaaccacgccatcaatttgtatcggttgttcatgtcgaaaggcatgttagatccgggaagacccctctgttgccattcttcactccgtcgaaccacacttgtggaaaatcgacgagtaagttgattaagacgaagagctctctgcaacattttcacagcttctgaccttccccaagc
+>Ac2173_1/1_1.000_330
+ctccttcagaagctggtggcggactaagagatatggcactcccaagccagaaccaaaaaagaggatgaaccaagccatcaacttgtatcggttgttcatatcgaaaggcatgttagatccgggaagacccctttgttgccattcttcacttcgccgaaccacacttgtggaaaatcgacgagtaagctggttaagacgaagagctctctgcaacattttcaacgcttctgaccttctccaagc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_05_CDS_aa/test1/orthogroup_1_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am7472_1/1_1.000_254
+?????????????????????????????????????????????????????????????VFLVVFH
+>Ap2303_1/1_1.000_424
+???????????????FNLRYKIIWICVHFSTVYCLFIWIHSLLLDNFLINRISEFHFSVKWVLLVILN
+>Ac3644_1/1_1.000_1626
+QFIFTLFLVVIQSPHFNLRYKIIWICVHFSTVNCLFIWIHSLLLNNFLINSISEFHFSVKWVLLVIFN
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_05_CDS_aa/test1/orthogroup_7_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+???????????????????VSVVDVKWHVRSRYSKLLPFLALSDHTCGKTT????????????????
+>Ap5050_1/1_1.000_243
+EIWHSQARTKKEDEPRHQFVSVVHVERHVRSGKTPLLPFFTPSNHTCGKSTSKLIKTKSSLQHFHSF
+>Ac2173_1/1_1.000_330
+EIWHSQARTKKEDEPSHQLVSVVHIERHVRSGKTPLLPFFTSPNHTCGKSTSKLVKTKSSLQHFQRF
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_05_CDS_aa/test2/orthogroup_14_with_2_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Ac6688_1/1_1.000_963
+GGSALALVLITSGRNSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSLPPEAHKDPSGDTVTQFR
+>Ap1491_1/1_1.000_963
+GGSALALVLITSGRSSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSFPPEAHKDPSGDTVTQFR
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_05_CDS_aa/test2/orthogroup_1_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am7472_1/1_1.000_254
+?????????????????????????????????????????????????????????????VFLVVFH
+>Ap2303_1/1_1.000_424
+???????????????FNLRYKIIWICVHFSTVYCLFIWIHSLLLDNFLINRISEFHFSVKWVLLVILN
+>Ac3644_1/1_1.000_1626
+QFIFTLFLVVIQSPHFNLRYKIIWICVHFSTVNCLFIWIHSLLLNNFLINSISEFHFSVKWVLLVIFN
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_05_CDS_aa/test2/orthogroup_6_with_2_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Ap5072_1/1_1.000_437
+SRRCPSFRLCFCPPARKPAAWRPLCGRHPGLDPSRSGILEGIFSCRRLYQTV
+>Ac1013_1/1_1.000_525
+PRRCPSFRLCFCPPARKPAAWLPLCGRHPGLGPSRSSILEGLFSYRHLYQTV
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_05_CDS_aa/test2/orthogroup_7_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+???????????????????VSVVDVKWHVRSRYSKLLPFLALSDHTCGKTT????????????????
+>Ap5050_1/1_1.000_243
+EIWHSQARTKKEDEPRHQFVSVVHVERHVRSGKTPLLPFFTPSNHTCGKSTSKLIKTKSSLQHFHSF
+>Ac2173_1/1_1.000_330
+EIWHSQARTKKEDEPSHQLVSVVHIERHVRSGKTPLLPFFTSPNHTCGKSTSKLVKTKSSLQHFQRF
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_1_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am7472_1/1_1.000_254
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------gggtttttcttgtagtattccac
+>Ap2303_1/1_1.000_424
+--------------------------------------------ctttaatcttagatacaaaatcatttggatttgtgtacatttctccactgtatattgcctgttcatctggatccattccttgttgctggacaacttcctcattaaccgcatatctgagtttcatttttctgtaaagtgggttcttcttgtaatacttaac
+>Ac3644_1/1_1.000_1626
+cagttcatcttcacgttgtttcttgtagtcatacaaagccctcactttaatcttagatacaaaatcatttggatttgtgtacatttctccactgtaaattgcctgttcatctggatccattccttgttgttgaacaacttcctcattaacagcatatctgagtttcatttttctgtaaagtgggttcttcttgtaatatttaac
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_05_CDS_nuc/test1/orthogroup_7_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+-------------------------------------------------------ttgtatctgttgttgatgtcaaatggcatgttcgatccaggtactccaagctgttgccattcttggctttgtctgaccacacttgtggaaaaacgacgag----------------------------------------------
+>Ap5050_1/1_1.000_243
+gagatatggcactcccaagccagaaccaaaaaagaggatgaaccacgccatcaatttgtatcggttgttcatgtcgaaaggcatgttagatccgggaagacccctctgttgccattcttcactccgtcgaaccacacttgtggaaaatcgacgagtaagttgattaagacgaagagctctctgcaacattttcacagcttc
+>Ac2173_1/1_1.000_330
+gagatatggcactcccaagccagaaccaaaaaagaggatgaaccaagccatcaacttgtatcggttgttcatatcgaaaggcatgttagatccgggaagacccctttgttgccattcttcacttcgccgaaccacacttgtggaaaatcgacgagtaagctggttaagacgaagagctctctgcaacattttcaacgcttc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_05_CDS_nuc/test2/orthogroup_14_with_2_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Ac6688_1/1_1.000_963
+ggtggttcagcgctggcactggtactgatcacttctggacgcaactcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaggcagcacaaagccaatatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccacaacattgcctggccatccttaccaccagaagcacacaaagatccatcaggggacacggtgacacagttcagg
+>Ap1491_1/1_1.000_963
+ggtggctcggcactggcactggtgctgatcacttctggacgcagctcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaagcagcacaaagccagtatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccataacattgcctggccatcttttccaccagaagcgcacaaagatccatcaggggacacagtgacacagttcaga
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_05_CDS_nuc/test2/orthogroup_1_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am7472_1/1_1.000_254
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------gggtttttcttgtagtattccac
+>Ap2303_1/1_1.000_424
+--------------------------------------------ctttaatcttagatacaaaatcatttggatttgtgtacatttctccactgtatattgcctgttcatctggatccattccttgttgctggacaacttcctcattaaccgcatatctgagtttcatttttctgtaaagtgggttcttcttgtaatacttaac
+>Ac3644_1/1_1.000_1626
+cagttcatcttcacgttgtttcttgtagtcatacaaagccctcactttaatcttagatacaaaatcatttggatttgtgtacatttctccactgtaaattgcctgttcatctggatccattccttgttgttgaacaacttcctcattaacagcatatctgagtttcatttttctgtaaagtgggttcttcttgtaatatttaac
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_05_CDS_nuc/test2/orthogroup_6_with_2_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Ap5072_1/1_1.000_437
+tctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggcggcctctttgtggtagacatccgggacttgacccatcgcgatcaggtattcttgaagggatattttcttgtcgccgtctttatcaaactgtt
+>Ac1013_1/1_1.000_525
+cctcgccgatgtccaagcttccgtctttgtttttgtccgcccgctcgaaagcctgccgcatggctgcctctttgtggtaggcatccgggacttggcccatcgcgatcaagtattcttgaagggttattttcttatcgccatctttatcaaactgtt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_05_CDS_nuc/test2/orthogroup_7_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+-------------------------------------------------------ttgtatctgttgttgatgtcaaatggcatgttcgatccaggtactccaagctgttgccattcttggctttgtctgaccacacttgtggaaaaacgacgag----------------------------------------------
+>Ap5050_1/1_1.000_243
+gagatatggcactcccaagccagaaccaaaaaagaggatgaaccacgccatcaatttgtatcggttgttcatgtcgaaaggcatgttagatccgggaagacccctctgttgccattcttcactccgtcgaaccacacttgtggaaaatcgacgagtaagttgattaagacgaagagctctctgcaacattttcacagcttc
+>Ac2173_1/1_1.000_330
+gagatatggcactcccaagccagaaccaaaaaagaggatgaaccaagccatcaacttgtatcggttgttcatatcgaaaggcatgttagatccgggaagacccctttgttgccattcttcacttcgccgaaccacacttgtggaaaatcgacgagtaagctggttaagacgaagagctctctgcaacattttcaacgcttc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/old_orthogroup_7_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+-------------------VSVVDVKWHVRSRYSKLLPFLALSDHTCGKTT----------------
+>Ap5050_1/1_1.000_243
+EIWHSQARTKKEDEPRHQFVSVVHVERHVRSGKTPLLPFFTPSNHTCGKSTSKLIKTKSSLQHFHSF
+>Ac2173_1/1_1.000_330
+EIWHSQARTKKEDEPSHQLVSVVHIERHVRSGKTPLLPFFTSPNHTCGKSTSKLVKTKSSLQHFQRF
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test1/orthogroup_7_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+VSVVDVKWHVRSRYSKLLPFLALSDHTCGKTT
+>Ap5050_1/1_1.000_243
+VSVVHVERHVRSGKTPLLPFFTPSNHTCGKST
+>Ac2173_1/1_1.000_330
+VSVVHIERHVRSGKTPLLPFFTSPNHTCGKST
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_aa/test2/orthogroup_14_with_2_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Ac6688_1/1_1.000_963
+GGSALALVLITSGRNSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSLPPEAHKDPSGDTVTQFR
+>Ap1491_1/1_1.000_963
+GGSALALVLITSGRSSSTTTLPSRSQIFMLGPEAAQSQYLLGLKHSELMTSPPSNVYRCLPSFKSHNIAWPSFPPEAHKDPSGDTVTQFR
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/old_orthogroup_7_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+---------------------------------------------------------gtatctgttgttgatgtcaaatggcatgttcgatccaggtactccaagctgttgccattcttggctttgtctgaccacacttgtggaaaaacgacg------------------------------------------------
+>Ap5050_1/1_1.000_243
+gagatatggcactcccaagccagaaccaaaaaagaggatgaaccacgccatcaatttgtatcggttgttcatgtcgaaaggcatgttagatccgggaagacccctctgttgccattcttcactccgtcgaaccacacttgtggaaaatcgacgagtaagttgattaagacgaagagctctctgcaacattttcacagcttc
+>Ac2173_1/1_1.000_330
+gagatatggcactcccaagccagaaccaaaaaagaggatgaaccaagccatcaacttgtatcggttgttcatatcgaaaggcatgttagatccgggaagacccctttgttgccattcttcacttcgccgaaccacacttgtggaaaatcgacgagtaagctggttaagacgaagagctctctgcaacattttcaacgcttc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test1/orthogroup_7_with_3_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,6 @@
+>Am3527_1/1_1.000_270
+gtatctgttgttgatgtcaaatggcatgttcgatccaggtactccaagctgttgccattcttggctttgtctgaccacacttgtggaaaaacgacg
+>Ap5050_1/1_1.000_243
+gtatcggttgttcatgtcgaaaggcatgttagatccgggaagacccctctgttgccattcttcactccgtcgaaccacacttgtggaaaatcgacg
+>Ac2173_1/1_1.000_330
+gtatcggttgttcatatcgaaaggcatgttagatccgggaagacccctttgttgccattcttcacttcgccgaaccacacttgtggaaaatcgacg
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outputs_ORF_Search_08_CDS_without_indel_nuc/test2/orthogroup_14_with_2_species.fasta	Fri Feb 01 10:26:37 2019 -0500
@@ -0,0 +1,4 @@
+>Ac6688_1/1_1.000_963
+ggtggttcagcgctggcactggtactgatcacttctggacgcaactcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaggcagcacaaagccaatatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccacaacattgcctggccatccttaccaccagaagcacacaaagatccatcaggggacacggtgacacagttcagg
+>Ap1491_1/1_1.000_963
+ggtggctcggcactggcactggtgctgatcacttctggacgcagctcatccacaacaaccttgccttccagatcccagatctttatgcttggtccagaagcagcacaaagccagtatctgttggggctgaagcacagtgagttgatgacatcaccaccatccaatgtgtacagatgcttgccttcattcaaatcccataacattgcctggccatcttttccaccagaagcgcacaaagatccatcaggggacacagtgacacagttcaga