Mercurial > repos > itaxotools > mold
annotate MolD_v1.4.py @ 0:4e8e2f836d0f draft default tip
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
| author | itaxotools | 
|---|---|
| date | Sun, 29 Jan 2023 16:25:48 +0000 | 
| parents | |
| children | 
| rev | line source | 
|---|---|
| 0 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 1 """ | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 2 This script compiles rDNC-based DNA diagnoses for a pre-defined taxa in a dataset. This is the MAIN WORKING VERSION v1.4 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 3 This version already implements the new functionalities: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 4 - automatic trimming of the alignment to match the median sequence length (should seq len distribution and provided NumberN settings require that) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 5 - expanded qCLADEs setting | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 6 - diagnoses in pairwise comparisons of taxa. | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 7 - selection of a reference sequence for indexing DNCs | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 8 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 9 THIS version LACKS SPART compatibility | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 10 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 11 """ | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 12 import os, sys | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 13 import random | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 14 import argparse | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 15 from io import StringIO | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 16 ######################################################################## FUNCTIONS | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 17 #***STEP 1 - SORTING ENTRIES BY CLADE AND IDENTIFYING NUCLEOTIDE POSITIONS SHARED WITHIN CLADE | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 18 def Step1(raw_records): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 19 Clades=[] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 20 for i in range(len(raw_records)): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 21 Clade=raw_records[i][1] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 22 if Clade not in Clades: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 23 Clades.append(Clade) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 24 clade_sorted_seqs = {} | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 25 for letter in Clades: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 26 clade_sorted_seqs[letter]=[] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 27 for i in range(len(raw_records)): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 28 if raw_records[i][1]==letter: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 29 clade_sorted_seqs[letter].append(raw_records[i][2]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 30 shared_positions={} | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 31 for key in clade_sorted_seqs: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 32 sh_pos=[] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 33 for i in range(len(clade_sorted_seqs[key][0])): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 34 shared_nucleotide = True | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 35 csm = clade_sorted_seqs[key][0][i] #candidate shared nucleotide | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 36 for j in range(1, len(clade_sorted_seqs[key])): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 37 if clade_sorted_seqs[key][j][i] != csm: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 38 shared_nucleotide = False | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 39 break | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 40 if shared_nucleotide == True and csm != 'N': | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 41 sh_pos.append(i) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 42 shared_positions[key]=sh_pos | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 43 return Clades, clade_sorted_seqs, shared_positions | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 44 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 45 #***STEP 2 COMPILING COMPARISON LISTS FOR CLADES AND IDENTIFYING VARIABLE POSITIONS AND N PRIORITY POSITIONS WITH LARGEST CUTOFFS | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 46 def C_VP_PP(clade_sorted_seqs, clade, shared_positions, CUTOFF):# complist_variable_positions_priority_positions; Arguments: dictionary, string, dictionary | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 47 CShN={}#a dictionary keys - clade shared positions, values - nucleotides at those positions | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 48 for pos in shared_positions[clade]: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 49 CShN[pos] = clade_sorted_seqs[clade][0][pos]#creates a dictionary shared position : nucleotide | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 50 complist=[] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 51 for key in clade_sorted_seqs: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 52 if key != clade: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 53 complist = complist + clade_sorted_seqs[key]#creates a list of all other sequences for comparison | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 54 cutoffs = {} | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 55 pures = []####! newline | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 56 for key in CShN: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 57 newcomplist = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 58 for k in complist: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 59 if k[key] == CShN[key]: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 60 newcomplist.append(k) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 61 else: continue | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 62 cutoffs[key] = len(complist) - len(newcomplist) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 63 if len(newcomplist) == 0:####! newline | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 64 pures.append(key)####! newline | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 65 CPP = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 66 for key in sorted(cutoffs, key = cutoffs.get, reverse = True): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 67 CPP.append(key) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 68 if CUTOFF[0] == '>':#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 69 Clade_priority_positions = {pos:CShN[pos] for pos in CPP if cutoffs[pos] > int(CUTOFF[1:])}#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 70 else:#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 71 Clade_priority_positions = {} | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 72 for position in CPP[:int(CUTOFF)]:#Here you define how many of the clade shared combinations are used in subsequent search | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 73 Clade_priority_positions[position] = CShN[position] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 74 return complist, Clade_priority_positions, cutoffs, pures####! pures added | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 75 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 76 #***STEPS 3 RANDOM SEARCH ACROSS PRIORITY POSITIONS TO FIND RAW DIAGNOSTIC COMBINATIONS AND TO SUBSEQUENTLY REFINE THEM | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 77 def random_position(somelist, checklist):#gives a random index (integer) of the specified range, and returns indexed somelist element if it is not present in the checklist | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 78 while True: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 79 i = random.randint(0, len(somelist) - 1) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 80 if somelist[i] not in checklist: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 81 return somelist[i] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 82 break | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 83 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 84 continue | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 85 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 86 def step_reduction_complist(clade, complist, CPP, checked_ind):#checks randomly selected positions of CladeSharedNucleotides with sequences of other clades, until a diagnostic combination of nucleotides for a selected clade is found. | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 87 if len(complist) == 0: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 88 return checked_ind | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 89 elif len(checked_ind) == len(CPP): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 90 return checked_ind | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 91 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 92 newcomplist = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 93 pos = random_position(list(CPP.keys()), checked_ind) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 94 for j in complist: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 95 if j[pos] == CPP[pos] or j[pos] == 'N':#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 96 newcomplist.append(j) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 97 else: continue | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 98 new_checked_ind = checked_ind + [pos] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 99 return step_reduction_complist(clade, newcomplist, CPP, new_checked_ind) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 100 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 101 def ConditionD(newcomb, complist, CPP):#The function checks the 'Condition D' - i.e. whither any given combination of nucleotide positions is diagnostic for the selected clade | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 102 ContD = False | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 103 for i in newcomb: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 104 newcomplist = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 105 for m in complist: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 106 if m[i] == CPP[i]: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 107 newcomplist.append(m) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 108 else: continue | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 109 complist = newcomplist | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 110 if len(complist) == 0: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 111 ContD = True | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 112 return ContD | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 113 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 114 def RemoveRedundantPositions(raw_comb, complist, CPP):# The function removes positions from the raw combinations one by one, and then checks whether new combination fulfills the condition D, thus recursively reducing the diagnostic combination. | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 115 red_possible = False | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 116 for j in raw_comb: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 117 newcomb = [k for k in raw_comb if k != j] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 118 if ConditionD(newcomb, complist, CPP) == True: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 119 red_possible = True | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 120 return RemoveRedundantPositions(newcomb, complist, CPP) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 121 else: pass | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 122 if red_possible == False: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 123 return raw_comb | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 124 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 125 #PUTS EVERYTHING TOGETHER - 20000 ROUNDS OF RANDOM SEARCH FOLLOWED BY REFINING OF 500 SHORTEST COMBINATIONS | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 126 def Diagnostic_combinations(qCLADE, complist, CPP, n1, maxlen1, maxlen2): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 127 Achecked_ind = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 128 bestlists = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 129 n = n1 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 130 while n>0:#STEP3 proposes raw diagnostic combinations | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 131 raw_comb = step_reduction_complist(qCLADE, complist, CPP, Achecked_ind) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 132 if len(raw_comb) <= maxlen1: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 133 refined_comb = RemoveRedundantPositions(raw_comb, complist, CPP) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 134 if len(refined_comb) <= maxlen2 and sorted(refined_comb) not in bestlists: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 135 bestlists.append(sorted(refined_comb)) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 136 n=n-1 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 137 bestlists.sort(key=len) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 138 return bestlists | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 139 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 140 #***STEP 4 ANALYSIS OF OUTPUT rDNCs | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 141 def IndependentKey(diagnostic_combinations):#PRESENTLY NOT INVOLVED - returns independent diagnostic nucleotide combinations, and identifies key nucleotide positions | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 142 independent_combinations = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 143 selected_positions = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 144 for i in range(len(diagnostic_combinations)): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 145 if len(selected_positions) == 0: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 146 for j in range(0, i): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 147 if len(set(diagnostic_combinations[i]) & set(diagnostic_combinations[j])) == 0 and len(set(diagnostic_combinations[i]) & set(selected_positions)) == 0: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 148 independent_combinations.append(diagnostic_combinations[i]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 149 independent_combinations.append(diagnostic_combinations[j]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 150 for k in range(len(diagnostic_combinations[i])): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 151 selected_positions.append(diagnostic_combinations[i][k]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 152 for l in range(len(diagnostic_combinations[j])): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 153 selected_positions.append(diagnostic_combinations[j][l]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 154 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 155 if len(set(diagnostic_combinations[i]) & set(selected_positions)) == 0: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 156 independent_combinations.append(diagnostic_combinations[i]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 157 for k in range(len(diagnostic_combinations[i])): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 158 selected_positions.append(diagnostic_combinations[i][k]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 159 independent_combinations.sort(key=len) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 160 key_positions = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 161 for pos in diagnostic_combinations[0]: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 162 KP = True | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 163 for combination in diagnostic_combinations[1:]: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 164 if pos not in combination: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 165 KP = False | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 166 break | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 167 else: continue | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 168 if KP == True: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 169 key_positions.append(pos) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 170 return independent_combinations, key_positions | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 171 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 172 #SPECIFIC FUNCTIONS FOR THE rDNCs | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 173 def PositionArrays(Motifs):#VERYNEW ALL FUNCTION NEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 174 PositionArrays = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 175 VarPosList = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 176 for i in range(len(Motifs[0])): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 177 Const = True | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 178 array = [Motifs[0][i]] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 179 for j in range(len(Motifs[1:])): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 180 if Motifs[j][i] != 'N': | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 181 if Motifs[j][i] != array[-1]: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 182 Const = False | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 183 array.append(Motifs[j][i]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 184 PositionArrays.append(array) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 185 if Const == False: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 186 VarPosList.append(i) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 187 return PositionArrays, VarPosList | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 188 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 189 def random_sequence_new(SEQ, PositionArrays, VarPosList, Pdiff):#VERYNEW FUNCTION REVISED | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 190 #print(["ROUND", len(SEQ)*Pdiff/100, round(len(SEQ)*Pdiff/100)]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 191 n = round(len(SEQ)*Pdiff/100) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 192 N = random.sample(list(range(1, n)), 1)[0] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 193 PosToChange = random.sample([p for p in VarPosList if SEQ[p] != 'D'], N)#this is a new definition to keep alignment gaps unchanged | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 194 NEWSEQ = '' | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 195 for i in range(len(SEQ)): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 196 if i not in PosToChange: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 197 NEWSEQ = NEWSEQ + SEQ[i] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 198 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 199 newarray = [j for j in PositionArrays[i] if j != SEQ[i]] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 200 newbase = random.sample(newarray, 1)[0] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 201 NEWSEQ = NEWSEQ + newbase | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 202 return NEWSEQ | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 203 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 204 def GenerateBarcode_new(Diagnostic_combinations, length):#VERYNEW FUNCTION REVISED - This function calculates diagnostic combinations and assembles a barcode of desired length for a query taxon. First all single position DNCs are added, then based on the frequency of a nucleotide position in the DNCs of the 2 positions, and then based on the frequency of a position in longer DNCs | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 205 len1 = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 206 len2 = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 207 lenmore = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 208 for comb in Diagnostic_combinations: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 209 if len(comb) == len(Diagnostic_combinations[0]): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 210 for i in comb: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 211 len1.append(i) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 212 elif len(comb) == len(Diagnostic_combinations[0])+1: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 213 for j in comb: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 214 len2.append(j) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 215 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 216 for k in comb: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 217 lenmore.append(k) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 218 if len(Diagnostic_combinations[0]) == 1: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 219 Setin = len1 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 220 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 221 Setin = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 222 for pos in sorted(len1, key=len1.count, reverse = True): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 223 if not pos in Setin: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 224 Setin.append(pos) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 225 for pos1 in sorted(len2, key=len2.count, reverse = True): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 226 if not pos1 in Setin: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 227 Setin.append(pos1) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 228 for pos2 in sorted(lenmore, key=lenmore.count, reverse = True): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 229 if not pos2 in Setin: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 230 Setin.append(pos2) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 231 return Setin[:length] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 232 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 233 def Screwed_dataset_new(raw_records, nseq_per_clade_to_screw, PositionArrays, VarPosList, Percent_difference, Taxon, Cutoff):#VERYNEW FUNCTION REVISED | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 234 clades=[] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 235 for i in range(len(raw_records)): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 236 Clade=raw_records[i][1] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 237 if Clade not in clades: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 238 clades.append(Clade) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 239 clade_sorted_seqs = {} | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 240 for letter in clades: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 241 clade_sorted_seqs[letter]=[] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 242 for i in range(len(raw_records)): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 243 if raw_records[i][1]==letter: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 244 clade_sorted_seqs[letter].append(raw_records[i][2]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 245 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 246 for clade in clades: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 247 seqlist = clade_sorted_seqs[clade] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 248 newseqs = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 249 if len(seqlist) > nseq_per_clade_to_screw: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 250 iSTS = random.sample(list(range(len(seqlist))), nseq_per_clade_to_screw) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 251 for k in range(len(seqlist)): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 252 if k in iSTS: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 253 newseq = random_sequence_new(seqlist[k], PositionArrays, VarPosList, Percent_difference) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 254 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 255 newseq = seqlist[k] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 256 newseqs.append(newseq) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 257 elif len(clade_sorted_seqs[clade]) == nseq_per_clade_to_screw: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 258 for k in range(len(seqlist)): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 259 newseq = random_sequence_new(seqlist[k], PositionArrays, VarPosList, Percent_difference) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 260 newseqs.append(newseq) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 261 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 262 for i in range(nseq_per_clade_to_screw): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 263 seq = random.sample(seqlist, 1)[0] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 264 newseq = random_sequence_new(seq, PositionArrays, VarPosList, Percent_difference) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 265 newseqs.append(newseq) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 266 clade_sorted_seqs[clade] = newseqs | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 267 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 268 shared_positions={} | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 269 for key in clade_sorted_seqs: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 270 sh_pos=[] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 271 for i in range(len(clade_sorted_seqs[key][0])): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 272 shared_nucleotide = True | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 273 csm = clade_sorted_seqs[key][0][i] #candidate shared nucleotide | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 274 for j in range(1, len(clade_sorted_seqs[key])): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 275 if clade_sorted_seqs[key][j][i] != csm: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 276 shared_nucleotide = False | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 277 break | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 278 if shared_nucleotide == True and csm != 'N': | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 279 sh_pos.append(i) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 280 shared_positions[key]=sh_pos | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 281 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 282 x,y,z,pures = C_VP_PP(clade_sorted_seqs, Taxon, shared_positions, Cutoff)#STEP2#### | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 283 return x, y | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 284 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 285 #NEWFUNCYIONS | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 286 def medianSeqLen(listofseqs):#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 287 seqlens = [i.count('A')+i.count('C')+i.count('G')+i.count('T') for i in listofseqs] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 288 medlen = sorted(seqlens)[int(len(seqlens)/2)] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 289 medseq = listofseqs[seqlens.index(medlen)] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 290 start = min([medseq.find('A'),medseq.find('C'),medseq.find('G'),medseq.count('T')]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 291 if not 'N' in medseq[start:]: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 292 end = len(medseq) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 293 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 294 for i in range(start, len(medseq), 1): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 295 if medseq[i:].count('N') == len(medseq[i:]): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 296 end = i | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 297 break | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 298 return medlen, start, end | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 299 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 300 def getAllPairs(taxalist): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 301 uniquetaxapairs = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 302 stl = sorted(list(set(taxalist))) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 303 for i in range(len(stl)): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 304 for j in range(i+1, len(stl)): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 305 uniquetaxapairs.append([stl[i],stl[j]]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 306 return uniquetaxapairs | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 307 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 308 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 309 ################################################READ IN PARAMETER FILE AND DATA FILE | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 310 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 311 def get_args(): #arguments needed to give to this script | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 312 parser = argparse.ArgumentParser(description="run MolD") | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 313 required = parser.add_argument_group("required arguments") | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 314 required.add_argument("-i", help="textfile with parameters of the analysis", required=True) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 315 return parser.parse_args() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 316 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 317 def mainprocessing(gapsaschars=None, taxalist=None, taxonrank=None, cutoff=None, numnucl=None, numiter=None, maxlenraw=None, maxlenrefined=None, iref=None, pdiff=None, nmax=None, thresh=None, tmpfname=None, origfname=None): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 318 ParDict = {} | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 319 if not(all([gapsaschars, taxalist, taxonrank, cutoff, numnucl, numiter, maxlenraw, maxlenrefined, iref, pdiff, nmax, thresh, tmpfname])): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 320 args = get_args() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 321 with open(args.i) as params: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 322 for line in params: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 323 line = line.strip() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 324 if line.startswith('#'): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 325 pass | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 326 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 327 if len(line.split('=')) == 2 and len(line.split('=')[1]) != 0: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 328 ParDict[line.split('=')[0]] = line.split('=')[1].replace(' ', '')#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 329 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 330 ParDict['Gaps_as_chars'] = gapsaschars | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 331 ParDict['qTAXA'] = taxalist | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 332 ParDict['Taxon_rank'] = taxonrank | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 333 ParDict['INPUT_FILE'] = tmpfname | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 334 ParDict['ORIG_FNAME'] = origfname# I do not understand this ORIG_FNAME, it throws an error with the command line | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 335 ParDict['Cutoff'] = cutoff | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 336 ParDict['NumberN'] = numnucl | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 337 ParDict['Number_of_iterations'] = numiter | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 338 ParDict['MaxLen1'] = maxlenraw | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 339 ParDict['MaxLen2'] = maxlenrefined | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 340 ParDict['Iref'] = iref | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 341 ParDict['Pdiff'] = pdiff | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 342 #ParDict['PrSeq'] = prseq | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 343 ParDict['NMaxSeq'] = nmax | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 344 ParDict['Scoring'] = thresh | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 345 ParDict['OUTPUT_FILE'] = "str" | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 346 print(ParDict) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 347 ############################################# #VERYNEW HOW GAPS ARE TREATED | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 348 #REQUIRES A NEW FIELD IN THE GUI | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 349 if ParDict['Gaps_as_chars'] == 'yes': | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 350 gaps2D = True#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 351 else:#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 352 gaps2D = False#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 353 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 354 ############################################ DATA FILE | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 355 checkread = open(ParDict['INPUT_FILE'], 'r') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 356 firstline = checkread.readline() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 357 checkread.close() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 358 imported=[]#set up a new list with species and identifiers | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 359 if '>' in firstline: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 360 f = open(ParDict['INPUT_FILE'], 'r') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 361 for line in f:#VERYNEW - THE DATA READING FROM ALIGNMENT FILE IS ALL REVISED UNTIL f.close() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 362 line=line.rstrip() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 363 if line.startswith('>'): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 364 data = line[1:].split('|') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 365 if len(data) != 2: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 366 print('Check number of entries in', data[0]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 367 #break | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 368 data.append('') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 369 imported.append(data) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 370 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 371 if gaps2D == True:# | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 372 DNA = line.upper().replace('-', 'D').replace('R', 'N').replace('Y', 'N').replace('S','N').replace('W','N').replace('M','N').replace('K','N')#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 373 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 374 DNA = line.upper().replace('-', 'N').replace('R', 'N').replace('Y', 'N').replace('S','N').replace('W','N').replace('M','N').replace('K','N')#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 375 imported[-1][-1] = imported[-1][-1]+DNA | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 376 f.close() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 377 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 378 f = open(ParDict['INPUT_FILE'], 'r') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 379 for line in f:#VERYNEW - THE DATA READING FROM ALIGNMENT FILE IS ALL REVISED UNTIL f.close() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 380 line=line.rstrip() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 381 data = line.split('\t') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 382 if len(data) != 3: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 383 print('Check number of entries in', data[0]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 384 ID = data[0] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 385 thetax = data[1] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 386 if gaps2D == True:# | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 387 DNA = data[2].upper().replace('-', 'D').replace('R', 'N').replace('Y', 'N').replace('S','N').replace('W','N').replace('M','N').replace('K','N')#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 388 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 389 DNA = data[2].upper().replace('-', 'N').replace('R', 'N').replace('Y', 'N').replace('S','N').replace('W','N').replace('M','N').replace('K','N')#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 390 imported.append([ID, thetax, DNA]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 391 f.close() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 392 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 393 if 'NumberN' in list(ParDict.keys()):#How many ambiguously called nucleotides are allowed | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 394 NumberN = int(ParDict['NumberN']) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 395 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 396 NumberN = 5 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 397 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 398 if len(set([len(i[2]) for i in imported])) != 1: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 399 print('Alignment contains sequences of different lengths:', set([len(i[2]) for i in imported])) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 400 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 401 mlen, sstart, send = medianSeqLen([i[2] for i in imported])#OCT2022 - start | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 402 if mlen + NumberN < len(imported[0][2]): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 403 Slice = True | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 404 FragmentLen = mlen | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 405 corr = sstart | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 406 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 407 Slice = False | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 408 FragmentLen = len(imported[0][2]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 409 sstart = 0 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 410 send = FragmentLen+1 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 411 corr = 0 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 412 raw_records=[] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 413 for i in imported: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 414 if ParDict['Iref'] != 'NO' and ParDict['Iref'].split(',')[0] == i[0] and ParDict['Iref'].split(',')[1] in ['ex', 'excl', 'out']: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 415 continue | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 416 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 417 if i[2][sstart:send].count('N') < NumberN and len(i[2][sstart:send]) == FragmentLen: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 418 newi = [i[0], i[1], i[2][sstart:send]] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 419 raw_records.append(newi)#OCT2022 - end | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 420 print('\n########################## PARAMETERS ######################\n')#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 421 #print('input file:', ParDict['ORIG_FNAME']) #Outcommented ORIG_FNAME | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 422 print('input file:', ParDict['INPUT_FILE']) #Replacement of the line above | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 423 print('Coding gaps as characters:', gaps2D) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 424 print('Maximum undetermined nucleotides allowed:', NumberN) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 425 print('Length of the alignment:', len(imported[0][2]),'->', FragmentLen) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 426 print('Indexing reference:', ParDict['Iref'].replace('NO', 'Not set').replace('in', 'included').replace('ex', 'excluded')) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 427 print('Read in', len(raw_records), 'sequences') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 428 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 429 PosArrays, VarPosList = PositionArrays([i[2] for i in raw_records])#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 430 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 431 #############################################READ IN OTHER ANALYSIS PARAMETERS | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 432 ##OCT2022 - start | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 433 withplus = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 434 P2 = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 435 shift = True | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 436 if ParDict['qTAXA'][0] == '>':#THIS OPTION DIAGNOSES ALL TAXA WITH MORE THAN USER-DEFINED NUMBER OF SEQUENCES AVAILABLE | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 437 NumSeq = int(ParDict['qTAXA'][1:]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 438 Taxarecords = [i[1] for i in raw_records] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 439 qCLADEs = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 440 for j in set(Taxarecords): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 441 if Taxarecords.count(j) >= NumSeq: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 442 qCLADEs.append(j) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 443 elif ParDict['qTAXA'].startswith('P:'):#THIS OPTION DIAGNOSES ALL TAXA CONTAINING A USER-DEFINED PATTERN IN THE NAME | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 444 pattern = ParDict['qTAXA'].split(':')[1] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 445 Taxarecords = [i[1] for i in raw_records] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 446 qCLADEs = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 447 for j in set(Taxarecords): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 448 if pattern in j: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 449 qCLADEs.append(j) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 450 elif ParDict['qTAXA'].startswith('P+:'):#THIS OPTION POOLS ALL TAXA CONTAINING A USER-DEFINED PATTERN IN THE NAME IN ONE TAXON AND DIAGNOSES IT | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 451 pattern = ParDict['qTAXA'].split(':')[1] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 452 Taxarecords = set([i[1] for i in raw_records if pattern in i[1]]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 453 spp = '+'.join(sorted(list(Taxarecords))) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 454 qCLADEs = [spp] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 455 nrecords = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 456 for rec in raw_records: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 457 if rec[1] in Taxarecords: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 458 nrecords.append([rec[0], spp, rec[2]]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 459 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 460 nrecords.append(rec) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 461 raw_records = nrecords | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 462 else:#THIS OPTION DIAGNOSES ALL TAXA FROM A USER-DEFINED LIST; TAXA MAY BE COMBINED BY USING '+' | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 463 qCLADEs = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 464 allrecs = ParDict['qTAXA'].split(',') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 465 for item in allrecs: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 466 if item in ['ALL', 'All', 'all']:#THIS OPTION DIAGNOSES ALL TAXA IN THE DATASET | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 467 qCLADEs = list(set([i[1] for i in raw_records])) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 468 elif item in [i[1] for i in raw_records]: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 469 qCLADEs.append(item) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 470 elif '+' in item: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 471 withplus.append(item) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 472 elif 'VS' in item: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 473 P2.append(item.split('VS')) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 474 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 475 print('UNRECOGNIZED TAXON', item) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 476 #OCT2022 - end | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 477 print('query taxa:', len(qCLADEs+withplus), '-', str(sorted(qCLADEs)+sorted(withplus)).replace('[','').replace(']','').replace("'", ''))#1.3 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 478 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 479 if 'Cutoff' in list(ParDict.keys()):#CUTOFF Number of the informative positions to be considered, default 100 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 480 Cutoff = ParDict['Cutoff']#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 481 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 482 Cutoff = 100 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 483 print('Cutoff set as:', Cutoff) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 484 if 'Number_of_iterations' in list(ParDict.keys()):#Number iterations of MolD | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 485 N1 = int(ParDict['Number_of_iterations']) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 486 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 487 N1 = 10000 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 488 print('Number iterations of MolD set as:', N1) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 489 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 490 if 'MaxLen1' in list(ParDict.keys()):#Maximum length for the raw mDNCs | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 491 MaxLen1 = int(ParDict['MaxLen1']) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 492 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 493 MaxLen1 = 12 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 494 print('Maximum length of raw mDNCs set as:', MaxLen1) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 495 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 496 if 'MaxLen2' in list(ParDict.keys()):#Maximum length for the refined mDNCs | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 497 MaxLen2 = int(ParDict['MaxLen2']) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 498 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 499 MaxLen2 = 7 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 500 print('Maximum length of refined mDNCs set as:', MaxLen2) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 501 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 502 if 'Pdiff' in list(ParDict.keys()):#Percent difference | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 503 Percent_difference = float(ParDict['Pdiff']) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 504 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 505 if int(ParDict['Taxon_rank']) == 1:#read in taxon rank to configure Pdiff parameter of artificial dataset | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 506 Percent_difference = 1 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 507 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 508 Percent_difference = 5 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 509 print('simulated sequences up to', Percent_difference, 'percent divergent from original ones') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 510 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 511 if 'NMaxSeq' in list(ParDict.keys()):#Maximum number of sequences per taxon to be modified | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 512 Seq_per_clade_to_screw = int(ParDict['NMaxSeq']) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 513 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 514 Seq_per_clade_to_screw = 10####!changed value | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 515 print('Maximum number of sequences modified per clade', Seq_per_clade_to_screw) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 516 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 517 if 'Scoring' in list(ParDict.keys()): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 518 if ParDict['Scoring'] == 'lousy': | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 519 threshold = 66####!changed value | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 520 elif ParDict['Scoring'] == 'moderate': | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 521 threshold = 75####!changed value | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 522 elif ParDict['Scoring'] == 'stringent': | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 523 threshold = 90####!changed value | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 524 elif ParDict['Scoring'] == 'very_stringent': | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 525 threshold = 95####!changed value | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 526 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 527 threshold = 75####!changed value | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 528 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 529 threshold = 75####!changed value | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 530 #print(ParDict['Scoring'], 'scoring of the rDNCs; threshold in two consequtive runs:', threshold) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 531 print('scoring of the rDNCs; threshold in two consequtive runs:', threshold) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 532 #OCT2022 - start | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 533 if corr > 1 and len(ParDict['Iref'].split(',')) == 2: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 534 print('\nNOTE: The alignment was trimmed automatically to match median sequences length. The analysed slice starts from the site',str(sstart+1),'and ends on the site',str(send+1),'. The site indexing in the DNCs as in the provided reference.') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 535 if corr > 1 and ParDict['Iref'] == 'NO': | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 536 print('\nNOTE: The alignment was trimmed automatically to match median sequences length. The analysed slice starts from the site',str(sstart+1),'and ends on the site',str(send+1),'. The site indexing in the rDNC as in the untrimmed alignment.') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 537 #OCT2022 - end | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 538 thephrase = 'The DNA diagnosis for the taxon' | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 539 ###################################################IMPLEMENTATION | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 540 #Setting up a new class just for the convenient output formatting | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 541 class SortedDisplayDict(dict):#this is only to get a likable formatting of the barcode | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 542 def __str__(self): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 543 return "[" + ", ".join("%r: %r" % (key, self[key]) for key in sorted(self)) + "]" | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 544 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 545 class SortedDisplayDictVerbose(dict):#this is only to get a likable formatting of the barcode | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 546 def __str__(self): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 547 return ", ".join("%r %r" % (self[key],'in the site '+str(key)) for key in sorted(self)).replace("'", '').replace("A", "'A'").replace("C", "'C'").replace("G", "'G'").replace("T", "'T'")+'.' | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 548 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 549 #Calling functions and outputing results | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 550 if ParDict['OUTPUT_FILE'] == "str": | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 551 g = StringIO() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 552 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 553 g = open(ParDict['OUTPUT_FILE'], "w")#Initiating output file | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 554 #VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 555 print('<h4>########################## PARAMETERS ######################</h4>', file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 556 #print("<p>", 'input file:', ParDict['ORIG_FNAME'], "</p>", file=g) #outcommented ORIG_FNAME | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 557 print("<p>", 'input file:', ParDict['INPUT_FILE'], "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 558 print("<p>", 'Coding gaps as characters:', gaps2D, "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 559 print("<p>", 'Maximum undetermined nucleotides allowed:', NumberN, "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 560 print("<p>", 'Length of the alignment:', len(imported[0][2]),'->', FragmentLen, "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 561 print("<p>", 'Indexing reference:', ParDict['Iref'].replace('NO', 'Not set').replace('in', 'included').replace('ex', 'excluded'), "</p>", file=g)#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 562 print("<p>", 'Read in', len(raw_records), 'sequences', "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 563 print("<p>", 'query taxa:', len(qCLADEs+withplus), '-', str(sorted(qCLADEs)+sorted(withplus)).replace('[','').replace(']','').replace("'", ''), "</p>", file=g)#1.3 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 564 print("<p>", 'Cutoff set as:', Cutoff, "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 565 print("<p>", 'Number iterations of MolD set as:', N1, "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 566 print("<p>", 'Maximum length of raw mDNCs set as:', MaxLen1, "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 567 print("<p>", 'Maximum length of refined mDNCs set as:', MaxLen2, "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 568 print("<p>", 'simulated sequences up to', Percent_difference, 'percent divergent from original ones', "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 569 print("<p>", 'Maximum number of sequences modified per clade', Seq_per_clade_to_screw, "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 570 #print("<p>", ParDict['Scoring'], 'scoring of the rDNCs; threshold in two consequtive runs:', threshold, "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 571 print("<p>", 'scoring of the rDNCs; threshold in two consequtive runs:', threshold, "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 572 if corr > 1 and len(ParDict['Iref'].split(',')) == 2: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 573 print('<h4>NOTE: The alignment was trimmed automatically to match median sequences length. The analysed slice starts from the site',str(sstart+1),'and ends on the site',str(send+1),'. The site indexing in the DNCs as in the provided reference.</h4>', file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 574 if corr > 1 and ParDict['Iref'] == 'NO': | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 575 print('<h4>NOTE: The alignment was trimmed automatically to match median sequences length. The analysed slice starts from the site',str(sstart+1),'and ends on the site',str(send+1),'. The site indexing in the rDNC as in the untrimmed alignment.</h4>', file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 576 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 577 print('<h4>########################### RESULTS ##########################</h4>', file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 578 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 579 for qCLADE in sorted(qCLADEs) + sorted(withplus):#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 580 if '+' in qCLADE: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 581 if shift == True: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 582 old_records = raw_records | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 583 shift == False | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 584 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 585 raw_records = old_records | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 586 spp = qCLADE.split('+') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 587 nrecords = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 588 for rec in raw_records: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 589 if rec[1] in spp: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 590 nrecords.append([rec[0], qCLADE, rec[2]]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 591 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 592 nrecords.append(rec) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 593 raw_records = nrecords | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 594 print('\n**************', qCLADE, '**************') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 595 print('<h4>**************', qCLADE, '**************</h4>', file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 596 Clades, clade_sorted_seqs, shared_positions = Step1(raw_records)#STEP1 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 597 x,y,z,pures = C_VP_PP(clade_sorted_seqs, qCLADE, shared_positions, Cutoff)#STEP2 ####! added pures | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 598 newy = {key:y[key] for key in y if not key in pures} ####! newline | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 599 print('Sequences analyzed:', len(clade_sorted_seqs[qCLADE])) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 600 print("<p>",'Sequences analyzed:', len(clade_sorted_seqs[qCLADE]), "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 601 ND_combinations = [[item] for item in pures] ####! before ND_combinations were initiated as an empty list | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 602 print('single nucleotide mDNCs:', len(pures), '-', str(SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in pures]}))[1:-1])#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 603 print("<p>",'single nucleotide mDNCs*:',len(pures), '-', str(SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in pures]}))[1:-1], "</p>", file=g)#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 604 N = 1 ####! | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 605 while N > 0:#STEP3 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 606 try: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 607 q = Diagnostic_combinations(qCLADE, x, newy, N1, MaxLen1, MaxLen2) ####! newy instead of y | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 608 except IndexError: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 609 print(N, 'IndexError') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 610 continue | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 611 for comb in q: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 612 if not comb in ND_combinations: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 613 ND_combinations.append(comb) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 614 N-=1 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 615 ND_combinations.sort(key=len) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 616 #################################### mDNC output | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 617 try: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 618 Nind, KeyPos = IndependentKey(ND_combinations)#STEP4 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 619 except IndexError: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 620 print('no mDNCs recovered for', qCLADE)#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 621 print("<p>", 'no mDNCs recovered for', "</p>", qCLADE, file=g)#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 622 continue | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 623 Allpos = []#Create list of all positions involved in mDNCs | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 624 for comb in ND_combinations: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 625 for pos in comb: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 626 if not pos in Allpos: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 627 Allpos.append(pos) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 628 print('\nmDNCs retrieved:', str(len(ND_combinations)) + '; Sites involved:', str(len(Allpos)) + '; Independent mDNCs:', len(Nind))#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 629 print("<p>", 'mDNCs* retrieved:', str(len(ND_combinations)) + '; Sites involved:', str(len(Allpos)) + '; Independent mDNCs**:', len(Nind), "</p>", file=g)#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 630 print('Shortest retrieved mDNC:', SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in ND_combinations[0]]}), '\n')#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 631 print("<p>",'Shortest retrieved mDNC*:', SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in ND_combinations[0]]}), "</p>", file=g)#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 632 ######################################################## rDNC output | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 633 Barcode_scores = []#Initiate a list for rDNC scores | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 634 npos = len(ND_combinations[0]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 635 BestBarcode = 'none'####! newline | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 636 while npos <= min([10, len(Allpos)]):#in this loop the positions are added one-by-one to a rDNC and the rDNC is then rated on the artificially generated datasets | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 637 Barcode = GenerateBarcode_new(ND_combinations, npos)#Initiate a rDNC | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 638 Barcode_score = 0#Initiate a score to rate a rDNC | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 639 N = 100 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 640 while N > 0: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 641 NComplist, NCPP = Screwed_dataset_new(raw_records, Seq_per_clade_to_screw, PosArrays, VarPosList, Percent_difference, qCLADE, Cutoff)#Create an artificial dataset VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 642 NBarcode = [i for i in Barcode if i in list(NCPP.keys())] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 643 if len(Barcode) - len(NBarcode) <= 1 and ConditionD(NBarcode, NComplist, NCPP) == True:####! new condition (first) added | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 644 Barcode_score +=1 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 645 N -=1 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 646 print(npos, 'rDNC_score (100):', [k+corr+1 for k in Barcode], '-', Barcode_score)#VERYNEW | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 647 print("<p>", npos, 'rDNC_score (100):', [k+corr+1 for k in Barcode], '-', Barcode_score, "</p>", file=g)#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 648 if Barcode_score >= threshold and len(Barcode_scores) == 1: ###1.3 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 649 BestBarcode = Barcode###1.3 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 650 if Barcode_score >= threshold and len(Barcode_scores) > 1 and Barcode_score >= max(Barcode_scores): ###1.3 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 651 BestBarcode = Barcode####!newline | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 652 Barcode_scores.append(Barcode_score) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 653 if len(Barcode_scores) >= 2 and Barcode_scores[-1] >= threshold and Barcode_scores[-2] >= threshold:#Check whether the rDNC fulfills robustnes criteria 85:85:85 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 654 print('Final rDNC:', SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}))#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 655 print("<p>",'Final rDNC***:', SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}), "</p>", file=g)#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 656 print('\n',thephrase, qCLADE, 'is:', SortedDisplayDictVerbose({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}))#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 657 print("<p>", thephrase, qCLADE, 'is:', SortedDisplayDictVerbose({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}), "</p>", file=g)#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 658 break | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 659 else:# VERY NEW FROM HERE ONWARDS | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 660 npos += 1 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 661 if npos > min([10, len(Allpos)]): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 662 if BestBarcode != 'none': | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 663 print('The highest scoring rDNC for taxon', qCLADE, 'is:', SortedDisplayDictVerbose({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}))#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 664 print("<p>", 'The highest scoring rDNC*** for taxon', qCLADE, 'is:', SortedDisplayDictVerbose({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}), "</p>", file=g)#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 665 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 666 print('No sufficiently robust DNA diagnosis for taxon', qCLADE, 'was retrieved') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 667 print("<p>", 'No sufficiently robust DNA diagnosis for taxon', qCLADE, 'was retrieved', "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 668 #OCT2022 - start | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 669 print("<h4>", '################################# EXPLANATIONS ####################################', "</h4>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 670 print("<p>", ' * mDNC -(=minimal Diagnostic nucleotide combination) is a combination of nucleotides at specified sites of the alignment,', "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 671 print("<p>", ' unique for a query taxon. Therefore it is sufficient to differentiate a query taxon from all reference taxa in a dataset.', "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 672 print("<p>", ' Because it comprises minimal necessary number of nucleotide sites to differentiate a query, any mutation in the mDNC in' "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 673 print("<p>", ' single specimen of a query taxon will automatically disqualify it as a diagnostic combination.', "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 674 print("<p>", "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 675 print("<p>", ' ** two or more mDNCs are INDEPENDENT if they constitute non-overlapping sets of nucleotide sites.', "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 676 print("<p>", "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 677 print("<p>", '*** rDNC -(=robust/redundant Diagnostic nucleotide combination) is a combination of nucleotides at specified sites of the alignment,', "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 678 print("<p>", ' unique for a query taxon and (likewise mDNC) sufficient to differentiate a query taxon from all reference taxa in a dataset.', "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 679 print("<p>", ' However, rDNC comprises more than a minimal necessary number of diagnostic sites, and therefore is robust to single nucleotide', "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 680 print("<p>", ' replacements. Even if a mutation arises in one of the rDNC sites, the remaining ones will (with high probability) remain sufficient ', "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 681 print("<p>", ' to diagnose the query taxon', "</p>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 682 print("<h4>", ' Final diagnosis corresponds to rDNC', "</h4>", file=g) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 683 #OCT2022 - end | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 684 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 685 if ParDict['OUTPUT_FILE'] == "str": | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 686 contents = g.getvalue() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 687 os.unlink(ParDict['INPUT_FILE']) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 688 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 689 contents = None | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 690 g.close() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 691 #OCT2022 - start | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 692 if len(P2) != 0: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 693 ext = '.'+ParDict['OUTPUT_FILE'].split('.')[-1] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 694 h = open(ParDict['OUTPUT_FILE'].replace(ext, '_pairwise'+ext), "w")#Initiating output file | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 695 if len(withplus) != 0: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 696 raw_records = old_records | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 697 taxain = [i[1] for i in raw_records] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 698 tpairs = [] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 699 for alist in P2: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 700 if alist.count('all')+alist.count('All')+alist.count('ALL') == 2: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 701 tpairs = getAllPairs(taxain) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 702 elif alist.count('all')+alist.count('All')+alist.count('ALL') == 1: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 703 thetax = [i for i in taxain if i in alist][0] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 704 print(thetax) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 705 for atax in sorted(list(set(taxain))): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 706 if atax != thetax: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 707 tpairs.append([thetax, atax]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 708 else: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 709 for apair in getAllPairs(alist): | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 710 tpairs.append(apair) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 711 for apair in tpairs: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 712 t1 = apair[0] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 713 t2 = apair[1] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 714 p2records = [i for i in raw_records if i[1] in [t1, t2]] | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 715 print('\n**************', t1, 'VS', t2,'**************') | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 716 print('<h4>**************', t1, 'VS', t2, '**************</h4>', file=h) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 717 C2, css2, sp2 = Step1(p2records)#STEP1 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 718 x2,y2,z2,pures2 = C_VP_PP(css2, t1, sp2, '>0')#STEP2 ####! added pures | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 719 Pairphrase = 'Each of the following '+ str(len(pures2))+' sites is invariant across sequences of '+ t1+ ' and differentiates it from '+ t2+': ' | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 720 counterPures = {} | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 721 for site in pures2: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 722 counterPures[site] = "'or'".join(list(set([thing[site] for thing in css2[t2] if thing[site] != 'N']))) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 723 Pairphrase = Pairphrase + str(site+corr)+" ('"+str(y2[site])+"' vs '"+str(counterPures[site])+"'), " | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 724 print(Pairphrase[:-2]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 725 print("<p>",Pairphrase[:-2],'</h4>', file=h)#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 726 x2r,y2r,z2r,pures2r = C_VP_PP(css2, t2, sp2, '>0')#STEP2 ####! added pures | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 727 Pairphraser = 'Each of the following '+ str(len(pures2r))+' sites is invariant across sequences of '+ t2+ ' and differentiates it from '+ t1+': ' | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 728 counterPuresr = {} | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 729 for site in pures2r: | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 730 counterPuresr[site] = "'or'".join(list(set([thing[site] for thing in css2[t1] if thing[site] != 'N']))) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 731 Pairphraser = Pairphraser + str(site+corr)+" ('"+str(y2r[site])+"' vs '"+str(counterPuresr[site])+"'), " | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 732 print(Pairphraser[:-2]) | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 733 print("<p>",Pairphraser[:-2],'</h4>', file=h)#OCT2022 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 734 h.close() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 735 | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 736 #OCT2022 - end | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 737 return contents, qCLADEs | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 738 if __name__ == "__main__": | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 739 c, q = mainprocessing() | 
| 
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
 itaxotools parents: diff
changeset | 740 | 
