Mercurial > repos > itaxotools > mold
annotate MolD_v1.4.py @ 0:4e8e2f836d0f draft default tip
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
author | itaxotools |
---|---|
date | Sun, 29 Jan 2023 16:25:48 +0000 |
parents | |
children |
rev | line source |
---|---|
0
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
1 """ |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
2 This script compiles rDNC-based DNA diagnoses for a pre-defined taxa in a dataset. This is the MAIN WORKING VERSION v1.4 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
3 This version already implements the new functionalities: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
4 - automatic trimming of the alignment to match the median sequence length (should seq len distribution and provided NumberN settings require that) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
5 - expanded qCLADEs setting |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
6 - diagnoses in pairwise comparisons of taxa. |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
7 - selection of a reference sequence for indexing DNCs |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
8 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
9 THIS version LACKS SPART compatibility |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
10 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
11 """ |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
12 import os, sys |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
13 import random |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
14 import argparse |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
15 from io import StringIO |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
16 ######################################################################## FUNCTIONS |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
17 #***STEP 1 - SORTING ENTRIES BY CLADE AND IDENTIFYING NUCLEOTIDE POSITIONS SHARED WITHIN CLADE |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
18 def Step1(raw_records): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
19 Clades=[] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
20 for i in range(len(raw_records)): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
21 Clade=raw_records[i][1] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
22 if Clade not in Clades: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
23 Clades.append(Clade) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
24 clade_sorted_seqs = {} |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
25 for letter in Clades: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
26 clade_sorted_seqs[letter]=[] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
27 for i in range(len(raw_records)): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
28 if raw_records[i][1]==letter: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
29 clade_sorted_seqs[letter].append(raw_records[i][2]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
30 shared_positions={} |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
31 for key in clade_sorted_seqs: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
32 sh_pos=[] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
33 for i in range(len(clade_sorted_seqs[key][0])): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
34 shared_nucleotide = True |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
35 csm = clade_sorted_seqs[key][0][i] #candidate shared nucleotide |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
36 for j in range(1, len(clade_sorted_seqs[key])): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
37 if clade_sorted_seqs[key][j][i] != csm: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
38 shared_nucleotide = False |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
39 break |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
40 if shared_nucleotide == True and csm != 'N': |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
41 sh_pos.append(i) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
42 shared_positions[key]=sh_pos |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
43 return Clades, clade_sorted_seqs, shared_positions |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
44 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
45 #***STEP 2 COMPILING COMPARISON LISTS FOR CLADES AND IDENTIFYING VARIABLE POSITIONS AND N PRIORITY POSITIONS WITH LARGEST CUTOFFS |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
46 def C_VP_PP(clade_sorted_seqs, clade, shared_positions, CUTOFF):# complist_variable_positions_priority_positions; Arguments: dictionary, string, dictionary |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
47 CShN={}#a dictionary keys - clade shared positions, values - nucleotides at those positions |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
48 for pos in shared_positions[clade]: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
49 CShN[pos] = clade_sorted_seqs[clade][0][pos]#creates a dictionary shared position : nucleotide |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
50 complist=[] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
51 for key in clade_sorted_seqs: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
52 if key != clade: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
53 complist = complist + clade_sorted_seqs[key]#creates a list of all other sequences for comparison |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
54 cutoffs = {} |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
55 pures = []####! newline |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
56 for key in CShN: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
57 newcomplist = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
58 for k in complist: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
59 if k[key] == CShN[key]: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
60 newcomplist.append(k) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
61 else: continue |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
62 cutoffs[key] = len(complist) - len(newcomplist) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
63 if len(newcomplist) == 0:####! newline |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
64 pures.append(key)####! newline |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
65 CPP = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
66 for key in sorted(cutoffs, key = cutoffs.get, reverse = True): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
67 CPP.append(key) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
68 if CUTOFF[0] == '>':#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
69 Clade_priority_positions = {pos:CShN[pos] for pos in CPP if cutoffs[pos] > int(CUTOFF[1:])}#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
70 else:#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
71 Clade_priority_positions = {} |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
72 for position in CPP[:int(CUTOFF)]:#Here you define how many of the clade shared combinations are used in subsequent search |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
73 Clade_priority_positions[position] = CShN[position] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
74 return complist, Clade_priority_positions, cutoffs, pures####! pures added |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
75 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
76 #***STEPS 3 RANDOM SEARCH ACROSS PRIORITY POSITIONS TO FIND RAW DIAGNOSTIC COMBINATIONS AND TO SUBSEQUENTLY REFINE THEM |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
77 def random_position(somelist, checklist):#gives a random index (integer) of the specified range, and returns indexed somelist element if it is not present in the checklist |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
78 while True: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
79 i = random.randint(0, len(somelist) - 1) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
80 if somelist[i] not in checklist: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
81 return somelist[i] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
82 break |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
83 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
84 continue |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
85 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
86 def step_reduction_complist(clade, complist, CPP, checked_ind):#checks randomly selected positions of CladeSharedNucleotides with sequences of other clades, until a diagnostic combination of nucleotides for a selected clade is found. |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
87 if len(complist) == 0: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
88 return checked_ind |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
89 elif len(checked_ind) == len(CPP): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
90 return checked_ind |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
91 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
92 newcomplist = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
93 pos = random_position(list(CPP.keys()), checked_ind) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
94 for j in complist: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
95 if j[pos] == CPP[pos] or j[pos] == 'N':#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
96 newcomplist.append(j) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
97 else: continue |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
98 new_checked_ind = checked_ind + [pos] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
99 return step_reduction_complist(clade, newcomplist, CPP, new_checked_ind) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
100 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
101 def ConditionD(newcomb, complist, CPP):#The function checks the 'Condition D' - i.e. whither any given combination of nucleotide positions is diagnostic for the selected clade |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
102 ContD = False |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
103 for i in newcomb: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
104 newcomplist = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
105 for m in complist: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
106 if m[i] == CPP[i]: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
107 newcomplist.append(m) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
108 else: continue |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
109 complist = newcomplist |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
110 if len(complist) == 0: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
111 ContD = True |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
112 return ContD |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
113 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
114 def RemoveRedundantPositions(raw_comb, complist, CPP):# The function removes positions from the raw combinations one by one, and then checks whether new combination fulfills the condition D, thus recursively reducing the diagnostic combination. |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
115 red_possible = False |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
116 for j in raw_comb: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
117 newcomb = [k for k in raw_comb if k != j] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
118 if ConditionD(newcomb, complist, CPP) == True: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
119 red_possible = True |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
120 return RemoveRedundantPositions(newcomb, complist, CPP) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
121 else: pass |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
122 if red_possible == False: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
123 return raw_comb |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
124 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
125 #PUTS EVERYTHING TOGETHER - 20000 ROUNDS OF RANDOM SEARCH FOLLOWED BY REFINING OF 500 SHORTEST COMBINATIONS |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
126 def Diagnostic_combinations(qCLADE, complist, CPP, n1, maxlen1, maxlen2): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
127 Achecked_ind = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
128 bestlists = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
129 n = n1 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
130 while n>0:#STEP3 proposes raw diagnostic combinations |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
131 raw_comb = step_reduction_complist(qCLADE, complist, CPP, Achecked_ind) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
132 if len(raw_comb) <= maxlen1: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
133 refined_comb = RemoveRedundantPositions(raw_comb, complist, CPP) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
134 if len(refined_comb) <= maxlen2 and sorted(refined_comb) not in bestlists: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
135 bestlists.append(sorted(refined_comb)) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
136 n=n-1 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
137 bestlists.sort(key=len) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
138 return bestlists |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
139 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
140 #***STEP 4 ANALYSIS OF OUTPUT rDNCs |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
141 def IndependentKey(diagnostic_combinations):#PRESENTLY NOT INVOLVED - returns independent diagnostic nucleotide combinations, and identifies key nucleotide positions |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
142 independent_combinations = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
143 selected_positions = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
144 for i in range(len(diagnostic_combinations)): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
145 if len(selected_positions) == 0: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
146 for j in range(0, i): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
147 if len(set(diagnostic_combinations[i]) & set(diagnostic_combinations[j])) == 0 and len(set(diagnostic_combinations[i]) & set(selected_positions)) == 0: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
148 independent_combinations.append(diagnostic_combinations[i]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
149 independent_combinations.append(diagnostic_combinations[j]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
150 for k in range(len(diagnostic_combinations[i])): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
151 selected_positions.append(diagnostic_combinations[i][k]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
152 for l in range(len(diagnostic_combinations[j])): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
153 selected_positions.append(diagnostic_combinations[j][l]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
154 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
155 if len(set(diagnostic_combinations[i]) & set(selected_positions)) == 0: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
156 independent_combinations.append(diagnostic_combinations[i]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
157 for k in range(len(diagnostic_combinations[i])): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
158 selected_positions.append(diagnostic_combinations[i][k]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
159 independent_combinations.sort(key=len) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
160 key_positions = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
161 for pos in diagnostic_combinations[0]: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
162 KP = True |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
163 for combination in diagnostic_combinations[1:]: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
164 if pos not in combination: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
165 KP = False |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
166 break |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
167 else: continue |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
168 if KP == True: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
169 key_positions.append(pos) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
170 return independent_combinations, key_positions |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
171 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
172 #SPECIFIC FUNCTIONS FOR THE rDNCs |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
173 def PositionArrays(Motifs):#VERYNEW ALL FUNCTION NEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
174 PositionArrays = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
175 VarPosList = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
176 for i in range(len(Motifs[0])): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
177 Const = True |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
178 array = [Motifs[0][i]] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
179 for j in range(len(Motifs[1:])): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
180 if Motifs[j][i] != 'N': |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
181 if Motifs[j][i] != array[-1]: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
182 Const = False |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
183 array.append(Motifs[j][i]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
184 PositionArrays.append(array) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
185 if Const == False: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
186 VarPosList.append(i) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
187 return PositionArrays, VarPosList |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
188 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
189 def random_sequence_new(SEQ, PositionArrays, VarPosList, Pdiff):#VERYNEW FUNCTION REVISED |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
190 #print(["ROUND", len(SEQ)*Pdiff/100, round(len(SEQ)*Pdiff/100)]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
191 n = round(len(SEQ)*Pdiff/100) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
192 N = random.sample(list(range(1, n)), 1)[0] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
193 PosToChange = random.sample([p for p in VarPosList if SEQ[p] != 'D'], N)#this is a new definition to keep alignment gaps unchanged |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
194 NEWSEQ = '' |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
195 for i in range(len(SEQ)): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
196 if i not in PosToChange: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
197 NEWSEQ = NEWSEQ + SEQ[i] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
198 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
199 newarray = [j for j in PositionArrays[i] if j != SEQ[i]] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
200 newbase = random.sample(newarray, 1)[0] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
201 NEWSEQ = NEWSEQ + newbase |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
202 return NEWSEQ |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
203 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
204 def GenerateBarcode_new(Diagnostic_combinations, length):#VERYNEW FUNCTION REVISED - This function calculates diagnostic combinations and assembles a barcode of desired length for a query taxon. First all single position DNCs are added, then based on the frequency of a nucleotide position in the DNCs of the 2 positions, and then based on the frequency of a position in longer DNCs |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
205 len1 = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
206 len2 = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
207 lenmore = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
208 for comb in Diagnostic_combinations: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
209 if len(comb) == len(Diagnostic_combinations[0]): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
210 for i in comb: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
211 len1.append(i) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
212 elif len(comb) == len(Diagnostic_combinations[0])+1: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
213 for j in comb: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
214 len2.append(j) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
215 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
216 for k in comb: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
217 lenmore.append(k) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
218 if len(Diagnostic_combinations[0]) == 1: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
219 Setin = len1 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
220 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
221 Setin = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
222 for pos in sorted(len1, key=len1.count, reverse = True): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
223 if not pos in Setin: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
224 Setin.append(pos) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
225 for pos1 in sorted(len2, key=len2.count, reverse = True): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
226 if not pos1 in Setin: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
227 Setin.append(pos1) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
228 for pos2 in sorted(lenmore, key=lenmore.count, reverse = True): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
229 if not pos2 in Setin: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
230 Setin.append(pos2) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
231 return Setin[:length] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
232 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
233 def Screwed_dataset_new(raw_records, nseq_per_clade_to_screw, PositionArrays, VarPosList, Percent_difference, Taxon, Cutoff):#VERYNEW FUNCTION REVISED |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
234 clades=[] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
235 for i in range(len(raw_records)): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
236 Clade=raw_records[i][1] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
237 if Clade not in clades: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
238 clades.append(Clade) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
239 clade_sorted_seqs = {} |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
240 for letter in clades: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
241 clade_sorted_seqs[letter]=[] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
242 for i in range(len(raw_records)): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
243 if raw_records[i][1]==letter: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
244 clade_sorted_seqs[letter].append(raw_records[i][2]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
245 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
246 for clade in clades: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
247 seqlist = clade_sorted_seqs[clade] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
248 newseqs = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
249 if len(seqlist) > nseq_per_clade_to_screw: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
250 iSTS = random.sample(list(range(len(seqlist))), nseq_per_clade_to_screw) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
251 for k in range(len(seqlist)): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
252 if k in iSTS: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
253 newseq = random_sequence_new(seqlist[k], PositionArrays, VarPosList, Percent_difference) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
254 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
255 newseq = seqlist[k] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
256 newseqs.append(newseq) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
257 elif len(clade_sorted_seqs[clade]) == nseq_per_clade_to_screw: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
258 for k in range(len(seqlist)): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
259 newseq = random_sequence_new(seqlist[k], PositionArrays, VarPosList, Percent_difference) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
260 newseqs.append(newseq) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
261 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
262 for i in range(nseq_per_clade_to_screw): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
263 seq = random.sample(seqlist, 1)[0] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
264 newseq = random_sequence_new(seq, PositionArrays, VarPosList, Percent_difference) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
265 newseqs.append(newseq) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
266 clade_sorted_seqs[clade] = newseqs |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
267 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
268 shared_positions={} |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
269 for key in clade_sorted_seqs: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
270 sh_pos=[] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
271 for i in range(len(clade_sorted_seqs[key][0])): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
272 shared_nucleotide = True |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
273 csm = clade_sorted_seqs[key][0][i] #candidate shared nucleotide |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
274 for j in range(1, len(clade_sorted_seqs[key])): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
275 if clade_sorted_seqs[key][j][i] != csm: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
276 shared_nucleotide = False |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
277 break |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
278 if shared_nucleotide == True and csm != 'N': |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
279 sh_pos.append(i) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
280 shared_positions[key]=sh_pos |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
281 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
282 x,y,z,pures = C_VP_PP(clade_sorted_seqs, Taxon, shared_positions, Cutoff)#STEP2#### |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
283 return x, y |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
284 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
285 #NEWFUNCYIONS |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
286 def medianSeqLen(listofseqs):#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
287 seqlens = [i.count('A')+i.count('C')+i.count('G')+i.count('T') for i in listofseqs] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
288 medlen = sorted(seqlens)[int(len(seqlens)/2)] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
289 medseq = listofseqs[seqlens.index(medlen)] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
290 start = min([medseq.find('A'),medseq.find('C'),medseq.find('G'),medseq.count('T')]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
291 if not 'N' in medseq[start:]: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
292 end = len(medseq) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
293 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
294 for i in range(start, len(medseq), 1): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
295 if medseq[i:].count('N') == len(medseq[i:]): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
296 end = i |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
297 break |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
298 return medlen, start, end |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
299 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
300 def getAllPairs(taxalist): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
301 uniquetaxapairs = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
302 stl = sorted(list(set(taxalist))) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
303 for i in range(len(stl)): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
304 for j in range(i+1, len(stl)): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
305 uniquetaxapairs.append([stl[i],stl[j]]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
306 return uniquetaxapairs |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
307 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
308 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
309 ################################################READ IN PARAMETER FILE AND DATA FILE |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
310 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
311 def get_args(): #arguments needed to give to this script |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
312 parser = argparse.ArgumentParser(description="run MolD") |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
313 required = parser.add_argument_group("required arguments") |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
314 required.add_argument("-i", help="textfile with parameters of the analysis", required=True) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
315 return parser.parse_args() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
316 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
317 def mainprocessing(gapsaschars=None, taxalist=None, taxonrank=None, cutoff=None, numnucl=None, numiter=None, maxlenraw=None, maxlenrefined=None, iref=None, pdiff=None, nmax=None, thresh=None, tmpfname=None, origfname=None): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
318 ParDict = {} |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
319 if not(all([gapsaschars, taxalist, taxonrank, cutoff, numnucl, numiter, maxlenraw, maxlenrefined, iref, pdiff, nmax, thresh, tmpfname])): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
320 args = get_args() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
321 with open(args.i) as params: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
322 for line in params: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
323 line = line.strip() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
324 if line.startswith('#'): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
325 pass |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
326 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
327 if len(line.split('=')) == 2 and len(line.split('=')[1]) != 0: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
328 ParDict[line.split('=')[0]] = line.split('=')[1].replace(' ', '')#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
329 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
330 ParDict['Gaps_as_chars'] = gapsaschars |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
331 ParDict['qTAXA'] = taxalist |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
332 ParDict['Taxon_rank'] = taxonrank |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
333 ParDict['INPUT_FILE'] = tmpfname |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
334 ParDict['ORIG_FNAME'] = origfname# I do not understand this ORIG_FNAME, it throws an error with the command line |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
335 ParDict['Cutoff'] = cutoff |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
336 ParDict['NumberN'] = numnucl |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
337 ParDict['Number_of_iterations'] = numiter |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
338 ParDict['MaxLen1'] = maxlenraw |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
339 ParDict['MaxLen2'] = maxlenrefined |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
340 ParDict['Iref'] = iref |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
341 ParDict['Pdiff'] = pdiff |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
342 #ParDict['PrSeq'] = prseq |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
343 ParDict['NMaxSeq'] = nmax |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
344 ParDict['Scoring'] = thresh |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
345 ParDict['OUTPUT_FILE'] = "str" |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
346 print(ParDict) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
347 ############################################# #VERYNEW HOW GAPS ARE TREATED |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
348 #REQUIRES A NEW FIELD IN THE GUI |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
349 if ParDict['Gaps_as_chars'] == 'yes': |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
350 gaps2D = True#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
351 else:#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
352 gaps2D = False#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
353 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
354 ############################################ DATA FILE |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
355 checkread = open(ParDict['INPUT_FILE'], 'r') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
356 firstline = checkread.readline() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
357 checkread.close() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
358 imported=[]#set up a new list with species and identifiers |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
359 if '>' in firstline: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
360 f = open(ParDict['INPUT_FILE'], 'r') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
361 for line in f:#VERYNEW - THE DATA READING FROM ALIGNMENT FILE IS ALL REVISED UNTIL f.close() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
362 line=line.rstrip() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
363 if line.startswith('>'): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
364 data = line[1:].split('|') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
365 if len(data) != 2: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
366 print('Check number of entries in', data[0]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
367 #break |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
368 data.append('') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
369 imported.append(data) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
370 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
371 if gaps2D == True:# |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
372 DNA = line.upper().replace('-', 'D').replace('R', 'N').replace('Y', 'N').replace('S','N').replace('W','N').replace('M','N').replace('K','N')#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
373 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
374 DNA = line.upper().replace('-', 'N').replace('R', 'N').replace('Y', 'N').replace('S','N').replace('W','N').replace('M','N').replace('K','N')#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
375 imported[-1][-1] = imported[-1][-1]+DNA |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
376 f.close() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
377 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
378 f = open(ParDict['INPUT_FILE'], 'r') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
379 for line in f:#VERYNEW - THE DATA READING FROM ALIGNMENT FILE IS ALL REVISED UNTIL f.close() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
380 line=line.rstrip() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
381 data = line.split('\t') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
382 if len(data) != 3: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
383 print('Check number of entries in', data[0]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
384 ID = data[0] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
385 thetax = data[1] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
386 if gaps2D == True:# |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
387 DNA = data[2].upper().replace('-', 'D').replace('R', 'N').replace('Y', 'N').replace('S','N').replace('W','N').replace('M','N').replace('K','N')#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
388 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
389 DNA = data[2].upper().replace('-', 'N').replace('R', 'N').replace('Y', 'N').replace('S','N').replace('W','N').replace('M','N').replace('K','N')#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
390 imported.append([ID, thetax, DNA]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
391 f.close() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
392 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
393 if 'NumberN' in list(ParDict.keys()):#How many ambiguously called nucleotides are allowed |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
394 NumberN = int(ParDict['NumberN']) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
395 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
396 NumberN = 5 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
397 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
398 if len(set([len(i[2]) for i in imported])) != 1: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
399 print('Alignment contains sequences of different lengths:', set([len(i[2]) for i in imported])) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
400 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
401 mlen, sstart, send = medianSeqLen([i[2] for i in imported])#OCT2022 - start |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
402 if mlen + NumberN < len(imported[0][2]): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
403 Slice = True |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
404 FragmentLen = mlen |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
405 corr = sstart |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
406 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
407 Slice = False |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
408 FragmentLen = len(imported[0][2]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
409 sstart = 0 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
410 send = FragmentLen+1 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
411 corr = 0 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
412 raw_records=[] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
413 for i in imported: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
414 if ParDict['Iref'] != 'NO' and ParDict['Iref'].split(',')[0] == i[0] and ParDict['Iref'].split(',')[1] in ['ex', 'excl', 'out']: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
415 continue |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
416 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
417 if i[2][sstart:send].count('N') < NumberN and len(i[2][sstart:send]) == FragmentLen: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
418 newi = [i[0], i[1], i[2][sstart:send]] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
419 raw_records.append(newi)#OCT2022 - end |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
420 print('\n########################## PARAMETERS ######################\n')#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
421 #print('input file:', ParDict['ORIG_FNAME']) #Outcommented ORIG_FNAME |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
422 print('input file:', ParDict['INPUT_FILE']) #Replacement of the line above |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
423 print('Coding gaps as characters:', gaps2D) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
424 print('Maximum undetermined nucleotides allowed:', NumberN) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
425 print('Length of the alignment:', len(imported[0][2]),'->', FragmentLen) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
426 print('Indexing reference:', ParDict['Iref'].replace('NO', 'Not set').replace('in', 'included').replace('ex', 'excluded')) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
427 print('Read in', len(raw_records), 'sequences') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
428 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
429 PosArrays, VarPosList = PositionArrays([i[2] for i in raw_records])#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
430 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
431 #############################################READ IN OTHER ANALYSIS PARAMETERS |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
432 ##OCT2022 - start |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
433 withplus = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
434 P2 = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
435 shift = True |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
436 if ParDict['qTAXA'][0] == '>':#THIS OPTION DIAGNOSES ALL TAXA WITH MORE THAN USER-DEFINED NUMBER OF SEQUENCES AVAILABLE |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
437 NumSeq = int(ParDict['qTAXA'][1:]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
438 Taxarecords = [i[1] for i in raw_records] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
439 qCLADEs = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
440 for j in set(Taxarecords): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
441 if Taxarecords.count(j) >= NumSeq: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
442 qCLADEs.append(j) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
443 elif ParDict['qTAXA'].startswith('P:'):#THIS OPTION DIAGNOSES ALL TAXA CONTAINING A USER-DEFINED PATTERN IN THE NAME |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
444 pattern = ParDict['qTAXA'].split(':')[1] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
445 Taxarecords = [i[1] for i in raw_records] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
446 qCLADEs = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
447 for j in set(Taxarecords): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
448 if pattern in j: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
449 qCLADEs.append(j) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
450 elif ParDict['qTAXA'].startswith('P+:'):#THIS OPTION POOLS ALL TAXA CONTAINING A USER-DEFINED PATTERN IN THE NAME IN ONE TAXON AND DIAGNOSES IT |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
451 pattern = ParDict['qTAXA'].split(':')[1] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
452 Taxarecords = set([i[1] for i in raw_records if pattern in i[1]]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
453 spp = '+'.join(sorted(list(Taxarecords))) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
454 qCLADEs = [spp] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
455 nrecords = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
456 for rec in raw_records: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
457 if rec[1] in Taxarecords: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
458 nrecords.append([rec[0], spp, rec[2]]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
459 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
460 nrecords.append(rec) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
461 raw_records = nrecords |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
462 else:#THIS OPTION DIAGNOSES ALL TAXA FROM A USER-DEFINED LIST; TAXA MAY BE COMBINED BY USING '+' |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
463 qCLADEs = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
464 allrecs = ParDict['qTAXA'].split(',') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
465 for item in allrecs: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
466 if item in ['ALL', 'All', 'all']:#THIS OPTION DIAGNOSES ALL TAXA IN THE DATASET |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
467 qCLADEs = list(set([i[1] for i in raw_records])) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
468 elif item in [i[1] for i in raw_records]: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
469 qCLADEs.append(item) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
470 elif '+' in item: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
471 withplus.append(item) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
472 elif 'VS' in item: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
473 P2.append(item.split('VS')) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
474 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
475 print('UNRECOGNIZED TAXON', item) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
476 #OCT2022 - end |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
477 print('query taxa:', len(qCLADEs+withplus), '-', str(sorted(qCLADEs)+sorted(withplus)).replace('[','').replace(']','').replace("'", ''))#1.3 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
478 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
479 if 'Cutoff' in list(ParDict.keys()):#CUTOFF Number of the informative positions to be considered, default 100 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
480 Cutoff = ParDict['Cutoff']#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
481 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
482 Cutoff = 100 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
483 print('Cutoff set as:', Cutoff) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
484 if 'Number_of_iterations' in list(ParDict.keys()):#Number iterations of MolD |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
485 N1 = int(ParDict['Number_of_iterations']) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
486 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
487 N1 = 10000 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
488 print('Number iterations of MolD set as:', N1) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
489 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
490 if 'MaxLen1' in list(ParDict.keys()):#Maximum length for the raw mDNCs |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
491 MaxLen1 = int(ParDict['MaxLen1']) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
492 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
493 MaxLen1 = 12 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
494 print('Maximum length of raw mDNCs set as:', MaxLen1) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
495 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
496 if 'MaxLen2' in list(ParDict.keys()):#Maximum length for the refined mDNCs |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
497 MaxLen2 = int(ParDict['MaxLen2']) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
498 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
499 MaxLen2 = 7 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
500 print('Maximum length of refined mDNCs set as:', MaxLen2) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
501 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
502 if 'Pdiff' in list(ParDict.keys()):#Percent difference |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
503 Percent_difference = float(ParDict['Pdiff']) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
504 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
505 if int(ParDict['Taxon_rank']) == 1:#read in taxon rank to configure Pdiff parameter of artificial dataset |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
506 Percent_difference = 1 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
507 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
508 Percent_difference = 5 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
509 print('simulated sequences up to', Percent_difference, 'percent divergent from original ones') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
510 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
511 if 'NMaxSeq' in list(ParDict.keys()):#Maximum number of sequences per taxon to be modified |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
512 Seq_per_clade_to_screw = int(ParDict['NMaxSeq']) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
513 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
514 Seq_per_clade_to_screw = 10####!changed value |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
515 print('Maximum number of sequences modified per clade', Seq_per_clade_to_screw) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
516 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
517 if 'Scoring' in list(ParDict.keys()): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
518 if ParDict['Scoring'] == 'lousy': |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
519 threshold = 66####!changed value |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
520 elif ParDict['Scoring'] == 'moderate': |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
521 threshold = 75####!changed value |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
522 elif ParDict['Scoring'] == 'stringent': |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
523 threshold = 90####!changed value |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
524 elif ParDict['Scoring'] == 'very_stringent': |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
525 threshold = 95####!changed value |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
526 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
527 threshold = 75####!changed value |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
528 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
529 threshold = 75####!changed value |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
530 #print(ParDict['Scoring'], 'scoring of the rDNCs; threshold in two consequtive runs:', threshold) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
531 print('scoring of the rDNCs; threshold in two consequtive runs:', threshold) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
532 #OCT2022 - start |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
533 if corr > 1 and len(ParDict['Iref'].split(',')) == 2: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
534 print('\nNOTE: The alignment was trimmed automatically to match median sequences length. The analysed slice starts from the site',str(sstart+1),'and ends on the site',str(send+1),'. The site indexing in the DNCs as in the provided reference.') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
535 if corr > 1 and ParDict['Iref'] == 'NO': |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
536 print('\nNOTE: The alignment was trimmed automatically to match median sequences length. The analysed slice starts from the site',str(sstart+1),'and ends on the site',str(send+1),'. The site indexing in the rDNC as in the untrimmed alignment.') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
537 #OCT2022 - end |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
538 thephrase = 'The DNA diagnosis for the taxon' |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
539 ###################################################IMPLEMENTATION |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
540 #Setting up a new class just for the convenient output formatting |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
541 class SortedDisplayDict(dict):#this is only to get a likable formatting of the barcode |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
542 def __str__(self): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
543 return "[" + ", ".join("%r: %r" % (key, self[key]) for key in sorted(self)) + "]" |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
544 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
545 class SortedDisplayDictVerbose(dict):#this is only to get a likable formatting of the barcode |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
546 def __str__(self): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
547 return ", ".join("%r %r" % (self[key],'in the site '+str(key)) for key in sorted(self)).replace("'", '').replace("A", "'A'").replace("C", "'C'").replace("G", "'G'").replace("T", "'T'")+'.' |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
548 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
549 #Calling functions and outputing results |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
550 if ParDict['OUTPUT_FILE'] == "str": |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
551 g = StringIO() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
552 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
553 g = open(ParDict['OUTPUT_FILE'], "w")#Initiating output file |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
554 #VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
555 print('<h4>########################## PARAMETERS ######################</h4>', file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
556 #print("<p>", 'input file:', ParDict['ORIG_FNAME'], "</p>", file=g) #outcommented ORIG_FNAME |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
557 print("<p>", 'input file:', ParDict['INPUT_FILE'], "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
558 print("<p>", 'Coding gaps as characters:', gaps2D, "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
559 print("<p>", 'Maximum undetermined nucleotides allowed:', NumberN, "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
560 print("<p>", 'Length of the alignment:', len(imported[0][2]),'->', FragmentLen, "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
561 print("<p>", 'Indexing reference:', ParDict['Iref'].replace('NO', 'Not set').replace('in', 'included').replace('ex', 'excluded'), "</p>", file=g)#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
562 print("<p>", 'Read in', len(raw_records), 'sequences', "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
563 print("<p>", 'query taxa:', len(qCLADEs+withplus), '-', str(sorted(qCLADEs)+sorted(withplus)).replace('[','').replace(']','').replace("'", ''), "</p>", file=g)#1.3 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
564 print("<p>", 'Cutoff set as:', Cutoff, "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
565 print("<p>", 'Number iterations of MolD set as:', N1, "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
566 print("<p>", 'Maximum length of raw mDNCs set as:', MaxLen1, "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
567 print("<p>", 'Maximum length of refined mDNCs set as:', MaxLen2, "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
568 print("<p>", 'simulated sequences up to', Percent_difference, 'percent divergent from original ones', "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
569 print("<p>", 'Maximum number of sequences modified per clade', Seq_per_clade_to_screw, "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
570 #print("<p>", ParDict['Scoring'], 'scoring of the rDNCs; threshold in two consequtive runs:', threshold, "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
571 print("<p>", 'scoring of the rDNCs; threshold in two consequtive runs:', threshold, "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
572 if corr > 1 and len(ParDict['Iref'].split(',')) == 2: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
573 print('<h4>NOTE: The alignment was trimmed automatically to match median sequences length. The analysed slice starts from the site',str(sstart+1),'and ends on the site',str(send+1),'. The site indexing in the DNCs as in the provided reference.</h4>', file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
574 if corr > 1 and ParDict['Iref'] == 'NO': |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
575 print('<h4>NOTE: The alignment was trimmed automatically to match median sequences length. The analysed slice starts from the site',str(sstart+1),'and ends on the site',str(send+1),'. The site indexing in the rDNC as in the untrimmed alignment.</h4>', file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
576 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
577 print('<h4>########################### RESULTS ##########################</h4>', file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
578 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
579 for qCLADE in sorted(qCLADEs) + sorted(withplus):#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
580 if '+' in qCLADE: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
581 if shift == True: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
582 old_records = raw_records |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
583 shift == False |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
584 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
585 raw_records = old_records |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
586 spp = qCLADE.split('+') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
587 nrecords = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
588 for rec in raw_records: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
589 if rec[1] in spp: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
590 nrecords.append([rec[0], qCLADE, rec[2]]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
591 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
592 nrecords.append(rec) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
593 raw_records = nrecords |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
594 print('\n**************', qCLADE, '**************') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
595 print('<h4>**************', qCLADE, '**************</h4>', file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
596 Clades, clade_sorted_seqs, shared_positions = Step1(raw_records)#STEP1 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
597 x,y,z,pures = C_VP_PP(clade_sorted_seqs, qCLADE, shared_positions, Cutoff)#STEP2 ####! added pures |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
598 newy = {key:y[key] for key in y if not key in pures} ####! newline |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
599 print('Sequences analyzed:', len(clade_sorted_seqs[qCLADE])) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
600 print("<p>",'Sequences analyzed:', len(clade_sorted_seqs[qCLADE]), "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
601 ND_combinations = [[item] for item in pures] ####! before ND_combinations were initiated as an empty list |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
602 print('single nucleotide mDNCs:', len(pures), '-', str(SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in pures]}))[1:-1])#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
603 print("<p>",'single nucleotide mDNCs*:',len(pures), '-', str(SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in pures]}))[1:-1], "</p>", file=g)#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
604 N = 1 ####! |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
605 while N > 0:#STEP3 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
606 try: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
607 q = Diagnostic_combinations(qCLADE, x, newy, N1, MaxLen1, MaxLen2) ####! newy instead of y |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
608 except IndexError: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
609 print(N, 'IndexError') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
610 continue |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
611 for comb in q: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
612 if not comb in ND_combinations: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
613 ND_combinations.append(comb) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
614 N-=1 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
615 ND_combinations.sort(key=len) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
616 #################################### mDNC output |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
617 try: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
618 Nind, KeyPos = IndependentKey(ND_combinations)#STEP4 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
619 except IndexError: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
620 print('no mDNCs recovered for', qCLADE)#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
621 print("<p>", 'no mDNCs recovered for', "</p>", qCLADE, file=g)#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
622 continue |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
623 Allpos = []#Create list of all positions involved in mDNCs |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
624 for comb in ND_combinations: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
625 for pos in comb: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
626 if not pos in Allpos: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
627 Allpos.append(pos) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
628 print('\nmDNCs retrieved:', str(len(ND_combinations)) + '; Sites involved:', str(len(Allpos)) + '; Independent mDNCs:', len(Nind))#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
629 print("<p>", 'mDNCs* retrieved:', str(len(ND_combinations)) + '; Sites involved:', str(len(Allpos)) + '; Independent mDNCs**:', len(Nind), "</p>", file=g)#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
630 print('Shortest retrieved mDNC:', SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in ND_combinations[0]]}), '\n')#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
631 print("<p>",'Shortest retrieved mDNC*:', SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in ND_combinations[0]]}), "</p>", file=g)#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
632 ######################################################## rDNC output |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
633 Barcode_scores = []#Initiate a list for rDNC scores |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
634 npos = len(ND_combinations[0]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
635 BestBarcode = 'none'####! newline |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
636 while npos <= min([10, len(Allpos)]):#in this loop the positions are added one-by-one to a rDNC and the rDNC is then rated on the artificially generated datasets |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
637 Barcode = GenerateBarcode_new(ND_combinations, npos)#Initiate a rDNC |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
638 Barcode_score = 0#Initiate a score to rate a rDNC |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
639 N = 100 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
640 while N > 0: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
641 NComplist, NCPP = Screwed_dataset_new(raw_records, Seq_per_clade_to_screw, PosArrays, VarPosList, Percent_difference, qCLADE, Cutoff)#Create an artificial dataset VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
642 NBarcode = [i for i in Barcode if i in list(NCPP.keys())] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
643 if len(Barcode) - len(NBarcode) <= 1 and ConditionD(NBarcode, NComplist, NCPP) == True:####! new condition (first) added |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
644 Barcode_score +=1 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
645 N -=1 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
646 print(npos, 'rDNC_score (100):', [k+corr+1 for k in Barcode], '-', Barcode_score)#VERYNEW |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
647 print("<p>", npos, 'rDNC_score (100):', [k+corr+1 for k in Barcode], '-', Barcode_score, "</p>", file=g)#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
648 if Barcode_score >= threshold and len(Barcode_scores) == 1: ###1.3 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
649 BestBarcode = Barcode###1.3 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
650 if Barcode_score >= threshold and len(Barcode_scores) > 1 and Barcode_score >= max(Barcode_scores): ###1.3 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
651 BestBarcode = Barcode####!newline |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
652 Barcode_scores.append(Barcode_score) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
653 if len(Barcode_scores) >= 2 and Barcode_scores[-1] >= threshold and Barcode_scores[-2] >= threshold:#Check whether the rDNC fulfills robustnes criteria 85:85:85 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
654 print('Final rDNC:', SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}))#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
655 print("<p>",'Final rDNC***:', SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}), "</p>", file=g)#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
656 print('\n',thephrase, qCLADE, 'is:', SortedDisplayDictVerbose({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}))#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
657 print("<p>", thephrase, qCLADE, 'is:', SortedDisplayDictVerbose({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}), "</p>", file=g)#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
658 break |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
659 else:# VERY NEW FROM HERE ONWARDS |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
660 npos += 1 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
661 if npos > min([10, len(Allpos)]): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
662 if BestBarcode != 'none': |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
663 print('The highest scoring rDNC for taxon', qCLADE, 'is:', SortedDisplayDictVerbose({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}))#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
664 print("<p>", 'The highest scoring rDNC*** for taxon', qCLADE, 'is:', SortedDisplayDictVerbose({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}), "</p>", file=g)#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
665 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
666 print('No sufficiently robust DNA diagnosis for taxon', qCLADE, 'was retrieved') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
667 print("<p>", 'No sufficiently robust DNA diagnosis for taxon', qCLADE, 'was retrieved', "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
668 #OCT2022 - start |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
669 print("<h4>", '################################# EXPLANATIONS ####################################', "</h4>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
670 print("<p>", ' * mDNC -(=minimal Diagnostic nucleotide combination) is a combination of nucleotides at specified sites of the alignment,', "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
671 print("<p>", ' unique for a query taxon. Therefore it is sufficient to differentiate a query taxon from all reference taxa in a dataset.', "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
672 print("<p>", ' Because it comprises minimal necessary number of nucleotide sites to differentiate a query, any mutation in the mDNC in' "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
673 print("<p>", ' single specimen of a query taxon will automatically disqualify it as a diagnostic combination.', "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
674 print("<p>", "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
675 print("<p>", ' ** two or more mDNCs are INDEPENDENT if they constitute non-overlapping sets of nucleotide sites.', "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
676 print("<p>", "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
677 print("<p>", '*** rDNC -(=robust/redundant Diagnostic nucleotide combination) is a combination of nucleotides at specified sites of the alignment,', "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
678 print("<p>", ' unique for a query taxon and (likewise mDNC) sufficient to differentiate a query taxon from all reference taxa in a dataset.', "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
679 print("<p>", ' However, rDNC comprises more than a minimal necessary number of diagnostic sites, and therefore is robust to single nucleotide', "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
680 print("<p>", ' replacements. Even if a mutation arises in one of the rDNC sites, the remaining ones will (with high probability) remain sufficient ', "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
681 print("<p>", ' to diagnose the query taxon', "</p>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
682 print("<h4>", ' Final diagnosis corresponds to rDNC', "</h4>", file=g) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
683 #OCT2022 - end |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
684 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
685 if ParDict['OUTPUT_FILE'] == "str": |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
686 contents = g.getvalue() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
687 os.unlink(ParDict['INPUT_FILE']) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
688 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
689 contents = None |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
690 g.close() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
691 #OCT2022 - start |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
692 if len(P2) != 0: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
693 ext = '.'+ParDict['OUTPUT_FILE'].split('.')[-1] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
694 h = open(ParDict['OUTPUT_FILE'].replace(ext, '_pairwise'+ext), "w")#Initiating output file |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
695 if len(withplus) != 0: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
696 raw_records = old_records |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
697 taxain = [i[1] for i in raw_records] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
698 tpairs = [] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
699 for alist in P2: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
700 if alist.count('all')+alist.count('All')+alist.count('ALL') == 2: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
701 tpairs = getAllPairs(taxain) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
702 elif alist.count('all')+alist.count('All')+alist.count('ALL') == 1: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
703 thetax = [i for i in taxain if i in alist][0] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
704 print(thetax) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
705 for atax in sorted(list(set(taxain))): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
706 if atax != thetax: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
707 tpairs.append([thetax, atax]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
708 else: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
709 for apair in getAllPairs(alist): |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
710 tpairs.append(apair) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
711 for apair in tpairs: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
712 t1 = apair[0] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
713 t2 = apair[1] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
714 p2records = [i for i in raw_records if i[1] in [t1, t2]] |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
715 print('\n**************', t1, 'VS', t2,'**************') |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
716 print('<h4>**************', t1, 'VS', t2, '**************</h4>', file=h) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
717 C2, css2, sp2 = Step1(p2records)#STEP1 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
718 x2,y2,z2,pures2 = C_VP_PP(css2, t1, sp2, '>0')#STEP2 ####! added pures |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
719 Pairphrase = 'Each of the following '+ str(len(pures2))+' sites is invariant across sequences of '+ t1+ ' and differentiates it from '+ t2+': ' |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
720 counterPures = {} |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
721 for site in pures2: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
722 counterPures[site] = "'or'".join(list(set([thing[site] for thing in css2[t2] if thing[site] != 'N']))) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
723 Pairphrase = Pairphrase + str(site+corr)+" ('"+str(y2[site])+"' vs '"+str(counterPures[site])+"'), " |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
724 print(Pairphrase[:-2]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
725 print("<p>",Pairphrase[:-2],'</h4>', file=h)#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
726 x2r,y2r,z2r,pures2r = C_VP_PP(css2, t2, sp2, '>0')#STEP2 ####! added pures |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
727 Pairphraser = 'Each of the following '+ str(len(pures2r))+' sites is invariant across sequences of '+ t2+ ' and differentiates it from '+ t1+': ' |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
728 counterPuresr = {} |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
729 for site in pures2r: |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
730 counterPuresr[site] = "'or'".join(list(set([thing[site] for thing in css2[t1] if thing[site] != 'N']))) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
731 Pairphraser = Pairphraser + str(site+corr)+" ('"+str(y2r[site])+"' vs '"+str(counterPuresr[site])+"'), " |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
732 print(Pairphraser[:-2]) |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
733 print("<p>",Pairphraser[:-2],'</h4>', file=h)#OCT2022 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
734 h.close() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
735 |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
736 #OCT2022 - end |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
737 return contents, qCLADEs |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
738 if __name__ == "__main__": |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
739 c, q = mainprocessing() |
4e8e2f836d0f
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
itaxotools
parents:
diff
changeset
|
740 |