Mercurial > repos > itaxotools > mold

diff MolD_v1.4.py @ 0:4e8e2f836d0f draft default tip
planemo upload commit 232ce39054ce38be27c436a4cabec2800e14f988-dirty
author: itaxotools
date: Sun, 29 Jan 2023 16:25:48 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MolD_v1.4.py	Sun Jan 29 16:25:48 2023 +0000
@@ -0,0 +1,740 @@
+"""
+This script compiles rDNC-based DNA diagnoses for a pre-defined taxa in a dataset. This is the MAIN WORKING VERSION v1.4
+This version already implements the new functionalities:
+- automatic trimming of the alignment to match the median sequence length (should seq len distribution and provided NumberN settings require that)
+- expanded qCLADEs setting
+- diagnoses in pairwise comparisons of taxa.
+- selection of a reference sequence for indexing DNCs
+
+THIS version LACKS SPART compatibility
+
+"""
+import os, sys
+import random
+import argparse
+from io import StringIO
+######################################################################## FUNCTIONS
+#***STEP 1 - SORTING ENTRIES BY CLADE AND IDENTIFYING NUCLEOTIDE POSITIONS SHARED WITHIN CLADE
+def Step1(raw_records):
+    Clades=[]
+    for i in range(len(raw_records)):
+        Clade=raw_records[i][1]
+        if Clade not in Clades:
+            Clades.append(Clade)
+    clade_sorted_seqs = {}
+    for letter in Clades:
+        clade_sorted_seqs[letter]=[]
+        for i in range(len(raw_records)):
+            if raw_records[i][1]==letter:
+                clade_sorted_seqs[letter].append(raw_records[i][2])
+    shared_positions={}
+    for key in clade_sorted_seqs:
+        sh_pos=[]
+        for i in range(len(clade_sorted_seqs[key][0])):
+            shared_nucleotide = True
+            csm = clade_sorted_seqs[key][0][i] #candidate shared nucleotide
+            for j in range(1, len(clade_sorted_seqs[key])):
+                if clade_sorted_seqs[key][j][i] != csm:
+                    shared_nucleotide = False
+                    break
+            if shared_nucleotide == True and csm != 'N':
+                sh_pos.append(i)
+        shared_positions[key]=sh_pos
+    return Clades, clade_sorted_seqs, shared_positions
+
+#***STEP 2 COMPILING COMPARISON LISTS FOR CLADES AND IDENTIFYING VARIABLE POSITIONS AND N PRIORITY POSITIONS WITH LARGEST CUTOFFS
+def C_VP_PP(clade_sorted_seqs, clade, shared_positions, CUTOFF):# complist_variable_positions_priority_positions; Arguments: dictionary, string, dictionary
+    CShN={}#a dictionary keys - clade shared positions, values - nucleotides at those positions
+    for pos in shared_positions[clade]:
+        CShN[pos] = clade_sorted_seqs[clade][0][pos]#creates a dictionary shared position : nucleotide
+    complist=[]
+    for key in clade_sorted_seqs:
+        if key != clade:
+            complist = complist + clade_sorted_seqs[key]#creates a list of all other sequences for comparison
+    cutoffs = {}
+    pures = []####! newline
+    for key in CShN:
+        newcomplist = []
+        for k in complist:
+            if k[key] == CShN[key]:
+                newcomplist.append(k)
+            else: continue
+        cutoffs[key] = len(complist) - len(newcomplist)
+        if len(newcomplist) == 0:####! newline
+            pures.append(key)####! newline
+    CPP = []
+    for key in sorted(cutoffs, key = cutoffs.get, reverse = True):
+        CPP.append(key)
+    if CUTOFF[0] == '>':#VERYNEW
+        Clade_priority_positions = {pos:CShN[pos] for pos in CPP if cutoffs[pos] > int(CUTOFF[1:])}#VERYNEW
+    else:#VERYNEW
+        Clade_priority_positions = {}
+        for position in CPP[:int(CUTOFF)]:#Here you define how many of the clade shared combinations are used in subsequent search
+            Clade_priority_positions[position] = CShN[position]
+    return complist, Clade_priority_positions, cutoffs, pures####! pures added
+
+#***STEPS 3 RANDOM SEARCH ACROSS PRIORITY POSITIONS TO FIND RAW DIAGNOSTIC COMBINATIONS AND TO SUBSEQUENTLY REFINE THEM
+def random_position(somelist, checklist):#gives a random index (integer) of the specified range, and returns indexed somelist element if it is not present in the checklist 
+    while True:
+        i = random.randint(0, len(somelist) - 1)
+        if somelist[i] not in checklist:
+            return somelist[i]
+            break
+        else:
+            continue
+
+def step_reduction_complist(clade, complist, CPP, checked_ind):#checks randomly selected positions of CladeSharedNucleotides with sequences of other clades, until a diagnostic combination of nucleotides for a selected clade is found.
+    if len(complist) == 0:
+        return checked_ind
+    elif len(checked_ind) == len(CPP):
+        return checked_ind
+    else:
+        newcomplist = []
+        pos = random_position(list(CPP.keys()), checked_ind)
+        for j in complist:
+            if j[pos] == CPP[pos] or j[pos] == 'N':#VERYNEW
+                newcomplist.append(j)
+            else: continue
+        new_checked_ind = checked_ind + [pos]
+        return step_reduction_complist(clade, newcomplist, CPP, new_checked_ind)
+
+def ConditionD(newcomb, complist, CPP):#The function checks the 'Condition D' - i.e. whither any given combination of nucleotide positions is diagnostic for the selected clade
+    ContD = False
+    for i in newcomb:
+        newcomplist = []
+        for m in complist:
+            if m[i] == CPP[i]:
+                newcomplist.append(m)
+            else: continue
+        complist = newcomplist
+    if len(complist) == 0:
+        ContD = True
+    return ContD
+
+def RemoveRedundantPositions(raw_comb, complist, CPP):# The function removes positions from the raw combinations one by one, and then checks whether new combination fulfills the condition D, thus recursively reducing the diagnostic combination.
+    red_possible = False
+    for j in raw_comb:
+        newcomb = [k for k in raw_comb if k != j]
+        if ConditionD(newcomb, complist, CPP) == True:
+            red_possible = True
+            return RemoveRedundantPositions(newcomb, complist, CPP)
+        else: pass
+    if red_possible == False:
+        return raw_comb
+
+#PUTS EVERYTHING TOGETHER - 20000 ROUNDS OF RANDOM SEARCH FOLLOWED BY REFINING OF 500 SHORTEST COMBINATIONS
+def Diagnostic_combinations(qCLADE, complist, CPP, n1, maxlen1, maxlen2):
+    Achecked_ind = []
+    bestlists = []
+    n = n1
+    while n>0:#STEP3 proposes raw diagnostic combinations
+        raw_comb = step_reduction_complist(qCLADE, complist, CPP, Achecked_ind)
+        if len(raw_comb) <= maxlen1:
+            refined_comb = RemoveRedundantPositions(raw_comb, complist, CPP)
+            if len(refined_comb) <= maxlen2 and sorted(refined_comb) not in bestlists:
+                bestlists.append(sorted(refined_comb))
+        n=n-1
+    bestlists.sort(key=len)
+    return bestlists
+
+#***STEP 4 ANALYSIS OF OUTPUT rDNCs
+def IndependentKey(diagnostic_combinations):#PRESENTLY NOT INVOLVED - returns independent diagnostic nucleotide combinations, and identifies key nucleotide positions
+    independent_combinations = []
+    selected_positions = []
+    for i in range(len(diagnostic_combinations)):
+        if len(selected_positions) == 0:
+            for j in range(0, i):
+                if len(set(diagnostic_combinations[i]) & set(diagnostic_combinations[j])) == 0 and len(set(diagnostic_combinations[i]) & set(selected_positions)) == 0:
+                    independent_combinations.append(diagnostic_combinations[i])
+                    independent_combinations.append(diagnostic_combinations[j])
+                    for k in range(len(diagnostic_combinations[i])):
+                        selected_positions.append(diagnostic_combinations[i][k])
+                    for l in range(len(diagnostic_combinations[j])):
+                        selected_positions.append(diagnostic_combinations[j][l])
+        else:
+            if len(set(diagnostic_combinations[i]) & set(selected_positions)) == 0:
+                independent_combinations.append(diagnostic_combinations[i])
+                for k in range(len(diagnostic_combinations[i])):
+                    selected_positions.append(diagnostic_combinations[i][k])
+    independent_combinations.sort(key=len)
+    key_positions = []
+    for pos in diagnostic_combinations[0]:
+        KP = True
+        for combination in diagnostic_combinations[1:]:
+            if pos not in combination:
+                KP = False
+                break
+            else: continue
+        if KP == True:
+            key_positions.append(pos)
+    return independent_combinations, key_positions
+
+#SPECIFIC FUNCTIONS FOR THE rDNCs
+def PositionArrays(Motifs):#VERYNEW ALL FUNCTION NEW
+    PositionArrays = []
+    VarPosList = []
+    for i in range(len(Motifs[0])):
+        Const = True
+        array = [Motifs[0][i]]
+        for j in range(len(Motifs[1:])):
+            if Motifs[j][i] != 'N':
+                if Motifs[j][i] != array[-1]:
+                    Const = False
+                array.append(Motifs[j][i])
+        PositionArrays.append(array)
+        if Const == False:
+            VarPosList.append(i)
+    return PositionArrays, VarPosList
+
+def random_sequence_new(SEQ, PositionArrays, VarPosList, Pdiff):#VERYNEW FUNCTION REVISED
+    #print(["ROUND", len(SEQ)*Pdiff/100, round(len(SEQ)*Pdiff/100)])
+    n = round(len(SEQ)*Pdiff/100)
+    N = random.sample(list(range(1, n)), 1)[0]
+    PosToChange = random.sample([p for p in VarPosList if SEQ[p] != 'D'], N)#this is a new definition to keep alignment gaps unchanged
+    NEWSEQ = ''
+    for i in range(len(SEQ)):
+        if i not in PosToChange:
+            NEWSEQ = NEWSEQ + SEQ[i]
+        else:
+            newarray = [j for j in PositionArrays[i] if j != SEQ[i]]
+            newbase = random.sample(newarray, 1)[0]
+            NEWSEQ = NEWSEQ + newbase
+    return NEWSEQ
+
+def GenerateBarcode_new(Diagnostic_combinations, length):#VERYNEW FUNCTION REVISED - This function calculates diagnostic combinations and assembles a barcode of desired length for a query taxon. First all single position DNCs are added, then based on the frequency of a nucleotide position in the DNCs of the 2 positions, and then based on the frequency of a position in longer DNCs
+    len1 = []
+    len2 = []
+    lenmore = []
+    for comb in Diagnostic_combinations:
+        if len(comb) == len(Diagnostic_combinations[0]):
+            for i in comb:
+                len1.append(i)
+        elif len(comb) == len(Diagnostic_combinations[0])+1:
+            for j in comb:
+                len2.append(j)
+        else:
+            for k in comb:
+                lenmore.append(k)
+    if len(Diagnostic_combinations[0]) == 1:
+        Setin = len1
+    else:
+        Setin = []
+        for pos in sorted(len1, key=len1.count, reverse = True):
+            if not pos in Setin:
+                Setin.append(pos)
+    for pos1 in sorted(len2, key=len2.count, reverse = True):
+        if not pos1 in Setin:
+            Setin.append(pos1)
+    for pos2 in sorted(lenmore, key=lenmore.count, reverse = True):
+        if not pos2 in Setin:
+            Setin.append(pos2)
+    return Setin[:length]
+
+def Screwed_dataset_new(raw_records, nseq_per_clade_to_screw, PositionArrays, VarPosList, Percent_difference, Taxon, Cutoff):#VERYNEW FUNCTION REVISED
+    clades=[]
+    for i in range(len(raw_records)):
+        Clade=raw_records[i][1]
+        if Clade not in clades:
+            clades.append(Clade)
+    clade_sorted_seqs = {}
+    for letter in clades:
+        clade_sorted_seqs[letter]=[]
+        for i in range(len(raw_records)):
+            if raw_records[i][1]==letter:
+                clade_sorted_seqs[letter].append(raw_records[i][2])
+    
+    for clade in clades:
+        seqlist = clade_sorted_seqs[clade]
+        newseqs = []
+        if len(seqlist) > nseq_per_clade_to_screw:
+            iSTS = random.sample(list(range(len(seqlist))), nseq_per_clade_to_screw)
+            for k in range(len(seqlist)):
+                if k in iSTS:
+                    newseq = random_sequence_new(seqlist[k], PositionArrays, VarPosList, Percent_difference)
+                else:
+                    newseq = seqlist[k]
+                newseqs.append(newseq)
+        elif len(clade_sorted_seqs[clade]) == nseq_per_clade_to_screw:
+            for k in range(len(seqlist)):
+                newseq = random_sequence_new(seqlist[k], PositionArrays, VarPosList, Percent_difference)
+                newseqs.append(newseq)
+        else:
+            for i in range(nseq_per_clade_to_screw):
+                seq = random.sample(seqlist, 1)[0]
+                newseq = random_sequence_new(seq, PositionArrays, VarPosList, Percent_difference)
+                newseqs.append(newseq)
+        clade_sorted_seqs[clade] = newseqs
+    
+    shared_positions={}
+    for key in clade_sorted_seqs:
+        sh_pos=[]
+        for i in range(len(clade_sorted_seqs[key][0])):
+            shared_nucleotide = True
+            csm = clade_sorted_seqs[key][0][i] #candidate shared nucleotide
+            for j in range(1, len(clade_sorted_seqs[key])):
+                if clade_sorted_seqs[key][j][i] != csm:
+                    shared_nucleotide = False
+                    break
+            if shared_nucleotide == True and csm != 'N':
+                sh_pos.append(i)
+        shared_positions[key]=sh_pos
+    
+    x,y,z,pures = C_VP_PP(clade_sorted_seqs, Taxon, shared_positions, Cutoff)#STEP2####
+    return x, y
+
+#NEWFUNCYIONS
+def medianSeqLen(listofseqs):#OCT2022
+    seqlens = [i.count('A')+i.count('C')+i.count('G')+i.count('T') for i in listofseqs]
+    medlen = sorted(seqlens)[int(len(seqlens)/2)]
+    medseq = listofseqs[seqlens.index(medlen)]
+    start = min([medseq.find('A'),medseq.find('C'),medseq.find('G'),medseq.count('T')])
+    if not 'N' in medseq[start:]:
+        end = len(medseq)
+    else:
+        for i in range(start, len(medseq), 1):
+            if medseq[i:].count('N') == len(medseq[i:]):
+                end = i
+                break
+    return medlen, start, end
+
+def getAllPairs(taxalist):
+    uniquetaxapairs = []
+    stl = sorted(list(set(taxalist)))
+    for i in range(len(stl)):
+        for j in range(i+1, len(stl)):
+            uniquetaxapairs.append([stl[i],stl[j]])
+    return uniquetaxapairs
+        
+
+################################################READ IN PARAMETER FILE AND DATA FILE
+
+def get_args(): #arguments needed to give to this script
+    parser = argparse.ArgumentParser(description="run MolD")
+    required = parser.add_argument_group("required arguments")
+    required.add_argument("-i", help="textfile with parameters of the analysis", required=True)
+    return parser.parse_args()
+
+def mainprocessing(gapsaschars=None, taxalist=None, taxonrank=None, cutoff=None, numnucl=None, numiter=None, maxlenraw=None, maxlenrefined=None, iref=None, pdiff=None, nmax=None, thresh=None, tmpfname=None, origfname=None):
+    ParDict = {}
+    if not(all([gapsaschars, taxalist, taxonrank, cutoff, numnucl, numiter, maxlenraw, maxlenrefined, iref, pdiff, nmax, thresh, tmpfname])):
+        args = get_args()
+        with open(args.i) as params:
+            for line in params:
+                line = line.strip()
+                if line.startswith('#'):
+                    pass
+                else:
+                    if len(line.split('=')) == 2 and len(line.split('=')[1]) != 0:
+                        ParDict[line.split('=')[0]] = line.split('=')[1].replace(' ', '')#VERYNEW
+    else:
+        ParDict['Gaps_as_chars'] = gapsaschars
+        ParDict['qTAXA'] = taxalist
+        ParDict['Taxon_rank'] = taxonrank
+        ParDict['INPUT_FILE'] = tmpfname
+        ParDict['ORIG_FNAME'] = origfname# I do not understand this ORIG_FNAME, it throws an error with the command line
+        ParDict['Cutoff'] = cutoff
+        ParDict['NumberN'] = numnucl
+        ParDict['Number_of_iterations'] = numiter
+        ParDict['MaxLen1'] = maxlenraw
+        ParDict['MaxLen2'] = maxlenrefined
+        ParDict['Iref'] = iref
+        ParDict['Pdiff'] = pdiff
+        #ParDict['PrSeq'] = prseq
+        ParDict['NMaxSeq'] = nmax
+        ParDict['Scoring'] = thresh
+        ParDict['OUTPUT_FILE'] = "str"        
+    print(ParDict)
+    ############################################# #VERYNEW HOW GAPS ARE TREATED
+    #REQUIRES A NEW FIELD IN THE GUI
+    if ParDict['Gaps_as_chars'] == 'yes':
+        gaps2D = True#VERYNEW
+    else:#VERYNEW
+        gaps2D = False#VERYNEW
+    
+    ############################################ DATA FILE
+    checkread = open(ParDict['INPUT_FILE'], 'r')
+    firstline = checkread.readline()
+    checkread.close()
+    imported=[]#set up a new list with species and identifiers
+    if '>' in firstline:
+        f = open(ParDict['INPUT_FILE'], 'r') 
+        for line in f:#VERYNEW - THE DATA READING FROM ALIGNMENT FILE IS ALL REVISED UNTIL f.close()
+            line=line.rstrip()
+            if line.startswith('>'):
+                data = line[1:].split('|')
+                if len(data) != 2:
+                    print('Check number of entries in', data[0])
+                    #break
+                data.append('')
+                imported.append(data)
+            else:
+                if gaps2D == True:#
+                    DNA = line.upper().replace('-', 'D').replace('R', 'N').replace('Y', 'N').replace('S','N').replace('W','N').replace('M','N').replace('K','N')#VERYNEW
+                else:
+                    DNA = line.upper().replace('-', 'N').replace('R', 'N').replace('Y', 'N').replace('S','N').replace('W','N').replace('M','N').replace('K','N')#VERYNEW
+                imported[-1][-1] = imported[-1][-1]+DNA
+        f.close()
+    else:
+        f = open(ParDict['INPUT_FILE'], 'r') 
+        for line in f:#VERYNEW - THE DATA READING FROM ALIGNMENT FILE IS ALL REVISED UNTIL f.close()
+            line=line.rstrip()
+            data = line.split('\t')
+            if len(data) != 3:
+                print('Check number of entries in', data[0])
+            ID = data[0]
+            thetax = data[1]
+            if gaps2D == True:#
+                DNA = data[2].upper().replace('-', 'D').replace('R', 'N').replace('Y', 'N').replace('S','N').replace('W','N').replace('M','N').replace('K','N')#VERYNEW
+            else:
+                DNA = data[2].upper().replace('-', 'N').replace('R', 'N').replace('Y', 'N').replace('S','N').replace('W','N').replace('M','N').replace('K','N')#VERYNEW
+            imported.append([ID, thetax, DNA])
+        f.close()
+    
+    if 'NumberN' in list(ParDict.keys()):#How many ambiguously called nucleotides are allowed
+        NumberN = int(ParDict['NumberN'])
+    else:
+        NumberN = 5
+    
+    if len(set([len(i[2]) for i in imported])) != 1:
+        print('Alignment contains sequences of different lengths:', set([len(i[2]) for i in imported]))
+    else:
+        mlen, sstart, send = medianSeqLen([i[2] for i in imported])#OCT2022 - start
+        if mlen + NumberN < len(imported[0][2]):
+            Slice = True
+            FragmentLen = mlen
+            corr = sstart
+        else:
+            Slice = False
+            FragmentLen = len(imported[0][2])
+            sstart = 0
+            send = FragmentLen+1
+            corr = 0
+    raw_records=[]
+    for i in imported:
+        if ParDict['Iref'] != 'NO' and ParDict['Iref'].split(',')[0] == i[0] and ParDict['Iref'].split(',')[1] in ['ex', 'excl', 'out']:
+            continue
+        else:
+            if i[2][sstart:send].count('N') < NumberN and len(i[2][sstart:send]) == FragmentLen:
+                newi = [i[0], i[1], i[2][sstart:send]]
+                raw_records.append(newi)#OCT2022 - end
+    print('\n########################## PARAMETERS ######################\n')#VERYNEW
+    #print('input file:', ParDict['ORIG_FNAME']) #Outcommented ORIG_FNAME
+    print('input file:', ParDict['INPUT_FILE']) #Replacement of the line above
+    print('Coding gaps as characters:', gaps2D)
+    print('Maximum undetermined nucleotides allowed:', NumberN)
+    print('Length of the alignment:', len(imported[0][2]),'->', FragmentLen)
+    print('Indexing reference:', ParDict['Iref'].replace('NO', 'Not set').replace('in', 'included').replace('ex', 'excluded'))
+    print('Read in', len(raw_records), 'sequences')
+    
+    PosArrays, VarPosList = PositionArrays([i[2] for i in raw_records])#VERYNEW
+   
+    #############################################READ IN OTHER ANALYSIS PARAMETERS
+    ##OCT2022 - start
+    withplus = []
+    P2 = []
+    shift = True
+    if ParDict['qTAXA'][0] == '>':#THIS OPTION DIAGNOSES ALL TAXA WITH MORE THAN USER-DEFINED NUMBER OF SEQUENCES AVAILABLE
+        NumSeq = int(ParDict['qTAXA'][1:])
+        Taxarecords = [i[1] for i in raw_records]
+        qCLADEs = []
+        for j in set(Taxarecords):
+            if Taxarecords.count(j) >= NumSeq:
+                qCLADEs.append(j)
+    elif ParDict['qTAXA'].startswith('P:'):#THIS OPTION DIAGNOSES ALL TAXA CONTAINING A USER-DEFINED PATTERN IN THE NAME
+        pattern = ParDict['qTAXA'].split(':')[1]
+        Taxarecords = [i[1] for i in raw_records]
+        qCLADEs = []
+        for j in set(Taxarecords):
+            if pattern in j:
+                qCLADEs.append(j)
+    elif ParDict['qTAXA'].startswith('P+:'):#THIS OPTION POOLS ALL TAXA CONTAINING A USER-DEFINED PATTERN IN THE NAME IN ONE TAXON AND DIAGNOSES IT
+        pattern = ParDict['qTAXA'].split(':')[1]
+        Taxarecords = set([i[1] for i in raw_records if pattern in i[1]])
+        spp = '+'.join(sorted(list(Taxarecords)))
+        qCLADEs = [spp]
+        nrecords = []
+        for rec in raw_records:
+            if rec[1] in Taxarecords:
+                nrecords.append([rec[0], spp, rec[2]])
+            else:
+                nrecords.append(rec)
+        raw_records = nrecords
+    else:#THIS OPTION DIAGNOSES ALL TAXA FROM A USER-DEFINED LIST; TAXA MAY BE COMBINED BY USING '+'
+        qCLADEs = []
+        allrecs = ParDict['qTAXA'].split(',')
+        for item in allrecs:
+            if item in ['ALL', 'All', 'all']:#THIS OPTION DIAGNOSES ALL TAXA IN THE DATASET
+                qCLADEs = list(set([i[1] for i in raw_records]))
+            elif item in [i[1] for i in raw_records]:
+                qCLADEs.append(item)
+            elif '+' in item:
+                withplus.append(item)
+            elif 'VS' in item:
+                P2.append(item.split('VS'))
+            else:
+                print('UNRECOGNIZED TAXON', item)
+    #OCT2022 - end
+    print('query taxa:', len(qCLADEs+withplus), '-', str(sorted(qCLADEs)+sorted(withplus)).replace('[','').replace(']','').replace("'", ''))#1.3
+    
+    if 'Cutoff' in list(ParDict.keys()):#CUTOFF Number of the informative positions to be considered, default 100
+        Cutoff = ParDict['Cutoff']#VERYNEW
+    else:
+        Cutoff = 100
+    print('Cutoff set as:', Cutoff)
+    if 'Number_of_iterations' in list(ParDict.keys()):#Number iterations of MolD
+        N1 = int(ParDict['Number_of_iterations'])
+    else:
+        N1 = 10000
+    print('Number iterations of MolD set as:', N1)
+    
+    if 'MaxLen1' in list(ParDict.keys()):#Maximum length for the raw mDNCs
+        MaxLen1 = int(ParDict['MaxLen1'])
+    else:
+        MaxLen1 = 12
+    print('Maximum length of raw mDNCs set as:', MaxLen1)
+    
+    if 'MaxLen2' in list(ParDict.keys()):#Maximum length for the refined mDNCs
+        MaxLen2 = int(ParDict['MaxLen2'])
+    else:
+        MaxLen2 = 7
+    print('Maximum length of refined mDNCs set as:', MaxLen2)
+    
+    if 'Pdiff' in list(ParDict.keys()):#Percent difference
+        Percent_difference = float(ParDict['Pdiff'])
+    else:
+        if int(ParDict['Taxon_rank']) == 1:#read in taxon rank to configure Pdiff parameter of artificial dataset
+            Percent_difference = 1
+        else:
+            Percent_difference = 5
+    print('simulated sequences up to', Percent_difference, 'percent divergent from original ones')
+    
+    if 'NMaxSeq' in list(ParDict.keys()):#Maximum number of sequences per taxon to be modified
+        Seq_per_clade_to_screw = int(ParDict['NMaxSeq'])
+    else:
+        Seq_per_clade_to_screw = 10####!changed value
+    print('Maximum number of sequences modified per clade', Seq_per_clade_to_screw)
+    
+    if 'Scoring' in list(ParDict.keys()):
+        if ParDict['Scoring'] == 'lousy':
+            threshold = 66####!changed value
+        elif ParDict['Scoring'] == 'moderate':
+            threshold = 75####!changed value
+        elif ParDict['Scoring'] == 'stringent':
+            threshold = 90####!changed value
+        elif ParDict['Scoring'] == 'very_stringent':
+            threshold = 95####!changed value
+        else:
+            threshold = 75####!changed value
+    else:
+        threshold = 75####!changed value
+    #print(ParDict['Scoring'], 'scoring of the rDNCs; threshold in two consequtive runs:', threshold)
+    print('scoring of the rDNCs; threshold in two consequtive runs:', threshold)
+    #OCT2022 - start
+    if corr > 1 and len(ParDict['Iref'].split(',')) == 2:
+        print('\nNOTE: The alignment was trimmed automatically to match median sequences length. The analysed slice starts from the site',str(sstart+1),'and ends on the site',str(send+1),'. The site indexing in the DNCs as in the provided reference.')
+    if corr > 1 and ParDict['Iref'] == 'NO':
+        print('\nNOTE: The alignment was trimmed automatically to match median sequences length. The analysed slice starts from the site',str(sstart+1),'and ends on the site',str(send+1),'. The site indexing in the rDNC as in the untrimmed alignment.')
+    #OCT2022 - end
+    thephrase = 'The DNA diagnosis for the taxon'
+    ###################################################IMPLEMENTATION
+    #Setting up a new class just for the convenient output formatting
+    class SortedDisplayDict(dict):#this is only to get a likable formatting of the barcode
+        def __str__(self):
+            return "[" + ", ".join("%r: %r" % (key, self[key]) for key in sorted(self)) + "]"
+    
+    class SortedDisplayDictVerbose(dict):#this is only to get a likable formatting of the barcode
+        def __str__(self):
+            return ", ".join("%r %r" % (self[key],'in the site '+str(key)) for key in sorted(self)).replace("'", '').replace("A", "'A'").replace("C", "'C'").replace("G", "'G'").replace("T", "'T'")+'.'
+    
+    #Calling functions and outputing results
+    if ParDict['OUTPUT_FILE'] == "str":
+        g = StringIO()
+    else:
+        g = open(ParDict['OUTPUT_FILE'], "w")#Initiating output file
+    #VERYNEW
+    print('<h4>########################## PARAMETERS ######################</h4>', file=g)
+    #print("<p>", 'input file:', ParDict['ORIG_FNAME'], "</p>", file=g) #outcommented ORIG_FNAME
+    print("<p>", 'input file:', ParDict['INPUT_FILE'], "</p>", file=g)
+    print("<p>", 'Coding gaps as characters:', gaps2D, "</p>", file=g)
+    print("<p>", 'Maximum undetermined nucleotides allowed:', NumberN, "</p>", file=g)
+    print("<p>", 'Length of the alignment:', len(imported[0][2]),'->', FragmentLen, "</p>", file=g)
+    print("<p>", 'Indexing reference:', ParDict['Iref'].replace('NO', 'Not set').replace('in', 'included').replace('ex', 'excluded'), "</p>", file=g)#OCT2022
+    print("<p>", 'Read in', len(raw_records), 'sequences', "</p>", file=g)
+    print("<p>", 'query taxa:', len(qCLADEs+withplus), '-', str(sorted(qCLADEs)+sorted(withplus)).replace('[','').replace(']','').replace("'", ''), "</p>", file=g)#1.3
+    print("<p>", 'Cutoff set as:', Cutoff, "</p>", file=g)
+    print("<p>", 'Number iterations of MolD set as:', N1, "</p>", file=g)
+    print("<p>", 'Maximum length of raw mDNCs set as:', MaxLen1, "</p>", file=g)
+    print("<p>", 'Maximum length of refined mDNCs set as:', MaxLen2, "</p>", file=g)
+    print("<p>", 'simulated sequences up to', Percent_difference, 'percent divergent from original ones', "</p>", file=g)
+    print("<p>", 'Maximum number of sequences modified per clade', Seq_per_clade_to_screw, "</p>", file=g)
+    #print("<p>", ParDict['Scoring'], 'scoring of the rDNCs; threshold in two consequtive runs:', threshold, "</p>", file=g)
+    print("<p>", 'scoring of the rDNCs; threshold in two consequtive runs:', threshold, "</p>", file=g)
+    if corr > 1 and len(ParDict['Iref'].split(',')) == 2:
+        print('<h4>NOTE: The alignment was trimmed automatically to match median sequences length. The analysed slice starts from the site',str(sstart+1),'and ends on the site',str(send+1),'. The site indexing in the DNCs as in the provided reference.</h4>', file=g)
+    if corr > 1 and ParDict['Iref'] == 'NO':
+        print('<h4>NOTE: The alignment was trimmed automatically to match median sequences length. The analysed slice starts from the site',str(sstart+1),'and ends on the site',str(send+1),'. The site indexing in the rDNC as in the untrimmed alignment.</h4>', file=g)
+
+    print('<h4>########################### RESULTS ##########################</h4>', file=g)
+    
+    for qCLADE in sorted(qCLADEs) + sorted(withplus):#OCT2022
+        if '+' in qCLADE:
+            if shift == True:
+                old_records = raw_records
+                shift == False
+            else:
+                raw_records = old_records
+            spp = qCLADE.split('+')
+            nrecords = []
+            for rec in raw_records:
+                if rec[1] in spp:
+                    nrecords.append([rec[0], qCLADE, rec[2]])
+                else:
+                    nrecords.append(rec)
+            raw_records = nrecords
+        print('\n**************', qCLADE, '**************')
+        print('<h4>**************', qCLADE, '**************</h4>', file=g)
+        Clades, clade_sorted_seqs, shared_positions = Step1(raw_records)#STEP1
+        x,y,z,pures = C_VP_PP(clade_sorted_seqs, qCLADE, shared_positions, Cutoff)#STEP2 ####! added pures
+        newy = {key:y[key] for key in y if not key in pures} ####! newline
+        print('Sequences analyzed:', len(clade_sorted_seqs[qCLADE]))
+        print("<p>",'Sequences analyzed:', len(clade_sorted_seqs[qCLADE]), "</p>", file=g)
+        ND_combinations = [[item] for item in pures] ####! before ND_combinations were initiated as an empty list
+        print('single nucleotide mDNCs:', len(pures), '-', str(SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in pures]}))[1:-1])#OCT2022
+        print("<p>",'single nucleotide mDNCs*:',len(pures), '-', str(SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in pures]}))[1:-1], "</p>", file=g)#OCT2022
+        N = 1 ####!
+        while N > 0:#STEP3
+            try:
+                q = Diagnostic_combinations(qCLADE, x, newy, N1, MaxLen1, MaxLen2) ####! newy instead of y
+            except IndexError:
+                print(N, 'IndexError')
+                continue
+            for comb in q:
+                if not comb in ND_combinations:
+                    ND_combinations.append(comb)
+            N-=1
+        ND_combinations.sort(key=len)
+        #################################### mDNC output
+        try:
+            Nind, KeyPos = IndependentKey(ND_combinations)#STEP4
+        except IndexError:
+            print('no mDNCs recovered for', qCLADE)#VERYNEW
+            print("<p>", 'no mDNCs recovered for', "</p>", qCLADE, file=g)#VERYNEW
+            continue
+        Allpos = []#Create list of all positions involved in mDNCs
+        for comb in ND_combinations:
+            for pos in comb:
+                if not pos in Allpos:
+                    Allpos.append(pos)
+        print('\nmDNCs retrieved:', str(len(ND_combinations)) + '; Sites involved:', str(len(Allpos)) + '; Independent mDNCs:', len(Nind))#VERYNEW
+        print("<p>", 'mDNCs* retrieved:', str(len(ND_combinations)) + '; Sites involved:', str(len(Allpos)) + '; Independent mDNCs**:', len(Nind), "</p>", file=g)#VERYNEW
+        print('Shortest retrieved mDNC:', SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in ND_combinations[0]]}), '\n')#OCT2022
+        print("<p>",'Shortest retrieved mDNC*:', SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in ND_combinations[0]]}), "</p>", file=g)#OCT2022
+        ######################################################## rDNC output
+        Barcode_scores = []#Initiate a list for rDNC scores
+        npos = len(ND_combinations[0])
+        BestBarcode = 'none'####! newline
+        while npos <= min([10, len(Allpos)]):#in this loop the positions are added one-by-one to a rDNC and the rDNC is then rated on the artificially generated datasets
+            Barcode = GenerateBarcode_new(ND_combinations, npos)#Initiate a rDNC
+            Barcode_score = 0#Initiate a score to rate a rDNC
+            N = 100
+            while N > 0:
+                NComplist, NCPP = Screwed_dataset_new(raw_records, Seq_per_clade_to_screw, PosArrays, VarPosList, Percent_difference, qCLADE, Cutoff)#Create an artificial dataset VERYNEW
+                NBarcode = [i for i in Barcode if i in list(NCPP.keys())]
+                if len(Barcode) - len(NBarcode) <= 1 and ConditionD(NBarcode, NComplist, NCPP) == True:####! new condition (first) added
+                    Barcode_score +=1
+                N -=1
+            print(npos, 'rDNC_score (100):', [k+corr+1 for k in Barcode], '-', Barcode_score)#VERYNEW
+            print("<p>", npos, 'rDNC_score (100):', [k+corr+1 for k in Barcode], '-', Barcode_score, "</p>", file=g)#OCT2022
+            if Barcode_score >= threshold and len(Barcode_scores) == 1: ###1.3
+                BestBarcode = Barcode###1.3
+            if Barcode_score >= threshold and len(Barcode_scores) > 1 and Barcode_score >= max(Barcode_scores): ###1.3
+                BestBarcode = Barcode####!newline
+            Barcode_scores.append(Barcode_score)
+            if len(Barcode_scores) >= 2 and Barcode_scores[-1] >= threshold and Barcode_scores[-2] >= threshold:#Check whether the rDNC fulfills robustnes criteria 85:85:85
+                print('Final rDNC:', SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}))#OCT2022
+                print("<p>",'Final rDNC***:', SortedDisplayDict({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}), "</p>", file=g)#OCT2022
+                print('\n',thephrase, qCLADE, 'is:', SortedDisplayDictVerbose({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}))#OCT2022
+                print("<p>", thephrase, qCLADE, 'is:', SortedDisplayDictVerbose({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}), "</p>", file=g)#OCT2022
+                break
+            else:# VERY NEW FROM HERE ONWARDS
+                npos += 1
+                if npos > min([10, len(Allpos)]):
+                    if BestBarcode != 'none':
+                        print('The highest scoring rDNC for taxon', qCLADE, 'is:', SortedDisplayDictVerbose({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}))#OCT2022
+                        print("<p>", 'The highest scoring rDNC*** for taxon', qCLADE, 'is:', SortedDisplayDictVerbose({pos+corr : y[pos-1] for pos in [i+1 for i in BestBarcode]}), "</p>", file=g)#OCT2022
+                    else:
+                        print('No sufficiently robust DNA diagnosis for taxon', qCLADE, 'was retrieved')
+                        print("<p>", 'No sufficiently robust DNA diagnosis for taxon', qCLADE, 'was retrieved', "</p>", file=g)
+    #OCT2022 - start
+    print("<h4>", '################################# EXPLANATIONS ####################################', "</h4>", file=g)
+    print("<p>", '  * mDNC -(=minimal Diagnostic nucleotide combination) is a combination of nucleotides at specified sites of the alignment,', "</p>", file=g)
+    print("<p>", '    unique for a query taxon. Therefore it is sufficient to differentiate a query taxon from all reference taxa in a dataset.', "</p>", file=g)
+    print("<p>", '    Because it comprises minimal necessary number of nucleotide sites to differentiate a query, any mutation in the mDNC in'  "</p>", file=g)
+    print("<p>", '    single specimen of a query taxon will automatically disqualify it as a diagnostic combination.', "</p>", file=g)
+    print("<p>",  "</p>", file=g)
+    print("<p>", ' ** two or more mDNCs are INDEPENDENT if they constitute non-overlapping sets of nucleotide sites.', "</p>", file=g)
+    print("<p>",  "</p>", file=g)
+    print("<p>", '*** rDNC -(=robust/redundant Diagnostic nucleotide combination) is a combination of nucleotides at specified sites of the alignment,', "</p>", file=g)
+    print("<p>", '    unique for a query taxon and (likewise mDNC) sufficient to differentiate a query taxon from all reference taxa in a dataset.', "</p>", file=g)
+    print("<p>", '    However, rDNC comprises more than a minimal necessary number of diagnostic sites, and therefore is robust to single nucleotide', "</p>", file=g)
+    print("<p>", '    replacements. Even if a mutation arises in one of the rDNC sites, the remaining ones will (with high probability) remain sufficient ', "</p>", file=g)
+    print("<p>", '    to diagnose the query taxon', "</p>", file=g)
+    print("<h4>", '    Final diagnosis corresponds to rDNC', "</h4>", file=g)
+    #OCT2022 - end
+    
+    if ParDict['OUTPUT_FILE'] == "str":
+        contents = g.getvalue()
+        os.unlink(ParDict['INPUT_FILE'])
+    else:
+        contents = None
+    g.close()
+    #OCT2022 - start
+    if len(P2) != 0:
+        ext = '.'+ParDict['OUTPUT_FILE'].split('.')[-1]
+        h = open(ParDict['OUTPUT_FILE'].replace(ext, '_pairwise'+ext), "w")#Initiating output file
+        if len(withplus) != 0:
+            raw_records = old_records
+        taxain = [i[1] for i in raw_records]
+        tpairs = []
+        for alist in P2:
+            if alist.count('all')+alist.count('All')+alist.count('ALL') == 2:
+                tpairs = getAllPairs(taxain)
+            elif alist.count('all')+alist.count('All')+alist.count('ALL') == 1:
+                thetax = [i for i in taxain if i in alist][0]
+                print(thetax)
+                for atax in sorted(list(set(taxain))):
+                    if atax != thetax:
+                        tpairs.append([thetax, atax])
+            else:
+                for apair in getAllPairs(alist):
+                    tpairs.append(apair)
+        for apair in tpairs:
+            t1 = apair[0]
+            t2 = apair[1]
+            p2records = [i for i in raw_records if i[1] in [t1, t2]]
+            print('\n**************', t1, 'VS', t2,'**************')
+            print('<h4>**************', t1, 'VS', t2, '**************</h4>', file=h)
+            C2, css2, sp2 = Step1(p2records)#STEP1
+            x2,y2,z2,pures2 = C_VP_PP(css2, t1, sp2, '>0')#STEP2 ####! added pures
+            Pairphrase = 'Each of the following '+ str(len(pures2))+' sites is invariant across sequences of '+ t1+ ' and differentiates it from '+ t2+': '
+            counterPures = {}
+            for site in pures2:
+                counterPures[site] = "'or'".join(list(set([thing[site] for thing in css2[t2] if thing[site] != 'N'])))
+                Pairphrase = Pairphrase + str(site+corr)+" ('"+str(y2[site])+"' vs '"+str(counterPures[site])+"'), "
+            print(Pairphrase[:-2])
+            print("<p>",Pairphrase[:-2],'</h4>', file=h)#OCT2022
+            x2r,y2r,z2r,pures2r = C_VP_PP(css2, t2, sp2, '>0')#STEP2 ####! added pures
+            Pairphraser = 'Each of the following '+ str(len(pures2r))+' sites is invariant across sequences of '+ t2+ ' and differentiates it from '+ t1+': '
+            counterPuresr = {}
+            for site in pures2r:
+                counterPuresr[site] = "'or'".join(list(set([thing[site] for thing in css2[t1] if thing[site] != 'N'])))
+                Pairphraser = Pairphraser + str(site+corr)+" ('"+str(y2r[site])+"' vs '"+str(counterPuresr[site])+"'), "
+            print(Pairphraser[:-2])
+            print("<p>",Pairphraser[:-2],'</h4>', file=h)#OCT2022
+        h.close()
+            
+    #OCT2022 - end
+    return contents, qCLADEs
+if __name__ == "__main__":
+    c, q = mainprocessing()
+  
\ No newline at end of file
author	itaxotools
date	Sun, 29 Jan 2023 16:25:48 +0000
parents
children