Mercurial > repos > iss > eurl_vtec_wgs_pt
comparison scripts/GetConsensus.py @ 0:c6bab5103a14 draft
"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
| author | iss |
|---|---|
| date | Mon, 21 Mar 2022 15:23:09 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c6bab5103a14 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 # -*- coding: utf-8 -*- | |
| 3 """ | |
| 4 ############################################################################ | |
| 5 # Istituto Superiore di Sanita' | |
| 6 # European Union Reference Laboratory (EU-RL) for Escherichia coli, including Verotoxigenic E. coli (VTEC) | |
| 7 # Developer: Arnold Knijn arnold.knijn@iss.it | |
| 8 ############################################################################ | |
| 9 """ | |
| 10 | |
| 11 import argparse | |
| 12 import sys | |
| 13 import os | |
| 14 import subprocess | |
| 15 import shutil | |
| 16 | |
| 17 def __main__(): | |
| 18 parser = argparse.ArgumentParser() | |
| 19 parser.add_argument('-i', '--input', dest='input', help='input alignment file') | |
| 20 parser.add_argument('-o', '--output', dest='output', help='output consensus file') | |
| 21 args = parser.parse_args() | |
| 22 | |
| 23 sequences = [] | |
| 24 varsequences = [] | |
| 25 # read input | |
| 26 with open(args.input) as alignments: | |
| 27 for alignment in alignments: | |
| 28 if alignment[0] != ">": | |
| 29 sequences.append(alignment.rstrip()) | |
| 30 numsequences = len(sequences) | |
| 31 for j in range(0, numsequences + 1): | |
| 32 varsequences.append("") | |
| 33 lstnumvariants = [] | |
| 34 lstnumhyphens = [] | |
| 35 # loop over the columns | |
| 36 for i in range(0, len(sequences[0])): | |
| 37 variants = [] | |
| 38 numhyphens = 0 | |
| 39 # loop over the original rows to obtain variants | |
| 40 for j in range(0, numsequences): | |
| 41 if sequences[j][i:i+1] == "-": | |
| 42 numhyphens = numhyphens + 1 | |
| 43 if sequences[j][i:i+1] not in variants and sequences[j][i:i+1] != "-": | |
| 44 variants.append(sequences[j][i:i+1]) | |
| 45 lstnumvariants.append(len(variants)) | |
| 46 lstnumhyphens.append(numhyphens) | |
| 47 # create varsequences with a template of the variants | |
| 48 for j in range(0, numsequences): | |
| 49 if lstnumhyphens[i] == 0: | |
| 50 varsequences[j] = varsequences[j] + "-" | |
| 51 elif lstnumvariants[i] < 2: | |
| 52 varsequences[j] = varsequences[j] + "-" | |
| 53 else: | |
| 54 varsequences[j] = varsequences[j] + sequences[j][i:i+1] | |
| 55 if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0: | |
| 56 varsequences[numsequences] = varsequences[numsequences] + variants[0] | |
| 57 else: | |
| 58 varsequences[numsequences] = varsequences[numsequences] + "-" | |
| 59 # loop over the columns, apply single variant | |
| 60 for i in range(0, len(sequences[0])): | |
| 61 if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0: | |
| 62 # loop over all the rows to apply single variant to "-" | |
| 63 for j in range(0, len(sequences)): | |
| 64 if sequences[j][i:i+1] == "-": | |
| 65 lstsequence = list(sequences[j]) | |
| 66 lstsequence[i] = varsequences[numsequences][i:i+1] | |
| 67 sequences[j] = ''.join(lstsequence) | |
| 68 # loop over the rows of the sequences, apply multiple variants | |
| 69 for j in range(0, len(sequences)): | |
| 70 # loop over the columns | |
| 71 for i in range(0, len(sequences[0])): | |
| 72 variants = [] | |
| 73 if sequences[j][i:i+1] == "-" and lstnumvariants[i] > 1: | |
| 74 # loop over the rows of the varsequences | |
| 75 for k in range(0, numsequences): | |
| 76 if varsequences[k][i:i+1] not in variants and varsequences[k][i:i+1] != "-": | |
| 77 variants.append(varsequences[k][i:i+1]) | |
| 78 if len(variants) == 1: | |
| 79 lstsequence = list(sequences[j]) | |
| 80 lstsequence[i] = variants[0] | |
| 81 sequences[j] = ''.join(lstsequence) | |
| 82 else: | |
| 83 lstsequence[i] = variants[len(variants) - 1] | |
| 84 sequences.append(''.join(lstsequence)) | |
| 85 # eliminate duplicate sequences | |
| 86 sequences_unique = list(set(sequences)) | |
| 87 # write consensus sequences to output | |
| 88 consensus = open(args.output, 'w') | |
| 89 n = 1 | |
| 90 for sequence in sequences_unique: | |
| 91 consensus.write(">consensus_" + str(n) + "\n") | |
| 92 consensus.write(sequence + "\n") | |
| 93 n = n + 1 | |
| 94 | |
| 95 | |
| 96 if __name__ == "__main__": | |
| 97 __main__() |
