comparison scripts/S02_remove_too_short_bit_or_whole_sequence.py @ 1:c79bdda8abfb draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
author abims-sbr
date Thu, 09 Jun 2022 12:40:00 +0000
parents eb95bf7f90ae
children
comparison
equal deleted inserted replaced
0:eb95bf7f90ae 1:c79bdda8abfb
87 for fasta_name in dico_aa.keys(): 87 for fasta_name in dico_aa.keys():
88 seq = dico_aa[fasta_name] 88 seq = dico_aa[fasta_name]
89 seq_nuc = dico_nuc[fasta_name] 89 seq_nuc = dico_nuc[fasta_name]
90 90
91 if "?" in seq: 91 if "?" in seq:
92 seq = string.replace(seq, "?", "-") 92 seq = str.replace(seq, "?", "-")
93 if "?" in seq_nuc: 93 if "?" in seq_nuc:
94 seq_nuc = string.replace(seq_nuc, "?", "-") 94 seq_nuc = str.replace(seq_nuc, "?", "-")
95 95
96 ## 4.1 ## [FILTER 1] : Detect and Replace short internal indel symbole (= "-" as for other longer gaps) by a "?" 96 ## 4.1 ## [FILTER 1] : Detect and Replace short internal indel symbole (= "-" as for other longer gaps) by a "?"
97 ## aa 97 ## aa
98 list_sublist_pos = detect_short_indel(seq, MAX_LENGTH_SMALL_INDEL) ### DEF 9 ### 98 list_sublist_pos = detect_short_indel(seq, MAX_LENGTH_SMALL_INDEL) ### DEF 9 ###
99 for pos_short_indels in list_sublist_pos: 99 for pos_short_indels in list_sublist_pos:
105 for pos in pos_short_indels: 105 for pos in pos_short_indels:
106 seq_nuc = seq_nuc[:pos] + "?" + seq_nuc[pos+1:] 106 seq_nuc = seq_nuc[:pos] + "?" + seq_nuc[pos+1:]
107 107
108 ## 4.2 ## [FILTER 2] : Remove short bits of sequence (<"MIN_LENGTH_BIT_OF_SEQUENCE_aa") 108 ## 4.2 ## [FILTER 2] : Remove short bits of sequence (<"MIN_LENGTH_BIT_OF_SEQUENCE_aa")
109 LIST_sublist_aa=[] 109 LIST_sublist_aa=[]
110 S1 = string.split(seq, "-") 110 S1 = str.split(seq, "-")
111 for element in S1: 111 for element in S1:
112 if len(element) > MIN_LENGTH_BIT_OF_SEQUENCE_aa: 112 if len(element) > MIN_LENGTH_BIT_OF_SEQUENCE_aa:
113 LIST_sublist_aa.append(element) 113 LIST_sublist_aa.append(element)
114 114
115 ## 4.3 ## [FILTER 3] : Remove all the sequence if the total length of all subsequences < "MIN_LENGTH_ALL_aa") 115 ## 4.3 ## [FILTER 3] : Remove all the sequence if the total length of all subsequences < "MIN_LENGTH_ALL_aa")
124 seq_gap = "-" * len(seq) ## 4.4.1 ## generate a sequence with only gaps inside 124 seq_gap = "-" * len(seq) ## 4.4.1 ## generate a sequence with only gaps inside
125 seq_gap_nuc = "-" * len(seq_nuc) 125 seq_gap_nuc = "-" * len(seq_nuc)
126 126
127 for subsequence in LIST_sublist_aa: 127 for subsequence in LIST_sublist_aa:
128 ## aa 128 ## aa
129 START = string.find(seq, subsequence) 129 START = str.find(seq, subsequence)
130 END = START + len(subsequence) 130 END = START + len(subsequence)
131 seq_gap = seq_gap[:START] + seq[START:END] + seq_gap[END:] ## 4.4.2 ## and then replace the correponding gaps by coding subsequence in the sequence 131 seq_gap = seq_gap[:START] + seq[START:END] + seq_gap[END:] ## 4.4.2 ## and then replace the correponding gaps by coding subsequence in the sequence
132 ## nuc 132 ## nuc
133 START_nuc = START*3 133 START_nuc = START*3
134 END_nuc = END*3 134 END_nuc = END*3
135 seq_gap_nuc = seq_gap_nuc[:START_nuc] + seq_nuc[START_nuc:END_nuc] + seq_gap_nuc[END_nuc:] 135 seq_gap_nuc = seq_gap_nuc[:START_nuc] + seq_nuc[START_nuc:END_nuc] + seq_gap_nuc[END_nuc:]
136 136
137 ## 4.5 ## Save new sequence in bash if not empty 137 ## 4.5 ## Save new sequence in bash if not empty
138 seq_empty_test = string.replace(seq_gap, "-", "") 138 seq_empty_test = str.replace(seq_gap, "-", "")
139 if seq_empty_test != "": 139 if seq_empty_test != "":
140 new_bash_aa[fasta_name] = seq_gap 140 new_bash_aa[fasta_name] = seq_gap
141 141
142 seq_empty_test = string.replace(seq_gap_nuc, "-", "") 142 seq_empty_test = str.replace(seq_gap_nuc, "-", "")
143 if seq_empty_test != "": 143 if seq_empty_test != "":
144 new_bash_nuc[fasta_name] = seq_gap_nuc 144 new_bash_nuc[fasta_name] = seq_gap_nuc
145 145
146 # 4.6 ## Correct the nb of sequence in the output name, if necessary 146 # 4.6 ## Correct the nb of sequence in the output name, if necessary
147 n0 += 1 147 n0 += 1
178 else: 178 else:
179 e+=1 179 e+=1
180 180
181 ###Print 181 ###Print
182 if sys.argv[2] == "oui" : 182 if sys.argv[2] == "oui" :
183 print "\nIn locus with CDS considering Methionine : \n" 183 print("\nIn locus with CDS considering Methionine : \n")
184 else : 184 else :
185 print "\nIn locus with CDS regardless of the Methionine : \n" 185 print("\nIn locus with CDS regardless of the Methionine : \n")
186 186
187 print "\nTotal number of locus recorded = %d" %n0 187 print("\nTotal number of locus recorded = %d" %n0)