comparison scripts/S02_remove_too_short_bit_or_whole_sequence.py @ 0:eb95bf7f90ae draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:26:37 -0500
parents
children c79bdda8abfb
comparison
equal deleted inserted replaced
-1:000000000000 0:eb95bf7f90ae
1 #!/usr/bin/env python
2 # coding: utf8
3 ## Author: Eric Fontanillas
4 ## Modification: 03/09/14 by Julie BAFFARD
5 ## Last modification : 05/03/18 by Victor Mataigne
6
7 ## Description : find and remove indels
8
9 ###################
10 ###### DEF 9 ######
11 ###################
12 def detect_short_indel(seq,MAX_LENGTH_SMALL_INDEL):
13 ## 1 ## Built the list of sublist of consecutive gap position
14 LIST = []
15 sublist=[]
16 ln = len(seq)
17 i=0
18 while i < ln:
19 if seq[i] == "-":
20 sublist.append(i) ## save gaps in sublist until a aa is found => else:
21 else:
22 LIST.append(sublist) ## save the list of gap
23 sublist = [] ## create new list of gap
24 i = i+1
25 ## if gap at the end: add the last "sublist of gap" (not done in previous loop, at it add sublist (of gaps) only when in find aa, but if gap at the end, no aa after are present, so cannot add this last sublist to the LISt of gaps
26 if sublist != []:
27 LIST.append(sublist)
28
29 ## 2 ## keep only the records of the small indel (<MAX_LENGTH_SMALL_INDEL)
30 list_of_sublist_positions = []
31 for element in LIST:
32 if element != [] and len(element)<=MAX_LENGTH_SMALL_INDEL:
33 list_of_sublist_positions.append(element)
34
35 return(list_of_sublist_positions)
36 ####################################
37
38
39 #######################
40 ##### RUN RUN RUN #####
41 #######################
42 import string, os, time, re, sys
43 from dico import dico
44
45 ### 0 ### PARAMETERS
46 MIN_LENGTH_ALL_aa = int(sys.argv[3])-20
47 MIN_LENGTH_BIT_OF_SEQUENCE_aa = int(sys.argv[4])
48 MAX_LENGTH_SMALL_INDEL = 2 ## in aa
49 MAX_LENGTH_SMALL_INDEL_nuc = 6 ## in nuc
50 MIN_SPECIES_NB = int(sys.argv[1])
51 dicoco = {}
52 dico_dico = {}
53 list_new_file = []
54 n0 = 0
55 e=0
56 j=0
57 i=1
58 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
59
60 ### 1 ### IN
61 if sys.argv[2] == "oui" :
62 path_IN1 = "./06_CDS_with_M_aa/"
63 L_IN1 = os.listdir(path_IN1)
64 path_IN2 = "./06_CDS_with_M_nuc/"
65 L_IN2 = os.listdir(path_IN2)
66 elif sys.argv[2] == "non" :
67 path_IN1 = "./05_CDS_aa/"
68 L_IN1 = os.listdir(path_IN1)
69 path_IN2 = "./05_CDS_nuc/"
70 L_IN2 = os.listdir(path_IN2)
71
72 ## 2 ## OUT
73 os.mkdir("07_CDS_aa")
74 path_OUT1 = "07_CDS_aa"
75 os.mkdir("07_CDS_nuc")
76 path_OUT2 = "07_CDS_nuc"
77
78 for file in L_IN1:
79 file_INaa = "%s/%s" %(path_IN1, file)
80 file_INnuc = "%s/%s" %(path_IN2, file)
81
82 dico_aa = dico(file_INaa) ### DEF 0 ###
83 dico_nuc = dico(file_INnuc) ### DEF 0 ###
84
85 new_bash_aa = {}
86 new_bash_nuc = {}
87 for fasta_name in dico_aa.keys():
88 seq = dico_aa[fasta_name]
89 seq_nuc = dico_nuc[fasta_name]
90
91 if "?" in seq:
92 seq = string.replace(seq, "?", "-")
93 if "?" in seq_nuc:
94 seq_nuc = string.replace(seq_nuc, "?", "-")
95
96 ## 4.1 ## [FILTER 1] : Detect and Replace short internal indel symbole (= "-" as for other longer gaps) by a "?"
97 ## aa
98 list_sublist_pos = detect_short_indel(seq, MAX_LENGTH_SMALL_INDEL) ### DEF 9 ###
99 for pos_short_indels in list_sublist_pos:
100 for pos in pos_short_indels:
101 seq = seq[:pos] + "?" + seq[pos+1:]
102 ## nuc
103 list_sublist_pos = detect_short_indel(seq_nuc, MAX_LENGTH_SMALL_INDEL_nuc) ### DEF 9 ###
104 for pos_short_indels in list_sublist_pos:
105 for pos in pos_short_indels:
106 seq_nuc = seq_nuc[:pos] + "?" + seq_nuc[pos+1:]
107
108 ## 4.2 ## [FILTER 2] : Remove short bits of sequence (<"MIN_LENGTH_BIT_OF_SEQUENCE_aa")
109 LIST_sublist_aa=[]
110 S1 = string.split(seq, "-")
111 for element in S1:
112 if len(element) > MIN_LENGTH_BIT_OF_SEQUENCE_aa:
113 LIST_sublist_aa.append(element)
114
115 ## 4.3 ## [FILTER 3] : Remove all the sequence if the total length of all subsequences < "MIN_LENGTH_ALL_aa")
116 seq_all = ""
117 for bit_of_sequence in LIST_sublist_aa:
118 seq_all = seq_all + bit_of_sequence
119
120 if len(seq_all) < MIN_LENGTH_ALL_aa:
121 LIST_sublist_aa = []
122
123 ## 4.4 ## [FILTER 4] : Detect sublist position in the original sequence, and recreate the filtered sequence from these positions:
124 seq_gap = "-" * len(seq) ## 4.4.1 ## generate a sequence with only gaps inside
125 seq_gap_nuc = "-" * len(seq_nuc)
126
127 for subsequence in LIST_sublist_aa:
128 ## aa
129 START = string.find(seq, subsequence)
130 END = START + len(subsequence)
131 seq_gap = seq_gap[:START] + seq[START:END] + seq_gap[END:] ## 4.4.2 ## and then replace the correponding gaps by coding subsequence in the sequence
132 ## nuc
133 START_nuc = START*3
134 END_nuc = END*3
135 seq_gap_nuc = seq_gap_nuc[:START_nuc] + seq_nuc[START_nuc:END_nuc] + seq_gap_nuc[END_nuc:]
136
137 ## 4.5 ## Save new sequence in bash if not empty
138 seq_empty_test = string.replace(seq_gap, "-", "")
139 if seq_empty_test != "":
140 new_bash_aa[fasta_name] = seq_gap
141
142 seq_empty_test = string.replace(seq_gap_nuc, "-", "")
143 if seq_empty_test != "":
144 new_bash_nuc[fasta_name] = seq_gap_nuc
145
146 # 4.6 ## Correct the nb of sequence in the output name, if necessary
147 n0 += 1
148 name_elems[1] = file.split('_')[1]
149 #name_elems[1] = str(n0)
150 name_elems[3] = str(len(new_bash_nuc.keys()))
151 new_name = "_".join(name_elems)
152 dico_dico[new_name] = [new_bash_aa, new_bash_nuc]
153 list_new_file.append(new_name)
154
155 ## [FILTER 6]: print output only if at least "MIN_SPECIES_NB" species remaining in the alignment
156 for name in list_new_file :
157 dicoo = dico_dico[name]
158 dico_aa = dicoo[0]
159 dico_nuc = dicoo[1]
160 sp_nbre = len(dico_aa.keys())
161
162 if sp_nbre >= MIN_SPECIES_NB :
163 file_OUTaa = open("%s/%s" %(path_OUT1, name), "w")
164 file_OUTnuc = open("%s/%s" %(path_OUT2, name), "w")
165
166 for fasta_name in dico_aa.keys() :
167 seq_aa = dico_aa[fasta_name]
168 file_OUTaa.write("%s\n" %fasta_name)
169 file_OUTaa.write("%s\n" %seq_aa)
170 for fasta_name in dico_nuc.keys() :
171 seq_nuc = dico_nuc[fasta_name]
172 file_OUTnuc.write("%s\n" %fasta_name)
173 file_OUTnuc.write("%s\n" %seq_nuc)
174
175 file_OUTaa.close()
176 file_OUTnuc.close()
177
178 else:
179 e+=1
180
181 ###Print
182 if sys.argv[2] == "oui" :
183 print "\nIn locus with CDS considering Methionine : \n"
184 else :
185 print "\nIn locus with CDS regardless of the Methionine : \n"
186
187 print "\nTotal number of locus recorded = %d" %n0