annotate scripts/S02_remove_too_short_bit_or_whole_sequence.py @ 1:c79bdda8abfb draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
author abims-sbr
date Thu, 09 Jun 2022 12:40:00 +0000
parents eb95bf7f90ae
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
1 #!/usr/bin/env python
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
2 # coding: utf8
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
3 ## Author: Eric Fontanillas
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
4 ## Modification: 03/09/14 by Julie BAFFARD
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
5 ## Last modification : 05/03/18 by Victor Mataigne
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
6
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
7 ## Description : find and remove indels
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
8
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
9 ###################
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
10 ###### DEF 9 ######
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
11 ###################
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
12 def detect_short_indel(seq,MAX_LENGTH_SMALL_INDEL):
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
13 ## 1 ## Built the list of sublist of consecutive gap position
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
14 LIST = []
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
15 sublist=[]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
16 ln = len(seq)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
17 i=0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
18 while i < ln:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
19 if seq[i] == "-":
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
20 sublist.append(i) ## save gaps in sublist until a aa is found => else:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
21 else:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
22 LIST.append(sublist) ## save the list of gap
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
23 sublist = [] ## create new list of gap
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
24 i = i+1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
25 ## if gap at the end: add the last "sublist of gap" (not done in previous loop, at it add sublist (of gaps) only when in find aa, but if gap at the end, no aa after are present, so cannot add this last sublist to the LISt of gaps
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
26 if sublist != []:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
27 LIST.append(sublist)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
28
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
29 ## 2 ## keep only the records of the small indel (<MAX_LENGTH_SMALL_INDEL)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
30 list_of_sublist_positions = []
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
31 for element in LIST:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
32 if element != [] and len(element)<=MAX_LENGTH_SMALL_INDEL:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
33 list_of_sublist_positions.append(element)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
34
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
35 return(list_of_sublist_positions)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
36 ####################################
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
37
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
38
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
39 #######################
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
40 ##### RUN RUN RUN #####
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
41 #######################
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
42 import string, os, time, re, sys
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
43 from dico import dico
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
44
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
45 ### 0 ### PARAMETERS
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
46 MIN_LENGTH_ALL_aa = int(sys.argv[3])-20
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
47 MIN_LENGTH_BIT_OF_SEQUENCE_aa = int(sys.argv[4])
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
48 MAX_LENGTH_SMALL_INDEL = 2 ## in aa
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
49 MAX_LENGTH_SMALL_INDEL_nuc = 6 ## in nuc
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
50 MIN_SPECIES_NB = int(sys.argv[1])
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
51 dicoco = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
52 dico_dico = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
53 list_new_file = []
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
54 n0 = 0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
55 e=0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
56 j=0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
57 i=1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
58 name_elems = ["orthogroup", "0", "with", "0", "species.fasta"]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
59
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
60 ### 1 ### IN
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
61 if sys.argv[2] == "oui" :
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
62 path_IN1 = "./06_CDS_with_M_aa/"
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
63 L_IN1 = os.listdir(path_IN1)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
64 path_IN2 = "./06_CDS_with_M_nuc/"
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
65 L_IN2 = os.listdir(path_IN2)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
66 elif sys.argv[2] == "non" :
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
67 path_IN1 = "./05_CDS_aa/"
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
68 L_IN1 = os.listdir(path_IN1)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
69 path_IN2 = "./05_CDS_nuc/"
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
70 L_IN2 = os.listdir(path_IN2)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
71
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
72 ## 2 ## OUT
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
73 os.mkdir("07_CDS_aa")
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
74 path_OUT1 = "07_CDS_aa"
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
75 os.mkdir("07_CDS_nuc")
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
76 path_OUT2 = "07_CDS_nuc"
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
77
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
78 for file in L_IN1:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
79 file_INaa = "%s/%s" %(path_IN1, file)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
80 file_INnuc = "%s/%s" %(path_IN2, file)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
81
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
82 dico_aa = dico(file_INaa) ### DEF 0 ###
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
83 dico_nuc = dico(file_INnuc) ### DEF 0 ###
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
84
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
85 new_bash_aa = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
86 new_bash_nuc = {}
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
87 for fasta_name in dico_aa.keys():
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
88 seq = dico_aa[fasta_name]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
89 seq_nuc = dico_nuc[fasta_name]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
90
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
91 if "?" in seq:
1
c79bdda8abfb planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
abims-sbr
parents: 0
diff changeset
92 seq = str.replace(seq, "?", "-")
0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
93 if "?" in seq_nuc:
1
c79bdda8abfb planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
abims-sbr
parents: 0
diff changeset
94 seq_nuc = str.replace(seq_nuc, "?", "-")
0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
95
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
96 ## 4.1 ## [FILTER 1] : Detect and Replace short internal indel symbole (= "-" as for other longer gaps) by a "?"
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
97 ## aa
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
98 list_sublist_pos = detect_short_indel(seq, MAX_LENGTH_SMALL_INDEL) ### DEF 9 ###
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
99 for pos_short_indels in list_sublist_pos:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
100 for pos in pos_short_indels:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
101 seq = seq[:pos] + "?" + seq[pos+1:]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
102 ## nuc
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
103 list_sublist_pos = detect_short_indel(seq_nuc, MAX_LENGTH_SMALL_INDEL_nuc) ### DEF 9 ###
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
104 for pos_short_indels in list_sublist_pos:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
105 for pos in pos_short_indels:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
106 seq_nuc = seq_nuc[:pos] + "?" + seq_nuc[pos+1:]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
107
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
108 ## 4.2 ## [FILTER 2] : Remove short bits of sequence (<"MIN_LENGTH_BIT_OF_SEQUENCE_aa")
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
109 LIST_sublist_aa=[]
1
c79bdda8abfb planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
abims-sbr
parents: 0
diff changeset
110 S1 = str.split(seq, "-")
0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
111 for element in S1:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
112 if len(element) > MIN_LENGTH_BIT_OF_SEQUENCE_aa:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
113 LIST_sublist_aa.append(element)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
114
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
115 ## 4.3 ## [FILTER 3] : Remove all the sequence if the total length of all subsequences < "MIN_LENGTH_ALL_aa")
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
116 seq_all = ""
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
117 for bit_of_sequence in LIST_sublist_aa:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
118 seq_all = seq_all + bit_of_sequence
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
119
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
120 if len(seq_all) < MIN_LENGTH_ALL_aa:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
121 LIST_sublist_aa = []
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
122
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
123 ## 4.4 ## [FILTER 4] : Detect sublist position in the original sequence, and recreate the filtered sequence from these positions:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
124 seq_gap = "-" * len(seq) ## 4.4.1 ## generate a sequence with only gaps inside
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
125 seq_gap_nuc = "-" * len(seq_nuc)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
126
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
127 for subsequence in LIST_sublist_aa:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
128 ## aa
1
c79bdda8abfb planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
abims-sbr
parents: 0
diff changeset
129 START = str.find(seq, subsequence)
0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
130 END = START + len(subsequence)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
131 seq_gap = seq_gap[:START] + seq[START:END] + seq_gap[END:] ## 4.4.2 ## and then replace the correponding gaps by coding subsequence in the sequence
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
132 ## nuc
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
133 START_nuc = START*3
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
134 END_nuc = END*3
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
135 seq_gap_nuc = seq_gap_nuc[:START_nuc] + seq_nuc[START_nuc:END_nuc] + seq_gap_nuc[END_nuc:]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
136
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
137 ## 4.5 ## Save new sequence in bash if not empty
1
c79bdda8abfb planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
abims-sbr
parents: 0
diff changeset
138 seq_empty_test = str.replace(seq_gap, "-", "")
0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
139 if seq_empty_test != "":
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
140 new_bash_aa[fasta_name] = seq_gap
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
141
1
c79bdda8abfb planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
abims-sbr
parents: 0
diff changeset
142 seq_empty_test = str.replace(seq_gap_nuc, "-", "")
0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
143 if seq_empty_test != "":
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
144 new_bash_nuc[fasta_name] = seq_gap_nuc
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
145
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
146 # 4.6 ## Correct the nb of sequence in the output name, if necessary
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
147 n0 += 1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
148 name_elems[1] = file.split('_')[1]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
149 #name_elems[1] = str(n0)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
150 name_elems[3] = str(len(new_bash_nuc.keys()))
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
151 new_name = "_".join(name_elems)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
152 dico_dico[new_name] = [new_bash_aa, new_bash_nuc]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
153 list_new_file.append(new_name)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
154
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
155 ## [FILTER 6]: print output only if at least "MIN_SPECIES_NB" species remaining in the alignment
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
156 for name in list_new_file :
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
157 dicoo = dico_dico[name]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
158 dico_aa = dicoo[0]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
159 dico_nuc = dicoo[1]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
160 sp_nbre = len(dico_aa.keys())
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
161
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
162 if sp_nbre >= MIN_SPECIES_NB :
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
163 file_OUTaa = open("%s/%s" %(path_OUT1, name), "w")
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
164 file_OUTnuc = open("%s/%s" %(path_OUT2, name), "w")
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
165
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
166 for fasta_name in dico_aa.keys() :
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
167 seq_aa = dico_aa[fasta_name]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
168 file_OUTaa.write("%s\n" %fasta_name)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
169 file_OUTaa.write("%s\n" %seq_aa)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
170 for fasta_name in dico_nuc.keys() :
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
171 seq_nuc = dico_nuc[fasta_name]
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
172 file_OUTnuc.write("%s\n" %fasta_name)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
173 file_OUTnuc.write("%s\n" %seq_nuc)
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
174
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
175 file_OUTaa.close()
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
176 file_OUTnuc.close()
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
177
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
178 else:
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
179 e+=1
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
180
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
181 ###Print
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
182 if sys.argv[2] == "oui" :
1
c79bdda8abfb planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
abims-sbr
parents: 0
diff changeset
183 print("\nIn locus with CDS considering Methionine : \n")
0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
184 else :
1
c79bdda8abfb planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
abims-sbr
parents: 0
diff changeset
185 print("\nIn locus with CDS regardless of the Methionine : \n")
0
eb95bf7f90ae planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
abims-sbr
parents:
diff changeset
186
1
c79bdda8abfb planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
abims-sbr
parents: 0
diff changeset
187 print("\nTotal number of locus recorded = %d" %n0)