Mercurial > repos > abims-sbr > cds_search
changeset 1:c79bdda8abfb draft default tip
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3a118aa934e6406cc8b0b24d006af6365c277519
author | abims-sbr |
---|---|
date | Thu, 09 Jun 2022 12:40:00 +0000 |
parents | eb95bf7f90ae |
children | |
files | CDS_search.xml macros.xml scripts/S01_find_orf_on_multiple_alignment.py scripts/S02_remove_too_short_bit_or_whole_sequence.py scripts/S03_remove_site_with_not_enough_species_represented.py scripts/dico.py |
diffstat | 6 files changed, 153 insertions(+), 96 deletions(-) [+] |
line wrap: on
line diff
--- a/CDS_search.xml Fri Feb 01 10:26:37 2019 -0500 +++ b/CDS_search.xml Thu Jun 09 12:40:00 2022 +0000 @@ -1,4 +1,4 @@ -<tool name="CDS_search" id="cds_search" version="2.1.2"> +<tool name="CDS_search" id="cds_search" version="2.2.2"> <description> ORF and CDS search @@ -9,7 +9,7 @@ </macros> <requirements> - <expand macro="python_required" /> + <requirement type="package" version="1.79">biopython</requirement> </requirements> <command><![CDATA[
--- a/macros.xml Fri Feb 01 10:26:37 2019 -0500 +++ b/macros.xml Thu Jun 09 12:40:00 2022 +0000 @@ -4,6 +4,10 @@ <requirement type="package" version="2.7">python</requirement> </xml> + <xml name="python3_required"> + <requirement type="package" version="1.79">biopython</requirement> + </xml> + <token name="@HELP_AUTHORS@"> .. class:: infomark
--- a/scripts/S01_find_orf_on_multiple_alignment.py Fri Feb 01 10:26:37 2019 -0500 +++ b/scripts/S01_find_orf_on_multiple_alignment.py Thu Jun 09 12:40:00 2022 +0000 @@ -2,74 +2,104 @@ # coding: utf8 # Author: Eric Fontanillas # Modification: 03/09/14 by Julie BAFFARD -# Last modification : 25/07/18 by Victor Mataigne +# Last modification : 10/09/21 by Charlotte Berthelier + +""" +Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria + +- CRITERIA 1 + +Get the longest part of the sequence alignemen without codon stop "*", +and test in the 3 potential ORF and check with a Blast the best coding sequence + +- CRITERIA 2 + +This longest part should be > 150nc or 50aa -# Description: Predict potential ORF on the basis of 2 criteria + 1 optional criteria - # CRITERIA 1 - Longest part of the alignment of sequence without codon stop "*", tested in the 3 potential ORF - # CRITERIA 2 - This longest part should be > 150nc or 50aa - # CRITERIA 3 - [OPTIONNAL] A codon start "M" should be present in this longuest part, before the last 50 aa - # OUTPUTs "05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA - # OUTPUTs "06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA +- CRITERIA 3 [OPTIONNAL] + +A codon start "M" should be present in this longuest part, before the last 50 aa -import string, os, time, re, zipfile, sys, argparse +OUTPUTs: +"05_CDS_aa" & "05_CDS_nuc" => NOT INCLUDE THIS CRITERIA +"06_CDS_with_M_aa" & "06_CDS_with_M_nuc" => INCLUDE THIS CRITERIA +""" + +import os +import re +import argparse + +from Bio.Blast import NCBIWWW +from Bio.Blast import NCBIXML from dico import dico -def code_universel(F1): + +def code_universel(file1): """ Creates bash for genetic code (key : codon ; value : amino-acid) """ - bash_codeUniversel = {} + bash_code_universel = {} - with open(F1, "r") as file: + with open(file1, "r") as file: for line in file.readlines(): - L1 = string.split(line, " ") - length1 = len(L1) + item = str.split(line, " ") + length1 = len(item) if length1 == 3: - key = L1[0] - value = L1[2][:-1] - bash_codeUniversel[key] = value + key = item[0] + value = item[2][:-1] + bash_code_universel[key] = value else: - key = L1[0] - value = L1[2] - bash_codeUniversel[key] = value + key = item[0] + value = item[2] + bash_code_universel[key] = value - return(bash_codeUniversel) + return bash_code_universel + def multiple3(seq): - """ Tests if the sequence is a multiple of 3, and if not removes extra-bases - !! Possible to lost a codon, when I test ORF (as I will decay the ORF) """ - - m = len(seq)%3 - if m != 0 : - return seq[:-m], m - else : - return seq, m + """ + Tests if the sequence is a multiple of 3, and if not removes extra-bases + Possible to lost a codon, when I test ORF (as I will decay the ORF) + """ -def detect_Methionine(seq_aa, Ortho, minimal_cds_length): + multiple = len(seq) % 3 + if multiple != 0: + return seq[:-multiple], multiple + return seq, multiple + + +def detect_methionine(seq_aa, ortho, minimal_cds_length): """ Detects if methionin in the aa sequence """ - ln = len(seq_aa) - CUTOFF_Last_50aa = ln - minimal_cds_length + size = len(seq_aa) + cutoff_last_50aa = size - minimal_cds_length - # Find all indices of occurances of "M" in a string of aa + # Find all indices of occurances of "M" in a string of aa list_indices = [pos for pos, char in enumerate(seq_aa) if char == "M"] - # If some "M" are present, find whether the first "M" found is not in the 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS + # If some "M" are present, find whether the first "M" found is not in the + # 50 last aa (indice < CUTOFF_Last_50aa) ==> in this case: maybenot a CDS if list_indices != []: - first_M = list_indices[0] - if first_M < CUTOFF_Last_50aa: - Ortho = 1 # means orthologs found + first_m = list_indices[0] + if first_m < cutoff_last_50aa: + ortho = 1 # means orthologs found - return(Ortho) + return ortho -def ReverseComplement2(seq): + +def reverse_complement2(seq): """ Reverse complement DNA sequence """ seq1 = 'ATCGN-TAGCN-atcgn-tagcn-' - seq_dict = { seq1[i]:seq1[i+6] for i in range(24) if i < 6 or 12<=i<=16 } + seq_dict = {seq1[i]: seq1[i + 6] + for i in range(24) if i < 6 or 12 <= i <= 16} return "".join([seq_dict[base] for base in reversed(seq)]) -def simply_get_ORF(seq_dna, gen_code): - seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i+3] for i in range(0, len(seq_dna), 3)] - seq_by_aa = [gen_code[codon] if codon in gen_code.keys() else '?' for codon in seq_by_codons] + +def simply_get_orf(seq_dna, gen_code): + """ Generate the ORF sequence from DNA sequence """ + seq_by_codons = [seq_dna.upper().replace('T', 'U')[i:i + 3] + for i in range(0, len(seq_dna), 3)] + seq_by_aa = [gen_code[codon] if codon in gen_code.keys() + else '?' for codon in seq_by_codons] return ''.join(seq_by_aa) @@ -78,29 +108,31 @@ # Criteria 1 : Get the segment in the alignment with no codon stop # 1 - Get the list of aligned aa seq for the 3 ORF: + print("1 - Get the list of aligned aa seq for the 3 ORF") bash_of_aligned_aa_seq_3ORF = {} bash_of_aligned_nuc_seq_3ORF = {} BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION = [] - + for fasta_name in bash_aligned_nc_seq.keys(): # Get sequence, chek if multiple 3, then get 6 orfs sequence_nc = bash_aligned_nc_seq[fasta_name] new_sequence_nc, modulo = multiple3(sequence_nc) - new_sequence_rev = ReverseComplement2(new_sequence_nc) + new_sequence_rev = reverse_complement2(new_sequence_nc) # For each seq of the multialignment => give the 6 ORFs (in nuc) bash_of_aligned_nuc_seq_3ORF[fasta_name] = [new_sequence_nc, new_sequence_nc[1:-2], new_sequence_nc[2:-1], new_sequence_rev, new_sequence_rev[1:-2], new_sequence_rev[2:-1]] - seq_prot_ORF1 = simply_get_ORF(new_sequence_nc, bash_codeUniversel) - seq_prot_ORF2 = simply_get_ORF(new_sequence_nc[1:-2], bash_codeUniversel) - seq_prot_ORF3 = simply_get_ORF(new_sequence_nc[2:-1], bash_codeUniversel) - seq_prot_ORF4 = simply_get_ORF(new_sequence_rev, bash_codeUniversel) - seq_prot_ORF5 = simply_get_ORF(new_sequence_rev[1:-2], bash_codeUniversel) - seq_prot_ORF6 = simply_get_ORF(new_sequence_rev[2:-1], bash_codeUniversel) - + seq_prot_ORF1 = simply_get_orf(new_sequence_nc, bash_codeUniversel) + seq_prot_ORF2 = simply_get_orf(new_sequence_nc[1:-2], bash_codeUniversel) + seq_prot_ORF3 = simply_get_orf(new_sequence_nc[2:-1], bash_codeUniversel) + seq_prot_ORF4 = simply_get_orf(new_sequence_rev, bash_codeUniversel) + seq_prot_ORF5 = simply_get_orf(new_sequence_rev[1:-2], bash_codeUniversel) + seq_prot_ORF6 = simply_get_orf(new_sequence_rev[2:-1], bash_codeUniversel) + # For each seq of the multialignment => give the 6 ORFs (in aa) bash_of_aligned_aa_seq_3ORF[fasta_name] = [seq_prot_ORF1, seq_prot_ORF2, seq_prot_ORF3, seq_prot_ORF4, seq_prot_ORF5, seq_prot_ORF6] # 2 - Test for the best ORF (Get the longuest segment in the alignment with no codon stop ... for each ORF ... the longuest should give the ORF) + print("2 - Test for the best ORF") BEST_MAX = 0 for i in [0,1,2,3,4,5]: # Test the 6 ORFs @@ -170,6 +202,7 @@ # 3 - ONCE we have this better segment (BEST CODING SEGMENT) # ==> GET THE STARTING and ENDING POSITIONS (in aa position and in nuc position) # And get the INDEX of the best ORF [0, 1, or 2] + print("3 - ONCE we have this better segment") if BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION != []: pos_MIN_aa = BEST_LONGUEST_SUBSEQUENCE_LIST_POSITION[0] pos_MIN_aa = pos_MIN_aa - 1 @@ -186,6 +219,7 @@ BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] = seq_coding # 4 - Get the corresponding position (START/END of BEST CODING SEGMENT) for nucleotides alignment + print("4 - Get the corresponding position") pos_MIN_nuc = pos_MIN_aa * 3 pos_MAX_nuc = pos_MAX_aa * 3 @@ -196,6 +230,25 @@ seq_coding = seq[pos_MIN_nuc:pos_MAX_nuc] BESTORF_bash_aligned_nc_seq[fasta_name] = seq BESTORF_bash_aligned_nc_seq_CODING[fasta_name] = seq_coding + seq_cutted = re.sub(r'^.*?[a-zA-Z]', '', seq) + sequence_for_blast=(fasta_name+'\n'+seq_cutted+'\n') + good_ORF_found = False + try: + #result_handle = "" + #blast_records = "" + # logger.debug("sequence_for_blast = %s ", sequence_for_blast) + print('sequence_for_blast = %s ',sequence_for_blast, end=' ', flush=True) + result_handle = NCBIWWW.qblast("blastn", "/db/nt/current/fasta/nt.fsa", sequence_for_blast, expect=0.001, hitlist_size=1) + blast_records = NCBIXML.parse(result_handle) + except: + good_ORF_found = False + else: + for blast_record in blast_records: + for alignment in blast_record.alignments: + for hsp in alignment.hsps: + if hsp.expect < 0.001: + good_ORF_found = True + print("good_ORF_found = %s" %good_ORF_found) else: # no CDS found BESTORF_bash_aligned_nc_seq = {} @@ -204,14 +257,14 @@ BESTORF_bash_of_aligned_aa_seq_CODING ={} # Check whether their is a "M" or not, and if at least 1 "M" is present, that it is not in the last 50 aa - + BESTORF_bash_of_aligned_aa_seq_CDS_with_M = {} BESTORF_bash_of_aligned_nuc_seq_CDS_with_M = {} Ortho = 0 for fasta_name in BESTORF_bash_of_aligned_aa_seq_CODING.keys(): seq_aa = BESTORF_bash_of_aligned_aa_seq_CODING[fasta_name] - Ortho = detect_Methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ### + Ortho = detect_methionine(seq_aa, Ortho, minimal_cds_length) ### DEF6 ### # CASE 1: A "M" is present and correctly localized (not in last 50 aa) if Ortho == 1: @@ -243,7 +296,7 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("codeUniversel", help="File describing the genetic code (code_universel_modified.txt") - parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int) + parser.add_argument("min_cds_len", help="Minmal length of a CDS (in amino-acids)", type=int) parser.add_argument("min_spec", help="Minimal number of species per alignment") parser.add_argument("list_files", help="File with all input files names") args = parser.parse_args() @@ -251,7 +304,7 @@ minimal_cds_length = int(args.min_cds_len) # in aa number bash_codeUniversel = code_universel(args.codeUniversel) minimum_species = int(args.min_spec) - + # Inputs from file containing list of species list_files = [] with open(args.list_files, 'r') as f: @@ -278,14 +331,14 @@ count_file_processed = count_file_processed + 1 nb_gp = file.split('_')[1] # Keep trace of the orthogroup number - fasta_file_path = "./%s" %file + fasta_file_path = "./%s" %file bash_fasta = dico(fasta_file_path) BESTORF_nuc, BESTORF_nuc_CODING, BESTORF_nuc_CDS_with_M, BESTORF_aa, BESTORF_aa_CODING, BESTORF_aa_CDS_with_M = find_good_ORF_criteria_3(bash_fasta, bash_codeUniversel, minimal_cds_length, minimum_species) - + name_elems[1] = nb_gp # Update counts and write group in corresponding output directory - if BESTORF_nuc != {}: + if BESTORF_nuc != {}: count_file_with_CDS += 1 if len(BESTORF_nuc.keys()) >= minimum_species : count_file_with_cds_and_enought_species += 1 @@ -305,14 +358,14 @@ write_output_file(BESTORF_nuc_CDS_with_M, name_elems, dirs[4]) write_output_file(BESTORF_aa_CDS_with_M, name_elems, dirs[5]) - print "*************** CDS detection ***************" - print "\nFiles processed: %d" %count_file_processed - print "\tFiles with CDS: %d" %count_file_with_CDS - print "\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species) - print "\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M - print "\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) - print "\tFiles without CDS: %d \n" %count_file_without_CDS - print "" + print("*************** CDS detection ***************") + print("\nFiles processed: %d" %count_file_processed) + print("\tFiles with CDS: %d" %count_file_with_CDS) + print("\tFiles wth CDS and more than %s species: %d" %(minimum_species, count_file_with_cds_and_enought_species)) + print("\t\tFiles with CDS plus M (codon start): %d" %count_file_with_CDS_plus_M) + print("\t\tFiles with CDS plus M (codon start) and more than %s species: %d" %(minimum_species,count_file_with_cds_M_and_enought_species) ) + print("\tFiles without CDS: %d \n" %count_file_without_CDS) + print("") if __name__ == '__main__': main()
--- a/scripts/S02_remove_too_short_bit_or_whole_sequence.py Fri Feb 01 10:26:37 2019 -0500 +++ b/scripts/S02_remove_too_short_bit_or_whole_sequence.py Thu Jun 09 12:40:00 2022 +0000 @@ -89,9 +89,9 @@ seq_nuc = dico_nuc[fasta_name] if "?" in seq: - seq = string.replace(seq, "?", "-") + seq = str.replace(seq, "?", "-") if "?" in seq_nuc: - seq_nuc = string.replace(seq_nuc, "?", "-") + seq_nuc = str.replace(seq_nuc, "?", "-") ## 4.1 ## [FILTER 1] : Detect and Replace short internal indel symbole (= "-" as for other longer gaps) by a "?" ## aa @@ -107,7 +107,7 @@ ## 4.2 ## [FILTER 2] : Remove short bits of sequence (<"MIN_LENGTH_BIT_OF_SEQUENCE_aa") LIST_sublist_aa=[] - S1 = string.split(seq, "-") + S1 = str.split(seq, "-") for element in S1: if len(element) > MIN_LENGTH_BIT_OF_SEQUENCE_aa: LIST_sublist_aa.append(element) @@ -126,7 +126,7 @@ for subsequence in LIST_sublist_aa: ## aa - START = string.find(seq, subsequence) + START = str.find(seq, subsequence) END = START + len(subsequence) seq_gap = seq_gap[:START] + seq[START:END] + seq_gap[END:] ## 4.4.2 ## and then replace the correponding gaps by coding subsequence in the sequence ## nuc @@ -135,11 +135,11 @@ seq_gap_nuc = seq_gap_nuc[:START_nuc] + seq_nuc[START_nuc:END_nuc] + seq_gap_nuc[END_nuc:] ## 4.5 ## Save new sequence in bash if not empty - seq_empty_test = string.replace(seq_gap, "-", "") + seq_empty_test = str.replace(seq_gap, "-", "") if seq_empty_test != "": new_bash_aa[fasta_name] = seq_gap - seq_empty_test = string.replace(seq_gap_nuc, "-", "") + seq_empty_test = str.replace(seq_gap_nuc, "-", "") if seq_empty_test != "": new_bash_nuc[fasta_name] = seq_gap_nuc @@ -180,8 +180,8 @@ ###Print if sys.argv[2] == "oui" : - print "\nIn locus with CDS considering Methionine : \n" + print("\nIn locus with CDS considering Methionine : \n") else : - print "\nIn locus with CDS regardless of the Methionine : \n" + print("\nIn locus with CDS regardless of the Methionine : \n") -print "\nTotal number of locus recorded = %d" %n0 \ No newline at end of file +print("\nTotal number of locus recorded = %d" %n0)
--- a/scripts/S03_remove_site_with_not_enough_species_represented.py Fri Feb 01 10:26:37 2019 -0500 +++ b/scripts/S03_remove_site_with_not_enough_species_represented.py Thu Jun 09 12:40:00 2022 +0000 @@ -12,7 +12,7 @@ def remove_position_with_too_much_missing_data(bash_aa, bash_nuc, MIN_SPECIES_NB): ## 1 ## Get alignment length - fasta_name0 = bash_aa.keys()[0] + fasta_name0 = list(bash_aa.keys())[0] ln_aa = len(bash_aa[fasta_name0]) ln_nuc = len(bash_nuc[fasta_name0]) @@ -23,7 +23,7 @@ i=0 while i < ln_aa: site = [] - for fasta_name in bash_aa.keys(): + for fasta_name in list(bash_aa.keys()): pos = bash_aa[fasta_name][i] if pos != "-" and pos != "?" and pos != "X": @@ -45,15 +45,15 @@ ## 4 ## Create entries for "filtered_bash" for aa & nuc filtered_bash_aa = {} filtered_bash_nuc = {} - for fasta_name in bash_aa.keys(): + for fasta_name in list(bash_aa.keys()): filtered_bash_aa[fasta_name] = "" - for fasta_name in bash_nuc.keys(): + for fasta_name in list(bash_nuc.keys()): filtered_bash_nuc[fasta_name] = "" ## 5 ## Write "filtered_bash" for aa j=0 while j < ln_aa: - for fasta_name in bash_aa.keys(): + for fasta_name in list(bash_aa.keys()): seq=filtered_bash_aa[fasta_name] pos=bash_aa[fasta_name][j] @@ -63,7 +63,7 @@ j = j + 1 ## 6 ## Remove empty sequence - for name in filtered_bash_aa.keys(): + for name in list(filtered_bash_aa.keys()): seq = filtered_bash_aa[name] if seq == '': del filtered_bash_aa[name] @@ -72,7 +72,7 @@ ## 7 ## Write "filtered_bash" for nuc j=0 while j < ln_nuc: - for fasta_name in bash_nuc.keys(): + for fasta_name in list(bash_nuc.keys()): seq=filtered_bash_nuc[fasta_name] #print seq pos=bash_nuc[fasta_name][j] @@ -83,7 +83,7 @@ j = j + 1 ## 8 ## Remove empty sequence - for name in filtered_bash_nuc.keys(): + for name in list(filtered_bash_nuc.keys()): seq = filtered_bash_nuc[name] if seq == '': del filtered_bash_nuc[name] @@ -147,7 +147,7 @@ ## 4.1 ## REMOVE POSITIONS WITH TOO MUCH MISSING DATA (i.e. not enough taxa represented at each position in the alignment) filtered_bash_aa, filtered_bash_nuc = remove_position_with_too_much_missing_data(dico_aa, dico_nuc, MIN_SPECIES_NB) ### DEF 2 ### - k = filtered_bash_nuc.keys() + k = list(filtered_bash_nuc.keys()) new_leng_nuc = 0 if k != []: k0 = k[0] @@ -158,14 +158,14 @@ n0+=1 #name_elems[1] = str(n0) name_elems[1] = file.split('_')[1] - name_elems[3] = str(len(filtered_bash_aa.keys())) + name_elems[3] = str(len(list(filtered_bash_aa.keys()))) new_name = "_".join(name_elems) ## 4.5 ## Write filtered alignment in OUTPUTs ## aa if filtered_bash_aa != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC: OUTaa=open("%s/%s" %(path_OUT1, new_name), "w") - for fasta_name in filtered_bash_aa.keys(): + for fasta_name in list(filtered_bash_aa.keys()): seq_aa = filtered_bash_aa[fasta_name] OUTaa.write("%s\n" %fasta_name) OUTaa.write("%s\n" %seq_aa) @@ -174,7 +174,7 @@ if filtered_bash_nuc != {} and new_leng_nuc >= MIN_LENGTH_FINAL_ALIGNMENT_NUC: good+=1 OUTnuc=open("%s/%s" %(path_OUT2, new_name), "w") - for fasta_name in filtered_bash_nuc.keys(): + for fasta_name in list(filtered_bash_nuc.keys()): seq_nuc = filtered_bash_nuc[fasta_name] OUTnuc.write("%s\n" %fasta_name) OUTnuc.write("%s\n" %seq_nuc) @@ -184,8 +184,8 @@ ## 5 ## Print -print "*************** 2nd Filter : removal of the indel ***************" -print "\nTotal number of locus recorded = %d" %n0 -print "\tTotal number of locus with no indels (SAVED) = %d" %good -print "\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad -print "" \ No newline at end of file +print("*************** 2nd Filter : removal of the indel ***************") +print("\nTotal number of locus recorded = %d" %n0) +print("\tTotal number of locus with no indels (SAVED) = %d" %good) +print("\tTotal number of locus, when removing indel, wich are empty (EXCLUDED) = %d" %bad) +print("")
--- a/scripts/dico.py Fri Feb 01 10:26:37 2019 -0500 +++ b/scripts/dico.py Thu Jun 09 12:40:00 2022 +0000 @@ -3,10 +3,10 @@ def dico(F1): dicoco = {} with open(F1, "r") as file: - for name, query in itertools.izip_longest(*[file]*2): + for name, query in itertools.zip_longest(*[file]*2): if name[0] == ">": fasta_name_query = name[:-1] - Sn = string.split(fasta_name_query, "||") + Sn = str.split(fasta_name_query, "||") fasta_name_query = Sn[0] fasta_seq_query = query[:-1] dicoco[fasta_name_query] = fasta_seq_query