Mercurial > repos > abims-sbr > concatphyl
diff scripts/S01_concatenate.py @ 0:b186cae246bd draft default tip
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:27:42 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S01_concatenate.py Fri Feb 01 10:27:42 2019 -0500 @@ -0,0 +1,361 @@ +#!/usr/bin/python +## Author: Eric Fontanillas +## Last modification: 17/06/2011 +## Subject: find and remove indels + +############################### +##### DEF 0 : Dico fasta ##### +############################### +def dico(F2): + dicoco = {} + with open(F2, "r") as file: + for name, query in itertools.izip_longest(*[file]*2): + if not name: + break + if name[0] == ">": + fasta_name_query = name[:-1] + Sn = string.split(fasta_name_query, "||") + fasta_name_query = Sn[0] + fasta_seq_query = query[:-1] + dicoco[fasta_name_query]=fasta_seq_query + return dicoco +################################################################################### + + +#################### +###### DEF 11 ###### +#################### +## Concatenate sequences +########################### +def concatenate(L_IN, SPECIES_ID_LIST): + ## 4 ## Process files + ## 4.1 ## Create the bash and the fasta names entries (name of the species) + bash_concat = {} + + for species_ID in SPECIES_ID_LIST: + bash_concat[species_ID] = '' + + ln_concat = 0 + nb_locus = 0 + pos=1 + list_genes_position=[] + ## 4.2 ## Concatenate + for file in L_IN: + nb_locus=nb_locus+1 + + ## a ## Open alignments + dico_seq = dico(file) ### DEF 0 ### + ## b ## Get alignment length + genes positions for RAxML + key0 = dico_seq.keys()[0] + ln = len(dico_seq[key0]) + ln_concat = ln_concat + ln + + pos_start = pos + pos_end = pos+ln-1 + pos=pos_end+1 + position="%d-%d" %(pos_start, pos_end) + RAxML_name = file[:-6] + sublist = [RAxML_name, position] + list_genes_position.append(sublist) + + ## c ## Generate "empty" sequence with alignment length * "-" + empty_seq = "-" * ln + + ## d ## Concatenate + ## d.1 ## Detect missing species in this alignment + list_ID=[] + list_absent_ID=[] + bash_fastaName={} + for fasta_name in dico_seq: + ID = fasta_name[1:3] + list_ID.append(ID) + seq = dico_seq[fasta_name] + bash_fastaName[ID]=fasta_name + for sp_ID in SPECIES_ID_LIST: + if sp_ID not in list_ID: + list_absent_ID.append(sp_ID) + + for ID in SPECIES_ID_LIST: + if ID in list_absent_ID: + bash_concat[ID] = bash_concat[ID] + empty_seq + else: + fasta_name = bash_fastaName[ID] + seq = dico_seq[fasta_name] + bash_concat[ID] = bash_concat[ID] + seq + + return(bash_concat, ln_concat, nb_locus, list_genes_position) +#################################### + + +######################################## +##### DEF 12 : get codon position ##### +######################################## +def get_codon_position(seq_inORF): + + ln = len(seq_inORF) + + i=0 + seq_pos1="" + seq_pos2="" + seq_pos12="" + seq_pos3="" + while i<ln: + pos1 = seq_inORF[i] + pos2 = seq_inORF[i+1] + pos3 = seq_inORF[i+2] + + seq_pos1 = seq_pos1 + pos1 + seq_pos2 = seq_pos2 + pos2 + seq_pos12 = seq_pos12 + pos1 + pos2 + seq_pos3 = seq_pos3 + pos3 + + i = i+3 + + return(seq_pos1, seq_pos2, seq_pos12, seq_pos3) +############################################################################### + + + +####################### +##### RUN RUN RUN ##### +####################### +import string, os, time, re, sys, itertools + +list_species = [] +SPECIES_ID_LIST = [] +fasta = "^.*fasta$" +i=3 + +## Arguments +infiles_filter_assemblies = sys.argv[1] +format_run = sys.argv[2] + +## add file to list_species +list_species = str.split(infiles_filter_assemblies,",") + +## in SPECIES_ID_LIST, only the 2 first letters of name of species +for name in list_species : + name = name[:2] + SPECIES_ID_LIST.append(name) + +## add alignment files to L_IN +list_files = [] +with open(sys.argv[3], 'r') as f: + for line in f.readlines(): + list_files.append(line.strip('\n')) + +L_IN = [] +for file in list_files: + L_IN.append(file) + +#L_IN = str.split(input_alignments,",") +print(L_IN) + +### 1 ### Proteic +if format_run == "proteic" : + + OUT1 = open("02_Concatenation_aa.fas", "w") + OUT2 = open("02_Concatenation_aa.phy", "w") + OUT3 = open("02_Concatenation_aa.nex", "w") + OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w") + + + ## Get bash with concatenation + bash_concatenation, ln, nb_locus,list_genes_position= concatenate(L_IN, SPECIES_ID_LIST) ### DEF 11 ## + + ## Write gene AA partition file for RAxML + for sublist in list_genes_position: + name = sublist[0] + positions=sublist[1] + OUT_PARTITION_gene_AA.write("DNA,%s=%s\n"%(name,positions)) + OUT_PARTITION_gene_AA.close() + + ## Get "ntax" for NEXUS HEADER + nb_taxa = len(bash_concatenation.keys()) + + print "******************** CONCATENATION ********************\n" + print "Process amino-acid concatenation:" + print "\tNumber of taxa aligned = %d" %nb_taxa + print "\tNumber of loci concatenated = %d\n" %nb_locus + print "\tTotal length of the concatenated sequences = %d" %ln + + + ## Print NEXUS HEADER: + OUT3.write("#NEXUS\n\n") + OUT3.write("Begin data;\n") + OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln)) + OUT3.write("\tFormat datatype=aa gap=-;\n") + OUT3.write("\tMatrix\n") + + ## Print PHYLIP HEADER: + OUT2.write(" %d %d\n" %(nb_taxa, ln)) + + ## 3.5 ## Print outputs + for seq_name in bash_concatenation.keys(): + seq = bash_concatenation[seq_name] + + ## Filtering the sequence in case of remaining "?" + seq = string.replace(seq, "?", "-") + + #print seq FASTA FORMAT + OUT1.write(">%s\n" %seq_name) + OUT1.write("%s\n" %seq) + + #print seq PHYLIP FORMAT + OUT2.write("%s\n" %seq_name) + OUT2.write("%s\n" %seq) + + #print seq NEXUS FORMAT + OUT3.write("%s" %seq_name) + OUT3.write(" %s\n" %seq) + + OUT3.write("\t;\n") + OUT3.write("End;\n") + OUT1.close() + OUT2.close() + OUT2.close() + + +### 2 ### Nucleic +elif format_run == "nucleic" : + + OUT1 = open("03_Concatenation_nuc.fas", "w") + OUT2 = open("03_Concatenation_nuc.phy", "w") + OUT3 = open("03_Concatenation_nuc.nex", "w") + + OUT1_pos12 = open("03_Concatenation_pos12_nuc.fas", "w") + OUT2_pos12 = open("03_Concatenation_pos12_nuc.phy", "w") + OUT3_pos12 = open("03_Concatenation_pos12_nuc.nex", "w") + + OUT1_pos3 = open("03_Concatenation_pos3_nuc.fas", "w") + OUT2_pos3 = open("03_Concatenation_pos3_nuc.phy", "w") + OUT3_pos3 = open("03_Concatenation_pos3_nuc.nex", "w") + + OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w") + OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w") + OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w") + + ## Get bash with concatenation + bash_concatenation, ln, nb_locus, list_genes_position = concatenate(L_IN, SPECIES_ID_LIST) ### DEF 11 ## + ln_12 = ln/3*2 ### length of the alignment when only the 2 first codon position + ln_3 = ln/3 ### length of the alignment when only the third codon position + + ## Write partition files for RAxML subsequent runs + # a # Codon partition + OUT_PARTITION_codon_12_3.write("DNA, p1=1-%d\\3,2-%d\\3\n" %(ln, ln)) + OUT_PARTITION_codon_12_3.write("DNA, p2=3-%d\\3\n" %(ln)) + OUT_PARTITION_codon_12_3.close() + + # b # Gene partition + for sublist in list_genes_position: + name=sublist[0] + positions=sublist[1] + OUT_PARTITION_gene_NUC.write("DNA,%s=%s\n"%(name,positions)) + OUT_PARTITION_gene_NUC.close() + + # c # Mixed partition (codon + gene) + for sublist in list_genes_position: + name = sublist[0] + positions = sublist[1] + S1 = string.split(positions, "-") + pos_start1 = string.atoi(S1[0]) + pos_end = string.atoi(S1[1]) + pos_start2=pos_start1+1 + pos_start3=pos_start2+1 + partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end) + partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end) + OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1) + OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2) + + OUT_PARTITION_gene_PLUS_codon_12_3.close() + + + ## Get "ntax" for NEXUS HEADER + nb_taxa = len(bash_concatenation.keys()) + + print "******************** CONCATENATION ********************\n" + print "Process nucleotides concatenation:" + print "\tNumber of taxa aligned = %d" %nb_taxa + print "\tNumber of loci concatenated = %d\n" %nb_locus + print "\tTotal length of the concatenated sequences [All codon positions] = %d" %ln + print "\t\tTotal length of the concatenated sequences [Codon positions 1 & 2] = %d" %ln_12 + print "\t\tTotal length of the concatenated sequences [Codon position 3] = %d" %ln_3 + + + ## Print NEXUS HEADER: + OUT3.write("#NEXUS\n\n") + OUT3.write("Begin data;\n") + OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln)) + OUT3.write("\tFormat datatype=dna gap=-;\n") + OUT3.write("\tMatrix\n") + + OUT3_pos12.write("#NEXUS\n\n") + OUT3_pos12.write("Begin data;\n") + OUT3_pos12.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_12)) + OUT3_pos12.write("\tFormat datatype=dna gap=-;\n") + OUT3_pos12.write("\tMatrix\n") + + OUT3_pos3.write("#NEXUS\n\n") + OUT3_pos3.write("Begin data;\n") + OUT3_pos3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_3)) + OUT3_pos3.write("\tFormat datatype=dna gap=-;\n") + OUT3_pos3.write("\tMatrix\n") + + ## Print PHYLIP HEADER: + OUT2.write(" %d %d\n" %(nb_taxa, ln)) + OUT2_pos12.write(" %d %d\n" %(nb_taxa, ln_12)) + OUT2_pos3.write(" %d %d\n" %(nb_taxa, ln_3)) + + ## Print outputs + for seq_name in bash_concatenation.keys(): + seq = bash_concatenation[seq_name] + + ## Filtering the sequence in case of remaining "?" + seq = string.replace(seq, "?", "-") + + ## Get the differentes codons partitions + seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq) ### DEF 12 ### + + #print seq FASTA FORMAT + OUT1.write(">%s\n" %seq_name) + OUT1.write("%s\n" %seq) + OUT1_pos12.write(">%s\n" %seq_name) + OUT1_pos12.write("%s\n" %seq_pos12) + OUT1_pos3.write(">%s\n" %seq_name) + OUT1_pos3.write("%s\n" %seq_pos3) + + #print seq PHYLIP FORMAT + OUT2.write("%s\n" %seq_name) + OUT2.write("%s\n" %seq) + OUT2_pos12.write("%s\n" %seq_name) + OUT2_pos12.write("%s\n" %seq_pos12) + OUT2_pos3.write("%s\n" %seq_name) + OUT2_pos3.write("%s\n" %seq_pos3) + + #print seq NEXUS FORMAT + OUT3.write("%s" %seq_name) + OUT3.write(" %s\n" %seq) + OUT3_pos12.write("%s" %seq_name) + OUT3_pos12.write(" %s\n" %seq_pos12) + OUT3_pos3.write("%s" %seq_name) + OUT3_pos3.write(" %s\n" %seq_pos3) + + + OUT3.write("\t;\n") + OUT3.write("End;\n") + OUT3_pos12.write("\t;\n") + OUT3_pos12.write("End;\n") + OUT3_pos3.write("\t;\n") + OUT3_pos3.write("End;\n") + + OUT1.close() + OUT2.close() + OUT3.close() + OUT1_pos12.close() + OUT2_pos12.close() + OUT3_pos12.close() + OUT1_pos3.close() + OUT2_pos3.close() + OUT3_pos3.close() + +print "\n\n\n******************** RAxML RUN ********************\n"