Mercurial > repos > abims-sbr > concatphyl
view scripts/S01_concatenate.py @ 0:b186cae246bd draft default tip
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author | abims-sbr |
---|---|
date | Fri, 01 Feb 2019 10:27:42 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/python ## Author: Eric Fontanillas ## Last modification: 17/06/2011 ## Subject: find and remove indels ############################### ##### DEF 0 : Dico fasta ##### ############################### def dico(F2): dicoco = {} with open(F2, "r") as file: for name, query in itertools.izip_longest(*[file]*2): if not name: break if name[0] == ">": fasta_name_query = name[:-1] Sn = string.split(fasta_name_query, "||") fasta_name_query = Sn[0] fasta_seq_query = query[:-1] dicoco[fasta_name_query]=fasta_seq_query return dicoco ################################################################################### #################### ###### DEF 11 ###### #################### ## Concatenate sequences ########################### def concatenate(L_IN, SPECIES_ID_LIST): ## 4 ## Process files ## 4.1 ## Create the bash and the fasta names entries (name of the species) bash_concat = {} for species_ID in SPECIES_ID_LIST: bash_concat[species_ID] = '' ln_concat = 0 nb_locus = 0 pos=1 list_genes_position=[] ## 4.2 ## Concatenate for file in L_IN: nb_locus=nb_locus+1 ## a ## Open alignments dico_seq = dico(file) ### DEF 0 ### ## b ## Get alignment length + genes positions for RAxML key0 = dico_seq.keys()[0] ln = len(dico_seq[key0]) ln_concat = ln_concat + ln pos_start = pos pos_end = pos+ln-1 pos=pos_end+1 position="%d-%d" %(pos_start, pos_end) RAxML_name = file[:-6] sublist = [RAxML_name, position] list_genes_position.append(sublist) ## c ## Generate "empty" sequence with alignment length * "-" empty_seq = "-" * ln ## d ## Concatenate ## d.1 ## Detect missing species in this alignment list_ID=[] list_absent_ID=[] bash_fastaName={} for fasta_name in dico_seq: ID = fasta_name[1:3] list_ID.append(ID) seq = dico_seq[fasta_name] bash_fastaName[ID]=fasta_name for sp_ID in SPECIES_ID_LIST: if sp_ID not in list_ID: list_absent_ID.append(sp_ID) for ID in SPECIES_ID_LIST: if ID in list_absent_ID: bash_concat[ID] = bash_concat[ID] + empty_seq else: fasta_name = bash_fastaName[ID] seq = dico_seq[fasta_name] bash_concat[ID] = bash_concat[ID] + seq return(bash_concat, ln_concat, nb_locus, list_genes_position) #################################### ######################################## ##### DEF 12 : get codon position ##### ######################################## def get_codon_position(seq_inORF): ln = len(seq_inORF) i=0 seq_pos1="" seq_pos2="" seq_pos12="" seq_pos3="" while i<ln: pos1 = seq_inORF[i] pos2 = seq_inORF[i+1] pos3 = seq_inORF[i+2] seq_pos1 = seq_pos1 + pos1 seq_pos2 = seq_pos2 + pos2 seq_pos12 = seq_pos12 + pos1 + pos2 seq_pos3 = seq_pos3 + pos3 i = i+3 return(seq_pos1, seq_pos2, seq_pos12, seq_pos3) ############################################################################### ####################### ##### RUN RUN RUN ##### ####################### import string, os, time, re, sys, itertools list_species = [] SPECIES_ID_LIST = [] fasta = "^.*fasta$" i=3 ## Arguments infiles_filter_assemblies = sys.argv[1] format_run = sys.argv[2] ## add file to list_species list_species = str.split(infiles_filter_assemblies,",") ## in SPECIES_ID_LIST, only the 2 first letters of name of species for name in list_species : name = name[:2] SPECIES_ID_LIST.append(name) ## add alignment files to L_IN list_files = [] with open(sys.argv[3], 'r') as f: for line in f.readlines(): list_files.append(line.strip('\n')) L_IN = [] for file in list_files: L_IN.append(file) #L_IN = str.split(input_alignments,",") print(L_IN) ### 1 ### Proteic if format_run == "proteic" : OUT1 = open("02_Concatenation_aa.fas", "w") OUT2 = open("02_Concatenation_aa.phy", "w") OUT3 = open("02_Concatenation_aa.nex", "w") OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w") ## Get bash with concatenation bash_concatenation, ln, nb_locus,list_genes_position= concatenate(L_IN, SPECIES_ID_LIST) ### DEF 11 ## ## Write gene AA partition file for RAxML for sublist in list_genes_position: name = sublist[0] positions=sublist[1] OUT_PARTITION_gene_AA.write("DNA,%s=%s\n"%(name,positions)) OUT_PARTITION_gene_AA.close() ## Get "ntax" for NEXUS HEADER nb_taxa = len(bash_concatenation.keys()) print "******************** CONCATENATION ********************\n" print "Process amino-acid concatenation:" print "\tNumber of taxa aligned = %d" %nb_taxa print "\tNumber of loci concatenated = %d\n" %nb_locus print "\tTotal length of the concatenated sequences = %d" %ln ## Print NEXUS HEADER: OUT3.write("#NEXUS\n\n") OUT3.write("Begin data;\n") OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln)) OUT3.write("\tFormat datatype=aa gap=-;\n") OUT3.write("\tMatrix\n") ## Print PHYLIP HEADER: OUT2.write(" %d %d\n" %(nb_taxa, ln)) ## 3.5 ## Print outputs for seq_name in bash_concatenation.keys(): seq = bash_concatenation[seq_name] ## Filtering the sequence in case of remaining "?" seq = string.replace(seq, "?", "-") #print seq FASTA FORMAT OUT1.write(">%s\n" %seq_name) OUT1.write("%s\n" %seq) #print seq PHYLIP FORMAT OUT2.write("%s\n" %seq_name) OUT2.write("%s\n" %seq) #print seq NEXUS FORMAT OUT3.write("%s" %seq_name) OUT3.write(" %s\n" %seq) OUT3.write("\t;\n") OUT3.write("End;\n") OUT1.close() OUT2.close() OUT2.close() ### 2 ### Nucleic elif format_run == "nucleic" : OUT1 = open("03_Concatenation_nuc.fas", "w") OUT2 = open("03_Concatenation_nuc.phy", "w") OUT3 = open("03_Concatenation_nuc.nex", "w") OUT1_pos12 = open("03_Concatenation_pos12_nuc.fas", "w") OUT2_pos12 = open("03_Concatenation_pos12_nuc.phy", "w") OUT3_pos12 = open("03_Concatenation_pos12_nuc.nex", "w") OUT1_pos3 = open("03_Concatenation_pos3_nuc.fas", "w") OUT2_pos3 = open("03_Concatenation_pos3_nuc.phy", "w") OUT3_pos3 = open("03_Concatenation_pos3_nuc.nex", "w") OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w") OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w") OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w") ## Get bash with concatenation bash_concatenation, ln, nb_locus, list_genes_position = concatenate(L_IN, SPECIES_ID_LIST) ### DEF 11 ## ln_12 = ln/3*2 ### length of the alignment when only the 2 first codon position ln_3 = ln/3 ### length of the alignment when only the third codon position ## Write partition files for RAxML subsequent runs # a # Codon partition OUT_PARTITION_codon_12_3.write("DNA, p1=1-%d\\3,2-%d\\3\n" %(ln, ln)) OUT_PARTITION_codon_12_3.write("DNA, p2=3-%d\\3\n" %(ln)) OUT_PARTITION_codon_12_3.close() # b # Gene partition for sublist in list_genes_position: name=sublist[0] positions=sublist[1] OUT_PARTITION_gene_NUC.write("DNA,%s=%s\n"%(name,positions)) OUT_PARTITION_gene_NUC.close() # c # Mixed partition (codon + gene) for sublist in list_genes_position: name = sublist[0] positions = sublist[1] S1 = string.split(positions, "-") pos_start1 = string.atoi(S1[0]) pos_end = string.atoi(S1[1]) pos_start2=pos_start1+1 pos_start3=pos_start2+1 partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end) partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end) OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1) OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2) OUT_PARTITION_gene_PLUS_codon_12_3.close() ## Get "ntax" for NEXUS HEADER nb_taxa = len(bash_concatenation.keys()) print "******************** CONCATENATION ********************\n" print "Process nucleotides concatenation:" print "\tNumber of taxa aligned = %d" %nb_taxa print "\tNumber of loci concatenated = %d\n" %nb_locus print "\tTotal length of the concatenated sequences [All codon positions] = %d" %ln print "\t\tTotal length of the concatenated sequences [Codon positions 1 & 2] = %d" %ln_12 print "\t\tTotal length of the concatenated sequences [Codon position 3] = %d" %ln_3 ## Print NEXUS HEADER: OUT3.write("#NEXUS\n\n") OUT3.write("Begin data;\n") OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln)) OUT3.write("\tFormat datatype=dna gap=-;\n") OUT3.write("\tMatrix\n") OUT3_pos12.write("#NEXUS\n\n") OUT3_pos12.write("Begin data;\n") OUT3_pos12.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_12)) OUT3_pos12.write("\tFormat datatype=dna gap=-;\n") OUT3_pos12.write("\tMatrix\n") OUT3_pos3.write("#NEXUS\n\n") OUT3_pos3.write("Begin data;\n") OUT3_pos3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_3)) OUT3_pos3.write("\tFormat datatype=dna gap=-;\n") OUT3_pos3.write("\tMatrix\n") ## Print PHYLIP HEADER: OUT2.write(" %d %d\n" %(nb_taxa, ln)) OUT2_pos12.write(" %d %d\n" %(nb_taxa, ln_12)) OUT2_pos3.write(" %d %d\n" %(nb_taxa, ln_3)) ## Print outputs for seq_name in bash_concatenation.keys(): seq = bash_concatenation[seq_name] ## Filtering the sequence in case of remaining "?" seq = string.replace(seq, "?", "-") ## Get the differentes codons partitions seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq) ### DEF 12 ### #print seq FASTA FORMAT OUT1.write(">%s\n" %seq_name) OUT1.write("%s\n" %seq) OUT1_pos12.write(">%s\n" %seq_name) OUT1_pos12.write("%s\n" %seq_pos12) OUT1_pos3.write(">%s\n" %seq_name) OUT1_pos3.write("%s\n" %seq_pos3) #print seq PHYLIP FORMAT OUT2.write("%s\n" %seq_name) OUT2.write("%s\n" %seq) OUT2_pos12.write("%s\n" %seq_name) OUT2_pos12.write("%s\n" %seq_pos12) OUT2_pos3.write("%s\n" %seq_name) OUT2_pos3.write("%s\n" %seq_pos3) #print seq NEXUS FORMAT OUT3.write("%s" %seq_name) OUT3.write(" %s\n" %seq) OUT3_pos12.write("%s" %seq_name) OUT3_pos12.write(" %s\n" %seq_pos12) OUT3_pos3.write("%s" %seq_name) OUT3_pos3.write(" %s\n" %seq_pos3) OUT3.write("\t;\n") OUT3.write("End;\n") OUT3_pos12.write("\t;\n") OUT3_pos12.write("End;\n") OUT3_pos3.write("\t;\n") OUT3_pos3.write("End;\n") OUT1.close() OUT2.close() OUT3.close() OUT1_pos12.close() OUT2_pos12.close() OUT3_pos12.close() OUT1_pos3.close() OUT2_pos3.close() OUT3_pos3.close() print "\n\n\n******************** RAxML RUN ********************\n"