diff scripts/S01_concatenate.py @ 0:b186cae246bd draft default tip

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:27:42 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S01_concatenate.py	Fri Feb 01 10:27:42 2019 -0500
@@ -0,0 +1,361 @@
+#!/usr/bin/python
+## Author: Eric Fontanillas
+## Last modification: 17/06/2011
+## Subject: find and remove indels
+
+###############################
+##### DEF 0 : Dico fasta  #####
+###############################
+def dico(F2):
+    dicoco = {}
+    with open(F2, "r") as file:
+        for name, query in itertools.izip_longest(*[file]*2):
+            if not name:
+                break
+            if name[0] == ">":
+                fasta_name_query = name[:-1]
+                Sn = string.split(fasta_name_query, "||")
+                fasta_name_query = Sn[0]
+                fasta_seq_query = query[:-1]
+                dicoco[fasta_name_query]=fasta_seq_query
+    return dicoco
+###################################################################################
+
+
+####################
+###### DEF 11 ######
+####################
+## Concatenate sequences
+###########################
+def concatenate(L_IN, SPECIES_ID_LIST):
+    ## 4 ## Process files
+    ## 4.1 ## Create the bash and the fasta names entries (name of the species)
+    bash_concat = {}
+
+    for species_ID in SPECIES_ID_LIST:
+        bash_concat[species_ID] = ''
+
+    ln_concat = 0
+    nb_locus = 0
+    pos=1
+    list_genes_position=[]
+    ## 4.2 ## Concatenate
+    for file in L_IN:
+        nb_locus=nb_locus+1
+
+        ## a ## Open alignments        
+        dico_seq = dico(file)   ### DEF 0 ###        
+        ## b ## Get alignment length + genes positions for RAxML
+        key0 = dico_seq.keys()[0]
+        ln = len(dico_seq[key0])
+        ln_concat = ln_concat + ln
+
+        pos_start = pos
+        pos_end = pos+ln-1
+        pos=pos_end+1
+        position="%d-%d" %(pos_start, pos_end)
+        RAxML_name = file[:-6]
+        sublist = [RAxML_name, position]
+        list_genes_position.append(sublist)
+
+        ## c ## Generate "empty" sequence with alignment length * "-"
+        empty_seq = "-" * ln
+
+        ## d ## Concatenate
+        ## d.1 ## Detect missing species in this alignment
+        list_ID=[]
+        list_absent_ID=[]
+        bash_fastaName={}
+        for fasta_name in dico_seq:
+            ID = fasta_name[1:3]
+            list_ID.append(ID)
+            seq = dico_seq[fasta_name]
+            bash_fastaName[ID]=fasta_name
+        for sp_ID in SPECIES_ID_LIST:
+            if sp_ID not in list_ID:
+                list_absent_ID.append(sp_ID)
+
+        for ID in SPECIES_ID_LIST:
+            if ID in list_absent_ID:
+                bash_concat[ID] = bash_concat[ID] + empty_seq
+            else:
+                fasta_name = bash_fastaName[ID]
+                seq = dico_seq[fasta_name]
+                bash_concat[ID] = bash_concat[ID] + seq
+
+    return(bash_concat, ln_concat, nb_locus, list_genes_position)
+####################################
+
+
+########################################
+##### DEF 12 : get codon position  #####
+########################################
+def get_codon_position(seq_inORF):
+
+    ln = len(seq_inORF)
+
+    i=0
+    seq_pos1=""
+    seq_pos2=""
+    seq_pos12=""
+    seq_pos3=""
+    while i<ln:
+       pos1 =  seq_inORF[i]
+       pos2 =  seq_inORF[i+1]
+       pos3 =  seq_inORF[i+2]
+
+       seq_pos1 = seq_pos1 + pos1
+       seq_pos2 = seq_pos2 + pos2
+       seq_pos12 = seq_pos12 + pos1 + pos2
+       seq_pos3 = seq_pos3 + pos3
+
+       i = i+3
+
+    return(seq_pos1, seq_pos2, seq_pos12, seq_pos3)
+###############################################################################
+
+
+
+#######################
+##### RUN RUN RUN #####
+#######################
+import string, os, time, re, sys, itertools
+
+list_species = []
+SPECIES_ID_LIST = []
+fasta = "^.*fasta$"
+i=3
+
+## Arguments
+infiles_filter_assemblies = sys.argv[1]
+format_run = sys.argv[2]
+
+## add file to list_species
+list_species = str.split(infiles_filter_assemblies,",")
+
+## in SPECIES_ID_LIST, only the 2 first letters of name of species
+for name in list_species :
+    name = name[:2]
+    SPECIES_ID_LIST.append(name)
+
+## add alignment files to L_IN
+list_files = []
+with open(sys.argv[3], 'r') as f:
+    for line in f.readlines():
+        list_files.append(line.strip('\n'))
+
+L_IN = []
+for file in list_files:
+    L_IN.append(file)
+
+#L_IN = str.split(input_alignments,",")
+print(L_IN)
+
+### 1 ### Proteic
+if format_run == "proteic" :
+
+    OUT1 = open("02_Concatenation_aa.fas", "w")
+    OUT2 = open("02_Concatenation_aa.phy", "w")
+    OUT3 = open("02_Concatenation_aa.nex", "w")
+    OUT_PARTITION_gene_AA = open("06_partitions_gene_AA","w")
+
+
+    ##  Get bash with concatenation
+    bash_concatenation, ln, nb_locus,list_genes_position= concatenate(L_IN, SPECIES_ID_LIST)    ### DEF 11 ##
+
+    ## Write gene AA partition file for RAxML
+    for sublist in list_genes_position:
+        name = sublist[0]
+        positions=sublist[1]
+        OUT_PARTITION_gene_AA.write("DNA,%s=%s\n"%(name,positions))
+    OUT_PARTITION_gene_AA.close()
+
+    ## Get "ntax" for NEXUS HEADER
+    nb_taxa = len(bash_concatenation.keys())
+
+    print "******************** CONCATENATION ********************\n"
+    print "Process amino-acid concatenation:"
+    print "\tNumber of taxa aligned = %d" %nb_taxa
+    print "\tNumber of loci concatenated = %d\n" %nb_locus
+    print "\tTotal length of the concatenated sequences = %d" %ln
+
+
+    ## Print NEXUS HEADER:
+    OUT3.write("#NEXUS\n\n")
+    OUT3.write("Begin data;\n")
+    OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln))
+    OUT3.write("\tFormat datatype=aa gap=-;\n")
+    OUT3.write("\tMatrix\n")
+
+    ## Print PHYLIP HEADER:
+    OUT2.write("   %d %d\n" %(nb_taxa, ln))
+
+    ## 3.5 ## Print outputs
+    for seq_name in bash_concatenation.keys():
+        seq = bash_concatenation[seq_name]
+
+        ## Filtering the sequence in case of remaining "?"
+        seq = string.replace(seq, "?", "-")
+
+        #print seq FASTA FORMAT
+        OUT1.write(">%s\n" %seq_name)
+        OUT1.write("%s\n" %seq)
+
+        #print seq PHYLIP FORMAT
+        OUT2.write("%s\n" %seq_name)
+        OUT2.write("%s\n" %seq)
+
+        #print seq NEXUS FORMAT
+        OUT3.write("%s" %seq_name)
+        OUT3.write("      %s\n" %seq)
+
+    OUT3.write("\t;\n")
+    OUT3.write("End;\n")
+    OUT1.close()
+    OUT2.close()
+    OUT2.close()
+
+
+### 2 ### Nucleic
+elif format_run == "nucleic" :
+
+    OUT1 = open("03_Concatenation_nuc.fas", "w")
+    OUT2 = open("03_Concatenation_nuc.phy", "w")
+    OUT3 = open("03_Concatenation_nuc.nex", "w")
+
+    OUT1_pos12 = open("03_Concatenation_pos12_nuc.fas", "w")
+    OUT2_pos12 = open("03_Concatenation_pos12_nuc.phy", "w")
+    OUT3_pos12 = open("03_Concatenation_pos12_nuc.nex", "w")
+
+    OUT1_pos3 = open("03_Concatenation_pos3_nuc.fas", "w")
+    OUT2_pos3 = open("03_Concatenation_pos3_nuc.phy", "w")
+    OUT3_pos3 = open("03_Concatenation_pos3_nuc.nex", "w")
+
+    OUT_PARTITION_codon_12_3 = open("05_partitions_codon12_3","w")
+    OUT_PARTITION_gene_NUC = open("05_partitions_gene_NUC","w")
+    OUT_PARTITION_gene_PLUS_codon_12_3 = open("05_partitions_gene_PLUS_codon12_3","w")
+
+    ## Get bash with concatenation
+    bash_concatenation, ln, nb_locus, list_genes_position = concatenate(L_IN, SPECIES_ID_LIST)    ### DEF 11 ##
+    ln_12 = ln/3*2   ### length of the alignment when only the 2 first codon position
+    ln_3 = ln/3      ### length of the alignment when only the third codon position
+
+    ## Write partition files for RAxML subsequent runs
+    # a # Codon partition
+    OUT_PARTITION_codon_12_3.write("DNA, p1=1-%d\\3,2-%d\\3\n" %(ln, ln))
+    OUT_PARTITION_codon_12_3.write("DNA, p2=3-%d\\3\n" %(ln))
+    OUT_PARTITION_codon_12_3.close()
+
+    # b # Gene partition
+    for sublist in list_genes_position:
+        name=sublist[0]
+        positions=sublist[1]
+        OUT_PARTITION_gene_NUC.write("DNA,%s=%s\n"%(name,positions))
+    OUT_PARTITION_gene_NUC.close()
+
+    # c # Mixed partition (codon + gene)
+    for sublist in list_genes_position:
+        name = sublist[0]
+        positions = sublist[1]
+        S1 = string.split(positions, "-")
+        pos_start1 = string.atoi(S1[0])
+        pos_end = string.atoi(S1[1])
+        pos_start2=pos_start1+1
+        pos_start3=pos_start2+1
+        partition1 = "DNA, %s_1=%d-%d\\3,%d-%d\\3\n" %(name,pos_start1, pos_end, pos_start2, pos_end)
+        partition2 = "DNA, %s_2=%d-%d\\3\n" %(name,pos_start3, pos_end)
+        OUT_PARTITION_gene_PLUS_codon_12_3.write(partition1)
+        OUT_PARTITION_gene_PLUS_codon_12_3.write(partition2)
+
+    OUT_PARTITION_gene_PLUS_codon_12_3.close()
+
+
+    ## Get "ntax" for NEXUS HEADER
+    nb_taxa = len(bash_concatenation.keys())
+
+    print "******************** CONCATENATION ********************\n"
+    print "Process nucleotides concatenation:"
+    print "\tNumber of taxa aligned = %d" %nb_taxa
+    print "\tNumber of loci concatenated = %d\n" %nb_locus
+    print "\tTotal length of the concatenated sequences [All codon positions] = %d" %ln
+    print "\t\tTotal length of the concatenated sequences [Codon positions 1 & 2] = %d" %ln_12
+    print "\t\tTotal length of the concatenated sequences [Codon position 3] = %d" %ln_3
+
+
+    ## Print NEXUS HEADER:
+    OUT3.write("#NEXUS\n\n")
+    OUT3.write("Begin data;\n")
+    OUT3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln))
+    OUT3.write("\tFormat datatype=dna gap=-;\n")
+    OUT3.write("\tMatrix\n")
+
+    OUT3_pos12.write("#NEXUS\n\n")
+    OUT3_pos12.write("Begin data;\n")
+    OUT3_pos12.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_12))
+    OUT3_pos12.write("\tFormat datatype=dna gap=-;\n")
+    OUT3_pos12.write("\tMatrix\n")
+
+    OUT3_pos3.write("#NEXUS\n\n")
+    OUT3_pos3.write("Begin data;\n")
+    OUT3_pos3.write("\tDimensions ntax=%d nchar=%d;\n" %(nb_taxa, ln_3))
+    OUT3_pos3.write("\tFormat datatype=dna gap=-;\n")
+    OUT3_pos3.write("\tMatrix\n")
+
+    ## Print PHYLIP HEADER:
+    OUT2.write("   %d %d\n" %(nb_taxa, ln))
+    OUT2_pos12.write("   %d %d\n" %(nb_taxa, ln_12))
+    OUT2_pos3.write("   %d %d\n" %(nb_taxa, ln_3))
+
+    ## Print outputs
+    for seq_name in bash_concatenation.keys():
+        seq = bash_concatenation[seq_name]
+
+        ## Filtering the sequence in case of remaining "?"
+        seq = string.replace(seq, "?", "-")
+
+        ## Get the differentes codons partitions
+        seq_pos1, seq_pos2, seq_pos12, seq_pos3 = get_codon_position(seq)    ### DEF 12 ###
+
+        #print seq FASTA FORMAT
+        OUT1.write(">%s\n" %seq_name)
+        OUT1.write("%s\n" %seq)
+        OUT1_pos12.write(">%s\n" %seq_name)
+        OUT1_pos12.write("%s\n" %seq_pos12)
+        OUT1_pos3.write(">%s\n" %seq_name)
+        OUT1_pos3.write("%s\n" %seq_pos3)
+
+        #print seq PHYLIP FORMAT
+        OUT2.write("%s\n" %seq_name)
+        OUT2.write("%s\n" %seq)
+        OUT2_pos12.write("%s\n" %seq_name)
+        OUT2_pos12.write("%s\n" %seq_pos12)
+        OUT2_pos3.write("%s\n" %seq_name)
+        OUT2_pos3.write("%s\n" %seq_pos3)
+
+        #print seq NEXUS FORMAT
+        OUT3.write("%s" %seq_name)
+        OUT3.write("      %s\n" %seq)
+        OUT3_pos12.write("%s" %seq_name)
+        OUT3_pos12.write("      %s\n" %seq_pos12)
+        OUT3_pos3.write("%s" %seq_name)
+        OUT3_pos3.write("      %s\n" %seq_pos3)
+
+
+    OUT3.write("\t;\n")
+    OUT3.write("End;\n")
+    OUT3_pos12.write("\t;\n")
+    OUT3_pos12.write("End;\n")
+    OUT3_pos3.write("\t;\n")
+    OUT3_pos3.write("End;\n")
+
+    OUT1.close()
+    OUT2.close()
+    OUT3.close()
+    OUT1_pos12.close()
+    OUT2_pos12.close()
+    OUT3_pos12.close()
+    OUT1_pos3.close()
+    OUT2_pos3.close()
+    OUT3_pos3.close()
+
+print "\n\n\n******************** RAxML RUN ********************\n"