Mercurial > repos > estrain > seqsero_v1
diff SeqSero/libs/BWA_analysis_O_new_dependent.py @ 0:c577b57b7c74 draft
Uploaded
author | estrain |
---|---|
date | Wed, 06 Dec 2017 15:59:29 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SeqSero/libs/BWA_analysis_O_new_dependent.py Wed Dec 06 15:59:29 2017 -0500 @@ -0,0 +1,280 @@ +#!/usr/bin/env python +#tyr_of_O2_O9.fasta should be in the same directory, in it, O9 should be first then O2 + +import os +from Bio import SeqIO +import sys +from Initial_functions import Uniq +from Bio.Blast import NCBIXML + +def BWA_O_analysis(sra_name,additional_file,database,mapping_mode,file_mode): + if file_mode=="1":#interleaved + if sra_name[-3:]=="sra": + os.system("fastq-dump --split-files "+sra_name) + del_fastq=1 + for_fq=sra_name.replace(".sra","_1.fastq") + rev_fq=sra_name.replace(".sra","_2.fastq") + for_sai=sra_name.replace(".sra","_1.sai") + rev_sai=sra_name.replace(".sra","_2.sai") + sam=sra_name.replace(".sra",".sam") + bam=sra_name.replace(".sra",".bam") + else: + del_fastq=0 + core_id=sra_name.split(".fastq")[0] + try: + os.system("gunzip "+sra_name) + except: + pass + dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) + os.system("perl "+dirpath+"/split_interleaved_fastq.pl --input "+core_id+".fastq --output "+core_id.replace(".","_")+".fastq")#######03152016 + ori_size=os.path.getsize(core_id+".fastq")#######03152016 + os.system("mv "+core_id.replace(".","_")+"-read1.fastq"+" "+core_id+"-read1.fastq")#######03152016 + os.system("mv "+core_id.replace(".","_")+"-read2.fastq"+" "+core_id+"-read2.fastq")#######03152016 + for_fq=core_id+"-read1.fastq"#######03152016 + rev_fq=core_id+"-read2.fastq"#######03152016 + if float(os.path.getsize(for_fq))/ori_size<=0.1 or float(os.path.getsize(rev_fq))/ori_size<=0.1:#09092015#12292015#######03152016 + os.system("echo haha")#09092015 + os.system("perl "+dirpath+"/splitPairedEndReads.pl "+core_id+".fastq")#09092015 + os.system("mv "+core_id+".fastq_1 "+for_fq)##09092015 + os.system("mv "+core_id+".fastq_2 "+rev_fq)##09092015 + else:#09092015 + os.system("echo hehe")#09092015 + for_sai=core_id+"_1.sai" + rev_sai=core_id+"_2.sai" + sam=core_id+".sam" + bam=core_id+".bam" + elif file_mode=="2":#seperated + forword_seq=sra_name + reverse_seq=additional_file + try: + os.system("gunzip "+forword_seq) + except: + pass + try: + os.system("gunzip "+reverse_seq) + except: + pass + for_core_id=forword_seq.split(".fastq")[0] + re_core_id=reverse_seq.split(".fastq")[0] + for_fq=for_core_id+".fastq" + rev_fq=re_core_id+".fastq" + dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))#######03152016 + print "check fastq id and make them in accordance with each other...please wait..." + os.system("python "+dirpath+"/compare_and_change_two_fastq_id.py "+for_fq+" "+rev_fq)#######03152016 + for_sai=for_core_id+".sai" + rev_sai=re_core_id+".sai" + sam=for_core_id+".sam" + bam=sam.replace(".sam",".bam") + elif file_mode=="3":#single-end + if sra_name[-3:]=="sra": + os.system("fastq-dump --split-files "+sra_name)###01/28/2015 + del_fastq=1 + for_fq=sra_name.replace(".sra","_1.fastq") + for_sai=sra_name.replace(".sra","_1.sai") + sam=sra_name.replace(".sra",".sam") + bam=sra_name.replace(".sra",".bam") + else: + del_fastq=0 + core_id=sra_name.split(".fastq")[0] + try: + os.system("gunzip "+sra_name) + except: + pass + for_fq=core_id+".fastq" + for_sai=core_id+"_1.sai" + sam=core_id+".sam" + bam=core_id+".bam" + + os.system("bwa index "+database) + if file_mode!="3": + if mapping_mode=="sam": + os.system("bwa aln "+database+" "+for_fq+" > "+for_sai) + os.system("bwa aln "+database+" "+rev_fq+" > "+rev_sai) + os.system("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+for_fq+" "+rev_fq+" > "+sam) + elif mapping_mode=="mem": + os.system("bwa mem "+database+" "+for_fq+" "+rev_fq+" > "+sam) #2014/12/23 + elif mapping_mode=="nanopore": ## + os.system("bwa mem -x ont2d "+database+" "+for_fq+" "+rev_fq+" > "+sam)## + else: + if mapping_mode=="mem": + os.system("bwa mem "+database+" "+for_fq+" > "+sam) #2014/12/23 + elif mapping_mode=="sam": + os.system("bwa aln "+database+" "+for_fq+" > "+for_sai) + os.system("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam) + elif mapping_mode=="nanopore":## + os.system("bwa mem -x ont2d "+database+" "+for_fq+" > "+sam)## + os.system("samtools view -F 4 -Sbh "+sam+" > "+bam) + os.system("samtools view -h -o "+sam+" "+bam) + + file=open(sam,"r") + handle=file.readlines() + name_list=[] + for line in handle: + if len(line)>300: + name_list.append(line.split("\t")[2]) + a,b=Uniq(name_list) + c=dict(zip(a,b)) + final_O=sorted(c.iteritems(), key=lambda d:d[1], reverse = True) #order from frequency high to low, but tuple while not list + Sero_list_O=[] + print "Final_Otype_list:" + print final_O + num_1=0#new inserted + O9_wbav=0 + O310_wzx=0 + O946_wzy=0 + if len(final_O)>0: #new inserted + for x in final_O:#new inserted + num_1=num_1+x[1]#new inserted + if "O-9,46_wbaV" in x[0]: + O9_wbaV=x[1] + if "O-3,10_wzx" in x[0]: + O310_wzx=x[1] + if "O-9,46_wzy" in x[0]: + O946_wzy=x[1] + if "O-3,10_not_in_1,3,19" in x[0]: + O310_no_1319=x[1] + if "O-9,46,27_partial_wzy" in x[0]: + O94627=x[1] + O_list=[] + O_choice="" + + + print "$$$Genome:",sra_name + if len(final_O)==0: + print "$$$No Otype, due to no hit" + else: + if final_O[0][1]<8: + print "$$$No Otype, due to the hit reads number is small." + else: + for x in final_O: + if x[1]>5: + O_list.append(x[0]) + qq=1# + for x in final_O:# + if "sdf" in x[0] and x[1]>3:# + qq=0# + print "$$$",x[0],"got a hit, reads:",x[1]# + final_O.remove(x) + if qq!=0:# + print "$$$No sdf exists"# + + if "O-9,46_wbaV" in O_list and float(O9_wbaV)/float(num_1) > 0.1: + if "O-9,46_wzy" in O_list and float(O946_wzy)/float(num_1) > 0.1: + O_choice="O-9,46" + print "$$$Most possilble Otype: O-9,46" + elif "O-9,46,27_partial_wzy" in O_list and float(O94627)/float(num_1) > 0.1: + O_choice="O-9,46,27" + print "$$$Most possilble Otype: O-9,46,27" + else: + O_choice="O-9" + if file_mode=="3": + rev_fq="" + rev_sai="" + assembly(sra_name,O_choice,for_fq,rev_fq,for_sai,rev_sai,sam,bam,mapping_mode) + else: + assembly(sra_name,O_choice,for_fq,rev_fq,for_sai,rev_sai,sam,bam,mapping_mode) + elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1: + if "O-3,10_not_in_1,3,19" in O_list and float(O310_no_1319)/float(num_1) > 0.1: + O_choice="O-3,10" + print "$$$Most possilble Otype: O-3,10" + else: + O_choice="O-1,3,19" + print "$$$Most possilble Otype: O-1,3,19" + else: + try: + O_choice=final_O[0][0].split("_")[0] + if O_choice=="O-1,3,19": + O_choice=final_O[1][0].split("_")[0] + print "$$$Most possilble Otype: ",O_choice + except: + print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)" + + +def assembly(sra_name,potential_choice,for_fq,rev_fq,for_sai,rev_sai,sam,bam,mapping_mode): + database="ParaA_rfb.fasta" + os.system("bwa index database/"+database)###01/28/2015 + if rev_fq=="":#2015/09/09 + if mapping_mode=="mem": + os.system("bwa mem database/"+database+" "+for_fq+" > "+sam) #2014/12/23 + elif mapping_mode=="sam": + os.system("bwa aln database/"+database+" "+for_fq+" > "+for_sai) + os.system("bwa samse database/"+database+" "+for_sai+" "+for_fq+" > "+sam) + elif mapping_mode=="nanopore":## + os.system("bwa mem -x ont2d database/"+database+" "+for_fq+" > "+sam)## + else: + if mapping_mode=="mem": + os.system("bwa mem database/"+database+" "+for_fq+" "+rev_fq+" > "+sam) #2014/12/23 + elif mapping_mode=="sam": + os.system("bwa aln database/"+database+" "+for_fq+" > "+for_sai) + os.system("bwa aln database/"+database+" "+rev_fq+" > "+rev_sai) + os.system("bwa sampe database/"+database+" "+for_sai+" "+ rev_sai+" "+for_fq+" "+rev_fq+" > "+sam) + elif mapping_mode=="nanopore": + os.system("bwa mem -x ont2d database/"+database+" "+for_fq+" "+rev_fq+" > "+sam) + os.system("samtools view -F 4 -Sbh "+sam+" > "+bam) + os.system("samtools view -h -o "+sam+" "+bam) + os.system("cat "+sam+"|awk '{if ($5>0) {print $10}}'>"+sam+"_seq.txt") + os.system("cat "+sam+"|awk '{if ($5>0) {print $1}}'>"+sam+"_title.txt") + file1=open(sam+"_title.txt","r") + file2=open(sam+"_seq.txt","r") + file1=file1.readlines() + file2=file2.readlines() + file=open(sam+".fasta","w") + for i in range(len(file1)): + title=">"+file1[i] + seq=file2[i] + if len(seq)>=50 and len(title)>6:#generally,can be adjusted + file.write(title) + file.write(seq) + file.close() + database2="tyr_of_O2_O9.fasta" + os.system('makeblastdb -in database/'+database2+' -out '+database2+'_db '+'-dbtype nucl') + os.system("blastn -query "+sam+".fasta"+" -db "+database2+"_db -out "+sam+"_vs_O29.xml -outfmt 5") + handle=open(sam+"_vs_O29.xml") + handle=NCBIXML.parse(handle) + handle=list(handle) + O9_bigger=0 + O2_bigger=0 + for x in handle: + O9_score=0 + O2_score=0 + try: + if 'O-9' in x.alignments[0].hit_def: + O9_score=x.alignments[0].hsps[0].bits + O2_score=x.alignments[1].hsps[0].bits + elif 'O-2' in x.alignments[0].hit_def: + O9_score=x.alignments[1].hsps[0].bits + O2_score=x.alignments[0].hsps[0].bits + if O9_score>O2_score: + O9_bigger+=1 + if O9_score<O2_score: + O2_bigger+=1 + except: + continue + print "$$$Genome:",sra_name + if O9_bigger>O2_bigger: + print "$$$Most possible Otype is O-9" + elif O9_bigger<O2_bigger: + print "$$$Most possible Otype is O-2" + else: + print "$$$No suitable one, because can't distinct it's O-9 or O-2, but ",potential_choice," has a more possibility." + print "O-9 number is:",O9_bigger + print "O-2 number is:",O2_bigger + + os.system("rm "+sam+"_title.txt")###01/28/2015 + os.system("rm "+sam+"_seq.txt")###01/28/2015 + os.system("rm "+sam+".fasta")###01/28/2015 + os.system("rm "+database2+"_db.*")###01/28/2015 + os.system("rm "+sam+"_vs_O29.xml")###01/28/2015 + + +target=sys.argv[1] #should be sra format +data_base=sys.argv[2] +mapping_mode=sys.argv[3] +if sys.argv[4] not in ("1","2","3"): + additional_file=sys.argv[4] + file_mode=sys.argv[5] +else: + additional_file="" + file_mode=sys.argv[4] + +BWA_O_analysis(target,additional_file,data_base,mapping_mode,file_mode)