diff SeqSero/libs/H_combination_output_analysis.py @ 0:c577b57b7c74 draft

Uploaded
author estrain
date Wed, 06 Dec 2017 15:59:29 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SeqSero/libs/H_combination_output_analysis.py	Wed Dec 06 15:59:29 2017 -0500
@@ -0,0 +1,296 @@
+#!/usr/bin/env python
+# "H_combination_output_analysis.py target.fasta fliCdatabase.fasta fljBdatabase.fasta"
+# must have ispcr and primers of fliC and fljB at the same directory
+
+
+import os
+from Bio import SeqIO
+import sys
+from Bio.Blast import NCBIXML
+from Initial_Conditions import phase1
+from Initial_Conditions import phase2
+
+
+
+target=sys.argv[1]
+database_fliC=sys.argv[2]
+database_fljB=sys.argv[3]
+output=target.split('.')[0]+'_out.fasta'
+
+dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))###01/27/2015
+database_path="database"###01/27/2015,database_path=dirpath+"/database"
+os.system(dirpath+'/isPcr maxSize=3000 tileSize=7 minPerfect=7 minGood=7 '+target+' '+dirpath+'/../primers/seq_primer_fliC.txt '+target+'_fliC.fa')
+os.system(dirpath+'/isPcr maxSize=3000 tileSize=7 minPerfect=7 minGood=7 '+target+' '+dirpath+'/../primers/seq_primer_fljB.txt '+target+'_fljB.fa')
+fliC=target+'_fliC.fa'
+fljB=target+'_fljB.fa'
+
+if os.path.getsize(fliC)>10:
+  os.system('makeblastdb -in '+database_fliC+' -out '+database_fliC+'_db '+'-dbtype prot')###01/28/2015,no need to add fljB address,  because input is abs address already
+  os.system('blastx -seg=no -query '+fliC+' -db '+database_fliC+'_db '+'-out '+'FliC_Htype_'+target+'.xml '+'-outfmt 5')
+  print target
+  fliC_XML='FliC_Htype_'+target+'.xml'
+  fliC_handle=open(fliC_XML)
+  records=NCBIXML.parse(fliC_handle)
+  fliC_records=list(records)
+  E_thresh=1e-10
+  hspbit=[]
+  alignmentlist=[]
+  for record in fliC_records:
+    for alignment in record.alignments:
+      hsp_bit_score=0
+      startlist=[]#the percentage algorithm don't consider one situation, the new hsp cover old hsp
+      endlist=[]#
+      for hsp in alignment.hsps:
+        start=hsp.query_start#
+        end=hsp.query_end#
+        leng=abs(start-end)#
+        if hsp.expect<E_thresh:#
+          if start>end:#
+            temp=start#
+            start=end#
+            end=start#
+          if len(startlist)==0:#
+            hsp_bit_score=hsp_bit_score+hsp.bits#
+            startlist.append(start)#
+            endlist.append(end)#
+          else:#
+            for i in range(len(startlist)):#
+              if startlist[i]<start<endlist[i]:#
+                start=endlist[i]+1#
+              if startlist[i]<end<endlist[i]:#03112016
+                end=startlist[i]-1#
+            if end<start:#the new hsp was included in old hsp#
+              percentage=0#
+            else:
+              percentage=float(end-start)/leng#
+              startlist.append(start)#
+              endlist.append(end)#
+            hsp_bit_score=hsp_bit_score+percentage*hsp.bits#          
+      alignment=alignment.hit_def+':'+str(hsp_bit_score)
+      hspbit.append(hsp_bit_score)
+      alignmentlist.append(alignment)
+  scorelist=dict(zip(alignmentlist,hspbit))
+  score=0
+  serotype=[]
+  seroscore=[]
+  for Htype in scorelist:
+    if scorelist[Htype]>score:
+      First_Choice=Htype
+      score=scorelist[Htype]
+  if  locals().has_key('First_Choice'):
+    serotype.append(First_Choice.split("__")[0])
+    seroscore.append(score)
+    secscore=0
+    for Htype in scorelist:
+
+      if scorelist[Htype]>secscore and (Htype.split("__")[0] not in serotype):
+        Sec_Choice=Htype
+        secscore=scorelist[Htype]
+    if locals().has_key('Sec_Choice'):  
+      serotype.append(Sec_Choice.split("__")[0])
+      seroscore.append(secscore)
+      thirdscore=0
+      for Htype in scorelist:
+        if scorelist[Htype]>thirdscore and (Htype.split("__")[0] not in serotype):
+          Third_Choice=Htype
+          thirdscore=scorelist[Htype]
+      if locals().has_key('Third_Choice'):
+        serotype.append(Third_Choice.split("__")[0])
+        seroscore.append(thirdscore)
+  print serotype,seroscore
+  if score>100:
+    print '#',target,'$$$ Most possible H_fliC_type: ',First_Choice,'\n'
+    print '$$$ bit_score:',score,'\n'
+    if locals().has_key('secscore'):
+      if secscore>100:
+        print '#',target,'$$$ Second possible H_fliC_type: ',Sec_Choice,'\n'
+        print '$$$ Second bit_score:',secscore,'\n'
+        if locals().has_key('thirdscore'):
+          if thirdscore>100:
+            print '#',target,'$$$ Third possible H_fliC_type: ',Third_Choice,'\n'
+            print '$$$ Third bit_score:',thirdscore,'\n'
+  else:
+    print '$$$ No fliC in',target
+else:
+  score=1
+  print '$$$ No fliC (no file created) in',target
+
+
+
+if os.path.getsize(fljB)>10:
+  os.system('makeblastdb -in '+database_fljB+' -out '+database_fljB+'_db '+'-dbtype prot')###01/28/2015,no need to add fljB address,  because input is abs address already
+  os.system('blastx -query '+fljB+' -db '+database_fljB+'_db '+'-out '+'FljB_Htype_'+target+'.xml '+'-outfmt 5')
+  print target
+  fljB_XML='FljB_Htype_'+target+'.xml'
+  fljB_handle=open(fljB_XML)
+  records=NCBIXML.parse(fljB_handle)
+  fljB_records=list(records)
+  E_thresh=1e-10
+  hspbit=[]
+  alignmentlist=[]
+  for record in fljB_records:
+    for alignment in record.alignments:
+      hsp_bit_score=0
+      startlist=[]#
+      endlist=[]#
+      for hsp in alignment.hsps:
+        start=hsp.query_start#
+        end=hsp.query_end#
+        leng=abs(start-end)#
+        if hsp.expect<E_thresh:#
+          if start>end:#
+            temp=start#
+            start=end#
+            end=start#
+          if len(startlist)==0:#
+            hsp_bit_score=hsp_bit_score+hsp.bits#
+            startlist.append(start)#
+            endlist.append(end)#
+          else:#
+            for i in range(len(startlist)):#
+              if startlist[i]<start<endlist[i]:#
+                start=endlist[i]+1#
+              if startlist[i]<end<endlist[i]:#03112016
+                end=startlist[i]-1#
+            if end<start:#the new hsp was included in old hsp#
+              percentage=0#
+            else:
+              percentage=float(end-start)/leng#
+              startlist.append(start)#
+              endlist.append(end)#
+            hsp_bit_score=hsp_bit_score+percentage*hsp.bits#   
+      alignment=alignment.hit_def+':'+str(hsp_bit_score)
+      hspbit.append(hsp_bit_score)
+      alignmentlist.append(alignment)
+  fljB_scorelist=dict(zip(alignmentlist,hspbit))
+
+  fljB_score=0
+  fljB_serotype=[]
+  fljB_seroscore=[]
+  for Htype in fljB_scorelist:
+    if fljB_scorelist[Htype]>fljB_score:
+      fljB_First_Choice=Htype
+      fljB_score=fljB_scorelist[Htype]
+  if  locals().has_key('fljB_First_Choice'):
+    fljB_serotype.append(fljB_First_Choice.split("__")[0])
+    fljB_seroscore.append(fljB_score)
+    fljB_secscore=0
+    for Htype in fljB_scorelist:
+      if fljB_scorelist[Htype]>fljB_secscore and (Htype.split("__")[0] not in fljB_serotype):
+        fljB_Sec_Choice=Htype
+        fljB_secscore=fljB_scorelist[Htype]
+    if locals().has_key('fljB_Sec_Choice'):  
+      fljB_serotype.append(fljB_Sec_Choice.split("__")[0])
+      fljB_seroscore.append(fljB_secscore)
+      fljB_thirdscore=0
+      for Htype in fljB_scorelist:
+        if fljB_scorelist[Htype]>fljB_thirdscore and (Htype.split("__")[0] not in fljB_serotype):
+          fljB_Third_Choice=Htype
+          fljB_thirdscore=fljB_scorelist[Htype]
+      if locals().has_key('fljB_Third_Choice'):
+        fljB_serotype.append(fljB_Third_Choice.split("__")[0])
+        fljB_seroscore.append(fljB_thirdscore)
+
+  if fljB_score>100:
+    print '#',target,'$$$ Most possible H_fljB_type: ',fljB_First_Choice,'\n'
+    print '$$$ Most bit_score:',fljB_score,'\n'
+    if locals().has_key('fljB_secscore'):
+      if fljB_secscore>100:
+        print '#',target,'$$$ Second possible H_fljB_type: ',fljB_Sec_Choice,'\n'
+        print '$$$ Second bit_score:',fljB_secscore,'\n'
+        if locals().has_key('fljB_thirdscore'):
+          if fljB_thirdscore>100:
+            print '#',target,'$$$ Third possible H_fljB_type: ',fljB_Third_Choice,'\n'
+            print '$$$ Third bit_score:',fljB_thirdscore,'\n'
+  else:
+    print '$$$ No fljB in',target
+else:
+  fljB_score=1
+  print '$$$ No fljB (no file created) in',target
+
+
+if score>100 and fljB_score>100:
+  fliC_sero=dict(zip(serotype,seroscore))
+  fljB_sero=dict(zip(fljB_serotype,fljB_seroscore))
+  combination=[]
+  combination_score=[]
+  for seroname in fliC_sero:
+    for fljB_seroname in fljB_sero:
+      for i in range(len(phase1)):
+        if phase1[i]==seroname and phase2[i]==fljB_seroname:
+          name=seroname+"_"+fljB_seroname
+          score=fliC_sero[seroname]+fljB_sero[fljB_seroname]
+          combination.append(name)
+          combination_score.append(score)
+  combinationlist=dict(zip(combination,combination_score))  #we can do the filteration here
+  final_dict=sorted(combinationlist.iteritems(), key=lambda d:d[1], reverse = True)
+  print "$$_H:Order:",final_dict
+elif score>100 and fljB_score<100:
+  print "$$_H:No fljB, only fliC, and its order:",First_Choice,Sec_Choice,Third_Choice
+elif score<100 and fljB_score>100:
+  print "$$_H:No fliC, only fljB, and its order:",fljB_First_Choice,fljB_Sec_Choice,fljB_Third_Choice
+elif score==1 and fljB_score>100:
+  print "$$_H:No fliC (file) existed, only fljB, and its order:",fljB_First_Choice,fljB_Sec_Choice,fljB_Third_Choice
+elif score==1 and fljB_score<100:
+  print "$$_H:No fliC (file) existed, and no fljB"
+elif score>100 and fljB_score==1:
+  print "$$_H:No fljB (file) existed, only fliC, and its order:",First_Choice,Sec_Choice,Third_Choice
+elif score<100 and fljB_score==1:
+  print "$$_H:No fljB (file) existed, and no fliC"
+else:
+  print "$$_H:No fliC and fljB"
+
+
+'''
+E_thresh=1e-10
+hspbit=[]
+alignmentlist=[]
+for record in fliC_records:
+  for alignment in record.alignments:
+    hsp_bit_score=0
+    for hsp in alignment.hsps:
+      if hsp.expect<E_thresh:
+        hsp_bit_score=hsp_bit_score+hsp.bits
+    alignment=alignment.hit_def+':'+str(hsp_bit_score)
+    hspbit.append(hsp_bit_score)
+    alignmentlist.append(alignment)
+
+scorelist=dict(zip(alignmentlist,hspbit))
+score=0
+for Htype in scorelist:
+  if scorelist[Htype]>score:
+    First_Choice=Htype
+    score=scorelist[Htype]
+if score>100:
+  print '#',target,'Most possible H_fliC_type: ',First_Choice,'\n'
+  print '#bit_score:',score,'\n'
+else:
+  print '#No fliC in',target
+
+
+E_thresh=1e-10
+hspbit=[]
+alignmentlist=[]
+for record in fljB_records:
+  for alignment in record.alignments:
+    hsp_bit_score=0
+    for hsp in alignment.hsps:
+      if hsp.expect<E_thresh:
+        hsp_bit_score=hsp_bit_score+hsp.bits
+    alignment=alignment.hit_def+':'+str(hsp_bit_score)
+    hspbit.append(hsp_bit_score)
+    alignmentlist.append(alignment)
+
+scorelist=dict(zip(alignmentlist,hspbit))
+fljB_score=0
+for Htype in scorelist:
+  if scorelist[Htype]>fljB_score:
+    First_Choice=Htype
+    fljB_score=scorelist[Htype]
+if fljB_score>100:
+  print '#',target,'Most possible H_fljB_type: ',First_Choice,'\n'
+  print '#bit_score:',fljB_score,'\n'
+else:
+  print '#No fljB in',target
+'''
+