Mercurial > repos > siyuan > prada
view pyPRADA_1.2/prada-guess-if @ 1:03815b87eb65 draft
Uploaded
author | siyuan |
---|---|
date | Fri, 21 Feb 2014 18:08:37 -0500 |
parents | acc2ca1a3ba4 |
children |
line wrap: on
line source
#!/usr/bin/env python #GUESS-if is to find abnormal intragenic fusions. #It is an extension of the GUESS-ft method. import pysam import subprocess import os import time import sys import bioclass import ioprada args=sys.argv #Get all parameters help_menu='''\nUsage: prada-guess-if Gene -conf xx.txt -inputbam X -mm 1 -minmapq 30 -junL X -outdir ./ -unmap X\n **Parameters**: -conf the configure file. see prada-fusion -conf for details -inputbam the input bam file -mm number of mismatch allowed -minmapq mininum mapping quality for reads to be used in fusion finding -junL length of exons to be used for junctions. see prada-fusion -outdir output directory -unmap the numapped reads. useful if user need to run guess-ft multiple times ''' if '-h' in args or '-help' in args or len(args)==1: print help_menu sys.exit(0) ######################################################################### target=args[1] if '-inputbam' not in args: sys.exit('Input BAM needed') else: i=args.index('-inputbam') sourcefile=args[i+1] if '-outdir' not in args: outdir='./' else: i=args.index('-outdir') outdir=args[i+1] if not os.path.exists(outdir): os.mkdir(outdir) if '-unmap' not in args: unmapbam='%s/one.end.unmapped.bam'%outdir extract_mask=1 else: i=args.index('-unmap') unmapbam=args[i+1] extract_mask=0 if '-mm' not in args: mm=1 else: i=args.index('-mm') mm=int(args[i+1]) #minimum mapping quality for reads as fusion evidences if '-minmapq' not in args: minmapq=30 else: i=args.index('-minmapq') minmapq=int(args[i+1]) if '-junL' not in args: sys.exit('-junL needed') else: i=args.index('-junL') junL=int(args[i+1]) prada_path=os.path.dirname(os.path.abspath(__file__)) #### ref_search_path=[prada_path,os.getcwd()] #search path for ref file if not specified in command line if '-conf' in args: i=args.index('-conf') reffile=args[i+1] if os.path.exists(reffile): pass else: for pth in ref_search_path: new_reffile='%s/%s'%(pth, os.path.basename(reffile)) if os.path.exists(new_reffile): reffile=new_reffile break else: sys.exit('ERROR: ref file %s not found'%reffile) else: reffile='%s/conf.txt'%prada_path if not os.path.exists(reffile): sys.exit('ERROR: No default conf.txt found and none specified') #reference files refdict=ioprada.read_conf(reffile) ref_anno=refdict['--REF--']['ref_anno'] ref_map=refdict['--REF--']['ref_map'] ref_fasta=refdict['--REF--']['ref_fasta'] featurefile=refdict['--REF--']['feature_file'] samtools='%s/tools/samtools-0.1.16/samtools'%prada_path bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path ######################################################################### print 'GUESS-if start: %s'%time.ctime() print 'CMD: %s'%('\t'.join(args)) ##get gene position information gobj=ioprada.read_feature_genes(featurefile,target)[0] if gobj is None: sys.exit('%s not found'%target) gchr=gobj.strand gchr_rev=True if gchr=='-1' else False #Generate unmapped reads. if extract_mask: cmd='%s view -b -f 4 -F 8 %s > %s'%(samtools,sourcefile,unmapbam) cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) while True: if cmdout.poll() is None: print 'Extracting unmapped reads...' time.sleep(120) pass if cmdout.poll()==0: print 'Extracted unmapped reads' break if cmdout.poll() is not None and cmdout.poll() != 0: raise Exception('Error extracting unmapped reads') #Generate junction db juncfile='%s.junction.fasta'%target cmd='perl %s/make_intragenic_junctions.pl %s %s %s %s %s %d > %s/%s'%(prada_path,target,target,ref_anno,ref_map,ref_fasta,junL,outdir,juncfile) cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) while True: if cmdout.poll() is None: print 'Generating junction db. Waiting...' time.sleep(20) pass if cmdout.poll()==0: print 'Generated Junction DB.' break if cmdout.poll() is not None and cmdout.poll() != 0: raise Exception('Error generated Junction DB.') #scan BAM file for mapping reads. samfile=pysam.Samfile(sourcefile,'rb') reads_se=[] reads_as=[] for alignedread in samfile.fetch(gobj.chr,gobj.start-1,gobj.end): if alignedread.mapq >= minmapq: mmf=[x[1] for x in alignedread.tags if x[0]=='NM'][0] if mmf <= mm: if alignedread.is_reverse==gchr_rev: #sense reads_se.append(alignedread) else: #anti-sense reads_as.append(alignedread) samfile.close() seonly,asonly=[],[] for rd in reads_se: if rd.mate_is_unmapped: seonly.append(rd) for rd in reads_as: if rd.mate_is_unmapped: asonly.append(rd) seunmap=[x.qname for x in seonly] asunmap=[x.qname for x in asonly] #find read sequences print 'Extracting unmapped read sequences.' print 'mate unmapped reads for sense strand:',len(seonly) print 'mate unmapped reads for antisense strand:',len(asonly) samfile=pysam.Samfile(unmapbam,'rb') resfq_a=open('%s/%s_se_unmap.fq'%(outdir,target),'w') resfq_b=open('%s/%s_as_unmap.fq'%(outdir,target),'w') for item in samfile: if item.qname in seunmap: resfq_a.write('@%s\n'%item.qname) resfq_a.write('%s\n'%item.seq) resfq_a.write('+\n') resfq_a.write('%s\n'%item.qual) if item.qname in asunmap: resfq_b.write('@%s\n'%item.qname) resfq_b.write('%s\n'%item.seq) resfq_b.write('+\n') resfq_b.write('%s\n'%item.qual) resfq_a.close() resfq_b.close() samfile.close() ##indexing junction db print 'Aligning reads to junction db' cmd='%s index %s/%s'%(bwa,outdir,juncfile) cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) while True: if cmdout.poll() is None: time.sleep(3) pass if cmdout.poll()==0: print 'Junction DB indexed.' break if cmdout.poll() is not None and cmdout.poll() != 0: raise Exception('Error building junction db index') taga='%s_se'%target tagb='%s_as'%target for rs in [taga,tagb]: cmd='%s aln -n %d -R 100 %s/%s %s/%s_unmap.fq > %s/%s_unmap.sai'%(bwa,mm,outdir,juncfile,outdir,rs,outdir,rs) cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) while True: if cmdout.poll() is None: time.sleep(5) pass if cmdout.poll()==0: print 'Aligned unmapped reads group %s'%rs break if cmdout.poll() is not None and cmdout.poll() != 0: raise Exception('Error aligning unmapped reads for group %s'%rs) cmd='%s samse -n 1000 %s/%s %s/%s_unmap.sai %s/%s_unmap.fq > %s/%s_unmap.sam'%(bwa,outdir,juncfile,outdir,rs,outdir,rs,outdir,rs) cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) while True: if cmdout.poll() is None: time.sleep(2) pass if cmdout.poll()==0: print 'Converting to sam for group %s'%rs break if cmdout.poll() is not None and cmdout.poll() != 0: raise Exception('Error converting to sam for group %s'%rs) qualrd_a=[] junc_a=[] samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,taga),'r') for rd in samfile: if not rd.is_unmapped and rd.is_reverse: qualrd_a.append(rd) junc_a.append(samfile.getrname(rd.tid)) samfile.close() qualrd_b=[] junc_b=[] samfile=pysam.Samfile('%s/%s_unmap.sam'%(outdir,tagb),'r') for rd in samfile: if not rd.is_unmapped and not rd.is_reverse: qualrd_b.append(rd) junc_b.append(samfile.getrname(rd.tid)) samfile.close() junc_span=[] junc_span.extend(qualrd_a) junc_span.extend(qualrd_b) junc_name=[] junc_name.extend(junc_a) junc_name.extend(junc_b) #Generate a summary report sumfile=open('%s/%s.GUESS-IF.summary.txt'%(outdir,target),'w') sumfile.write('%s\n'%(target)) sumfile.write('\n') sumfile.write('>spanning\n') for i in range(len(junc_span)): rd=junc_span[i] jname=junc_name[i] mm_j=[x[1] for x in rd.tags if x[0]=='NM'][0] ss='%s\t%s.mm%d'%(rd.qname,jname,mm_j) sumfile.write('%s\n'%ss) sumfile.write('\n') sumfile.write('>junction\n') juncol=[] for item in set(junc_name): nn=junc_name.count(item) juncol.append([item,nn]) juncol=sorted(juncol,key=lambda x:x[1],reverse=True) for item in juncol: sumfile.write('%s\t%s\n'%(item[0],item[1])) sumfile.write('\n') sumfile.write('>summary\n') sumfile.write('Number of Fusion Reads = %d\n'%len(junc_span)) sumfile.write('Number of Distinct Junctions = %d\n'%len(set(junc_name))) sumfile.close() print 'Done: %s'%time.ctime()