diff pyPRADA_1.2/prada-preprocess-unc @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyPRADA_1.2/prada-preprocess-unc	Thu Feb 20 00:44:58 2014 -0500
@@ -0,0 +1,324 @@
+#!/usr/bin/env python
+
+#This is a program to implement the PRADA pipeline "process subsection", based on the work of Rahul Vegesna and Wandaliz Torres-Garcia
+#It merely generates a PBS file, depending on the user input entry point (step) 
+#Author: Siyuan Zheng, szheng2@mdanderson.org
+#Copy right belongs to Roel Verhaak's lab from MD Anderson Cancer Center, Department of Bioinformatics and Computational Biology. 
+#Last revision: 02/17/2014
+
+import subprocess
+import os,os.path
+import sys
+import time
+import ioprada
+import re
+
+########################################################################################
+args=sys.argv
+
+help_menu='''\nPipeline for RNAseq Data Analaysis - preprocessing pipeline (PRADA).
+\t**Usage**:
+\tprada-preprocess-bi -conf xx.txt -inputdir .. -sample XX -tag TCGA-XX -platform illumina -step 1_1 -intermediate no -pbs xxx -outdir ... -submit no
+\t**Parameters**:
+\t-h		print help message
+\t-step_info	print complete steps curated in the module.
+\t-inputdir	the dir where the input bam or fastq can be found.
+\t-sample		input sample name. PRADA searches for sample.bam or sample.end1.fastq/sample.end2.fastq, etc, depending on the 
+\t		initiating step number. See step_info for more information. 
+\t-conf		config file for references and parameters. Default is conf.txt in py-PRADA installation folder. 
+\t-tag		a tag to describe the sample, likely sample ID, such as TCGA-LGG-01; no default.
+\t-platform	only illumina at present (default). 
+\t-step		values: 1_1/2,2_e1/2_1/2/3/4,3_e1/2_1/2,4_1/2,5,6_1/2,7,8; example 2_e1_1; no default.
+\t-outdir		output dir. Default is the directory where the input bam is. 
+\t-pbs		name for output pbs file and log file. Default (time-stamp) is used if no input. 
+\t-intermediate	values:yes/no; if intermediate files should be kept. Default is not. 
+\t-submit		if submit the job to HPC, default is no. If yes, ppn is set to 12.
+\t-v		print version information. 
+'''
+
+steps_info='''
+Command orders (sample XX)
+step 1_1	-->  XX.sorted.bam [sort input bam by name]
+step 1_2	-->  XX.end1/2.fastq [extract reads from bam]
+step 2_e1_1	-->  XX.end1.sai [realign end1 reads to composite reference]
+step 2_e1_2	-->  XX.end1.sam [generate sam]
+step 2_e1_3	-->  XX.end1.bam [generate bam]
+step 2_e1_4	-->  XX.end1.sorted.bam [sort bam]
+step 2_e2_1	-->  XX.end2.sai [realign end2 reads to composite reference]
+step 2_e2_2	-->  XX.end2.sam [generate sam]
+step 2_e2_3	-->  XX.end2.bam [generate bam]
+step 2_e2_4	-->  XX.end2.sorted.bam [sort bam]
+step 3_e1_1	-->  XX.end1.remapped.bam [remap end1 to genome]
+step 3_e1_2	-->  XX.end1.remapped.sorted.bam [sort remapped bam]
+step 3_e2_1	-->  XX.end2.remapped.bam [remap end2 to genome]
+step 3_e2_2	-->  XX.end2.remapped.sorted.bam [sort remapped bam]
+step 4_1	-->  XX.paired.bam [pair end1 and end1]
+step 4_2	-->  XX.paired.sorted.bam [sort paired bam]
+step 5		-->  XX.withRG.paired.sorted.bam [add read group]
+step 6_1	-->  XX.orig.csv [prepair recalibration table]
+step 6_2	-->  XX.withRG.GATKRecalibrated.bam [recalibration]
+step 7		-->  XX.withRG.GATKRecalibrated.flagged.bam [flag duplication reads]
+step 8		-->  folder XX for gene expression, QC metrics etc. [generate QC and expression]
+'''
+
+if '-h' in args or '-help' in args or len(args)==1:
+    print help_menu
+    sys.exit(0)
+if '-step_info' in args:
+    print steps_info
+    sys.exit(0)
+if '-v' in args:
+    import version
+    print version.version
+    sys.exit(0)
+
+if '-sample' not in args:
+    sys.exit('ERROR: Sample name is needed')
+if '-step' not in args:
+    sys.exit('ERROR: Step number is needed')
+if '-tag' not in args:
+    sys.exit('ERROR: A tag is needed')
+
+i=args.index('-sample')
+sample=args[i+1]
+if '-inputdir' not in args:
+    inputpath=os.path.abspath('./')
+else:
+    i=args.index('-inputdir')
+    inputpath=args[i+1]
+bampath=os.path.abspath(inputpath)+'/%s.bam'%sample
+
+if '-outdir' not in args:
+    outpath=os.path.dirname(bampath)
+else:
+    i=args.index('-outdir')
+    outpath=os.path.abspath(args[i+1])
+if not os.path.exists(outpath):
+    os.mkdir(outpath)
+
+#bam=os.path.basename(bampath)
+#sample=bam[:-4]
+i=args.index('-step')
+step=args[i+1]
+i=args.index('-tag')
+tag=args[i+1]
+
+prada_path=os.path.dirname(os.path.abspath(__file__))   ####
+ref_search_path=[prada_path,os.getcwd()]                #search path for ref file if not specified in command line
+
+if '-conf' in args:
+    i=args.index('-conf')
+    reffile=args[i+1]
+    if os.path.exists(reffile):
+        pass
+    else:
+        for pth in ref_search_path:
+            new_reffile='%s/%s'%(pth, os.path.basename(reffile))
+            if os.path.exists(new_reffile):
+                reffile=new_reffile
+                break
+        else:
+            sys.exit('ERROR: conf file %s not found'%reffile)
+else:
+    reffile='%s/conf.txt'%prada_path
+    if not os.path.exists(reffile):
+        sys.exit('ERROR: No default conf.txt found and none specified')
+
+if '-platform' in args:
+    i=args.index('-platform')
+    plat=args[i+1]
+else:
+    plat='illumina'
+if '-submit' in args:
+    i=args.index('-submit')
+    submit=args[i+1]
+else:
+    submit='no'
+if '-intermediate' in args:
+    i=args.index('-intermediate')
+    keepmed=args[i+1]
+else:
+    keepmed='False'
+
+if keepmed in ['False','FALSE','false','F','NO','No','no','n']:
+    keepmed_flag='no'
+elif keepmed in ['True','TRUE','true','T','YES','Yes','yes','y']:
+    keepmed_flag='yes'
+else:
+    sys.exit('ERROR: -intermediate value not recognized')
+
+if '-pbs' in args:
+    i=args.index('-pbs')
+    docstr=args[i+1]
+else:
+    a=time.ctime().split()
+    b=time.time()
+    timestamp='_'.join([a[-1],a[1],a[2]])+'.'+str(b)
+    docstr='prada_prep_'+timestamp
+logfilename=docstr+'.log'
+pbsfilename=docstr+'.pbs'
+pbspath=outpath+'/'+pbsfilename
+logpath=outpath+'/'+logfilename
+########################################################################################
+
+########################################################################################
+#underlying utilities, automatically detected
+samtools='%s/tools/samtools-0.1.16/samtools'%prada_path
+bwa='%s/tools/bwa-0.5.7-mh/bwa'%prada_path
+gatk='%s/tools/GATK/'%prada_path
+picard='%s/tools/Picard/'%prada_path
+seqc='%s/tools/RNA-SeQC_v1.1.7.jar'%prada_path
+sam2fastq='%s/tools/ubu-1.2-jar-with-dependencies.jar'%prada_path
+#Default uses 12 nodes in HPC
+#########################################################################################
+
+#########################################################################################
+#reference files
+refdict=ioprada.read_conf(reffile)
+genome_gtf=refdict['--REF--']['genome_gtf']
+compdb_fasta=refdict['--REF--']['compdb_fasta']
+compdb_map=refdict['--REF--']['compdb_map']
+genome_fasta=refdict['--REF--']['genome_fasta']
+dbsnp_vcf=refdict['--REF--']['dbsnp_vcf']
+select_tx=refdict['--REF--']['select_tx']
+pat=re.compile('ppn=(\d*)')
+parallel_n=pat.search(refdict['--PBS--']['-l']).groups()[0]
+#########################################################################################
+
+#########################################################################################
+#pipeline command lines.
+##Cleaning up steps: if -intermediate is yes, none will be executed.
+step_1_1_cmd=['%s sort -n -m 1000000000 %s %s.sorted'%(samtools,bampath,sample)]
+post_1_1_clean=[]
+step_1_2_cmd=['java -Xmx8g -jar %s INPUT=%s.sorted.bam FASTQ=%s.end1.fastq SECOND_END_FASTQ=%s.end2.fastq INCLUDE_NON_PF_READS=true VALIDATION_STRINGENCY=SILENT TMP_DIR=tmp/'%(sam2fastq,sample,sample,sample)]
+post_1_2_clean=['rm -f %s.sorted.bam'%sample]
+alnparams=' '.join([' '.join(x) for x in refdict['--BWA aln--'].items()])
+samseparams=' '.join([' '.join(x) for x in refdict['--BWA samse--'].items()])
+if step == '2_e1_1' or step == '2_e2_1':
+    step_2_e1_1_cmd=['%s aln %s %s %s > %s.end1.sai'%(bwa,alnparams,compdb_fasta,fq1path,sample)]
+    post_2_e1_1_clean=[]
+    step_2_e1_2_cmd=['%s samse -s %s %s %s.end1.sai %s > %s.end1.sam'%(bwa,samseparams,compdb_fasta,sample,fq1path,sample)]
+    post_2_e1_2_clean=['rm -f %s.end1.sai'%sample]
+    step_2_e2_1_cmd=['%s aln %s %s %s > %s.end2.sai'%(bwa,alnparams,compdb_fasta,fq2path,sample)]
+    post_2_e2_1_clean=[]
+    step_2_e2_2_cmd=['%s samse -s %s %s %s.end2.sai %s > %s.end2.sam'%(bwa,samseparams,compdb_fasta,sample,fq2path,sample)]
+    post_2_e2_2_clean=['rm -f %s.end2.sai'%sample]
+else:
+    step_2_e1_1_cmd=['%s aln %s %s %s.end1.fastq > %s.end1.sai'%(bwa,alnparams,compdb_fasta,sample,sample)]
+    post_2_e1_1_clean=[]
+    step_2_e1_2_cmd=['%s samse -s %s %s %s.end1.sai %s.end1.fastq > %s.end1.sam'%(bwa,samseparams,compdb_fasta,sample,sample,sample)]
+    post_2_e1_2_clean=['rm -f %s.end1.sai'%sample,'rm -f %s.end1.fastq'%sample]
+    step_2_e2_1_cmd=['%s aln %s %s %s.end2.fastq > %s.end2.sai'%(bwa,alnparams,compdb_fasta,sample,sample)]
+    post_2_e2_1_clean=[]
+    step_2_e2_2_cmd=['%s samse -s %s %s %s.end2.sai %s.end2.fastq > %s.end2.sam'%(bwa,samseparams,compdb_fasta,sample,sample,sample)]
+    post_2_e2_2_clean=['rm -f %s.end2.sai'%sample,'rm -f %s.end2.fastq'%sample]
+step_2_e1_3_cmd=['%s view -bS -o %s.end1.bam %s.end1.sam'%(samtools,sample,sample)]
+post_2_e1_3_clean=['rm -f %s.end1.sam'%sample]
+step_2_e1_4_cmd=['%s sort -n -m 1000000000 %s.end1.bam %s.end1.sorted'%(samtools,sample,sample)]
+post_2_e1_4_clean=['rm -f %s.end1.bam'%sample]
+step_2_e2_3_cmd=['%s view -bS -o %s.end2.bam %s.end2.sam'%(samtools,sample,sample)]
+post_2_e2_3_clean=['rm -f %s.end2.sam'%sample]
+step_2_e2_4_cmd=['%s sort -n -m 1000000000 %s.end2.bam %s.end2.sorted'%(samtools,sample,sample)]
+post_2_e2_4_clean=['rm -f %s.end2.bam'%sample]
+step_3_e1_1_cmd=['java -Djava.io.tmpdir=tmp/ -cp %s/RemapAlignments.jar -Xmx8g org.broadinstitute.cga.tools.gatk.rna.RemapAlignments M=%s IN=%s.end1.sorted.bam OUT=%s.end1.remapped.bam R=%s REDUCE=TRUE'%(gatk,compdb_map,sample,sample,genome_fasta)]
+post_3_e1_1_clean=['rm -f %s.end1.sorted.bam'%sample]
+step_3_e1_2_cmd=['%s sort -n -m 1000000000 %s.end1.remapped.bam %s.end1.remapped.sorted'%(samtools,sample,sample)]
+post_3_e1_2_clean=['rm -f %s.end1.remapped.bam'%sample]
+step_3_e2_1_cmd=['java -Djava.io.tmpdir=tmp/ -cp %s/RemapAlignments.jar -Xmx8g org.broadinstitute.cga.tools.gatk.rna.RemapAlignments M=%s IN=%s.end2.sorted.bam OUT=%s.end2.remapped.bam R=%s REDUCE=TRUE'%(gatk,compdb_map,sample,sample,genome_fasta)]
+post_3_e2_1_clean=['rm -f %s.end2.sorted.bam'%sample]
+step_3_e2_2_cmd=['%s sort -n -m 1000000000 %s.end2.remapped.bam %s.end2.remapped.sorted'%(samtools,sample,sample)]
+post_3_e2_2_clean=['rm -f %s.end2.remapped.bam'%sample]
+step_4_1_cmd=['java -Djava.io.tmpdir=tmp/ -Xmx8g -jar %s/PairMaker.jar IN1=%s.end1.remapped.sorted.bam IN2=%s.end2.remapped.sorted.bam OUTPUT=%s.paired.bam TMP_DIR=tmp/'%(gatk,sample,sample,sample)]
+post_4_1_clean=['rm -f %s.end1.remapped.sorted.bam'%sample,'rm -f %s.end2.remapped.sorted.bam'%sample]
+step_4_2_cmd=['%s sort -m 1000000000 %s.paired.bam %s.paired.sorted'%(samtools,sample,sample)]
+post_4_2_clean=['rm -f %s.paired.bam'%sample]
+step_5_cmd=['java -Xmx8g -jar %s/AddOrReplaceReadGroups.jar I=%s.paired.sorted.bam O=%s.withRG.paired.sorted.bam RGLB=%s RGPL=%s RGPU=%s RGSM=%s'%(picard,sample,sample,tag,plat,tag,tag)]
+post_5_clean=['rm -f %s.paired.sorted.bam'%sample]
+step_6_1_cmd=['%s index %s.withRG.paired.sorted.bam'%(samtools,sample),'java -Xmx8g -jar %s/GenomeAnalysisTK.jar -l INFO -R %s --default_platform %s --knownSites %s -I %s.withRG.paired.sorted.bam --downsample_to_coverage 10000 -T CountCovariates -cov ReadGroupCovariate -cov QualityScoreCovariate -cov CycleCovariate -cov DinucCovariate -nt %s -recalFile %s.orig.csv'%(gatk,genome_fasta,plat,dbsnp_vcf,sample,parallel_n,sample)]
+post_6_1_clean=[]
+step_6_2_cmd=['java -Xmx8g -jar %s/GenomeAnalysisTK.jar -l INFO -R %s --default_platform %s -I %s.withRG.paired.sorted.bam -T TableRecalibration --out %s.withRG.GATKRecalibrated.bam -recalFile %s.orig.csv'%(gatk,genome_fasta,plat,sample,sample,sample)]
+post_6_2_clean=['rm -f %s.withRG.paired.sorted.bam'%sample,'rm -f %s.withRG.paired.sorted.bam.bai'%sample,'rm -f %s.orig.csv'%sample]
+step_7_cmd=['java -Xmx8g -jar %s/MarkDuplicates.jar I=%s.withRG.GATKRecalibrated.bam O=%s.withRG.GATKRecalibrated.flagged.bam METRICS_FILE=%s.Duplicates_metrics.txt VALIDATION_STRINGENCY=SILENT TMP_DIR=tmp/'%(picard,sample,sample,sample),'%s index %s.withRG.GATKRecalibrated.flagged.bam'%(samtools,sample)]
+post_7_clean=['rm -f %s.withRG.GATKRecalibrated.bam'%sample,'rm -f %s.withRG.GATKRecalibrated.bai'%sample,'rm -f %s.Duplicates_metrics.txt'%sample]
+step_8_cmd=["java -Xmx8g -jar %s -ttype 2 -t %s -r %s -s '%s|%s.withRG.GATKRecalibrated.flagged.bam|Disc' -o %s/"%(seqc,genome_gtf,genome_fasta,sample,sample,sample)]
+post_8_clean=[]
+
+cmdset=[]
+for item in globals().keys():
+    if item.endswith('_cmd'):
+        cmdset.append(item)
+cmdset.sort()
+
+#########################################################################################
+#write PBS file
+
+###############Entry Point
+try:
+    cmd_entry=cmdset.index('step_'+step+'_cmd')    ##entry point
+except ValueError:
+    sys.exit('ERROR: STEP not recognized')
+
+def _parsecmd(cmd):
+    '''pase cmd str for step information'''
+    info=cmd.split('_')
+    cstep='_'.join(info[1:-1])
+    return cstep
+
+###########################
+##headers
+outfile=open(pbspath,'w')
+outfile.write('#! /bin/sh\n')
+outfile.write('#PBS -V\n')
+outfile.write('#PBS -N %s\n'%tag)
+outfile.write('#PBS -j oe\n')
+outfile.write('#PBS -o %s\n'%logpath)
+for item in refdict['--PBS--']:
+    outfile.write('#PBS %s %s\n'%(item,refdict['--PBS--'][item]))
+outfile.write('#PBS -d %s\n'%outpath)
+
+#commands
+outfile.write('echo "Job start: `date`"\n')
+
+for i in range(cmd_entry,len(cmdset)):
+    cmd=cmdset[i]
+    cstep=_parsecmd(cmd)
+    clean_flag=1
+    outfile.write('echo "step %s start: `date`"\n'%cstep)
+    outfile.write('if\n')
+    outfile.write('\t%s\n'%('\n'.join(eval(cmd))))
+    outfile.write('then\n')
+    outfile.write('\techo "step %s done: `date`"\n'%cstep)
+    outfile.write('else\n')
+    outfile.write('\techo "step %s ERROR"\n'%cstep)
+    outfile.write('\texit\n')
+    outfile.write('fi\n')
+
+    if keepmed_flag=='yes': #if so, keep files by force
+        clean_flag=0
+    
+    if clean_flag==1:
+        outfile.write('\n'.join(eval('post_%s_clean'%cstep)))
+        outfile.write('\n')
+
+outfile.write('echo "PIPELINE FINISHED"\n')
+outfile.close()
+######################################################################
+
+if submit in ['False','FALSE','false','F','NO','No','no','n']:
+    jid='Not_Submitted'
+    logpath='None'
+elif submit in ['True','TRUE','true','T','YES','Yes','yes','y']:
+    cmdstr='qsub %s'%pbspath
+    cmd=cmdstr.split()
+    cmdout=subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+    jid=cmdout.stdout.read().strip()   ##JOB ID
+else:
+    sys.exit('ERROR: submit parameter not recognized')
+ 
+print '#!#%s'%tag
+print 'BAM\t%s'%bampath
+print 'Entry\t%s'%step
+print 'PBS\t%s'%pbspath
+print 'JOB\t%s'%jid
+print 'LOG\t%s'%logpath
+