annotate tools/ngs_rna/cufflinks_wrapper.py @ 2:c2a356708570

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:42 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 import optparse, os, shutil, subprocess, sys, tempfile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 def stop_err( msg ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 sys.stderr.write( "%s\n" % msg )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 # Copied from sam_to_bam.py:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 def check_seq_file( dbkey, cached_seqs_pointer_file ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 seq_path = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 for line in open( cached_seqs_pointer_file ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 line = line.rstrip( '\r\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 if line and not line.startswith( '#' ) and line.startswith( 'index' ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 fields = line.split( '\t' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 if len( fields ) < 3:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 if fields[1] == dbkey:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 seq_path = fields[2].strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 return seq_path
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 def __main__():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 #Parse Command Line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 parser = optparse.OptionParser()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 parser.add_option( '-1', '--input', dest='input', help=' file of RNA-Seq read alignments in the SAM format. SAM is a standard short read alignment, that allows aligners to attach custom tags to individual alignments, and Cufflinks requires that the alignments you supply have some of these tags. Please see Input formats for more details.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 parser.add_option( '-s', '--inner-dist-std-dev', dest='inner_dist_std_dev', help='The standard deviation for the distribution on inner distances between mate pairs. The default is 20bp.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 parser.add_option( '-I', '--max-intron-length', dest='max_intron_len', help='The minimum intron length. Cufflinks will not report transcripts with introns longer than this, and will ignore SAM alignments with REF_SKIP CIGAR operations longer than this. The default is 300,000.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 parser.add_option( '-F', '--min-isoform-fraction', dest='min_isoform_fraction', help='After calculating isoform abundance for a gene, Cufflinks filters out transcripts that it believes are very low abundance, because isoforms expressed at extremely low levels often cannot reliably be assembled, and may even be artifacts of incompletely spliced precursors of processed transcripts. This parameter is also used to filter out introns that have far fewer spliced alignments supporting them. The default is 0.05, or 5% of the most abundant isoform (the major isoform) of the gene.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 parser.add_option( '-j', '--pre-mrna-fraction', dest='pre_mrna_fraction', help='Some RNA-Seq protocols produce a significant amount of reads that originate from incompletely spliced transcripts, and these reads can confound the assembly of fully spliced mRNAs. Cufflinks uses this parameter to filter out alignments that lie within the intronic intervals implied by the spliced alignments. The minimum depth of coverage in the intronic region covered by the alignment is divided by the number of spliced reads, and if the result is lower than this parameter value, the intronic alignments are ignored. The default is 5%.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 parser.add_option( '-m', '--inner-mean-dist', dest='inner_mean_dist', help='This is the expected (mean) inner distance between mate pairs. \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 For, example, for paired end runs with fragments selected at 300bp, \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 where each end is 50bp, you should set -r to be 200. The default is 45bp.')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 parser.add_option( '-G', '--GTF', dest='GTF', help='Tells Cufflinks to use the supplied reference annotation to estimate isoform expression. It will not assemble novel transcripts, and the program will ignore alignments not structurally compatible with any reference transcript.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 parser.add_option( '-g', '--GTF-guide', dest='GTFguide', help='use reference transcript annotation to guide assembly' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 # Normalization options.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 parser.add_option( "-N", "--quartile-normalization", dest="do_normalization", action="store_true" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 # Wrapper / Galaxy options.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 parser.add_option( '-A', '--assembled-isoforms-output', dest='assembled_isoforms_output_file', help='Assembled isoforms output file; formate is GTF.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 # Advanced Options:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 parser.add_option( '--num-importance-samples', dest='num_importance_samples', help='Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 parser.add_option( '--max-mle-iterations', dest='max_mle_iterations', help='Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 # Bias correction options.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 parser.add_option( '-b', dest='do_bias_correction', action="store_true", help='Providing Cufflinks with a multifasta file via this option instructs it to run our new bias detection and correction algorithm which can significantly improve accuracy of transcript abundance estimates.')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 (options, args) = parser.parse_args()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 # output version # of tool
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 tmp = tempfile.NamedTemporaryFile().name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 tmp_stdout = open( tmp, 'wb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 proc = subprocess.Popen( args='cufflinks --no-update-check 2>&1', shell=True, stdout=tmp_stdout )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 tmp_stdout.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 returncode = proc.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 stdout = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 for line in open( tmp_stdout.name, 'rb' ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 if line.lower().find( 'cufflinks v' ) >= 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 stdout = line.strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 if stdout:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 sys.stdout.write( '%s\n' % stdout )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 raise Exception
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 sys.stdout.write( 'Could not determine Cufflinks version\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 # If doing bias correction, set/link to sequence file.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 if options.do_bias_correction:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 if not os.path.exists( cached_seqs_pointer_file ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 # and the equCab2.fa file will contain fasta sequences.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 if options.ref_file != 'None':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 # Create symbolic link to ref_file so that index will be created in working directory.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 seq_path = "ref.fa"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 os.symlink( options.ref_file, seq_path )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 # Build command.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 # Base; always use quiet mode to avoid problems with storing log output.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 cmd = "cufflinks -q --no-update-check"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 # Add options.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 if options.inner_dist_std_dev:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 cmd += ( " -s %i" % int ( options.inner_dist_std_dev ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 if options.max_intron_len:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 cmd += ( " -I %i" % int ( options.max_intron_len ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 if options.min_isoform_fraction:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 cmd += ( " -F %f" % float ( options.min_isoform_fraction ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 if options.pre_mrna_fraction:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 cmd += ( " -j %f" % float ( options.pre_mrna_fraction ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 if options.num_threads:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 cmd += ( " -p %i" % int ( options.num_threads ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 if options.inner_mean_dist:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 cmd += ( " -m %i" % int ( options.inner_mean_dist ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 if options.GTF:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 cmd += ( " -G %s" % options.GTF )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 if options.GTFguide:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 cmd += ( " -g %s" % options.GTFguide )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 if options.num_importance_samples:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 cmd += ( " --num-importance-samples %i" % int ( options.num_importance_samples ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 if options.max_mle_iterations:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 cmd += ( " --max-mle-iterations %i" % int ( options.max_mle_iterations ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 if options.do_normalization:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 cmd += ( " -N" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 if options.do_bias_correction:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 cmd += ( " -b %s" % seq_path )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 # Debugging.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 print cmd
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 # Add input files.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 cmd += " " + options.input
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 # Run command.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 tmp_stderr = open( tmp_name, 'wb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 returncode = proc.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 tmp_stderr.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 # Get stderr, allowing for case where it's very large.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 tmp_stderr = open( tmp_name, 'rb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 stderr = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 buffsize = 1048576
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 while True:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 stderr += tmp_stderr.read( buffsize )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 if not stderr or len( stderr ) % buffsize != 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 except OverflowError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 tmp_stderr.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 # Copy outputs.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 shutil.copyfile( "transcripts.gtf" , options.assembled_isoforms_output_file )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 # Error checking.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 if returncode != 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 raise Exception, stderr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 stop_err( 'Error running cufflinks. ' + str( e ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 if __name__=="__main__": __main__()