annotate cuffmerge_wrapper.py @ 0:dbbd37e013aa

Uploaded tool tarball.
author devteam
date Tue, 01 Oct 2013 12:57:24 -0400
parents
children 5b285b6e4ee3
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
1 #!/usr/bin/env python
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
2
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
3 # Supports Cuffmerge versions 1.3 and newer.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
4
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
5 import optparse, os, shutil, subprocess, sys, tempfile
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
6
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
7 def stop_err( msg ):
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
8 sys.stderr.write( '%s\n' % msg )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
9 sys.exit()
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
10
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
11 # Copied from sam_to_bam.py:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
12 def check_seq_file( dbkey, cached_seqs_pointer_file ):
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
13 seq_path = ''
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
14 for line in open( cached_seqs_pointer_file ):
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
15 line = line.rstrip( '\r\n' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
16 if line and not line.startswith( '#' ) and line.startswith( 'index' ):
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
17 fields = line.split( '\t' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
18 if len( fields ) < 3:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
19 continue
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
20 if fields[1] == dbkey:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
21 seq_path = fields[2].strip()
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
22 break
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
23 return seq_path
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
24
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
25 def __main__():
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
26 #Parse Command Line
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
27 parser = optparse.OptionParser()
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
28 parser.add_option( '-g', dest='ref_annotation', help='An optional "reference" annotation GTF. Each sample is matched against this file, and sample isoforms are tagged as overlapping, matching, or novel where appropriate. See the refmap and tmap output file descriptions below.' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
29 parser.add_option( '-s', dest='use_seq_data', action="store_true", help='Causes cuffmerge to look into for fasta files with the underlying genomic sequences (one file per contig) against which your reads were aligned for some optional classification functions. For example, Cufflinks transcripts consisting mostly of lower-case bases are classified as repeats. Note that <seq_dir> must contain one fasta file per reference chromosome, and each file must be named after the chromosome, and have a .fa or .fasta extension.')
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
30 parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
31
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
32
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
33 # Wrapper / Galaxy options.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
34 parser.add_option( '', '--dbkey', dest='dbkey', help='The build of the reference dataset' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
35 parser.add_option( '', '--index_dir', dest='index_dir', help='GALAXY_DATA_INDEX_DIR' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
36 parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
37
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
38 # Outputs.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
39 parser.add_option( '', '--merged-transcripts', dest='merged_transcripts' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
40
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
41 (options, args) = parser.parse_args()
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
42
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
43 # output version # of tool
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
44 try:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
45 tmp = tempfile.NamedTemporaryFile().name
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
46 tmp_stdout = open( tmp, 'wb' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
47 proc = subprocess.Popen( args='cuffmerge -v 2>&1', shell=True, stdout=tmp_stdout )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
48 tmp_stdout.close()
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
49 returncode = proc.wait()
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
50 stdout = None
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
51 for line in open( tmp_stdout.name, 'rb' ):
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
52 if line.lower().find( 'merge_cuff_asms v' ) >= 0:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
53 stdout = line.strip()
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
54 break
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
55 if stdout:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
56 sys.stdout.write( '%s\n' % stdout )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
57 else:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
58 raise Exception
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
59 except:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
60 sys.stdout.write( 'Could not determine Cuffmerge version\n' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
61
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
62 # Set/link to sequence file.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
63 if options.use_seq_data:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
64 if options.ref_file != 'None':
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
65 # Sequence data from history.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
66 # Create symbolic link to ref_file so that index will be created in working directory.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
67 seq_path = "ref.fa"
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
68 os.symlink( options.ref_file, seq_path )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
69 else:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
70 # Sequence data from loc file.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
71 cached_seqs_pointer_file = os.path.join( options.index_dir, 'sam_fa_indices.loc' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
72 if not os.path.exists( cached_seqs_pointer_file ):
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
73 stop_err( 'The required file (%s) does not exist.' % cached_seqs_pointer_file )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
74 # If found for the dbkey, seq_path will look something like /galaxy/data/equCab2/sam_index/equCab2.fa,
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
75 # and the equCab2.fa file will contain fasta sequences.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
76 seq_path = check_seq_file( options.dbkey, cached_seqs_pointer_file )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
77 if seq_path == '':
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
78 stop_err( 'No sequence data found for dbkey %s, so sequence data cannot be used.' % options.dbkey )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
79
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
80 # Build command.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
81
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
82 # Base.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
83 cmd = "cuffmerge -o cm_output "
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
84
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
85 # Add options.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
86 if options.num_threads:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
87 cmd += ( " -p %i " % int ( options.num_threads ) )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
88 if options.ref_annotation:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
89 cmd += " -g %s " % options.ref_annotation
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
90 if options.use_seq_data:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
91 cmd += " -s %s " % seq_path
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
92
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
93 # Add input files to a file.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
94 inputs_file_name = tempfile.NamedTemporaryFile( dir="." ).name
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
95 inputs_file = open( inputs_file_name, 'w' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
96 for arg in args:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
97 inputs_file.write( arg + "\n" )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
98 inputs_file.close()
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
99 cmd += inputs_file_name
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
100
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
101 # Debugging.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
102 print cmd
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
103
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
104 # Run command.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
105 try:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
106 tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
107 tmp_stderr = open( tmp_name, 'wb' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
108 proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
109 returncode = proc.wait()
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
110 tmp_stderr.close()
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
111
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
112 # Get stderr, allowing for case where it's very large.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
113 tmp_stderr = open( tmp_name, 'rb' )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
114 stderr = ''
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
115 buffsize = 1048576
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
116 try:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
117 while True:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
118 stderr += tmp_stderr.read( buffsize )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
119 if not stderr or len( stderr ) % buffsize != 0:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
120 break
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
121 except OverflowError:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
122 pass
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
123 tmp_stderr.close()
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
124
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
125 # Error checking.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
126 if returncode != 0:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
127 raise Exception, stderr
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
128
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
129 if len( open( "cm_output/merged.gtf", 'rb' ).read().strip() ) == 0:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
130 raise Exception, 'The output file is empty, there may be an error with your input file or settings.'
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
131
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
132 # Copy outputs.
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
133 shutil.copyfile( "cm_output/merged.gtf" , options.merged_transcripts )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
134
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
135 except Exception, e:
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
136 stop_err( 'Error running cuffmerge. ' + str( e ) )
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
137
dbbd37e013aa Uploaded tool tarball.
devteam
parents:
diff changeset
138 if __name__=="__main__": __main__()