comparison defuse_trinity_analysis.py @ 11:b22f8634ff84 draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/defuse commit 23b94b5747c6956360cd2eca0a07a669929ea141-dirty
author jjohnson
date Sun, 17 Jan 2016 14:11:06 -0500
parents
children
comparison
equal deleted inserted replaced
10:f65857c1b92e 11:b22f8634ff84
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2014, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14
15
16 """
17 This tool takes the defuse results.tsv tab-delimited file, trinity
18 and creates a tabular report
19
20 Would it be possible to create 2 additional files from the deFuse-Trinity comparison program.
21 One containing all the Trinity records matched to deFuse records (with the deFuse ID number),
22 and the other with the ORFs records matching back to the Trinity records in the first files?
23
24 M045_Report.csv
25 "","deFuse_subset.count","deFuse.gene_name1","deFuse.gene_name2","deFuse.span_count","deFuse.probability","deFuse.gene_chromosome1","deFuse.gene_location1","deFuse.gene_chromosome2","deFuse.gene_location2","deFuse_subset.type"
26 "1",1,"Rps6","Dennd4c",7,0.814853504,"4","coding","4","coding","TIC "
27
28
29
30 OS03_Matched_Rev.csv
31 "count","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","ID1","protein"
32
33 "","deFuse.splitr_sequence","deFuse.gene_chromosome1","deFuse.gene_chromosome2","deFuse.gene_location1","deFuse.gene_location2","deFuse.gene_name1","deFuse.gene_name2","deFuse.span_count","deFuse.probability","word1","word2","fusion_part_1","fusion_part_2","fusion_point","fusion_point_rc","count","transcript"
34
35 """
36
37 import sys,re,os.path,math
38 import textwrap
39 import optparse
40 from optparse import OptionParser
41
42 revcompl = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A','a':'t','c':'g','g':'c','t':'a','N':'N','n':'n'}[B] for B in x][::-1])
43
44 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
45 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
46 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
47 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
48 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
49 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
50 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
51 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
52 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
53 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
54 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
55 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
56 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
57 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
58 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
59 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
60
61 def translate(seq) :
62 rna = seq.upper().replace('T','U')
63 aa = []
64 for i in range(0,len(rna) - 2, 3):
65 codon = rna[i:i+3]
66 aa.append(codon_map[codon] if codon in codon_map else 'X')
67 return ''.join(aa)
68
69 def get_stop_codons(seq) :
70 rna = seq.upper().replace('T','U')
71 stop_codons = []
72 for i in range(0,len(rna) - 2, 3):
73 codon = rna[i:i+3]
74 aa = codon_map[codon] if codon in codon_map else 'X'
75 if aa == '*':
76 stop_codons.append(codon)
77 return stop_codons
78
79 def read_fasta(fp):
80 name, seq = None, []
81 for line in fp:
82 line = line.rstrip()
83 if line.startswith(">"):
84 if name: yield (name, ''.join(seq))
85 name, seq = line, []
86 else:
87 seq.append(line)
88 if name: yield (name, ''.join(seq))
89
90
91 def test_rcomplement(seq, target):
92 try:
93 comp = revcompl(seq)
94 return comp in target
95 except:
96 pass
97 return False
98
99 def test_reverse(seq,target):
100 return options.test_reverse and seq and seq[::-1] in target
101
102 def cmp_alphanumeric(s1,s2):
103 if s1 == s2:
104 return 0
105 a1 = re.findall("\d+|[a-zA-Z]+",s1)
106 a2 = re.findall("\d+|[a-zA-Z]+",s2)
107 for i in range(min(len(a1),len(a2))):
108 if a1[i] == a2[i]:
109 continue
110 if a1[i].isdigit() and a2[i].isdigit():
111 return int(a1[i]) - int(a2[i])
112 return 1 if a1[i] > a2[i] else -1
113 return len(a1) - len(a2)
114
115 def parse_defuse_results(inputFile):
116 defuse_results = []
117 columns = []
118 coltype_int = ['expression1', 'expression2', 'gene_start1', 'gene_start2', 'gene_end1', 'gene_end2', 'genomic_break_pos1', 'genomic_break_pos2', 'breakpoint_homology', 'span_count', 'splitr_count', 'splice_score']
119 coltype_float = ['probability']
120 coltype_yn = [ 'orf', 'exonboundaries', 'read_through', 'interchromosomal', 'adjacent', 'altsplice', 'deletion', 'eversion', 'inversion']
121 try:
122 for linenum,line in enumerate(inputFile):
123 ## print >> sys.stderr, "%d: %s\n" % (linenum,line)
124 fields = line.strip().split('\t')
125 if line.startswith('cluster_id'):
126 columns = fields
127 ## print >> sys.stderr, "columns: %s\n" % columns
128 continue
129 elif fields and len(fields) == len(columns):
130 cluster_id = fields[columns.index('cluster_id')]
131 cluster = dict()
132 flags = []
133 defuse_results.append(cluster)
134 for i,v in enumerate(columns):
135 if v in coltype_int:
136 cluster[v] = int(fields[i])
137 elif v in coltype_float:
138 cluster[v] = float(fields[i])
139 elif v in coltype_yn:
140 cluster[v] = fields[i] == 'Y'
141 if cluster[v]:
142 flags.append(columns[i])
143 else:
144 cluster[v] = fields[i]
145 cluster['flags'] = ','.join(flags)
146 except Exception, e:
147 print >> sys.stderr, "failed to read cluster_dict: %s" % e
148 exit(1)
149 return defuse_results
150
151 ## deFuse params to the mapping application?
152
153 def __main__():
154 #Parse Command Line
155 parser = optparse.OptionParser()
156 # files
157 parser.add_option( '-i', '--input', dest='input', default=None, help='The input defuse results.tsv file (else read from stdin)' )
158 parser.add_option( '-t', '--transcripts', dest='transcripts', default=None, help='Trinity transcripts' )
159 parser.add_option( '-p', '--peptides', dest='peptides', default=None, help='Trinity ORFs' )
160 parser.add_option( '-o', '--output', dest='output', default=None, help='The output report (else write to stdout)' )
161 parser.add_option( '-m', '--matched', dest='matched', default=None, help='The output matched report' )
162 parser.add_option( '-a', '--transcript_alignment', dest='transcript_alignment', default=None, help='The output alignment file' )
163 parser.add_option( '-A', '--orf_alignment', dest='orf_alignment', default=None, help='The output ORF alignment file' )
164 parser.add_option( '-N', '--nbases', dest='nbases', type='int', default=12, help='Number of bases on either side of the fusion to compare' )
165 parser.add_option( '-L', '--min_pep_len', dest='min_pep_len', type='int', default=100, help='Minimum length of peptide to report' )
166 parser.add_option( '-T', '--ticdist', dest='ticdist', type='int', default=1000000, help='Maximum intrachromosomal distance to be classified a Transcription-induced chimera (TIC)' )
167 parser.add_option( '-P', '--prior_aa', dest='prior_aa', type='int', default=11, help='Number of protein AAs to show preceeding fusion point' )
168 parser.add_option( '-I', '--incomplete_orfs', dest='incomplete_orfs', action='store_true', default=False, help='Count incomplete ORFs' )
169 parser.add_option( '-O', '--orf_type', dest='orf_type', action='append', default=['complete','5prime_partial'], choices=['complete','5prime_partial','3prime_partial','internal'], help='ORF types to report' )
170 parser.add_option( '-r', '--readthrough', dest='readthrough', type='int', default=3, help='Number of stop_codons to read through' )
171 # min_orf_len
172 # split_na_len
173 # tic_len = 1000000
174 # prior
175 # deFuse direction reversed
176 # in frame ?
177 # contain known protein elements
178 # what protein change
179 # trinity provides full transctipt, defuse doesn't show full
180 #parser.add_option( '-r', '--reference', dest='reference', default=None, help='The genomic reference fasta' )
181 #parser.add_option( '-g', '--gtf', dest='gtf', default=None, help='The genomic reference gtf feature file')
182 (options, args) = parser.parse_args()
183
184 # results.tsv input
185 if options.input != None:
186 try:
187 inputPath = os.path.abspath(options.input)
188 inputFile = open(inputPath, 'r')
189 except Exception, e:
190 print >> sys.stderr, "failed: %s" % e
191 exit(2)
192 else:
193 inputFile = sys.stdin
194 # vcf output
195 if options.output != None:
196 try:
197 outputPath = os.path.abspath(options.output)
198 outputFile = open(outputPath, 'w')
199 except Exception, e:
200 print >> sys.stderr, "failed: %s" % e
201 exit(3)
202 else:
203 outputFile = sys.stdout
204 outputTxFile = None
205 outputOrfFile = None
206 if options.transcript_alignment:
207 try:
208 outputTxFile = open(options.transcript_alignment,'w')
209 except Exception, e:
210 print >> sys.stderr, "failed: %s" % e
211 exit(3)
212 if options.orf_alignment:
213 try:
214 outputOrfFile = open(options.orf_alignment,'w')
215 except Exception, e:
216 print >> sys.stderr, "failed: %s" % e
217 exit(3)
218 # Add percent match after transcript
219 report_fields = ['gene_name1','gene_name2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','coverage','Protein','flags','alignments1','alignments2']
220 report_fields = ['cluster_id','gene_name1','gene_name2','span_count','probability','genomic_bkpt1','gene_location1','genomic_bkpt2','gene_location2','fusion_type','Transcript','coverage','Protein','flags','alignments1','alignments2']
221 report_colnames = {'gene_name1':'Gene 1','gene_name2':'Gene 2','span_count':'Span cnt','probability':'Probability','gene_chromosome1':'From Chr','gene_location1':'Fusion point','gene_chromosome2':'To Chr','gene_location2':'Fusion point', 'cluster_id':'cluster_id', 'splitr_sequence':'splitr_sequence', 'splitr_count':'splitr_count', 'splitr_span_pvalue':'splitr_span_pvalue', 'splitr_pos_pvalue':'splitr_pos_pvalue', 'splitr_min_pvalue':'splitr_min_pvalue', 'adjacent':'adjacent', 'altsplice':'altsplice', 'break_adj_entropy1':'break_adj_entropy1', 'break_adj_entropy2':'break_adj_entropy2', 'break_adj_entropy_min':'break_adj_entropy_min', 'breakpoint_homology':'breakpoint_homology', 'breakseqs_estislands_percident':'breakseqs_estislands_percident', 'cdna_breakseqs_percident':'cdna_breakseqs_percident', 'deletion':'deletion', 'est_breakseqs_percident':'est_breakseqs_percident', 'eversion':'eversion', 'exonboundaries':'exonboundaries', 'expression1':'expression1', 'expression2':'expression2', 'gene1':'gene1', 'gene2':'gene2', 'gene_align_strand1':'gene_align_strand1', 'gene_align_strand2':'gene_align_strand2', 'gene_end1':'gene_end1', 'gene_end2':'gene_end2', 'gene_start1':'gene_start1', 'gene_start2':'gene_start2', 'gene_strand1':'gene_strand1', 'gene_strand2':'gene_strand2', 'genome_breakseqs_percident':'genome_breakseqs_percident', 'genomic_break_pos1':'genomic_break_pos1', 'genomic_break_pos2':'genomic_break_pos2', 'genomic_strand1':'genomic_strand1', 'genomic_strand2':'genomic_strand2', 'interchromosomal':'interchromosomal', 'interrupted_index1':'interrupted_index1', 'interrupted_index2':'interrupted_index2', 'inversion':'inversion', 'library_name':'library_name', 'max_map_count':'max_map_count', 'max_repeat_proportion':'max_repeat_proportion', 'mean_map_count':'mean_map_count', 'min_map_count':'min_map_count', 'num_multi_map':'num_multi_map', 'num_splice_variants':'num_splice_variants', 'orf':'orf', 'read_through':'read_through', 'repeat_proportion1':'repeat_proportion1', 'repeat_proportion2':'repeat_proportion2', 'span_coverage1':'span_coverage1', 'span_coverage2':'span_coverage2', 'span_coverage_max':'span_coverage_max', 'span_coverage_min':'span_coverage_min', 'splice_score':'splice_score', 'splicing_index1':'splicing_index1', 'splicing_index2':'splicing_index2', 'fusion_type':'Type', 'coverage':'fusion%','Transcript':'Transcript?','Protein':'Protein?','flags':'descriptions','fwd_seq':'fusion','alignments1':'alignments1','alignments2':'alignments2','genomic_bkpt1':'From Chr', 'genomic_bkpt2':'To Chr'}
222
223 ## Read defuse results
224 fusions = parse_defuse_results(inputFile)
225 ## Create a field with the 12 nt before and after the fusion point.
226 ## Create a field with the reverse complement of the 24 nt fusion point field.
227 ## Add fusion type filed (INTER, INTRA, TIC)
228 for i,fusion in enumerate(fusions):
229 fusion['ordinal'] = i + 1
230 fusion['genomic_bkpt1'] = "%s:%d" % (fusion['gene_chromosome1'], fusion['genomic_break_pos1'])
231 fusion['genomic_bkpt2'] = "%s:%d" % (fusion['gene_chromosome2'], fusion['genomic_break_pos2'])
232 fusion['alignments1'] = "%s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1'])
233 fusion['alignments2'] = "%s%s%s" % (fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2'])
234 split_seqs = fusion['splitr_sequence'].split('|')
235 fusion['split_seqs'] = split_seqs
236 fusion['split_seqs'] = split_seqs
237 fusion['split_seq_lens'] = [len(split_seqs[0]),len(split_seqs[1])]
238 fusion['split_max_lens'] = [len(split_seqs[0]),len(split_seqs[1])]
239 fwd_off = min(abs(options.nbases),len(split_seqs[0]))
240 rev_off = min(abs(options.nbases),len(split_seqs[1]))
241 fusion['fwd_off'] = fwd_off
242 fusion['rev_off'] = rev_off
243 fwd_seq = split_seqs[0][-fwd_off:] + split_seqs[1][:rev_off]
244 rev_seq = revcompl(fwd_seq)
245 fusion['fwd_seq'] = fwd_seq
246 fusion['rev_seq'] = rev_seq
247 fusion_type = 'inter' if fusion['gene_chromosome1'] != fusion['gene_chromosome2'] else 'intra' if abs(fusion['genomic_break_pos1'] - fusion['genomic_break_pos2']) > options.ticdist else 'TIC'
248 fusion['fusion_type'] = fusion_type
249 fusion['transcripts'] = dict()
250 fusion['Transcript'] = 'No'
251 fusion['coverage'] = 0
252 fusion['Protein'] = 'No'
253 # print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fwd_seq,rev_seq,fusion_type,fusion['gene_name1'],fusion['gene_name2'])
254 inputFile.close()
255
256 ## Process Trinity data and compare to deFuse
257 matched_transcripts = dict()
258 matched_orfs = dict()
259 transcript_orfs = dict()
260 fusions_with_transcripts = set()
261 fusions_with_orfs = set()
262 ## fusion['transcripts'][tx_id] { revcompl:?, bkpt:n, seq1: , seq2: , match1:n, match2:n}
263 n = 0
264 if options.transcripts:
265 with open(options.transcripts) as fp:
266 for tx_full_id, seq in read_fasta(fp):
267 n += 1
268 for i,fusion in enumerate(fusions):
269 if fusion['fwd_seq'] in seq or fusion['rev_seq'] in seq:
270 fusions_with_transcripts.add(i)
271 fusion['Transcript'] = 'Yes'
272 tx_id = tx_full_id.lstrip('>').split()[0]
273 matched_transcripts[tx_full_id] = seq
274 fusion['transcripts'][tx_id] = dict()
275 fusion['transcripts'][tx_id]['seq'] = seq
276 fusion['transcripts'][tx_id]['full_id'] = tx_full_id
277 pos = seq.find(fusion['fwd_seq'])
278 if pos >= 0:
279 tx_bkpt = pos + fusion['fwd_off']
280 # fusion['transcripts'][tx_full_id] = tx_bkpt
281 if tx_bkpt > fusion['split_max_lens'][0]:
282 fusion['split_max_lens'][0] = tx_bkpt
283 len2 = len(seq) - tx_bkpt
284 if len2 > fusion['split_max_lens'][1]:
285 fusion['split_max_lens'][1] = len2
286 fusion['transcripts'][tx_id]['bkpt'] = tx_bkpt
287 fusion['transcripts'][tx_id]['revcompl'] = False
288 fusion['transcripts'][tx_id]['seq1'] = seq[:tx_bkpt]
289 fusion['transcripts'][tx_id]['seq2'] = seq[tx_bkpt:]
290 else:
291 pos = seq.find(fusion['rev_seq'])
292 tx_bkpt = pos + fusion['rev_off']
293 # fusion['transcripts'][tx_full_id] = -tx_bkpt
294 if tx_bkpt > fusion['split_max_lens'][1]:
295 fusion['split_max_lens'][1] = tx_bkpt
296 len2 = len(seq) - tx_bkpt
297 if len2 > fusion['split_max_lens'][0]:
298 fusion['split_max_lens'][0] = len2
299 rseq = revcompl(seq)
300 pos = rseq.find(fusion['fwd_seq'])
301 tx_bkpt = pos + fusion['fwd_off']
302 fusion['transcripts'][tx_id]['bkpt'] = tx_bkpt
303 fusion['transcripts'][tx_id]['revcompl'] = True
304 fusion['transcripts'][tx_id]['seq1'] = rseq[:tx_bkpt]
305 fusion['transcripts'][tx_id]['seq2'] = rseq[tx_bkpt:]
306 fseq = fusion['split_seqs'][0]
307 tseq = fusion['transcripts'][tx_id]['seq1']
308 mlen = min(len(fseq),len(tseq))
309 fusion['transcripts'][tx_id]['match1'] = mlen
310 for j in range(1,mlen+1):
311 if fseq[-j] != tseq[-j]:
312 fusion['transcripts'][tx_id]['match1'] = j - 1
313 break
314 fseq = fusion['split_seqs'][1]
315 tseq = fusion['transcripts'][tx_id]['seq2']
316 mlen = min(len(fseq),len(tseq))
317 fusion['transcripts'][tx_id]['match2'] = mlen
318 for j in range(mlen):
319 if fseq[j] != tseq[j]:
320 fusion['transcripts'][tx_id]['match2'] = j
321 break
322 # coverage = math.floor(float(fusion['transcripts'][tx_id]['match1'] + fusion['transcripts'][tx_id]['match2']) * 100. / len(fusion['split_seqs'][0]+fusion['split_seqs'][1]))
323 coverage = int((fusion['transcripts'][tx_id]['match1'] + fusion['transcripts'][tx_id]['match2']) * 1000. / len(fusion['split_seqs'][0]+fusion['split_seqs'][1])) * .1
324 # print >> sys.stderr, "%s\t%d\t%d\t%d\%s\t\t%d\t%d\t%d\t%d" % (tx_id,fusion['transcripts'][tx_id]['match1'],fusion['transcripts'][tx_id]['match2'],len(fusion['split_seqs'][0]+fusion['split_seqs'][1]),coverage,len( fusion['split_seqs'][0]),len(fusion['transcripts'][tx_id]['seq1']),len(fusion['split_seqs'][1]),len(fusion['transcripts'][tx_id]['seq2']))
325 fusion['coverage'] = max(coverage,fusion['coverage'])
326 print >> sys.stdout, "fusions_with_transcripts: %d %s\n matched_transcripts: %d" % (len(fusions_with_transcripts),fusions_with_transcripts,len(matched_transcripts))
327 ##for i,fusion in enumerate(fusions):
328 ## print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fusion['fwd_seq'],fusion['rev_seq'],fusion['fusion_type'],fusion['gene_name1'],fusion['gene_name2'], fusion['transcripts'])
329 ## Process ORFs and compare to matched deFuse and Trinity data.
330 ## Proteins must be at least 100 aa long, starting at the first "M" and must end with an "*".
331 if options.peptides:
332 with open(options.peptides) as fp:
333 for orf_full_id, seq in read_fasta(fp):
334 n += 1
335 if len(seq) < options.min_pep_len:
336 continue
337 orf_type = re.match('^.* type:(\S+) .*$',orf_full_id).groups()[0]
338 ## if not seq[-1] == '*' and not options.incomplete_orfs:
339 ## if not orf_type 'complete' and not options.incomplete_orfs:
340 if orf_type not in options.orf_type:
341 continue
342 for i,fusion in enumerate(fusions):
343 if len(fusion['transcripts']) > 0:
344 for tx_id in fusion['transcripts']:
345 ## >m.196252 g.196252 ORF g.196252 m.196252 type:complete len:237 (+) comp100000_c5_seq2:315-1025(+)
346 ## >m.134565 g.134565 ORF g.134565 m.134565 type:5prime_partial len:126 (-) comp98702_c1_seq21:52-429(-)
347 if tx_id+':' not in orf_full_id:
348 continue
349 m = re.match("^.*%s:(\d+)-(\d+)[(]([+-])[)].*" % re.sub('([|.{}()$?^])','[\\1]',tx_id),orf_full_id)
350 if m:
351 if not m.groups() or len(m.groups()) < 3 or m.groups()[0] == None:
352 print >> sys.stderr, "Error:\n%s\n%s\n" % (tx_id,orf_full_id)
353 orf_id = orf_full_id.lstrip('>').split()[0]
354 if not tx_id in transcript_orfs:
355 transcript_orfs[tx_id] = []
356 alignments = "%s%s%s %s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1'], fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2'])
357 # print >> sys.stdout, "%d %s bkpt:%d %s rc:%s (%s) %s" % (fusion['ordinal'], tx_id, int(fusion['transcripts'][tx_id]['bkpt']), str(m.groups()), str(fusion['transcripts'][tx_id]['revcompl']), alignments, orf_full_id)
358 start = seq.find('M')
359 pep_len = len(seq)
360 if pep_len - start < options.min_pep_len:
361 continue
362 orf_dict = dict()
363 transcript_orfs[tx_id].append(orf_dict)
364 fusions_with_orfs.add(i)
365 matched_orfs[orf_full_id] = seq
366 fusion['Protein'] = 'Yes'
367 tx_start = int(m.groups()[0])
368 tx_end = int(m.groups()[1])
369 tx_strand = m.groups()[2]
370 tx_bkpt = fusion['transcripts'][tx_id]['bkpt']
371 orf_dict['orf_id'] = orf_id
372 orf_dict['tx_start'] = tx_start
373 orf_dict['tx_end'] = tx_end
374 orf_dict['tx_strand'] = tx_strand
375 orf_dict['tx_bkpt'] = tx_bkpt
376 orf_dict['seq'] = seq[:start].lower() + seq[start:] if start > 0 else seq
377 ## >m.208656 g.208656 ORF g.208656 m.208656 type:5prime_partial len:303 (+) comp100185_c2_seq9:2-910(+)
378 ## translate(tx34[1:910])
379 ## translate(tx34[1:2048])
380 ## comp99273_c1_seq1 len=3146 (-2772)
381 ## >m.158338 g.158338 ORF g.158338 m.158338 type:complete len:785 (-) comp99273_c1_seq1:404-2758(-)
382 ## translate(tx[-2758:-403])
383 ## comp100185_c2_seq9 len=2048 (904)
384 ## novel protein sequence
385 ## find first novel AA
386 ## get prior n AAs
387 ## get novel AA seq thru n stop codons
388 ### tx_seq = matched_transcripts[tx_full_id] if tx_bkpt >= 0 else revcompl(tx_seq)
389 tx_seq = fusion['transcripts'][tx_id]['seq']
390 orf_dict['tx_seq'] = tx_seq
391 novel_tx_seq = tx_seq[tx_start - 1:] if tx_strand == '+' else revcompl(tx_seq[:tx_end])
392 read_thru_pep = translate(novel_tx_seq)
393 # fusion['transcripts'][tx_id]['revcompl'] = True
394 # tx_bkpt = fusion['transcripts'][tx_id]['bkpt']
395 # bkpt_aa_pos = tx_bkpt - tx_start - 1
396 # bkpt_aa_pos = (tx_bkpt - tx_start - 1) / 3 if tx_strand == '+' else tx_end
397 # print >> sys.stdout, "%s\n%s" % (seq,read_thru_pep)
398 stop_codons = get_stop_codons(novel_tx_seq)
399 if options.readthrough:
400 readthrough = options.readthrough + 1
401 read_thru_pep = '*'.join(read_thru_pep.split('*')[:readthrough])
402 stop_codons = stop_codons[:readthrough]
403 orf_dict['read_thru_pep'] = read_thru_pep
404 orf_dict['stop_codons'] = ','.join(stop_codons)
405 print >> sys.stdout, "fusions_with_orfs: %d %s\n matched_orfs: %d" % (len(fusions_with_orfs),fusions_with_orfs,len(matched_orfs))
406 ## Alignments 3 columns, seq columns padded out to longest seq, UPPERCASE_match diffs lowercase
407 ### defuse_id pre_split_seq post_split_seq
408 ### trinity_id pre_split_seq post_split_seq
409 ## Transcripts alignment output
410 ## Peptide alignment output
411 ## Write reports
412 ## OS03_Matched_Rev.csv
413 ## "count","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","ID1","protein"
414 if options.transcripts and options.matched:
415 #match_fields = ['ordinal','gene_name1','gene_name2','fwd_seq']
416 outputMatchFile = open(options.matched,'w')
417 #print >> outputMatchFile, '\t'.join(["#fusion_id","cluster_id","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","Trinity_ORF_Transcript","Trinity_ORF_ID","protein","read_through","stop_codons"])
418 print >> outputMatchFile, '\t'.join(["#fusion_id","cluster_id","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","Trinity_ORF_Transcript","Trinity_ORF_ID","protein","stop_codons"])
419 for i,fusion in enumerate(fusions):
420 if len(fusion['transcripts']) > 0:
421 for tx_id in fusion['transcripts'].keys():
422 if tx_id in transcript_orfs:
423 for orf_dict in transcript_orfs[tx_id]:
424 if 'tx_seq' not in orf_dict:
425 print >> sys.stderr, "orf_dict %s" % orf_dict
426 #fields = [str(fusion['ordinal']),str(fusion['cluster_id']),fusion['gene_name1'],fusion['gene_name2'],fusion['fwd_seq'],fusion['splitr_sequence'],tx_id, fusion['transcripts'][tx_id]['seq1']+'|'+fusion['transcripts'][tx_id]['seq2'],orf_dict['tx_seq'],orf_dict['orf_id'],orf_dict['seq'],orf_dict['read_thru_pep'],orf_dict['stop_codons']]
427 fields = [str(fusion['ordinal']),str(fusion['cluster_id']),fusion['gene_name1'],fusion['gene_name2'],fusion['fwd_seq'],fusion['splitr_sequence'],tx_id, fusion['transcripts'][tx_id]['seq1']+'|'+fusion['transcripts'][tx_id]['seq2'],orf_dict['tx_seq'],orf_dict['orf_id'],orf_dict['read_thru_pep'],orf_dict['stop_codons']]
428 print >> outputMatchFile, '\t'.join(fields)
429 outputMatchFile.close()
430 if options.transcripts and options.transcript_alignment:
431 if outputTxFile:
432 id_fields = ['gene_name1','alignments1','gene_name2','alignments2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','Protein','flags']
433 fa_width = 80
434 for i,fusion in enumerate(fusions):
435 if len(fusion['transcripts']) > 0:
436 alignments1 = "%s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1'])
437 alignments2 = "%s%s%s" % (fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2'])
438 alignments = "%s%s%s %s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1'], fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2'])
439 fusion_id = "%s (%s) %s" % (i + 1,alignments,' '.join([str(fusion[x]) for x in report_fields]))
440 for tx_id in fusion['transcripts'].keys():
441 m1 = fusion['transcripts'][tx_id]['match1']
442 f_seq1 = fusion['split_seqs'][0][:-m1].lower() + fusion['split_seqs'][0][-m1:]
443 t_seq1 = fusion['transcripts'][tx_id]['seq1'][:-m1].lower() + fusion['transcripts'][tx_id]['seq1'][-m1:]
444 if len(f_seq1) > len(t_seq1):
445 t_seq1 = t_seq1.rjust(len(f_seq1),'.')
446 elif len(f_seq1) < len(t_seq1):
447 f_seq1 = f_seq1.rjust(len(t_seq1),'.')
448 m2 = fusion['transcripts'][tx_id]['match2']
449 f_seq2 = fusion['split_seqs'][1][:m2] + fusion['split_seqs'][1][m2:].lower()
450 t_seq2 = fusion['transcripts'][tx_id]['seq2'][:m2] + fusion['transcripts'][tx_id]['seq2'][m2:].lower()
451 if len(f_seq2) > len(t_seq2):
452 t_seq2 = t_seq2.ljust(len(f_seq2),'.')
453 elif len(f_seq2) < len(t_seq2):
454 f_seq2 = f_seq2.ljust(len(t_seq2),'.')
455 print >> outputTxFile, ">%s\n%s\n%s" % (fusion_id,'\n'.join(textwrap.wrap(f_seq1,fa_width)),'\n'.join(textwrap.wrap(f_seq2,fa_width)))
456 print >> outputTxFile, "%s bkpt:%d rev_compl:%s\n%s\n%s" % (fusion['transcripts'][tx_id]['full_id'],fusion['transcripts'][tx_id]['bkpt'],str(fusion['transcripts'][tx_id]['revcompl']),'\n'.join(textwrap.wrap(t_seq1,fa_width)),'\n'.join(textwrap.wrap(t_seq2,fa_width)))
457 """
458 if options.peptides and options.orf_alignment:
459 pass
460 """
461 print >> outputFile,"%s\t%s" % ('#','\t'.join([report_colnames[x] for x in report_fields]))
462 for i,fusion in enumerate(fusions):
463 print >> outputFile,"%s\t%s" % (i + 1,'\t'.join([str(fusion[x]) for x in report_fields]))
464
465 if __name__ == "__main__" : __main__()
466