comparison map_peptides_to_bed.py @ 3:704ea6303c4c draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/map_peptides_to_bed commit 2a470e2c775a7427aa530e058510e4dc7b6d8e80"
author galaxyp
date Tue, 07 Apr 2020 11:41:15 -0400
parents db90662d26f9
children
comparison
equal deleted inserted replaced
2:78b8213e122d 3:704ea6303c4c
8 # Author: 8 # Author:
9 # 9 #
10 # James E Johnson 10 # James E Johnson
11 # 11 #
12 #------------------------------------------------------------------------------ 12 #------------------------------------------------------------------------------
13 Input: list of protein_accessions, peptide_sequence
14 GFF3 with fasta
15 Output: GFF3 of peptides
16
17 Filter: Must cross splice boundary
13 """ 18 """
14 19
15 """
16 Input: list of protein_accessions, peptide_sequence
17 GFF3 with fasta
18 Output: GFF3 of peptides
19
20 Filter: Must cross splice boundary
21
22 """
23
24 import sys,re,os.path
25 import tempfile
26 import optparse 20 import optparse
27 from optparse import OptionParser 21 import os.path
28 from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate 22 import sys
29 23
30 class BedEntry( object ): 24 from Bio.Seq import (
31 def __init__(self, line): 25 reverse_complement,
32 self.line = line 26 translate
33 try: 27 )
34 fields = line.rstrip('\r\n').split('\t') 28
35 (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts) = fields[0:12] 29
36 seq = fields[12] if len(fields) > 12 else None 30 class BedEntry(object):
37 self.chrom = chrom 31 def __init__(self, line):
38 self.chromStart = int(chromStart) 32 self.line = line
39 self.chromEnd = int(chromEnd) 33 try:
40 self.name = name 34 fields = line.rstrip('\r\n').split('\t')
41 self.score = int(score) 35 (chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts) = fields[0:12]
42 self.strand = strand 36 seq = fields[12] if len(fields) > 12 else None
43 self.thickStart = int(thickStart) 37 self.chrom = chrom
44 self.thickEnd = int(thickEnd) 38 self.chromStart = int(chromStart)
45 self.itemRgb = itemRgb 39 self.chromEnd = int(chromEnd)
46 self.blockCount = int(blockCount) 40 self.name = name
47 self.blockSizes = [int(x) for x in blockSizes.split(',')] 41 self.score = int(score)
48 self.blockStarts = [int(x) for x in blockStarts.split(',')] 42 self.strand = strand
49 self.seq = seq 43 self.thickStart = int(thickStart)
50 except Exception, e: 44 self.thickEnd = int(thickEnd)
51 print >> sys.stderr, "Unable to read Bed entry" % e 45 self.itemRgb = itemRgb
52 exit(1) 46 self.blockCount = int(blockCount)
53 def __str__(self): 47 self.blockSizes = [int(x) for x in blockSizes.split(',')]
54 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s%s' % ( 48 self.blockStarts = [int(x) for x in blockStarts.split(',')]
55 self.chrom, self.chromStart, self.chromEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount, 49 self.seq = seq
56 ','.join([str(x) for x in self.blockSizes]), 50 except Exception as e:
57 ','.join([str(x) for x in self.blockStarts]), 51 sys.stderr.write("Unable to read Bed entry %s \n" % e)
58 '\t%s' % self.seq if self.seq else '') 52 exit(1)
59 def get_splice_junctions(self): 53
60 splice_juncs = [] 54 def __str__(self):
61 for i in range(self.blockCount - 1): 55 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s%s' % (
62 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) 56 self.chrom, self.chromStart, self.chromEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount,
63 splice_juncs.append(splice_junc) 57 ','.join([str(x) for x in self.blockSizes]),
64 return splice_juncs 58 ','.join([str(x) for x in self.blockStarts]),
65 def get_exon_seqs(self): 59 '\t%s' % self.seq if self.seq else '')
66 exons = [] 60
67 for i in range(self.blockCount): 61 def get_splice_junctions(self):
68 # splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) 62 splice_juncs = []
69 exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i] + self.blockSizes[i]]) 63 for i in range(self.blockCount - 1):
70 if self.strand == '-': #reverse complement 64 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i + 1])
71 exons.reverse() 65 splice_juncs.append(splice_junc)
72 for i,s in enumerate(exons): 66 return splice_juncs
73 exons[i] = reverse_complement(s) 67
74 return exons 68 def get_exon_seqs(self):
75 def get_spliced_seq(self): 69 exons = []
76 seq = ''.join(self.get_exon_seqs()) 70 for i in range(self.blockCount):
77 return seq 71 # splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1])
78 def get_translation(self,sequence=None): 72 exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i] + self.blockSizes[i]])
79 translation = None 73 if self.strand == '-': # reverse complement
80 seq = sequence if sequence else self.get_spliced_seq() 74 exons.reverse()
81 if seq: 75 for i, s in enumerate(exons):
82 seqlen = len(seq) / 3 * 3; 76 exons[i] = reverse_complement(s)
83 if seqlen >= 3: 77 return exons
84 translation = translate(seq[:seqlen]) 78
85 return translation 79 def get_spliced_seq(self):
86 def get_translations(self): 80 seq = ''.join(self.get_exon_seqs())
87 translations = [] 81 return seq
88 seq = self.get_spliced_seq() 82
89 if seq: 83 def get_translation(self, sequence=None):
90 for i in range(3): 84 translation = None
91 translation = self.get_translation(sequence=seq[i:]) 85 seq = sequence if sequence else self.get_spliced_seq()
92 if translation: 86 if seq:
93 translations.append(translation) 87 seqlen = int(len(seq) / 3) * 3
94 return translations 88 if seqlen >= 3:
95 ## (start,end) 89 translation = translate(seq[:seqlen])
96 def get_subrange(self,tstart,tstop): 90 return translation
97 chromStart = self.chromStart 91
98 chromEnd = self.chromEnd 92 def get_translations(self):
99 r = range(self.blockCount) 93 translations = []
100 if self.strand == '-': 94 seq = self.get_spliced_seq()
101 r.reverse() 95 if seq:
102 bStart = 0 96 for i in range(3):
103 for x in r: 97 translation = self.get_translation(sequence=seq[i:])
104 bEnd = bStart + self.blockSizes[x] 98 if translation:
105 ## print >> sys.stderr, "%d chromStart: %d chromEnd: %s bStart: %s bEnd: %d" % (x,chromStart,chromEnd,bStart,bEnd) 99 translations.append(translation)
106 if bStart <= tstart < bEnd: 100 return translations
107 if self.strand == '+': 101
108 chromStart = self.chromStart + self.blockStarts[x] + (tstart - bStart) 102 def get_subrange(self, tstart, tstop):
109 else: 103 """
110 chromEnd = self.chromStart + self.blockStarts[x] + self.blockSizes[x] - (tstart - bStart) 104 (start, end)
111 if bStart <= tstop < bEnd: 105 """
112 if self.strand == '+': 106 chromStart = self.chromStart
113 chromEnd = self.chromStart + self.blockStarts[x] + (tstop - bStart) 107 chromEnd = self.chromEnd
114 else: 108 r = range(self.blockCount)
115 chromStart = self.chromStart + self.blockStarts[x] + self.blockSizes[x] - (tstop - bStart) 109 if self.strand == '-':
116 bStart += self.blockSizes[x] 110 r = list(r)
117 return(chromStart,chromEnd) 111 r.reverse()
118 #get the blocks for sub range 112 bStart = 0
119 def get_blocks(self,chromStart,chromEnd): 113 for x in r:
120 tblockCount = 0 114 bEnd = bStart + self.blockSizes[x]
121 tblockSizes = [] 115 # print >> sys.stderr, "%d chromStart: %d chromEnd: %s bStart: %s bEnd: %d\n" % (x, chromStart, chromEnd, bStart, bEnd)
122 tblockStarts = [] 116 if bStart <= tstart < bEnd:
123 for x in range(self.blockCount): 117 if self.strand == '+':
124 bStart = self.chromStart + self.blockStarts[x] 118 chromStart = self.chromStart + self.blockStarts[x] + (tstart - bStart)
125 bEnd = bStart + self.blockSizes[x] 119 else:
126 if bStart > chromEnd: 120 chromEnd = self.chromStart + self.blockStarts[x] + self.blockSizes[x] - (tstart - bStart)
127 break 121 if bStart <= tstop < bEnd:
128 if bEnd < chromStart: 122 if self.strand == '+':
129 continue 123 chromEnd = self.chromStart + self.blockStarts[x] + (tstop - bStart)
130 cStart = max(chromStart,bStart) 124 else:
131 tblockStarts.append(cStart - chromStart) 125 chromStart = self.chromStart + self.blockStarts[x] + self.blockSizes[x] - (tstop - bStart)
132 tblockSizes.append(min(chromEnd,bEnd) - cStart) 126 bStart += self.blockSizes[x]
133 tblockCount += 1 127 return(chromStart, chromEnd)
134 print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes) 128
135 return (tblockCount,tblockSizes,tblockStarts) 129 def get_blocks(self, chromStart, chromEnd):
136 130 """
137 ## [[start,end,seq,blockCount,blockSizes,blockStarts],[start,end,seq,blockCount,blockSizes,blockStarts],[start,end,seq,blockCount,blockSizes,blockStarts]] 131 get the blocks for sub range
138 ## filter: ignore translation if stop codon in first exon after ignore_left_bp 132 """
139 def get_filterd_translations(self,untrimmed=False,filtering=True,ignore_left_bp=0,ignore_right_bp=0): 133 tblockCount = 0
140 translations = [None,None,None,None,None,None] 134 tblockSizes = []
141 seq = self.get_spliced_seq() 135 tblockStarts = []
142 ignore = (ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3 136 for x in range(self.blockCount):
143 block_sum = sum(self.blockSizes)
144 exon_sizes = self.blockSizes
145 if self.strand == '-':
146 exon_sizes.reverse()
147 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))]
148 print >> sys.stderr, "splice_sites: %s" % splice_sites
149 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0]
150 if seq:
151 for i in range(3):
152 translation = self.get_translation(sequence=seq[i:])
153 if translation:
154 tstart = 0
155 tstop = len(translation)
156 if not untrimmed:
157 tstart = translation.rfind('*',0,junc) + 1
158 stop = translation.find('*',junc)
159 tstop = stop if stop >= 0 else len(translation)
160 if filtering and tstart > ignore:
161 continue
162 trimmed = translation[tstart:tstop]
163 #get genomic locations for start and end
164 offset = (block_sum - i) % 3
165 print >> sys.stderr, "tstart: %d tstop: %d offset: %d" % (tstart,tstop,offset)
166 if self.strand == '+':
167 chromStart = self.chromStart + i + (tstart * 3)
168 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3
169 else:
170 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3
171 chromEnd = self.chromEnd - i - (tstart * 3)
172 #get the blocks for this translation
173 tblockCount = 0
174 tblockSizes = []
175 tblockStarts = []
176 for x in range(self.blockCount):
177 bStart = self.chromStart + self.blockStarts[x] 137 bStart = self.chromStart + self.blockStarts[x]
178 bEnd = bStart + self.blockSizes[x] 138 bEnd = bStart + self.blockSizes[x]
179 if bStart > chromEnd: 139 if bStart > chromEnd:
180 break 140 break
181 if bEnd < chromStart: 141 if bEnd < chromStart:
182 continue 142 continue
183 cStart = max(chromStart,bStart) 143 cStart = max(chromStart, bStart)
184 tblockStarts.append(cStart - chromStart) 144 tblockStarts.append(cStart - chromStart)
185 tblockSizes.append(min(chromEnd,bEnd) - cStart) 145 tblockSizes.append(min(chromEnd, bEnd) - cStart)
186 tblockCount += 1 146 tblockCount += 1
187 print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes) 147 sys.stderr.write("tblockCount: %d tblockStarts: %s tblockSizes: %s\n" % (tblockCount, tblockStarts, tblockSizes))
188 translations[i] = [chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts] 148 return (tblockCount, tblockSizes, tblockStarts)
189 return translations 149
190 def get_seq_id(self,seqtype='unk:unk',reference='',frame=None): 150 def get_filterd_translations(self, untrimmed=False, filtering=True, ignore_left_bp=0, ignore_right_bp=0):
191 ## Ensembl fasta ID format 151 """
192 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT 152 [[start, end, seq, blockCount, blockSizes, blockStarts], [start, end, seq, blockCount, blockSizes, blockStarts], [start, end, seq, blockCount, blockSizes, blockStarts]]
193 # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding 153 filter: ignore translation if stop codon in first exon after ignore_left_bp
194 frame_name = '' 154 """
195 chromStart = self.chromStart 155 translations = [None, None, None, None, None, None]
196 chromEnd = self.chromEnd 156 seq = self.get_spliced_seq()
197 strand = 1 if self.strand == '+' else -1 157 ignore = int((ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3)
198 if frame != None: 158 block_sum = sum(self.blockSizes)
199 block_sum = sum(self.blockSizes) 159 exon_sizes = self.blockSizes
200 offset = (block_sum - frame) % 3 160 if self.strand == '-':
201 frame_name = '_' + str(frame + 1) 161 exon_sizes.reverse()
202 if self.strand == '+': 162 splice_sites = [int(sum(exon_sizes[:x]) / 3) for x in range(1, len(exon_sizes))]
203 chromStart += frame 163 sys.stderr.write("splice_sites: %s\n" % splice_sites)
204 chromEnd -= offset 164 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0]
205 else: 165 if seq:
206 chromStart += offset 166 for i in range(3):
207 chromEnd -= frame 167 translation = self.get_translation(sequence=seq[i:])
208 location = "chromosome:%s:%s:%s:%s:%s" % (reference,self.chrom,chromStart,chromEnd,strand) 168 if translation:
209 seq_id = "%s%s %s %s" % (self.name,frame_name,seqtype,location) 169 tstart = 0
210 return seq_id 170 tstop = len(translation)
211 def get_line(self, start_offset = 0, end_offset = 0): 171 if not untrimmed:
212 if start_offset or end_offset: 172 tstart = translation.rfind('*', 0, junc) + 1
213 s_offset = start_offset if start_offset else 0 173 stop = translation.find('*', junc)
214 e_offset = end_offset if end_offset else 0 174 tstop = stop if stop >= 0 else len(translation)
215 if s_offset > self.chromStart: 175 if filtering and tstart > ignore:
216 s_offset = self.chromStart 176 continue
217 chrStart = self.chromStart - s_offset 177 trimmed = translation[tstart:tstop]
218 chrEnd = self.chromEnd + e_offset 178 # get genomic locations for start and end
219 blkSizes = self.blockSizes 179 offset = (block_sum - i) % 3
220 blkSizes[0] += s_offset 180 sys.stderr.write("tstart: %d tstop: %d offset: %d\n" % (tstart, tstop, offset))
221 blkSizes[-1] += e_offset 181 if self.strand == '+':
222 blkStarts = self.blockStarts 182 chromStart = self.chromStart + i + (tstart * 3)
223 for i in range(1,self.blockCount): 183 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3
224 blkStarts[i] += s_offset 184 else:
225 items = [str(x) for x in [self.chrom,chrStart,chrEnd,self.name,self.score,self.strand,self.thickStart,self.thickEnd,self.itemRgb,self.blockCount,','.join([str(x) for x in blkSizes]),','.join([str(x) for x in blkStarts])]] 185 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3
226 return '\t'.join(items) + '\n' 186 chromEnd = self.chromEnd - i - (tstart * 3)
227 return self.line 187 # get the blocks for this translation
188 tblockCount = 0
189 tblockSizes = []
190 tblockStarts = []
191 for x in range(self.blockCount):
192 bStart = self.chromStart + self.blockStarts[x]
193 bEnd = bStart + self.blockSizes[x]
194 if bStart > chromEnd:
195 break
196 if bEnd < chromStart:
197 continue
198 cStart = max(chromStart, bStart)
199 tblockStarts.append(cStart - chromStart)
200 tblockSizes.append(min(chromEnd, bEnd) - cStart)
201 tblockCount += 1
202 sys.stderr.write("tblockCount: %d tblockStarts: %s tblockSizes: %s\n" % (tblockCount, tblockStarts, tblockSizes))
203 translations[i] = [chromStart, chromEnd, trimmed, tblockCount, tblockSizes, tblockStarts]
204 return translations
205
206 def get_seq_id(self, seqtype='unk:unk', reference='', frame=None):
207 """
208 # Ensembl fasta ID format
209 >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT
210 >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding
211 """
212 frame_name = ''
213 chromStart = self.chromStart
214 chromEnd = self.chromEnd
215 strand = 1 if self.strand == '+' else -1
216 if frame is not None:
217 block_sum = sum(self.blockSizes)
218 offset = (block_sum - frame) % 3
219 frame_name = '_' + str(frame + 1)
220 if self.strand == '+':
221 chromStart += frame
222 chromEnd -= offset
223 else:
224 chromStart += offset
225 chromEnd -= frame
226 location = "chromosome:%s:%s:%s:%s:%s" % (reference, self.chrom, chromStart, chromEnd, strand)
227 seq_id = "%s%s %s %s" % (self.name, frame_name, seqtype, location)
228 return seq_id
229
230 def get_line(self, start_offset=0, end_offset=0):
231 if start_offset or end_offset:
232 s_offset = start_offset if start_offset else 0
233 e_offset = end_offset if end_offset else 0
234 if s_offset > self.chromStart:
235 s_offset = self.chromStart
236 chrStart = self.chromStart - s_offset
237 chrEnd = self.chromEnd + e_offset
238 blkSizes = self.blockSizes
239 blkSizes[0] += s_offset
240 blkSizes[-1] += e_offset
241 blkStarts = self.blockStarts
242 for i in range(1, self.blockCount):
243 blkStarts[i] += s_offset
244 items = [str(x) for x in [self.chrom, chrStart, chrEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount, ','.join([str(x) for x in blkSizes]), ','.join([str(x) for x in blkStarts])]]
245 return '\t'.join(items) + '\n'
246 return self.line
247
228 248
229 def __main__(): 249 def __main__():
230 #Parse Command Line 250 # Parse Command Line
231 parser = optparse.OptionParser() 251 parser = optparse.OptionParser()
232 parser.add_option( '-t', '--translated_bed', dest='translated_bed', default=None, help='A bed file with added 13th column having a translation' ) 252 parser.add_option('-t', '--translated_bed', dest='translated_bed', default=None, help='A bed file with added 13th column having a translation')
233 parser.add_option( '-i', '--input', dest='input', default=None, help='Tabular file with peptide_sequence column' ) 253 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file with peptide_sequence column')
234 parser.add_option( '-p', '--peptide_column', type='int', dest='peptide_column', default=1, help='column ordinal with peptide sequence' ) 254 parser.add_option('-p', '--peptide_column', type='int', dest='peptide_column', default=1, help='column ordinal with peptide sequence')
235 parser.add_option( '-n', '--name_column', type='int', dest='name_column', default=2, help='column ordinal with protein name' ) 255 parser.add_option('-n', '--name_column', type='int', dest='name_column', default=2, help='column ordinal with protein name')
236 parser.add_option( '-s', '--start_column', type='int', dest='start_column', default=None, help='column with peptide start position in protein' ) 256 parser.add_option('-s', '--start_column', type='int', dest='start_column', default=None, help='column with peptide start position in protein')
237 parser.add_option( '-B', '--bed', dest='bed', default=None, help='Output a bed file with added 13th column having translation' ) 257 parser.add_option('-B', '--bed', dest='bed', default=None, help='Output a bed file with added 13th column having translation')
238 ## parser.add_option( '-G', '--gff3', dest='gff', default=None, help='Output translations to a GFF3 file' ) 258 # parser.add_option('-G', '--gff3', dest='gff', default=None, help='Output translations to a GFF3 file')
239 ## parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='Protein fasta' ) 259 # parser.add_option('-f', '--fasta', dest='fasta', default=None, help='Protein fasta')
240 parser.add_option( '-T', '--gffTags', dest='gffTags', action='store_true', default=False, help='Add #gffTags to bed output for IGV' ) 260 parser.add_option('-T', '--gffTags', dest='gffTags', action='store_true', default=False, help='Add #gffTags to bed output for IGV')
241 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr' ) 261 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
242 (options, args) = parser.parse_args() 262 (options, args) = parser.parse_args()
243 # Input files 263 # Input files
244 if options.input != None: 264 if options.input is not None:
265 try:
266 inputPath = os.path.abspath(options.input)
267 inputFile = open(inputPath, 'r')
268 except Exception as e:
269 sys.stderr("failed: %s\n" % e)
270 exit(2)
271 else:
272 inputFile = sys.stdin
273 inputBed = None
274 if options.translated_bed is not None:
275 inputBed = open(os.path.abspath(options.translated_bed), 'r')
276 peptide_column = options.peptide_column - 1
277 name_column = options.name_column - 1 if options.name_column else None
278 start_column = options.start_column - 1 if options.start_column else None
279 # Read in peptides
280 # peps[prot_name] = [seq]
281 prot_peps = dict()
282 unassigned_peps = set()
245 try: 283 try:
246 inputPath = os.path.abspath(options.input) 284 for i, line in enumerate(inputFile):
247 inputFile = open(inputPath, 'r') 285 # print >> sys.stderr, "%3d\t%s\n" % (i, line)
248 except Exception, e: 286 if line.startswith('#'):
249 print >> sys.stderr, "failed: %s" % e 287 continue
250 exit(2) 288 fields = line.rstrip('\r\n').split('\t')
251 else: 289 # print >> sys.stderr, "%3d\t%s\n" % (i, fields)
252 inputFile = sys.stdin 290 if peptide_column < len(fields):
253 inputBed = None 291 peptide = fields[peptide_column]
254 if options.translated_bed != None: 292 prot_name = fields[name_column] if name_column is not None and name_column < len(fields) else None
255 inputBed = open(os.path.abspath(options.translated_bed),'r') 293 if prot_name:
256 peptide_column = options.peptide_column - 1 294 offset = fields[start_column] if start_column is not None and start_column < len(fields) else -1
257 name_column = options.name_column - 1 if options.name_column else None 295 if prot_name not in prot_peps:
258 start_column = options.start_column - 1 if options.start_column else None 296 prot_peps[prot_name] = dict()
259 # Read in peptides 297 prot_peps[prot_name][peptide] = offset
260 # peps[prot_name] = [seq] 298 else:
261 prot_peps = dict() 299 unassigned_peps.add(peptide)
262 unassigned_peps = set() 300 if options.debug:
263 try: 301 sys.stderr.write("prot_peps: %s\n" % prot_peps)
264 for i, line in enumerate( inputFile ): 302 sys.stderr.write("unassigned_peps: %s\n" % unassigned_peps)
265 ## print >> sys.stderr, "%3d\t%s" % (i,line) 303 except Exception as e:
266 if line.startswith('#'): 304 sys.stderr.write("failed: Error reading %s - %s\n" % (options.input if options.input else 'stdin', e))
267 continue 305 exit(1)
268 fields = line.rstrip('\r\n').split('\t') 306 # Output files
269 ## print >> sys.stderr, "%3d\t%s" % (i,fields) 307 bed_fh = None
270 if peptide_column < len(fields): 308 if options.bed:
271 peptide = fields[peptide_column] 309 bed_fh = open(options.bed, 'w')
272 prot_name = fields[name_column] if name_column is not None and name_column < len(fields) else None 310 bed_fh.write('track name="%s" type=bedDetail description="%s" \n' % ('novel_junction_peptides', 'test'))
273 if prot_name: 311 if options.gffTags:
274 offset = fields[start_column] if start_column is not None and start_column < len(fields) else -1 312 bed_fh.write('#gffTags\n')
275 if prot_name not in prot_peps: 313 # if options.gff:
276 prot_peps[prot_name] = dict() 314 # gff_fh = open(options.gff, 'w')
277 prot_peps[prot_name][peptide] = offset 315 # gff_fh.write("##gff-version 3.2.1\n")
278 else: 316 # if options.reference:
279 unassigned_peps.add(peptide) 317 # gff_fh.write("##genome-build %s %s\n" % (options.refsource if options.refsource else 'unknown', options.reference))
280 if options.debug: 318 try:
281 print >> sys.stderr, "prot_peps: %s" % prot_peps 319 for i, line in enumerate(inputBed):
282 print >> sys.stderr, "unassigned_peps: %s" % unassigned_peps 320 # print >> sys.stderr, "%3d:\t%s\n" % (i, line)
283 except Exception, e: 321 if line.startswith('track'):
284 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e) 322 continue
285 exit(1) 323 entry = BedEntry(line)
286 # Output files 324 if entry.name in prot_peps:
287 bed_fh = None 325 for (peptide, offset) in prot_peps[entry.name].items():
288 ## gff_fh = None 326 if offset < 0:
289 ## gff_fa_file = None 327 offset = entry.seq.find(peptide)
290 gff_fa = None 328 if options.debug:
291 outFile = None 329 sys.stderr.write("%s\t%s\t%d\t%s\n" % (entry.name, peptide, offset, entry.seq))
292 if options.bed: 330 if offset >= 0:
293 bed_fh = open(options.bed,'w') 331 tstart = offset * 3
294 bed_fh.write('track name="%s" type=bedDetail description="%s" \n' % ('novel_junction_peptides','test')) 332 tstop = tstart + len(peptide) * 3
295 if options.gffTags: 333 if options.debug:
296 bed_fh.write('#gffTags\n') 334 sys.stderr.write("%d\t%d\t%d\n" % (offset, tstart, tstop))
297 ## if options.gff: 335 (pepStart, pepEnd) = entry.get_subrange(tstart, tstop)
298 ## gff_fh = open(options.gff,'w') 336 if options.debug:
299 ## gff_fh.write("##gff-version 3.2.1\n") 337 sys.stderr.write("%d\t%d\t%d\n" % (offset, pepStart, pepEnd))
300 ## if options.reference: 338 if bed_fh:
301 ## gff_fh.write("##genome-build %s %s\n" % (options.refsource if options.refsource else 'unknown', options.reference)) 339 entry.thickStart = pepStart
302 try: 340 entry.thickEnd = pepEnd
303 for i, line in enumerate( inputBed ): 341 bedfields = str(entry).split('\t')
304 ## print >> sys.stderr, "%3d:\t%s" % (i,line) 342 if options.gffTags:
305 if line.startswith('track'): 343 bedfields[3] = "ID=%s;Name=%s" % (entry.name, peptide)
306 continue 344 bed_fh.write("%s\t%s\t%s\n" % ('\t'.join(bedfields[:12]), peptide, entry.seq))
307 entry = BedEntry(line) 345 except Exception as e:
308 if entry.name in prot_peps: 346 sys.stderr.write("failed: Error reading %s - %s\n" % (options.input if options.input else 'stdin', e))
309 for (peptide,offset) in prot_peps[entry.name].iteritems(): 347 raise
310 if offset < 0: 348
311 offset = entry.seq.find(peptide) 349
312 if options.debug: 350 if __name__ == "__main__":
313 print >> sys.stderr, "%s\t%s\t%d\t%s\n" % (entry.name, peptide,offset,entry.seq) 351 __main__()
314 if offset >= 0:
315 tstart = offset * 3
316 tstop = tstart + len(peptide) * 3
317 if options.debug:
318 print >> sys.stderr, "%d\t%d\t%d" % (offset,tstart,tstop)
319 (pepStart,pepEnd) = entry.get_subrange(tstart,tstop)
320 if options.debug:
321 print >> sys.stderr, "%d\t%d\t%d" % (offset,pepStart,pepEnd)
322 if bed_fh:
323 entry.thickStart = pepStart
324 entry.thickEnd = pepEnd
325 bedfields = str(entry).split('\t')
326 if options.gffTags:
327 bedfields[3] = "ID=%s;Name=%s" % (entry.name,peptide)
328 bed_fh.write("%s\t%s\t%s\n" % ('\t'.join(bedfields[:12]),peptide,entry.seq))
329 except Exception, e:
330 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e)
331
332 if __name__ == "__main__" : __main__()
333