comparison smRtools.py @ 0:a2f293717ce3 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_signature commit fe40dec87779c1fcfbd03330e653aa886f4a2cda
author drosofff
date Wed, 21 Oct 2015 11:35:25 -0400
parents
children 6218b518cd16
comparison
equal deleted inserted replaced
-1:000000000000 0:a2f293717ce3
1 #!/usr/bin/python
2 # version 1 7-5-2012 unification of the SmRNAwindow class
3
4 import sys, subprocess
5 from collections import defaultdict
6 from numpy import mean, median, std
7 ##Disable scipy import temporarily, as no working scipy on toolshed.
8 ##from scipy import stats
9
10 def get_fasta (index="/home/galaxy/galaxy-dist/bowtie/5.37_Dmel/5.37_Dmel"):
11 '''This function will return a dictionary containing fasta identifiers as keys and the
12 sequence as values. Index must be the path to a fasta file.'''
13 p = subprocess.Popen(args=["bowtie-inspect","-a", "0", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
14 outputlines = p.stdout.readlines()
15 p.wait()
16 item_dic = {}
17 for line in outputlines:
18 if (line[0] == ">"):
19 try:
20 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
21 except: pass
22 current_item = line[1:].rstrip().split()[0] #take the first word before space because bowtie splits headers !
23 item_dic[current_item] = ""
24 stringlist=[]
25 else:
26 stringlist.append(line.rstrip() )
27 item_dic[current_item] = "".join(stringlist) # for the last item
28 return item_dic
29
30 def get_fasta_headers (index):
31 p = subprocess.Popen(args=["bowtie-inspect","-n", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
32 outputlines = p.stdout.readlines()
33 p.wait()
34 item_dic = {}
35 for line in outputlines:
36 header = line.rstrip().split()[0] #take the first word before space because bowtie splits headers !
37 item_dic[header] = 1
38 return item_dic
39
40
41 def get_file_sample (file, numberoflines):
42 '''import random to use this function'''
43 F=open(file)
44 fullfile = F.read().splitlines()
45 F.close()
46 if len(fullfile) < numberoflines:
47 return "sample size exceeds file size"
48 return random.sample(fullfile, numberoflines)
49
50 def get_fasta_from_history (file):
51 F = open (file, "r")
52 item_dic = {}
53 for line in F:
54 if (line[0] == ">"):
55 try:
56 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
57 except: pass
58 current_item = line[1:-1].split()[0] #take the first word before space because bowtie splits headers !
59 item_dic[current_item] = ""
60 stringlist=[]
61 else:
62 stringlist.append(line[:-1])
63 item_dic[current_item] = "".join(stringlist) # for the last item
64 return item_dic
65
66 def antipara (sequence):
67 antidict = {"A":"T", "T":"A", "G":"C", "C":"G", "N":"N"}
68 revseq = sequence[::-1]
69 return "".join([antidict[i] for i in revseq])
70
71 def RNAtranslate (sequence):
72 return "".join([i if i in "AGCN" else "U" for i in sequence])
73 def DNAtranslate (sequence):
74 return "".join([i if i in "AGCN" else "T" for i in sequence])
75
76 def RNAfold (sequence_list):
77 thestring= "\n".join(sequence_list)
78 p = subprocess.Popen(args=["RNAfold","--noPS"], stdin= subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
79 output=p.communicate(thestring)[0]
80 p.wait()
81 output=output.split("\n")
82 if not output[-1]: output = output[:-1] # nasty patch to remove last empty line
83 buffer=[]
84 for line in output:
85 if line[0] in ["N","A","T","U","G","C"]:
86 buffer.append(DNAtranslate(line))
87 if line[0] in ["(",".",")"]:
88 fields=line.split("(")
89 energy= fields[-1]
90 energy = energy[:-1] # remove the ) parenthesis
91 energy=float(energy)
92 buffer.append(str(energy))
93 return dict(zip(buffer[::2], buffer[1::2]))
94
95 def extractsubinstance (start, end, instance):
96 ''' Testing whether this can be an function external to the class to save memory'''
97 subinstance = SmRNAwindow (instance.gene, instance.sequence[start-1:end], start)
98 subinstance.gene = "%s %s %s" % (subinstance.gene, subinstance.windowoffset, subinstance.windowoffset + subinstance.size - 1)
99 upcoordinate = [i for i in range(start,end+1) if instance.readDict.has_key(i) ]
100 downcoordinate = [-i for i in range(start,end+1) if instance.readDict.has_key(-i) ]
101 for i in upcoordinate:
102 subinstance.readDict[i]=instance.readDict[i]
103 for i in downcoordinate:
104 subinstance.readDict[i]=instance.readDict[i]
105 return subinstance
106
107 class HandleSmRNAwindows:
108 def __init__(self, alignmentFile="~", alignmentFileFormat="tabular", genomeRefFile="~", genomeRefFormat="bowtieIndex", biosample="undetermined", size_inf=None, size_sup=1000, norm=1.0):
109 self.biosample = biosample
110 self.alignmentFile = alignmentFile
111 self.alignmentFileFormat = alignmentFileFormat # can be "tabular" or "sam"
112 self.genomeRefFile = genomeRefFile
113 self.genomeRefFormat = genomeRefFormat # can be "bowtieIndex" or "fastaSource"
114 self.alignedReads = 0
115 self.instanceDict = {}
116 self.size_inf=size_inf
117 self.size_sup=size_sup
118 self.norm=norm
119 if genomeRefFormat == "bowtieIndex":
120 self.itemDict = get_fasta (genomeRefFile)
121 elif genomeRefFormat == "fastaSource":
122 self.itemDict = get_fasta_from_history (genomeRefFile)
123 for item in self.itemDict:
124 self.instanceDict[item] = SmRNAwindow(item, sequence=self.itemDict[item], windowoffset=1, biosample=self.biosample, norm=self.norm) # create as many instances as there is items
125 self.readfile()
126
127 def readfile (self) :
128 if self.alignmentFileFormat == "tabular":
129 F = open (self.alignmentFile, "r")
130 for line in F:
131 fields = line.split()
132 polarity = fields[1]
133 gene = fields[2]
134 offset = int(fields[3])
135 size = len (fields[4])
136 if self.size_inf:
137 if (size>=self.size_inf and size<= self.size_sup):
138 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
139 self.alignedReads += 1
140 else:
141 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
142 self.alignedReads += 1
143 F.close()
144 return self.instanceDict
145 # elif self.alignmentFileFormat == "sam":
146 # F = open (self.alignmentFile, "r")
147 # dict = {"0":"+", "16":"-"}
148 # for line in F:
149 # if line[0]=='@':
150 # continue
151 # fields = line.split()
152 # if fields[2] == "*": continue
153 # polarity = dict[fields[1]]
154 # gene = fields[2]
155 # offset = int(fields[3])
156 # size = len (fields[9])
157 # if self.size_inf:
158 # if (size>=self.size_inf and size<= self.size_sup):
159 # self.instanceDict[gene].addread (polarity, offset, size)
160 # self.alignedReads += 1
161 # else:
162 # self.instanceDict[gene].addread (polarity, offset, size)
163 # self.alignedReads += 1
164 # F.close()
165 elif self.alignmentFileFormat == "bam" or self.alignmentFileFormat == "sam":
166 import pysam
167 samfile = pysam.Samfile(self.alignmentFile)
168 for read in samfile:
169 if read.tid == -1:
170 continue # filter out unaligned reads
171 if read.is_reverse:
172 polarity="-"
173 else:
174 polarity="+"
175 gene = samfile.getrname(read.tid)
176 offset = read.pos
177 size = read.qlen
178 if self.size_inf:
179 if (size>=self.size_inf and size<= self.size_sup):
180 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
181 self.alignedReads += 1
182 else:
183 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
184 self.alignedReads += 1
185 return self.instanceDict
186
187 # def size_histogram (self):
188 # size_dict={}
189 # size_dict['F']= defaultdict (int)
190 # size_dict['R']= defaultdict (int)
191 # size_dict['both'] = defaultdict (int)
192 # for item in self.instanceDict:
193 # buffer_dict_F = self.instanceDict[item].size_histogram()['F']
194 # buffer_dict_R = self.instanceDict[item].size_histogram()['R']
195 # for size in buffer_dict_F:
196 # size_dict['F'][size] += buffer_dict_F[size]
197 # for size in buffer_dict_R:
198 # size_dict['R'][size] -= buffer_dict_R[size]
199 # allSizeKeys = list (set (size_dict['F'].keys() + size_dict['R'].keys() ) )
200 # for size in allSizeKeys:
201 # size_dict['both'][size] = size_dict['F'][size] + size_dict['R'][size]
202 # return size_dict
203 def size_histogram (self): # in HandleSmRNAwindows
204 '''refactored on 7-9-2014 to debug size_histogram tool'''
205 size_dict={}
206 size_dict['F']= defaultdict (float)
207 size_dict['R']= defaultdict (float)
208 size_dict['both'] = defaultdict (float)
209 for item in self.instanceDict:
210 buffer_dict = self.instanceDict[item].size_histogram()
211 for polarity in ["F", "R"]:
212 for size in buffer_dict[polarity]:
213 size_dict[polarity][size] += buffer_dict[polarity][size]
214 for size in buffer_dict["both"]:
215 size_dict["both"][size] += buffer_dict["F"][size] - buffer_dict["R"][size]
216 return size_dict
217
218 def CountFeatures (self, GFF3="path/to/file"):
219 featureDict = defaultdict(int)
220 F = open (GFF3, "r")
221 for line in F:
222 if line[0] == "#": continue
223 fields = line[:-1].split()
224 chrom, feature, leftcoord, rightcoord, polarity = fields[0], fields[2], fields[3], fields[4], fields[6]
225 featureDict[feature] += self.instanceDict[chrom].readcount(upstream_coord=int(leftcoord), downstream_coord=int(rightcoord), polarity="both", method="destructive")
226 F.close()
227 return featureDict
228
229 class SmRNAwindow:
230
231 def __init__(self, gene, sequence="ATGC", windowoffset=1, biosample="Undetermined", norm=1.0):
232 self.biosample = biosample
233 self.sequence = sequence
234 self.gene = gene
235 self.windowoffset = windowoffset
236 self.size = len(sequence)
237 self.readDict = defaultdict(list) # with a {+/-offset:[size1, size2, ...], ...}
238 self.matchedreadsUp = 0
239 self.matchedreadsDown = 0
240 self.norm=norm
241
242 def addread (self, polarity, offset, size):
243 '''ATTENTION ATTENTION ATTENTION'''
244 ''' We removed the conversion from 0 to 1 based offset, as we do this now during readparsing.'''
245 if polarity == "+":
246 self.readDict[offset].append(size)
247 self.matchedreadsUp += 1
248 else:
249 self.readDict[-(offset + size -1)].append(size)
250 self.matchedreadsDown += 1
251 return
252
253 def barycenter (self, upstream_coord=None, downstream_coord=None):
254 '''refactored 24-12-2013 to save memory and introduce offset filtering see readcount method for further discussion on that
255 In this version, attempt to replace the dictionary structure by a list of tupple to save memory too'''
256 upstream_coord = upstream_coord or self.windowoffset
257 downstream_coord = downstream_coord or self.windowoffset+self.size-1
258 window_size = downstream_coord - upstream_coord +1
259 def weigthAverage (TuppleList):
260 weightSum = 0
261 PonderWeightSum = 0
262 for tuple in TuppleList:
263 PonderWeightSum += tuple[0] * tuple[1]
264 weightSum += tuple[1]
265 if weightSum > 0:
266 return PonderWeightSum / float(weightSum)
267 else:
268 return 0
269 forwardTuppleList = [(k, len(self.readDict[k])) for k in self.readDict.keys() if (k > 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both forward and in the proper offset window
270 reverseTuppleList = [(-k, len(self.readDict[k])) for k in self.readDict.keys() if (k < 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both reverse and in the proper offset window
271 Fbarycenter = (weigthAverage (forwardTuppleList) - upstream_coord) / window_size
272 Rbarycenter = (weigthAverage (reverseTuppleList) - upstream_coord) / window_size
273 return Fbarycenter, Rbarycenter
274
275 def correlation_mapper (self, reference, window_size):
276 '''to map correlation with a sliding window 26-2-2013'''
277 from scipy import stats
278
279 if window_size > self.size:
280 return []
281 F=open(reference, "r")
282 reference_forward = []
283 reference_reverse = []
284 for line in F:
285 fields=line.split()
286 reference_forward.append(int(float(fields[1])))
287 reference_reverse.append(int(float(fields[2])))
288 F.close()
289 local_object_forward=[]
290 local_object_reverse=[]
291 ## Dict to list for the local object
292 for i in range(1, self.size+1):
293 local_object_forward.append(len(self.readDict[i]))
294 local_object_reverse.append(len(self.readDict[-i]))
295 ## start compiling results by slides
296 results=[]
297 for coordinate in range(self.size - window_size):
298 local_forward=local_object_forward[coordinate:coordinate + window_size]
299 local_reverse=local_object_reverse[coordinate:coordinate + window_size]
300 if sum(local_forward) == 0 or sum(local_reverse) == 0:
301 continue
302 try:
303 reference_to_local_cor_forward = stats.spearmanr(local_forward, reference_forward)
304 reference_to_local_cor_reverse = stats.spearmanr(local_reverse, reference_reverse)
305 if (reference_to_local_cor_forward[0] > 0.2 or reference_to_local_cor_reverse[0]>0.2):
306 results.append([coordinate+1, reference_to_local_cor_forward[0], reference_to_local_cor_reverse[0]])
307 except:
308 pass
309 return results
310
311 def readcount (self, size_inf=0, size_sup=1000, upstream_coord=None, downstream_coord=None, polarity="both", method="conservative"):
312 '''refactored 24-12-2013 to save memory and introduce offset filtering
313 take a look at the defaut parameters that cannot be defined relatively to the instance are they are defined before instanciation
314 the trick is to pass None and then test
315 polarity parameter can take "both", "forward" or "reverse" as value'''
316 upstream_coord = upstream_coord or self.windowoffset
317 downstream_coord = downstream_coord or self.windowoffset+self.size-1
318 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "both":
319 return self.matchedreadsUp + self.matchedreadsDown
320 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "forward":
321 return self.matchedreadsUp
322 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "reverse":
323 return self.matchedreadsDown
324 n=0
325 if polarity == "both":
326 for offset in xrange(upstream_coord, downstream_coord+1):
327 if self.readDict.has_key(offset):
328 for read in self.readDict[offset]:
329 if (read>=size_inf and read<= size_sup):
330 n += 1
331 if method != "conservative":
332 del self.readDict[offset] ## Carefull ! precludes re-use on the self.readDict dictionary !!!!!! TEST
333 if self.readDict.has_key(-offset):
334 for read in self.readDict[-offset]:
335 if (read>=size_inf and read<= size_sup):
336 n += 1
337 if method != "conservative":
338 del self.readDict[-offset]
339 return n
340 elif polarity == "forward":
341 for offset in xrange(upstream_coord, downstream_coord+1):
342 if self.readDict.has_key(offset):
343 for read in self.readDict[offset]:
344 if (read>=size_inf and read<= size_sup):
345 n += 1
346 return n
347 elif polarity == "reverse":
348 for offset in xrange(upstream_coord, downstream_coord+1):
349 if self.readDict.has_key(-offset):
350 for read in self.readDict[-offset]:
351 if (read>=size_inf and read<= size_sup):
352 n += 1
353 return n
354
355 def readsizes (self):
356 '''return a dictionary of number of reads by size (the keys)'''
357 dicsize = {}
358 for offset in self.readDict:
359 for size in self.readDict[offset]:
360 dicsize[size] = dicsize.get(size, 0) + 1
361 for offset in range (min(dicsize.keys()), max(dicsize.keys())+1):
362 dicsize[size] = dicsize.get(size, 0) # to fill offsets with null values
363 return dicsize
364
365 # def size_histogram(self):
366 # norm=self.norm
367 # hist_dict={}
368 # hist_dict['F']={}
369 # hist_dict['R']={}
370 # for offset in self.readDict:
371 # for size in self.readDict[offset]:
372 # if offset < 0:
373 # hist_dict['R'][size] = hist_dict['R'].get(size, 0) - 1*norm
374 # else:
375 # hist_dict['F'][size] = hist_dict['F'].get(size, 0) + 1*norm
376 # ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate !
377 # if not (hist_dict['F']) and (not hist_dict['R']):
378 # hist_dict['F'][21] = 0
379 # hist_dict['R'][21] = 0
380 # ##
381 # return hist_dict
382
383 def size_histogram(self, minquery=None, maxquery=None): # in SmRNAwindow
384 '''refactored on 7-9-2014 to debug size_histogram tool'''
385 norm=self.norm
386 size_dict={}
387 size_dict['F']= defaultdict (float)
388 size_dict['R']= defaultdict (float)
389 size_dict['both']= defaultdict (float)
390 for offset in self.readDict:
391 for size in self.readDict[offset]:
392 if offset < 0:
393 size_dict['R'][size] = size_dict['R'][size] - 1*norm
394 else:
395 size_dict['F'][size] = size_dict['F'][size] + 1*norm
396 ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate !
397 if not (size_dict['F']) and (not size_dict['R']):
398 size_dict['F'][21] = 0
399 size_dict['R'][21] = 0
400 ##
401 allSizeKeys = list (set (size_dict['F'].keys() + size_dict['R'].keys() ) )
402 for size in allSizeKeys:
403 size_dict['both'][size] = size_dict['F'][size] - size_dict['R'][size]
404 if minquery:
405 for polarity in size_dict.keys():
406 for size in xrange(minquery, maxquery+1):
407 if not size in size_dict[polarity].keys():
408 size_dict[polarity][size]=0
409 return size_dict
410
411 def statsizes (self, upstream_coord=None, downstream_coord=None):
412 ''' migration to memory saving by specifying possible subcoordinates
413 see the readcount method for further discussion'''
414 upstream_coord = upstream_coord or self.windowoffset
415 downstream_coord = downstream_coord or self.windowoffset+self.size-1
416 L = []
417 for offset in self.readDict:
418 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
419 for size in self.readDict[offset]:
420 L.append(size)
421 meansize = mean(L)
422 stdv = std(L)
423 mediansize = median(L)
424 return meansize, mediansize, stdv
425
426 def foldEnergy (self, upstream_coord=None, downstream_coord=None):
427 ''' migration to memory saving by specifying possible subcoordinates
428 see the readcount method for further discussion'''
429 upstream_coord = upstream_coord or self.windowoffset
430 downstream_coord = downstream_coord or self.windowoffset+self.size-1
431 Energy = RNAfold ([self.sequence[upstream_coord-1:downstream_coord] ])
432 return float(Energy[self.sequence[upstream_coord-1:downstream_coord]])
433
434 def Ufreq (self, size_scope, upstream_coord=None, downstream_coord=None):
435 ''' migration to memory saving by specifying possible subcoordinates
436 see the readcount method for further discussion. size_scope must be an interable'''
437 upstream_coord = upstream_coord or self.windowoffset
438 downstream_coord = downstream_coord or self.windowoffset+self.size-1
439 freqDic = {"A":0,"T":0,"G":0,"C":0, "N":0}
440 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
441 for offset in self.readDict:
442 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
443 for size in self.readDict[offset]:
444 if size in size_scope:
445 startbase = self.sequence[abs(offset)-self.windowoffset]
446 if offset < 0:
447 startbase = convertDic[startbase]
448 freqDic[startbase] += 1
449 base_sum = float ( sum( freqDic.values()) )
450 if base_sum == 0:
451 return "."
452 else:
453 return freqDic["T"] / base_sum * 100
454
455 def Ufreq_stranded (self, size_scope, upstream_coord=None, downstream_coord=None):
456 ''' migration to memory saving by specifying possible subcoordinates
457 see the readcount method for further discussion. size_scope must be an interable
458 This method is similar to the Ufreq method but take strandness into account'''
459 upstream_coord = upstream_coord or self.windowoffset
460 downstream_coord = downstream_coord or self.windowoffset+self.size-1
461 freqDic = {"Afor":0,"Tfor":0,"Gfor":0,"Cfor":0, "Nfor":0,"Arev":0,"Trev":0,"Grev":0,"Crev":0, "Nrev":0}
462 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
463 for offset in self.readDict:
464 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
465 for size in self.readDict[offset]:
466 if size in size_scope:
467 startbase = self.sequence[abs(offset)-self.windowoffset]
468 if offset < 0:
469 startbase = convertDic[startbase]
470 freqDic[startbase+"rev"] += 1
471 else:
472 freqDic[startbase+"for"] += 1
473 forward_sum = float ( freqDic["Afor"]+freqDic["Tfor"]+freqDic["Gfor"]+freqDic["Cfor"]+freqDic["Nfor"])
474 reverse_sum = float ( freqDic["Arev"]+freqDic["Trev"]+freqDic["Grev"]+freqDic["Crev"]+freqDic["Nrev"])
475 if forward_sum == 0 and reverse_sum == 0:
476 return ". | ."
477 elif reverse_sum == 0:
478 return "%s | ." % (freqDic["Tfor"] / forward_sum * 100)
479 elif forward_sum == 0:
480 return ". | %s" % (freqDic["Trev"] / reverse_sum * 100)
481 else:
482 return "%s | %s" % (freqDic["Tfor"] / forward_sum * 100, freqDic["Trev"] / reverse_sum * 100)
483
484
485 def readplot (self):
486 norm=self.norm
487 readmap = {}
488 for offset in self.readDict.keys():
489 readmap[abs(offset)] = ( len(self.readDict.get(-abs(offset),[]))*norm , len(self.readDict.get(abs(offset),[]))*norm )
490 mylist = []
491 for offset in sorted(readmap):
492 if readmap[offset][1] != 0:
493 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, readmap[offset][1], "F") )
494 if readmap[offset][0] != 0:
495 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, -readmap[offset][0], "R") )
496 ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate !
497 if not mylist:
498 mylist.append("%s\t%s\t%s\t%s" % (self.gene, 1, 0, "F") )
499 ###
500 return mylist
501
502 def readcoverage (self, upstream_coord=None, downstream_coord=None, windowName=None):
503 '''Use by MirParser tool'''
504 upstream_coord = upstream_coord or 1
505 downstream_coord = downstream_coord or self.size
506 windowName = windowName or "%s_%s_%s" % (self.gene, upstream_coord, downstream_coord)
507 forORrev_coverage = dict ([(i,0) for i in xrange(1, downstream_coord-upstream_coord+1)])
508 totalforward = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="forward")
509 totalreverse = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="reverse")
510 if totalforward > totalreverse:
511 majorcoverage = "forward"
512 for offset in self.readDict.keys():
513 if (offset > 0) and ((offset-upstream_coord+1) in forORrev_coverage.keys() ):
514 for read in self.readDict[offset]:
515 for i in xrange(read):
516 try:
517 forORrev_coverage[offset-upstream_coord+1+i] += 1
518 except KeyError:
519 continue # a sense read may span over the downstream limit
520 else:
521 majorcoverage = "reverse"
522 for offset in self.readDict.keys():
523 if (offset < 0) and (-offset-upstream_coord+1 in forORrev_coverage.keys() ):
524 for read in self.readDict[offset]:
525 for i in xrange(read):
526 try:
527 forORrev_coverage[-offset-upstream_coord-i] += 1 ## positive coordinates in the instance, with + for forward coverage and - for reverse coverage
528 except KeyError:
529 continue # an antisense read may span over the upstream limit
530 output_list = []
531 maximum = max (forORrev_coverage.values()) or 1
532 for n in sorted (forORrev_coverage):
533 output_list.append("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.biosample, windowName, n, float(n)/(downstream_coord-upstream_coord+1), forORrev_coverage[n], float(forORrev_coverage[n])/maximum, majorcoverage))
534 return "\n".join(output_list)
535
536
537 def signature (self, minquery, maxquery, mintarget, maxtarget, scope, zscore="no", upstream_coord=None, downstream_coord=None):
538 ''' migration to memory saving by specifying possible subcoordinates
539 see the readcount method for further discussion
540 scope must be a python iterable; scope define the *relative* offset range to be computed'''
541 upstream_coord = upstream_coord or self.windowoffset
542 downstream_coord = downstream_coord or self.windowoffset+self.size-1
543 query_range = range (minquery, maxquery+1)
544 target_range = range (mintarget, maxtarget+1)
545 Query_table = {}
546 Target_table = {}
547 frequency_table = dict ([(i, 0) for i in scope])
548 for offset in self.readDict:
549 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
550 for size in self.readDict[offset]:
551 if size in query_range:
552 Query_table[offset] = Query_table.get(offset, 0) + 1
553 if size in target_range:
554 Target_table[offset] = Target_table.get(offset, 0) + 1
555 for offset in Query_table:
556 for i in scope:
557 frequency_table[i] += min(Query_table[offset], Target_table.get(-offset -i +1, 0))
558 if minquery==mintarget and maxquery==maxtarget: ## added to incorporate the division by 2 in the method (26/11/2013), see signature_options.py and lattice_signature.py
559 frequency_table = dict([(i,frequency_table[i]/2) for i in frequency_table])
560 if zscore == "yes":
561 z_mean = mean(frequency_table.values() )
562 z_std = std(frequency_table.values() )
563 if z_std == 0:
564 frequency_table = dict([(i,0) for i in frequency_table] )
565 else:
566 frequency_table = dict([(i, (frequency_table[i]- z_mean)/z_std) for i in frequency_table] )
567 return frequency_table
568
569 def hannon_signature (self, minquery, maxquery, mintarget, maxtarget, scope, upstream_coord=None, downstream_coord=None):
570 ''' migration to memory saving by specifying possible subcoordinates see the readcount method for further discussion
571 note that scope must be an iterable (a list or a tuple), which specifies the relative offsets that will be computed'''
572 upstream_coord = upstream_coord or self.windowoffset
573 downstream_coord = downstream_coord or self.windowoffset+self.size-1
574 query_range = range (minquery, maxquery+1)
575 target_range = range (mintarget, maxtarget+1)
576 Query_table = {}
577 Target_table = {}
578 Total_Query_Numb = 0
579 general_frequency_table = dict ([(i,0) for i in scope])
580 ## filtering the appropriate reads for the study
581 for offset in self.readDict:
582 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
583 for size in self.readDict[offset]:
584 if size in query_range:
585 Query_table[offset] = Query_table.get(offset, 0) + 1
586 Total_Query_Numb += 1
587 if size in target_range:
588 Target_table[offset] = Target_table.get(offset, 0) + 1
589 for offset in Query_table:
590 frequency_table = dict ([(i,0) for i in scope])
591 number_of_targets = 0
592 for i in scope:
593 frequency_table[i] += Query_table[offset] * Target_table.get(-offset -i +1, 0)
594 number_of_targets += Target_table.get(-offset -i +1, 0)
595 for i in scope:
596 try:
597 general_frequency_table[i] += (1. / number_of_targets / Total_Query_Numb) * frequency_table[i]
598 except ZeroDivisionError :
599 continue
600 return general_frequency_table
601
602 def phasing (self, size_range, scope):
603 ''' to calculate autocorelation like signal - scope must be an python iterable'''
604 read_table = {}
605 total_read_number = 0
606 general_frequency_table = dict ([(i, 0) for i in scope])
607 ## read input filtering
608 for offset in self.readDict:
609 for size in self.readDict[offset]:
610 if size in size_range:
611 read_table[offset] = read_table.get(offset, 0) + 1
612 total_read_number += 1
613 ## per offset read phasing computing
614 for offset in read_table:
615 frequency_table = dict ([(i, 0) for i in scope]) # local frequency table
616 number_of_targets = 0
617 for i in scope:
618 if offset > 0:
619 frequency_table[i] += read_table[offset] * read_table.get(offset + i, 0)
620 number_of_targets += read_table.get(offset + i, 0)
621 else:
622 frequency_table[i] += read_table[offset] * read_table.get(offset - i, 0)
623 number_of_targets += read_table.get(offset - i, 0)
624 ## inclusion of local frequency table in the general frequency table (all offsets average)
625 for i in scope:
626 try:
627 general_frequency_table[i] += (1. / number_of_targets / total_read_number) * frequency_table[i]
628 except ZeroDivisionError :
629 continue
630 return general_frequency_table
631
632
633
634 def z_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
635 '''Must do: from numpy import mean, std, to use this method; scope must be a python iterable and defines the relative offsets to compute'''
636 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
637 z_table = {}
638 frequency_list = [frequency_table[i] for i in sorted (frequency_table)]
639 if std(frequency_list):
640 meanlist = mean(frequency_list)
641 stdlist = std(frequency_list)
642 z_list = [(i-meanlist)/stdlist for i in frequency_list]
643 return dict (zip (sorted(frequency_table), z_list) )
644 else:
645 return dict (zip (sorted(frequency_table), [0 for i in frequency_table]) )
646
647 def percent_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
648 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
649 total = float(sum ([self.readsizes().get(i,0) for i in set(range(minquery,maxquery)+range(mintarget,maxtarget))]) )
650 if total == 0:
651 return dict( [(i,0) for i in scope])
652 return dict( [(i, frequency_table[i]/total*100) for i in scope])
653
654 def pairer (self, overlap, minquery, maxquery, mintarget, maxtarget):
655 queryhash = defaultdict(list)
656 targethash = defaultdict(list)
657 query_range = range (int(minquery), int(maxquery)+1)
658 target_range = range (int(mintarget), int(maxtarget)+1)
659 paired_sequences = []
660 for offset in self.readDict: # selection of data
661 for size in self.readDict[offset]:
662 if size in query_range:
663 queryhash[offset].append(size)
664 if size in target_range:
665 targethash[offset].append(size)
666 for offset in queryhash:
667 if offset >= 0: matched_offset = -offset - overlap + 1
668 else: matched_offset = -offset - overlap + 1
669 if targethash[matched_offset]:
670 paired = min ( len(queryhash[offset]), len(targethash[matched_offset]) )
671 if offset >= 0:
672 for i in range (paired):
673 paired_sequences.append("+%s" % RNAtranslate ( self.sequence[offset:offset+queryhash[offset][i]]) )
674 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-targethash[matched_offset][i]+1:-matched_offset+1]) ) )
675 if offset < 0:
676 for i in range (paired):
677 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-queryhash[offset][i]+1:-offset+1]) ) )
678 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+targethash[matched_offset][i]] ) )
679 return paired_sequences
680
681 def pairable (self, overlap, minquery, maxquery, mintarget, maxtarget):
682 queryhash = defaultdict(list)
683 targethash = defaultdict(list)
684 query_range = range (int(minquery), int(maxquery)+1)
685 target_range = range (int(mintarget), int(maxtarget)+1)
686 paired_sequences = []
687
688 for offset in self.readDict: # selection of data
689 for size in self.readDict[offset]:
690 if size in query_range:
691 queryhash[offset].append(size)
692 if size in target_range:
693 targethash[offset].append(size)
694
695 for offset in queryhash:
696 matched_offset = -offset - overlap + 1
697 if targethash[matched_offset]:
698 if offset >= 0:
699 for i in queryhash[offset]:
700 paired_sequences.append("+%s" % RNAtranslate (self.sequence[offset:offset+i]) )
701 for i in targethash[matched_offset]:
702 paired_sequences.append( "-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-i+1:-matched_offset+1]) ) )
703 if offset < 0:
704 for i in queryhash[offset]:
705 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-i+1:-offset+1]) ) )
706 for i in targethash[matched_offset]:
707 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+i] ) )
708 return paired_sequences
709
710 def newpairable_bowtie (self, overlap, minquery, maxquery, mintarget, maxtarget):
711 ''' revision of pairable on 3-12-2012, with focus on the offset shift problem (bowtie is 1-based cooordinates whereas python strings are 0-based coordinates'''
712 queryhash = defaultdict(list)
713 targethash = defaultdict(list)
714 query_range = range (int(minquery), int(maxquery)+1)
715 target_range = range (int(mintarget), int(maxtarget)+1)
716 bowtie_output = []
717
718 for offset in self.readDict: # selection of data
719 for size in self.readDict[offset]:
720 if size in query_range:
721 queryhash[offset].append(size)
722 if size in target_range:
723 targethash[offset].append(size)
724 counter = 0
725 for offset in queryhash:
726 matched_offset = -offset - overlap + 1
727 if targethash[matched_offset]:
728 if offset >= 0:
729 for i in queryhash[offset]:
730 counter += 1
731 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "+", self.gene, offset-1, self.sequence[offset-1:offset-1+i]) ) # attention a la base 1-0 de l'offset
732 if offset < 0:
733 for i in queryhash[offset]:
734 counter += 1
735 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "-", self.gene, -offset-i, self.sequence[-offset-i:-offset])) # attention a la base 1-0 de l'offset
736 return bowtie_output
737
738
739 def __main__(bowtie_index_path, bowtie_output_path):
740 sequenceDic = get_fasta (bowtie_index_path)
741 objDic = {}
742 F = open (bowtie_output_path, "r") # F is the bowtie output taken as input
743 for line in F:
744 fields = line.split()
745 polarity = fields[1]
746 gene = fields[2]
747 offset = int(fields[3])
748 size = len (fields[4])
749 try:
750 objDic[gene].addread (polarity, offset, size)
751 except KeyError:
752 objDic[gene] = SmRNAwindow(gene, sequenceDic[gene])
753 objDic[gene].addread (polarity, offset, size)
754 F.close()
755 for gene in objDic:
756 print gene, objDic[gene].pairer(19,19,23,19,23)
757
758 if __name__ == "__main__" : __main__(sys.argv[1], sys.argv[2])