comparison smRtools.py @ 0:234b83159ea8 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/small_read_size_histograms commit ab983b2e57321e8913bd4d5f8fc89c3223c69869
author artbio
date Tue, 11 Jul 2017 11:44:36 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:234b83159ea8
1 #!/usr/bin/python
2 # version 1 7-5-2012 unification of the SmRNAwindow class
3
4 import sys, subprocess
5 from collections import defaultdict
6 from numpy import mean, median, std
7 ##Disable scipy import temporarily, as no working scipy on toolshed.
8 ##from scipy import stats
9
10 def get_fasta (index="/home/galaxy/galaxy-dist/bowtie/5.37_Dmel/5.37_Dmel"):
11 '''This function will return a dictionary containing fasta identifiers as keys and the
12 sequence as values. Index must be the path to a fasta file.'''
13 p = subprocess.Popen(args=["bowtie-inspect","-a", "0", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
14 outputlines = p.stdout.readlines()
15 p.wait()
16 item_dic = {}
17 for line in outputlines:
18 if (line[0] == ">"):
19 try:
20 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
21 except: pass
22 current_item = line[1:].rstrip().split()[0] #take the first word before space because bowtie splits headers !
23 item_dic[current_item] = ""
24 stringlist=[]
25 else:
26 stringlist.append(line.rstrip() )
27 item_dic[current_item] = "".join(stringlist) # for the last item
28 return item_dic
29
30 def get_fasta_headers (index):
31 p = subprocess.Popen(args=["bowtie-inspect","-n", index], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # bowtie-inspect outputs sequences on single lines
32 outputlines = p.stdout.readlines()
33 p.wait()
34 item_dic = {}
35 for line in outputlines:
36 header = line.rstrip().split()[0] #take the first word before space because bowtie splits headers !
37 item_dic[header] = 1
38 return item_dic
39
40
41 def get_file_sample (file, numberoflines):
42 '''import random to use this function'''
43 F=open(file)
44 fullfile = F.read().splitlines()
45 F.close()
46 if len(fullfile) < numberoflines:
47 return "sample size exceeds file size"
48 return random.sample(fullfile, numberoflines)
49
50 def get_fasta_from_history (file):
51 F = open (file, "r")
52 item_dic = {}
53 for line in F:
54 if (line[0] == ">"):
55 try:
56 item_dic[current_item] = "".join(stringlist) # to dump the sequence of the previous item - try because of the keyerror of the first item
57 except: pass
58 current_item = line[1:-1].split()[0] #take the first word before space because bowtie splits headers !
59 item_dic[current_item] = ""
60 stringlist=[]
61 else:
62 stringlist.append(line[:-1])
63 item_dic[current_item] = "".join(stringlist) # for the last item
64 return item_dic
65
66 def antipara (sequence):
67 antidict = {"A":"T", "T":"A", "G":"C", "C":"G", "N":"N"}
68 revseq = sequence[::-1]
69 return "".join([antidict[i] for i in revseq])
70
71 def RNAtranslate (sequence):
72 return "".join([i if i in "AGCN" else "U" for i in sequence])
73 def DNAtranslate (sequence):
74 return "".join([i if i in "AGCN" else "T" for i in sequence])
75
76 def RNAfold (sequence_list):
77 thestring= "\n".join(sequence_list)
78 p = subprocess.Popen(args=["RNAfold","--noPS"], stdin= subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
79 output=p.communicate(thestring)[0]
80 p.wait()
81 output=output.split("\n")
82 if not output[-1]: output = output[:-1] # nasty patch to remove last empty line
83 buffer=[]
84 for line in output:
85 if line[0] in ["N","A","T","U","G","C"]:
86 buffer.append(DNAtranslate(line))
87 if line[0] in ["(",".",")"]:
88 fields=line.split("(")
89 energy= fields[-1]
90 energy = energy[:-1] # remove the ) parenthesis
91 energy=float(energy)
92 buffer.append(str(energy))
93 return dict(zip(buffer[::2], buffer[1::2]))
94
95 def extractsubinstance (start, end, instance):
96 ''' Testing whether this can be an function external to the class to save memory'''
97 subinstance = SmRNAwindow (instance.gene, instance.sequence[start-1:end], start)
98 subinstance.gene = "%s %s %s" % (subinstance.gene, subinstance.windowoffset, subinstance.windowoffset + subinstance.size - 1)
99 upcoordinate = [i for i in range(start,end+1) if instance.readDict.has_key(i) ]
100 downcoordinate = [-i for i in range(start,end+1) if instance.readDict.has_key(-i) ]
101 for i in upcoordinate:
102 subinstance.readDict[i]=instance.readDict[i]
103 for i in downcoordinate:
104 subinstance.readDict[i]=instance.readDict[i]
105 return subinstance
106
107 class HandleSmRNAwindows:
108 def __init__(self, alignmentFile="~", alignmentFileFormat="tabular", genomeRefFile="~", genomeRefFormat="bowtieIndex", biosample="undetermined", size_inf=None, size_sup=1000, norm=1.0):
109 self.biosample = biosample
110 self.alignmentFile = alignmentFile
111 self.alignmentFileFormat = alignmentFileFormat # can be "tabular" or "sam"
112 self.genomeRefFile = genomeRefFile
113 self.genomeRefFormat = genomeRefFormat # can be "bowtieIndex" or "fastaSource"
114 self.alignedReads = 0
115 self.instanceDict = {}
116 self.size_inf=size_inf
117 self.size_sup=size_sup
118 self.norm=norm
119 if genomeRefFormat == "bowtieIndex":
120 self.itemDict = get_fasta (genomeRefFile)
121 elif genomeRefFormat == "fastaSource":
122 self.itemDict = get_fasta_from_history (genomeRefFile)
123 for item in self.itemDict:
124 self.instanceDict[item] = SmRNAwindow(item, sequence=self.itemDict[item], windowoffset=1, biosample=self.biosample, norm=self.norm) # create as many instances as there is items
125 self.readfile()
126
127 def readfile (self) :
128 if self.alignmentFileFormat == "tabular":
129 F = open (self.alignmentFile, "r")
130 for line in F:
131 fields = line.split()
132 polarity = fields[1]
133 gene = fields[2]
134 offset = int(fields[3])
135 size = len (fields[4])
136 if self.size_inf:
137 if (size>=self.size_inf and size<= self.size_sup):
138 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
139 self.alignedReads += 1
140 else:
141 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
142 self.alignedReads += 1
143 F.close()
144 return self.instanceDict
145 elif self.alignmentFileFormat == "bam" or self.alignmentFileFormat == "sam":
146 import pysam
147 samfile = pysam.Samfile(self.alignmentFile)
148 for read in samfile:
149 if read.tid == -1:
150 continue # filter out unaligned reads
151 if read.is_reverse:
152 polarity="-"
153 else:
154 polarity="+"
155 gene = samfile.getrname(read.tid)
156 offset = read.pos
157 size = read.qlen
158 if self.size_inf:
159 if (size>=self.size_inf and size<= self.size_sup):
160 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
161 self.alignedReads += 1
162 else:
163 self.instanceDict[gene].addread (polarity, offset+1, size) # to correct to 1-based coordinates of SmRNAwindow
164 self.alignedReads += 1
165 return self.instanceDict
166
167 def size_histogram (self): # in HandleSmRNAwindows
168 '''refactored on 7-9-2014 to debug size_histogram tool'''
169 size_dict={}
170 size_dict['F']= defaultdict (float)
171 size_dict['R']= defaultdict (float)
172 size_dict['both'] = defaultdict (float)
173 for item in self.instanceDict:
174 buffer_dict = self.instanceDict[item].size_histogram()
175 for polarity in ["F", "R"]:
176 for size in buffer_dict[polarity]:
177 size_dict[polarity][size] += buffer_dict[polarity][size]
178 for size in buffer_dict["both"]:
179 size_dict["both"][size] += buffer_dict["F"][size] - buffer_dict["R"][size]
180 return size_dict
181
182 def CountFeatures (self, GFF3="path/to/file"):
183 featureDict = defaultdict(int)
184 F = open (GFF3, "r")
185 for line in F:
186 if line[0] == "#": continue
187 fields = line[:-1].split()
188 chrom, feature, leftcoord, rightcoord, polarity = fields[0], fields[2], fields[3], fields[4], fields[6]
189 featureDict[feature] += self.instanceDict[chrom].readcount(upstream_coord=int(leftcoord), downstream_coord=int(rightcoord), polarity="both", method="destructive")
190 F.close()
191 return featureDict
192
193 class SmRNAwindow:
194
195 def __init__(self, gene, sequence="ATGC", windowoffset=1, biosample="Undetermined", norm=1.0):
196 self.biosample = biosample
197 self.sequence = sequence
198 self.gene = gene
199 self.windowoffset = windowoffset
200 self.size = len(sequence)
201 self.readDict = defaultdict(list) # with a {+/-offset:[size1, size2, ...], ...}
202 self.matchedreadsUp = 0
203 self.matchedreadsDown = 0
204 self.norm=norm
205
206 def addread (self, polarity, offset, size):
207 '''ATTENTION ATTENTION ATTENTION'''
208 ''' We removed the conversion from 0 to 1 based offset, as we do this now during readparsing.'''
209 if polarity == "+":
210 self.readDict[offset].append(size)
211 self.matchedreadsUp += 1
212 else:
213 self.readDict[-(offset + size -1)].append(size)
214 self.matchedreadsDown += 1
215 return
216
217 def barycenter (self, upstream_coord=None, downstream_coord=None):
218 '''refactored 24-12-2013 to save memory and introduce offset filtering see readcount method for further discussion on that
219 In this version, attempt to replace the dictionary structure by a list of tupple to save memory too'''
220 upstream_coord = upstream_coord or self.windowoffset
221 downstream_coord = downstream_coord or self.windowoffset+self.size-1
222 window_size = downstream_coord - upstream_coord +1
223 def weigthAverage (TuppleList):
224 weightSum = 0
225 PonderWeightSum = 0
226 for tuple in TuppleList:
227 PonderWeightSum += tuple[0] * tuple[1]
228 weightSum += tuple[1]
229 if weightSum > 0:
230 return PonderWeightSum / float(weightSum)
231 else:
232 return 0
233 forwardTuppleList = [(k, len(self.readDict[k])) for k in self.readDict.keys() if (k > 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both forward and in the proper offset window
234 reverseTuppleList = [(-k, len(self.readDict[k])) for k in self.readDict.keys() if (k < 0 and abs(k) >= upstream_coord and abs(k) <= downstream_coord)] # both reverse and in the proper offset window
235 Fbarycenter = (weigthAverage (forwardTuppleList) - upstream_coord) / window_size
236 Rbarycenter = (weigthAverage (reverseTuppleList) - upstream_coord) / window_size
237 return Fbarycenter, Rbarycenter
238
239 def correlation_mapper (self, reference, window_size):
240 '''to map correlation with a sliding window 26-2-2013'''
241 from scipy import stats
242
243 if window_size > self.size:
244 return []
245 F=open(reference, "r")
246 reference_forward = []
247 reference_reverse = []
248 for line in F:
249 fields=line.split()
250 reference_forward.append(int(float(fields[1])))
251 reference_reverse.append(int(float(fields[2])))
252 F.close()
253 local_object_forward=[]
254 local_object_reverse=[]
255 ## Dict to list for the local object
256 for i in range(1, self.size+1):
257 local_object_forward.append(len(self.readDict[i]))
258 local_object_reverse.append(len(self.readDict[-i]))
259 ## start compiling results by slides
260 results=[]
261 for coordinate in range(self.size - window_size):
262 local_forward=local_object_forward[coordinate:coordinate + window_size]
263 local_reverse=local_object_reverse[coordinate:coordinate + window_size]
264 if sum(local_forward) == 0 or sum(local_reverse) == 0:
265 continue
266 try:
267 reference_to_local_cor_forward = stats.spearmanr(local_forward, reference_forward)
268 reference_to_local_cor_reverse = stats.spearmanr(local_reverse, reference_reverse)
269 if (reference_to_local_cor_forward[0] > 0.2 or reference_to_local_cor_reverse[0]>0.2):
270 results.append([coordinate+1, reference_to_local_cor_forward[0], reference_to_local_cor_reverse[0]])
271 except:
272 pass
273 return results
274
275 def readcount (self, size_inf=0, size_sup=1000, upstream_coord=None, downstream_coord=None, polarity="both", method="conservative"):
276 '''refactored 24-12-2013 to save memory and introduce offset filtering
277 take a look at the defaut parameters that cannot be defined relatively to the instance are they are defined before instanciation
278 the trick is to pass None and then test
279 polarity parameter can take "both", "forward" or "reverse" as value'''
280 upstream_coord = upstream_coord or self.windowoffset
281 downstream_coord = downstream_coord or self.windowoffset+self.size-1
282 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "both":
283 return self.matchedreadsUp + self.matchedreadsDown
284 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "forward":
285 return self.matchedreadsUp
286 if upstream_coord == 1 and downstream_coord == self.windowoffset+self.size-1 and polarity == "reverse":
287 return self.matchedreadsDown
288 n=0
289 if polarity == "both":
290 for offset in xrange(upstream_coord, downstream_coord+1):
291 if self.readDict.has_key(offset):
292 for read in self.readDict[offset]:
293 if (read>=size_inf and read<= size_sup):
294 n += 1
295 if method != "conservative":
296 del self.readDict[offset] ## Carefull ! precludes re-use on the self.readDict dictionary !!!!!! TEST
297 if self.readDict.has_key(-offset):
298 for read in self.readDict[-offset]:
299 if (read>=size_inf and read<= size_sup):
300 n += 1
301 if method != "conservative":
302 del self.readDict[-offset]
303 return n
304 elif polarity == "forward":
305 for offset in xrange(upstream_coord, downstream_coord+1):
306 if self.readDict.has_key(offset):
307 for read in self.readDict[offset]:
308 if (read>=size_inf and read<= size_sup):
309 n += 1
310 return n
311 elif polarity == "reverse":
312 for offset in xrange(upstream_coord, downstream_coord+1):
313 if self.readDict.has_key(-offset):
314 for read in self.readDict[-offset]:
315 if (read>=size_inf and read<= size_sup):
316 n += 1
317 return n
318
319 def readsizes (self):
320 '''return a dictionary of number of reads by size (the keys)'''
321 dicsize = {}
322 for offset in self.readDict:
323 for size in self.readDict[offset]:
324 dicsize[size] = dicsize.get(size, 0) + 1
325 for offset in range (min(dicsize.keys()), max(dicsize.keys())+1):
326 dicsize[size] = dicsize.get(size, 0) # to fill offsets with null values
327 return dicsize
328
329
330 def size_histogram(self, minquery=None, maxquery=None): # in SmRNAwindow
331 '''refactored on 7-9-2014 to debug size_histogram tool'''
332 norm=self.norm
333 size_dict={}
334 size_dict['F']= defaultdict (float)
335 size_dict['R']= defaultdict (float)
336 size_dict['both']= defaultdict (float)
337 for offset in self.readDict:
338 for size in self.readDict[offset]:
339 if offset < 0:
340 size_dict['R'][size] = size_dict['R'][size] - 1*norm
341 else:
342 size_dict['F'][size] = size_dict['F'][size] + 1*norm
343 ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate !
344 if not (size_dict['F']) and (not size_dict['R']):
345 size_dict['F'][21] = 0
346 size_dict['R'][21] = 0
347 ##
348 allSizeKeys = list (set (size_dict['F'].keys() + size_dict['R'].keys() ) )
349 for size in allSizeKeys:
350 size_dict['both'][size] = size_dict['F'][size] - size_dict['R'][size]
351 if minquery:
352 for polarity in size_dict.keys():
353 for size in xrange(minquery, maxquery+1):
354 if not size in size_dict[polarity].keys():
355 size_dict[polarity][size]=0
356 return size_dict
357
358 def statsizes (self, upstream_coord=None, downstream_coord=None):
359 ''' migration to memory saving by specifying possible subcoordinates
360 see the readcount method for further discussion'''
361 upstream_coord = upstream_coord or self.windowoffset
362 downstream_coord = downstream_coord or self.windowoffset+self.size-1
363 L = []
364 for offset in self.readDict:
365 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
366 for size in self.readDict[offset]:
367 L.append(size)
368 meansize = mean(L)
369 stdv = std(L)
370 mediansize = median(L)
371 return meansize, mediansize, stdv
372
373 def foldEnergy (self, upstream_coord=None, downstream_coord=None):
374 ''' migration to memory saving by specifying possible subcoordinates
375 see the readcount method for further discussion'''
376 upstream_coord = upstream_coord or self.windowoffset
377 downstream_coord = downstream_coord or self.windowoffset+self.size-1
378 Energy = RNAfold ([self.sequence[upstream_coord-1:downstream_coord] ])
379 return float(Energy[self.sequence[upstream_coord-1:downstream_coord]])
380
381 def Ufreq (self, size_scope, upstream_coord=None, downstream_coord=None):
382 ''' migration to memory saving by specifying possible subcoordinates
383 see the readcount method for further discussion. size_scope must be an interable'''
384 upstream_coord = upstream_coord or self.windowoffset
385 downstream_coord = downstream_coord or self.windowoffset+self.size-1
386 freqDic = {"A":0,"T":0,"G":0,"C":0, "N":0}
387 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
388 for offset in self.readDict:
389 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
390 for size in self.readDict[offset]:
391 if size in size_scope:
392 startbase = self.sequence[abs(offset)-self.windowoffset]
393 if offset < 0:
394 startbase = convertDic[startbase]
395 freqDic[startbase] += 1
396 base_sum = float ( sum( freqDic.values()) )
397 if base_sum == 0:
398 return "."
399 else:
400 return freqDic["T"] / base_sum * 100
401
402 def Ufreq_stranded (self, size_scope, upstream_coord=None, downstream_coord=None):
403 ''' migration to memory saving by specifying possible subcoordinates
404 see the readcount method for further discussion. size_scope must be an interable
405 This method is similar to the Ufreq method but take strandness into account'''
406 upstream_coord = upstream_coord or self.windowoffset
407 downstream_coord = downstream_coord or self.windowoffset+self.size-1
408 freqDic = {"Afor":0,"Tfor":0,"Gfor":0,"Cfor":0, "Nfor":0,"Arev":0,"Trev":0,"Grev":0,"Crev":0, "Nrev":0}
409 convertDic = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
410 for offset in self.readDict:
411 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
412 for size in self.readDict[offset]:
413 if size in size_scope:
414 startbase = self.sequence[abs(offset)-self.windowoffset]
415 if offset < 0:
416 startbase = convertDic[startbase]
417 freqDic[startbase+"rev"] += 1
418 else:
419 freqDic[startbase+"for"] += 1
420 forward_sum = float ( freqDic["Afor"]+freqDic["Tfor"]+freqDic["Gfor"]+freqDic["Cfor"]+freqDic["Nfor"])
421 reverse_sum = float ( freqDic["Arev"]+freqDic["Trev"]+freqDic["Grev"]+freqDic["Crev"]+freqDic["Nrev"])
422 if forward_sum == 0 and reverse_sum == 0:
423 return ". | ."
424 elif reverse_sum == 0:
425 return "%s | ." % (freqDic["Tfor"] / forward_sum * 100)
426 elif forward_sum == 0:
427 return ". | %s" % (freqDic["Trev"] / reverse_sum * 100)
428 else:
429 return "%s | %s" % (freqDic["Tfor"] / forward_sum * 100, freqDic["Trev"] / reverse_sum * 100)
430
431 def readplot (self):
432 norm=self.norm
433 readmap = {}
434 for offset in self.readDict.keys():
435 readmap[abs(offset)] = ( len(self.readDict.get(-abs(offset),[]))*norm , len(self.readDict.get(abs(offset),[]))*norm )
436 mylist = []
437 for offset in sorted(readmap):
438 if readmap[offset][1] != 0:
439 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, readmap[offset][1], "F") )
440 if readmap[offset][0] != 0:
441 mylist.append("%s\t%s\t%s\t%s" % (self.gene, offset, -readmap[offset][0], "R") )
442 ## patch to avoid missing graphs when parsed by R-lattice. 27-08-2014. Test and validate !
443 if not mylist:
444 mylist.append("%s\t%s\t%s\t%s" % (self.gene, 1, 0, "F") )
445 ###
446 return mylist
447
448 def readcoverage (self, upstream_coord=None, downstream_coord=None, windowName=None):
449 '''Use by MirParser tool'''
450 upstream_coord = upstream_coord or 1
451 downstream_coord = downstream_coord or self.size
452 windowName = windowName or "%s_%s_%s" % (self.gene, upstream_coord, downstream_coord)
453 forORrev_coverage = dict ([(i,0) for i in xrange(1, downstream_coord-upstream_coord+1)])
454 totalforward = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="forward")
455 totalreverse = self.readcount(upstream_coord=upstream_coord, downstream_coord=downstream_coord, polarity="reverse")
456 if totalforward > totalreverse:
457 majorcoverage = "forward"
458 for offset in self.readDict.keys():
459 if (offset > 0) and ((offset-upstream_coord+1) in forORrev_coverage.keys() ):
460 for read in self.readDict[offset]:
461 for i in xrange(read):
462 try:
463 forORrev_coverage[offset-upstream_coord+1+i] += 1
464 except KeyError:
465 continue # a sense read may span over the downstream limit
466 else:
467 majorcoverage = "reverse"
468 for offset in self.readDict.keys():
469 if (offset < 0) and (-offset-upstream_coord+1 in forORrev_coverage.keys() ):
470 for read in self.readDict[offset]:
471 for i in xrange(read):
472 try:
473 forORrev_coverage[-offset-upstream_coord-i] += 1 ## positive coordinates in the instance, with + for forward coverage and - for reverse coverage
474 except KeyError:
475 continue # an antisense read may span over the upstream limit
476 output_list = []
477 maximum = max (forORrev_coverage.values()) or 1
478 for n in sorted (forORrev_coverage):
479 output_list.append("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (self.biosample, windowName, n, float(n)/(downstream_coord-upstream_coord+1), forORrev_coverage[n], float(forORrev_coverage[n])/maximum, majorcoverage))
480 return "\n".join(output_list)
481
482
483 def signature (self, minquery, maxquery, mintarget, maxtarget, scope, zscore="no", upstream_coord=None, downstream_coord=None):
484 ''' migration to memory saving by specifying possible subcoordinates
485 see the readcount method for further discussion
486 scope must be a python iterable; scope define the *relative* offset range to be computed'''
487 upstream_coord = upstream_coord or self.windowoffset
488 downstream_coord = downstream_coord or self.windowoffset+self.size-1
489 query_range = range (minquery, maxquery+1)
490 target_range = range (mintarget, maxtarget+1)
491 Query_table = {}
492 Target_table = {}
493 frequency_table = dict ([(i, 0) for i in scope])
494 for offset in self.readDict:
495 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
496 for size in self.readDict[offset]:
497 if size in query_range:
498 Query_table[offset] = Query_table.get(offset, 0) + 1
499 if size in target_range:
500 Target_table[offset] = Target_table.get(offset, 0) + 1
501 for offset in Query_table:
502 for i in scope:
503 frequency_table[i] += min(Query_table[offset], Target_table.get(-offset -i +1, 0))
504 if minquery==mintarget and maxquery==maxtarget: ## added to incorporate the division by 2 in the method (26/11/2013), see signature_options.py and lattice_signature.py
505 frequency_table = dict([(i,frequency_table[i]/2) for i in frequency_table])
506 if zscore == "yes":
507 z_mean = mean(frequency_table.values() )
508 z_std = std(frequency_table.values() )
509 if z_std == 0:
510 frequency_table = dict([(i,0) for i in frequency_table] )
511 else:
512 frequency_table = dict([(i, (frequency_table[i]- z_mean)/z_std) for i in frequency_table] )
513 return frequency_table
514
515 def hannon_signature (self, minquery, maxquery, mintarget, maxtarget, scope, upstream_coord=None, downstream_coord=None):
516 ''' migration to memory saving by specifying possible subcoordinates see the readcount method for further discussion
517 note that scope must be an iterable (a list or a tuple), which specifies the relative offsets that will be computed'''
518 upstream_coord = upstream_coord or self.windowoffset
519 downstream_coord = downstream_coord or self.windowoffset+self.size-1
520 query_range = range (minquery, maxquery+1)
521 target_range = range (mintarget, maxtarget+1)
522 Query_table = {}
523 Target_table = {}
524 Total_Query_Numb = 0
525 general_frequency_table = dict ([(i,0) for i in scope])
526 ## filtering the appropriate reads for the study
527 for offset in self.readDict:
528 if (abs(offset) < upstream_coord or abs(offset) > downstream_coord): continue
529 for size in self.readDict[offset]:
530 if size in query_range:
531 Query_table[offset] = Query_table.get(offset, 0) + 1
532 Total_Query_Numb += 1
533 if size in target_range:
534 Target_table[offset] = Target_table.get(offset, 0) + 1
535 for offset in Query_table:
536 frequency_table = dict ([(i,0) for i in scope])
537 number_of_targets = 0
538 for i in scope:
539 frequency_table[i] += Query_table[offset] * Target_table.get(-offset -i +1, 0)
540 number_of_targets += Target_table.get(-offset -i +1, 0)
541 for i in scope:
542 try:
543 general_frequency_table[i] += (1. / number_of_targets / Total_Query_Numb) * frequency_table[i]
544 except ZeroDivisionError :
545 continue
546 return general_frequency_table
547
548 def phasing (self, size_range, scope):
549 ''' to calculate autocorelation like signal - scope must be an python iterable'''
550 read_table = {}
551 total_read_number = 0
552 general_frequency_table = dict ([(i, 0) for i in scope])
553 ## read input filtering
554 for offset in self.readDict:
555 for size in self.readDict[offset]:
556 if size in size_range:
557 read_table[offset] = read_table.get(offset, 0) + 1
558 total_read_number += 1
559 ## per offset read phasing computing
560 for offset in read_table:
561 frequency_table = dict ([(i, 0) for i in scope]) # local frequency table
562 number_of_targets = 0
563 for i in scope:
564 if offset > 0:
565 frequency_table[i] += read_table[offset] * read_table.get(offset + i, 0)
566 number_of_targets += read_table.get(offset + i, 0)
567 else:
568 frequency_table[i] += read_table[offset] * read_table.get(offset - i, 0)
569 number_of_targets += read_table.get(offset - i, 0)
570 ## inclusion of local frequency table in the general frequency table (all offsets average)
571 for i in scope:
572 try:
573 general_frequency_table[i] += (1. / number_of_targets / total_read_number) * frequency_table[i]
574 except ZeroDivisionError :
575 continue
576 return general_frequency_table
577
578
579
580 def z_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
581 '''Must do: from numpy import mean, std, to use this method; scope must be a python iterable and defines the relative offsets to compute'''
582 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
583 z_table = {}
584 frequency_list = [frequency_table[i] for i in sorted (frequency_table)]
585 if std(frequency_list):
586 meanlist = mean(frequency_list)
587 stdlist = std(frequency_list)
588 z_list = [(i-meanlist)/stdlist for i in frequency_list]
589 return dict (zip (sorted(frequency_table), z_list) )
590 else:
591 return dict (zip (sorted(frequency_table), [0 for i in frequency_table]) )
592
593 def percent_signature (self, minquery, maxquery, mintarget, maxtarget, scope):
594 frequency_table = self.signature (minquery, maxquery, mintarget, maxtarget, scope)
595 total = float(sum ([self.readsizes().get(i,0) for i in set(range(minquery,maxquery)+range(mintarget,maxtarget))]) )
596 if total == 0:
597 return dict( [(i,0) for i in scope])
598 return dict( [(i, frequency_table[i]/total*100) for i in scope])
599
600 def pairer (self, overlap, minquery, maxquery, mintarget, maxtarget):
601 queryhash = defaultdict(list)
602 targethash = defaultdict(list)
603 query_range = range (int(minquery), int(maxquery)+1)
604 target_range = range (int(mintarget), int(maxtarget)+1)
605 paired_sequences = []
606 for offset in self.readDict: # selection of data
607 for size in self.readDict[offset]:
608 if size in query_range:
609 queryhash[offset].append(size)
610 if size in target_range:
611 targethash[offset].append(size)
612 for offset in queryhash:
613 if offset >= 0: matched_offset = -offset - overlap + 1
614 else: matched_offset = -offset - overlap + 1
615 if targethash[matched_offset]:
616 paired = min ( len(queryhash[offset]), len(targethash[matched_offset]) )
617 if offset >= 0:
618 for i in range (paired):
619 paired_sequences.append("+%s" % RNAtranslate ( self.sequence[offset:offset+queryhash[offset][i]]) )
620 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-targethash[matched_offset][i]+1:-matched_offset+1]) ) )
621 if offset < 0:
622 for i in range (paired):
623 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-queryhash[offset][i]+1:-offset+1]) ) )
624 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+targethash[matched_offset][i]] ) )
625 return paired_sequences
626
627 def pairable (self, overlap, minquery, maxquery, mintarget, maxtarget):
628 queryhash = defaultdict(list)
629 targethash = defaultdict(list)
630 query_range = range (int(minquery), int(maxquery)+1)
631 target_range = range (int(mintarget), int(maxtarget)+1)
632 paired_sequences = []
633
634 for offset in self.readDict: # selection of data
635 for size in self.readDict[offset]:
636 if size in query_range:
637 queryhash[offset].append(size)
638 if size in target_range:
639 targethash[offset].append(size)
640
641 for offset in queryhash:
642 matched_offset = -offset - overlap + 1
643 if targethash[matched_offset]:
644 if offset >= 0:
645 for i in queryhash[offset]:
646 paired_sequences.append("+%s" % RNAtranslate (self.sequence[offset:offset+i]) )
647 for i in targethash[matched_offset]:
648 paired_sequences.append( "-%s" % RNAtranslate (antipara (self.sequence[-matched_offset-i+1:-matched_offset+1]) ) )
649 if offset < 0:
650 for i in queryhash[offset]:
651 paired_sequences.append("-%s" % RNAtranslate (antipara (self.sequence[-offset-i+1:-offset+1]) ) )
652 for i in targethash[matched_offset]:
653 paired_sequences.append("+%s" % RNAtranslate (self.sequence[matched_offset:matched_offset+i] ) )
654 return paired_sequences
655
656 def newpairable_bowtie (self, overlap, minquery, maxquery, mintarget, maxtarget):
657 ''' revision of pairable on 3-12-2012, with focus on the offset shift problem (bowtie is 1-based cooordinates whereas python strings are 0-based coordinates'''
658 queryhash = defaultdict(list)
659 targethash = defaultdict(list)
660 query_range = range (int(minquery), int(maxquery)+1)
661 target_range = range (int(mintarget), int(maxtarget)+1)
662 bowtie_output = []
663
664 for offset in self.readDict: # selection of data
665 for size in self.readDict[offset]:
666 if size in query_range:
667 queryhash[offset].append(size)
668 if size in target_range:
669 targethash[offset].append(size)
670 counter = 0
671 for offset in queryhash:
672 matched_offset = -offset - overlap + 1
673 if targethash[matched_offset]:
674 if offset >= 0:
675 for i in queryhash[offset]:
676 counter += 1
677 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "+", self.gene, offset-1, self.sequence[offset-1:offset-1+i]) ) # attention a la base 1-0 de l'offset
678 if offset < 0:
679 for i in queryhash[offset]:
680 counter += 1
681 bowtie_output.append("%s\t%s\t%s\t%s\t%s" % (counter, "-", self.gene, -offset-i, self.sequence[-offset-i:-offset])) # attention a la base 1-0 de l'offset
682 return bowtie_output
683
684
685 def __main__(bowtie_index_path, bowtie_output_path):
686 sequenceDic = get_fasta (bowtie_index_path)
687 objDic = {}
688 F = open (bowtie_output_path, "r") # F is the bowtie output taken as input
689 for line in F:
690 fields = line.split()
691 polarity = fields[1]
692 gene = fields[2]
693 offset = int(fields[3])
694 size = len (fields[4])
695 try:
696 objDic[gene].addread (polarity, offset, size)
697 except KeyError:
698 objDic[gene] = SmRNAwindow(gene, sequenceDic[gene])
699 objDic[gene].addread (polarity, offset, size)
700 F.close()
701 for gene in objDic:
702 print gene, objDic[gene].pairer(19,19,23,19,23)
703
704 if __name__ == "__main__" : __main__(sys.argv[1], sys.argv[2])