comparison antaRNA.py @ 0:fcf4719d3831 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/antarna/ commit 71414cf7f040d610afc3f02be31446efc3a82a40-dirty
author rnateam
date Wed, 13 May 2015 11:02:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:fcf4719d3831
1 import numpy
2 import sys
3 import random
4 import subprocess
5 import re
6 import decimal
7 import math
8 import os
9 import shutil
10 import time
11 import types
12 import argparse
13 #from argparse import RawTextHelpFormatter
14
15 #############################
16 # FUNCTIONS
17
18 def print2file(f, i, m):
19 """
20 print content i to file f in mode m
21 """
22 line = str(i)
23 if m == "a":
24 call = "echo \"" + line + "\" >> " + f
25 elif m == "w":
26 call = "echo \"" + line + "\" > " + f
27 os.system(call)
28
29 # checking and correcting the alphabet of the constraint sequence
30 def checkSequenceConstraint(SC):
31 """
32 Checks the Sequence constraint for illegal nucleotide characters
33 """
34 out = ""
35 for c in SC:
36 c = c.upper()
37 if c not in "ACGURYSWKMBDHVN":
38 # and c!= "R" and c != "Y" and c != "S" and c != "W" and c != "K" and c != "M" and c != "B" and c != "D" and c != "H" and c != "V":
39 if c == "T":
40 c = "U"
41 else:
42 print "\tIllegal Character in the constraint sequence!"
43 print "\tPlease use the IUPAC nomenclature for defining nucleotides in the constraint sequence!"
44 print "\tA Adenine"
45 print "\tC Cytosine"
46 print "\tG Guanine"
47 print "\tT/U Thymine/Uracil"
48 print "\tR A or G"
49 print "\tY C or T/U"
50 print "\tS G or C"
51 print "\tW A or T/U"
52 print "\tK G or T/U"
53 print "\tM A or C"
54 print "\tB C or G or T/U"
55 print "\tD A or G or T/U"
56 print "\tH A or C or T/U"
57 print "\tV A or C or G"
58 print "\tN any base"
59 exit(0)
60 out += c
61 return (1, out)
62
63
64 def transform(seq):
65 """
66 Transforms "U" to "T" for the processing is done on DNA alphabet
67 """
68 S = ""
69 for s in seq:
70 if s == "T":
71 S += "U"
72 else:
73 S += s
74 return S
75
76
77 def checkSimilarLength(s, SC):
78 """
79 Compares sequence and structure constraint length
80 """
81 if len(s) == len(SC):
82 return 1
83 else:
84 return 0
85
86
87 def isStructure(s):
88 """
89 Checks if the structure constraint only contains "(", ")", and "." and legal fuzzy structure constraint characters.
90 """
91 returnvalue = 1
92 for a in range(0,len(s)):
93 if s[a] not in ".()[]{}<>":
94 if s[a] not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
95 returnvalue = 0
96 return returnvalue
97
98
99 def isBalanced(s):
100 """
101 Check if the structure s is of a balanced nature
102 """
103
104 balance = 1
105 for bracket in ["()", "[]", "{}", "<>"]:
106 counter = 0
107 for a in xrange(len(s)):
108 if s[a] in bracket[0]:
109 counter += 1
110 elif s[a] in bracket[1]:
111 counter -= 1
112 if counter != 0:
113 balance = 0
114 return balance
115
116
117
118 def fulfillsHairpinRule(s):
119 """
120 CHECKING FOR THE 3 nt LOOP INTERSPACE
121 for all kind of basepairs, even wihtin the pdeudoknots
122 """
123
124 fulfillsRules = 1
125 for bracket in ["()", "[]", "{}", "<>"]:
126 last_opening_char = 0
127 check = 0
128 for a in xrange(len(s)):
129 if s[a] == bracket[0]:
130 last_opening_char = a
131 check = 1
132 elif s[a] == bracket[1] and check == 1:
133 check = 0
134 if a - last_opening_char < 4:
135 return 0
136 return 1
137
138
139 def isValidStructure(s):
140 """
141 Checks, if the structure s is a valid structure
142 """
143
144 Structure = isStructure(s)
145 Balanced = isBalanced(s)
146 HairpinRule = fulfillsHairpinRule(s)
147
148 if Structure == 1 and Balanced == 1 and HairpinRule == 1:
149 return 1
150 else:
151 print Structure, Balanced, HairpinRule
152 return 0
153
154 def loadIUPACcompatibilities(IUPAC, useGU):
155 """
156 Generating a hash containing all compatibilities of all IUPAC RNA NUCLEOTIDES
157 """
158 compatible = {}
159 for nuc1 in IUPAC: # ITERATING OVER THE DIFFERENT GROUPS OF IUPAC CODE
160 sn1 = list(IUPAC[nuc1])
161 for nuc2 in IUPAC: # ITERATING OVER THE DIFFERENT GROUPS OF IUPAC CODE
162 sn2 = list(IUPAC[nuc2])
163 compatib = 0
164 for c1 in sn1: # ITERATING OVER THE SINGLE NUCLEOTIDES WITHIN THE RESPECTIVE IUPAC CODE:
165 for c2 in sn2: # ITERATING OVER THE SINGLE NUCLEOTIDES WITHIN THE RESPECTIVE IUPAC CODE:
166 # CHECKING THEIR COMPATIBILITY
167 if useGU == True:
168 if (c1 == "A" and c2 == "U") or (c1 == "U" and c2 == "A") or (c1 == "C" and c2 == "G") or (c1 == "G" and c2 == "C") or (c1 == "G" and c2 == "U") or (c1 == "U" and c2 == "G"):
169 compatib = 1
170 else:
171 if (c1 == "A" and c2 == "U") or (c1 == "U" and c2 == "A") or (c1 == "C" and c2 == "G") or (c1 == "G" and c2 == "C"):
172 compatib = 1
173 compatible[nuc1 + "_" + nuc2] = compatib # SAVING THE RESPECTIVE GROUP COMPATIBILITY, REVERSE SAVING IS NOT REQUIRED, SINCE ITERATING OVER ALL AGAINST ALL
174 return compatible
175
176 def isCompatibleToSet(c1, c2, IUPAC_compatibles):
177 """
178 Checks compatibility of c1 wihtin c2
179 """
180 compatible = True
181 for setmember in c2:
182 #print setmember
183 if isCompatible(c1, setmember, IUPAC_compatibles) == False:
184 return False
185 return compatible
186
187
188 def isCompatible(c1, c2, IUPAC_compatibles):
189 """
190 Checks compatibility between character c1 and c2
191 """
192 if IUPAC_compatibles[c1 + "_" + c2] == 1:
193 return True
194 else:
195 return False
196
197
198 def isStructureCompatible(lp1, lp2 ,bp):
199 """
200 Checks, if the region within lp1 and lp2 is structurally balanced
201 """
202 x = lp1 + 1
203 while (x < lp2):
204 if (bp[x] <= lp1 or bp[x] > lp2):
205 return False
206 if x == bp[x]:
207 x += 1
208 else:
209 x = bp[x] + 1
210 return x == lp2
211
212
213 def checkConstaintCompatibility(basepairstack, sequenceconstraint, IUPAC_compatibles):
214 """
215 Checks if the constraints are compatible to each other
216 """
217 returnstring = ""
218 compatible = 1
219 for id1 in basepairstack: # key = (constraint , (pos, constraint)))
220 constr1 = basepairstack[id1][0]
221 id2 = basepairstack[id1][1][0]
222 constr2 = basepairstack[id1][1][1]
223
224 if id1 != id2 and not isCompatible(constr1, constr2, IUPAC_compatibles):
225
226 compatible = 0
227 returnstring += "nucleotide constraint " + str(constr1) + " at position " + str(id1) + " is not compatible with nucleotide constraint " + str(constr2) + " at position " + str(id2) + "\n"
228 #if not isCompatible(basepairstack[basepair][0], basepairstack[basepair][1][1]):
229
230 #compatible = 0
231 #else:
232 #returnstring += "nucleotide constraint " + str(basepairstack[basepair][0]) + " at position " + str(basepair) + " is compatible with nucleotide constraint " + str(basepairstack[basepair][1][1]) + " at position " + str(basepairstack[basepair][1][0]) + "\n"
233 return (compatible, returnstring)
234
235
236 def getLP(BPSTACK):
237 """
238 Retreives valid lonley base pairs from a base pair stack
239 """
240 #20 ('N', (>BLOCK<, 'N'))
241
242 # geting single base pairs
243 stack = {}
244 LP = {}
245 if type(BPSTACK[random.choice(BPSTACK.keys())]) == types.TupleType:
246 for i in BPSTACK.keys():
247 #if str(BPSTACK[i][1][0]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
248 stack[i] = int(BPSTACK[i][1][0])
249 #print i , BPSTACK[i][1][0]
250 else:
251 for i in BPSTACK.keys():
252 #if str(BPSTACK[i]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
253 stack[i] = BPSTACK[i]
254
255 # removing redundant base pair indices
256 for i in stack.keys():
257 if i >= stack[i]:
258 del stack[i]
259
260 # actual checking for single lonley base pairs
261 for i in stack.keys():
262 if not (i-1 in stack and stack[i-1] == stack[i] + 1) and not (i+1 in stack and stack[i+1] == stack[i] - 1):
263 LP[i] = stack[i]
264
265 ##actual removal of 2er lonley base pairs
266 for i in stack.keys():
267 if not (i-1 in stack and stack[i-1] == stack[i] + 1) and (i+1 in stack and stack[i+1] == stack[i] - 1) and not (i+2 in stack and stack[i+2] == stack[i] - 2):
268 LP[i] = stack[i]
269 LP[i+1] = stack[i+1]
270
271
272 #if type(BPSTACK[random.choice(BPSTACK.keys())]) == types.TupleType:
273 #for i in BPSTACK.keys():
274
275 ##if str(BPSTACK[i][1][0]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
276 #stack[i] = int(BPSTACK[i][1][0])
277 ##print i , BPSTACK[i][1][0]
278 #else:
279 #for i in BPSTACK.keys():
280 ##if str(BPSTACK[i]) not in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
281 #stack[i] = BPSTACK[i]
282
283 #for i in stack.keys():
284 #if i >= stack[i]:
285 #del stack[i]
286
287
288
289 return LP
290
291
292 def getBPStack(s, seq):
293 """
294 Returns a dictionary of the corresponding basepairs of the structure s and the sequence constraint seq.
295 """
296 tmp_stack = {"()":[], "{}":[], "[]":[], "<>":[]}
297 bpstack = {}
298 for i in xrange(len(s)):
299
300 # REGULAR SECONDARY STRUCTURE DETECTION
301 if s[i] in "(){}[]<>":
302
303 no = 0
304 ### opening
305 if s[i] in "([{<":
306 if s[i] == "(":
307 tmp_stack["()"].append((i, seq[i]))
308 elif s[i] == "[":
309 tmp_stack["[]"].append((i, seq[i]))
310 elif s[i] == "{":
311 tmp_stack["{}"].append((i, seq[i]))
312 elif s[i] == "<":
313 tmp_stack["<>"].append((i, seq[i]))
314
315 #closing
316 elif s[i] in ")]}>":
317 if s[i] == ")":
318 no, constr = tmp_stack["()"].pop()
319 elif s[i] == "]":
320 no, constr = tmp_stack["[]"].pop()
321 elif s[i] == "}":
322 no, constr = tmp_stack["{}"].pop()
323 elif s[i] == ">":
324 no, constr = tmp_stack["<>"].pop()
325 bpstack[no] = (constr, (i, seq[i]))
326 bpstack[i] = (seq[i] ,(no, constr))
327
328 elif s[i] == ".":
329 bpstack[i] = (seq[i], (i, seq[i]))
330 elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
331 bpstack[i] = (seq[i], (i, seq[i]))
332
333 return (bpstack, getLP(bpstack))
334
335
336 def getbpStack(s):
337 """
338 Returns a dictionary of the corresponding basepairs of the structure s and the sequence constraint seq.
339 """
340 tmp_stack = {"()":[], "{}":[], "[]":[], "<>":[]}
341 bpstack = {}
342
343 for i in xrange(len(s)):
344 if s[i] in "(){}[]<>":
345
346 no = 0
347 ### opening
348 if s[i] in "([{<":
349 if s[i] == "(":
350 tmp_stack["()"].append(i)
351 elif s[i] == "[":
352 tmp_stack["[]"].append(i)
353 elif s[i] == "{":
354 tmp_stack["{}"].append(i)
355 elif s[i] == "<":
356 tmp_stack["<>"].append(i)
357
358 #closing
359 elif s[i] in ")]}>":
360 if s[i] == ")":
361 no = tmp_stack["()"].pop()
362 elif s[i] == "]":
363 no = tmp_stack["[]"].pop()
364 elif s[i] == "}":
365 no = tmp_stack["{}"].pop()
366 elif s[i] == ">":
367 no = tmp_stack["<>"].pop()
368 bpstack[no] = i # save basepair in the format {opening base id (opening seq constr,(closing base id, closing seq constr))}
369 bpstack[i] = no # save basepair in the format {closing base id (closing seq constr,(opening base id, opening seq constr))}
370
371 elif s[i] == ".": # no structural constaint given: produce entry, which references itself as a base pair partner....
372 bpstack[i] = i
373
374 elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
375 bpstack[i] = i
376
377 #elif s[i] in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
378 ## per position, assigned to a certain block, the target nucleotide, with whcih it should interact is marked with the specific
379 ## block character
380 #bpstack[i] = s[i]
381
382 return (bpstack, getLP(bpstack))
383
384 def maprange( a, b, s):
385 """
386 Mapping function
387 """
388 (a1, a2), (b1, b2) = a, b
389 return b1 + ((s - a1) * (b2 - b1) / (a2 - a1))
390
391
392 def applyGCcontributionPathAdjustment(pathlength, tmpGC, nt):
393 """
394 GC path length contribution calculation.
395 """
396 GCadjustment = 1.5
397 minimum = 0.5
398 upper = GCadjustment
399 lower = minimum
400
401 if nt == "A" or nt == "U":
402 pathlength = pathlength * maprange( (0, 1) , (lower, upper), tmpGC)
403
404 if nt == "G" or nt == "C":
405 #pathlength = pathlength * (float(1-tmpGC))
406 pathlength = pathlength * maprange( (1, 0) , (lower, upper), tmpGC)
407 return pathlength
408
409 def getConstraint(TE, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements):
410 """
411 Dependend on the situation in the constraint an the respective path section, setting wether a specific constraint can be given or not (for that path section)
412 """
413 # TE :: transition element / path section under dispute
414 # id1 :: id of the position of the caharacter to which the transition is leading to
415 # id2 :: id of the position of the character, which is listed in the BPinformation, it can be id1 as well, when no bp is present
416 # val :: BPstack information of the specific position
417 # constr1 :: constraining character of pos id1
418 # constr2 :: constraining character of pos id2
419
420 id1 = int(TE.split(".")[0])
421 val = BPstack[id1] # check out the value of the destination character in the basepair/constraint stack
422 constr1 = val[0] # getting the constraint character of position id1
423 id2 = int(val[1][0]) # getting position id2
424 constr2 = val[1][1] # getting the sequence constraint for position id2
425 targetNucleotide = TE.split(".")[1][-1:] # where the edge is leading to
426
427 c1 = set(IUPAC[constr1]) # getting all explicit symbols of c1
428 c2 = set(IUPAC_reverseComplements[constr2]) # getting the reverse complement explicit symbols of c2
429
430 if targetNucleotide in c1:
431 if id1 == id2:
432 return 1
433 else:
434 if targetNucleotide in c2:
435 return 1
436 else:
437 return 0
438 else:
439 return 0
440
441 """
442 def getConstraint(TE, BPstack):
443 # TE :: transition element / path section
444 # id1 :: id of the position of the caharacter to which the transition is leading to
445 # id2 :: id of the position of the character, which is listed in the BPinformation, it can be id1 as well, when no bp is present
446 # val :: BPstack information of the specific position
447 # constr1 :: constraining character of pos id1
448 # constr2 :: constraining character of pos id2
449
450 ### BPstack [id1] = (constr1, (id2, constr2))
451
452 id1 = TE.split(".")[0]
453 #print id1
454 #id1 = TE.find(TE.strip("_")) # strip the path section and getting the position of the section
455 #if len(TE.strip("_")) == 2: # check if the path section is from an internal and not an initial transition
456 #id1 += 1 # increase position id1 by 1, since the last character of the section is the destination character
457 val = BPstack[int(id1)] # check out the value of the destination character in the basepair/constraint stack
458 constr1 = val[0] # getting the constraint character of position id1
459 id2 = val[1][0] # getting position id2
460 constr2 = val[1][1] # getting the sequence constraint for position id2
461 #print TE, id1, constr1, id2, constr2,
462
463 #TE.split(".")[1][-1:]
464 if id1 == id2: # both ids were the same with either character, sequential or no sequential constraint -> no basepair constraint
465 if constr1 == TE.split(".")[1][-1:] and constr2 == TE.split(".")[1][-1:]: # case if the single base constraints on position id1 == id2 are the same as the destination character on id1
466 #print 1
467 return 1
468 elif constr1 == constr2 == "N": # case if the single base constraints on position id1 == id2 has no constraint
469 #print 1
470 return 1
471 else: # single base sequence constraints differ
472 #print 0
473 return 0
474
475 elif id1 != id2: # showing differentq ids, indicating a bp, (basepair structural constraint)
476 if constr1 == "N" and constr2 == "N": # no sequence constraint
477 #print 1
478 return 1
479 if constr1 == "N" and constr2 != "N": # c1 has no constraint, c2 has character constraint (sequence constraint of closing bases)
480 if TE.split(".")[1][-1:] == complementBase(constr2): # the current path section destination base is equal to the complement base of the mentioned sequence constraint in constr2
481 #print 1
482 return 1
483 else: # case if the current path section destination base is not equeal to the mentioned complement sequence constraint in constr2
484 #print 0
485 return 0
486 if constr1 != "N" and constr2 == "N": # c1 has character constraint, c2 has no character constraint (sequence constraint in the opening base)
487 if TE.split(".")[1][-1:] == constr1: # the current path section destination base is as constrained with constr1
488 #print 1
489 return 1
490 else: # the current path section destination base is not as constrained in constr1
491 #print 0
492 return 0
493 if constr1 != "N" and constr2 != "N": # both positions have sequential constraint
494 if TE.split(".")[1][-1:] == constr1:
495 #print 1
496 return 1
497 else:
498 #print 0
499 return 0
500 """
501
502 def applyTerrainModification(terrain, s, tmpGC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements):
503 #nucleotides = {'A': 0, 'C': 1,'G': 2,'T': 3}
504
505 dels = []
506 for terrainelement in sorted(terrain):
507 pheromone, pathlength = terrain[terrainelement]
508 pheromone = getConstraint(terrainelement, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
509 pathlength = getConstraint(terrainelement, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
510 pathlength = applyGCcontributionPathAdjustment(pathlength, tmpGC,terrainelement.split(".")[1][-1:])
511 if pheromone * pathlength == 0: dels.append(terrainelement)
512 terrain[terrainelement] = (pheromone, pathlength,[])
513 further_dels = {}
514 for terrainelement in sorted(dels):
515 pos, nucs = terrainelement.split(".")
516 if int(pos) < len(s)-1:
517 to_nt = nucs[-1:]
518 successor_pos = int(pos) + 1
519 for i in ["A", "C", "G", "U"]:
520 del_element = str(successor_pos) + "." + to_nt + i
521 further_dels[del_element] = 1
522 further_dels[terrainelement] = 1
523 # deleting the inbound and outbound edges, which are forbidden
524 for terrainelement in further_dels:
525 del terrain[terrainelement]
526 # allocate the appropriate children of edges
527 for terrainelement in terrain:
528 pheromone, pathlength, children = terrain[terrainelement]
529 pos, nucs = terrainelement.split(".")
530 if int(pos) < len(s):
531 to_nt = nucs[-1:]
532 successor_pos = int(pos) + 1
533 for i in ["A", "C", "G", "U"]:
534 if str(successor_pos) + "." + to_nt + i in terrain:
535 children.append(str(successor_pos) + "." + to_nt + i)
536 terrain[terrainelement] = (pheromone, pathlength,children)
537 starts = []
538 for i in ["A", "C", "G", "U"]:
539 if str(0) + "." + i in terrain:
540 starts.append(str(0) + "." + i)
541 terrain["00.XY"] = (1, 1, starts)
542 return (terrain, BPstack)
543
544
545 def initTerrain(s): # THE CLASSIC
546 """
547 Initialization of the terrain with graph like terrain... vertices are modeled implicitly
548 """
549 nt = ["A","C","G","U"]
550 nt2 = ["AA","AC","AG","AU","CA","CC","CG","CU","GA","GC","GG","GU","UA","UC","UG","UU"] # Allowed dinucleotides
551 e = {}
552 pathlength = 1
553 pheromone = 1
554 for p in xrange(len(s)):
555 if p == 0:
556 for i in nt:
557 e["%s.%s"%(p,i)] = (pheromone, pathlength)
558 elif p > 0:
559 for n in nt2:
560 e["%s.%s"%(p,n)] = (pheromone, pathlength)
561 return e
562
563
564
565 def complementBase(c):
566 """
567 Returns the complement RNA character of c (without GU base pairs)
568 """
569 retChar = ""
570 if c == "A" :
571 retChar = "U"
572 elif c == "U":
573 retChar = "A"
574 elif c == "C":
575 retChar = "G"
576 elif c == "G":
577 retChar = "C"
578 return retChar
579
580 def printTerrain(terrain):
581 #print sorted(terrain.keys())
582 tmp_i = "0"
583 tmp_c = 0
584 terrain = terrain[0]
585
586 for a, i in enumerate(sorted(terrain.keys())):
587 #print a
588 if i.split(".")[0] != tmp_i:
589 print "\nElements:", tmp_c,"\n#########################\n", i, terrain[i]
590
591 tmp_c = 1
592 tmp_i = i.split(".")[0]
593 else:
594 print i, terrain[i]
595 tmp_c += 1
596
597 print "\nElements:", tmp_c
598 print "#########################"
599 print len(terrain)
600
601 def pickStep(tmp_steps, summe):
602 """
603 Selects a step within the terrain
604 """
605 if len(tmp_steps) == 1:
606 return tmp_steps[0][1] # returning the nucleotide of the only present step
607 else:
608 rand = random.random() # draw random number
609 mainval = 0
610 for choice in xrange(len(tmp_steps)):
611 val, label = tmp_steps[choice]
612 mainval += val/float(summe)
613 if mainval > rand: # as soon, as the mainval gets larger than the random value the assignment is done
614 return label
615
616 def getPath(s, tmp_terrain, tmp_BPstack, alpha, beta, IUPAC, IUPAC_reverseComplements):
617 """
618 Performs a walk through the terrain and assembles a sequence, while respecting the structure constraint and IUPAC base complementarity
619 of the base pairs GU, GC and AT
620 """
621 nt = ["A","C","G","U"]
622 prev_edge = "00.XY"
623 sequence = ""
624 while len(sequence) < len(s):
625 coming_from = sequence[-1:]
626 summe = 0
627 steps = []
628 i = len(sequence)
629 allowed_nt = "ACGU"
630 # base pair closing case check, with subsequent delivery of a reduced allowed nt set
631
632 if i > tmp_BPstack[i][1][0]:
633 jump = tmp_BPstack[i][1][0]
634 nuc_at_jump = sequence[jump]
635 allowed_nt = IUPAC_reverseComplements[nuc_at_jump]
636
637 #allowed_nt = complementBase(nuc_at_jump)
638
639 # Checking for every possible nt if it is suitable for the selection procedure
640 for edge in tmp_terrain[prev_edge][-1]:
641
642 if edge[-1:] in allowed_nt:
643 pheromone, PL , children = tmp_terrain[edge]
644 #if PL > 0:
645 value = ((float(pheromone * alpha)) + ((1/float(PL)) * beta))
646 summe += value
647 steps.append((value, edge))
648 prev_edge = pickStep(steps, summe)
649 sequence += prev_edge[-1:]
650
651 return sequence
652
653
654 ###
655 # STRUCTURE PREDICTORS
656 ###
657 def getPKStructure(sequence, temperature, mode = "A"):
658 """
659 Initialization pKiss mfe pseudoknot prediction
660 """
661 p2p = "pKiss"
662 #p2p = "/usr/local/pkiss/2014-03-17/bin/pKiss_mfe"
663 strategy = "--strategy "
664 t = "--temperature " + str(temperature)
665
666 if mode == "A": strategy += "A"
667 elif mode == "B": strategy += "B"
668 elif mode == "C": strategy += "C"
669 elif mode == "D": strategy += "D"
670 elif mode == "P": strategy += "P"
671
672 p = subprocess.Popen( ([p2p, "--mode mfe", strategy, t]),
673 #shell = True,
674 stdin = subprocess.PIPE,
675 stdout = subprocess.PIPE,
676 stderr = subprocess.PIPE,
677 close_fds = True)
678 #print p.stderr.readline()
679
680 p.stdin.write(sequence+'\n')
681 pks = p.communicate()
682 structure = "".join(pks[0].split("\n")[2].split(" ")[-1:])
683 return structure
684
685 def init_RNAfold(version, temperature, paramFile = ""):
686 """
687 Initialization RNAfold listener
688 """
689 p2p = ""
690 t = "-T " + str(temperature)
691 P = ""
692 if paramFile != "":
693 P = "-P " + paramFile
694 if version == 185:
695 p2p = "/home/rk/Software/ViennaRNA/ViennaRNA-1.8.5/Progs/RNAfold"
696 p = subprocess.Popen( ([p2p, '--noPS', '-d 2', t, P]),
697 shell = True,
698 stdin = subprocess.PIPE,
699 stdout = subprocess.PIPE,
700 stderr = subprocess.PIPE,
701 close_fds = True)
702 return p
703 elif version == 213:
704 p2p = "RNAfold"
705 p = subprocess.Popen( ([p2p, '--noPS', '-d 2', t, P]),
706 #shell = True,
707 stdin = subprocess.PIPE,
708 stdout = subprocess.PIPE,
709 stderr = subprocess.PIPE,
710 close_fds = True)
711 return p
712 else:
713 exit(0)
714
715 def consult_RNAfold(seq, p):
716 """
717 Consults RNAfold listener
718 """
719 p.stdin.write(seq+'\n')
720 out = ""
721 for i in xrange(2):
722 out += p.stdout.readline()
723 return out
724
725
726 def getRNAfoldStructure(struct2, process1):
727 """
728 Retrieves folded structure of a RNAfold call
729 """
730
731 RNAfold_pattern = re.compile('.+\n([.()]+)\s.+')
732 #RNAdist_pattern = re.compile('.*\s([\d]+)')
733 RNAfold_match = RNAfold_pattern.match(consult_RNAfold(struct2, process1))
734 current_structure = ""
735 #if RNAfold_match:
736 return RNAfold_match.group(1)
737
738
739 def init_RNAdistance():
740 """
741 Initialization of RNAdistance listener
742 """
743 #p2p = "/home/rk/Software/ViennaRNA/ViennaRNA-1.8.5/Progs/RNAdistance"
744 p2p = "RNAdistance"
745 p = subprocess.Popen( ([p2p]),
746 #shell = True,
747 stdin = subprocess.PIPE,
748 stdout = subprocess.PIPE,
749 stderr = subprocess.PIPE,
750 close_fds = True)
751 return p
752
753
754 def consult_RNAdistance(s1, s2, p):
755 """
756 Consulting the RNAdistance listener
757 """
758 p.stdin.write(s1+'\n')
759 p.stdin.write(s2+'\n')
760 out = ""
761 out_tmp = p.stdout.readline().strip()
762 if out_tmp != "":
763 out += out_tmp
764 return out
765
766 def getInducingSequencePositions(Cseq, degreeOfSequenceInducement):
767 """
768 Delimiting the degree of structure inducement by the supplied sequence constraint.
769 0 : no sequence induced structure constraint
770 1 : "ACGT" induce structure (explicit nucleotide structure inducement level)
771 2 : "MWKSYR" and "ACGT" (explicit and double instances)
772 3 : "BDHV" , "MWKSYR" and "ACGT" (explicit, double, and triple instances)
773 """
774 setOfNucleotides = "" # resembling the "0"-case
775 if degreeOfSequenceInducement == 1:
776 setOfNucleotides = "ACGU"
777 elif degreeOfSequenceInducement == 2:
778 setOfNucleotides = "ACGUMWKSYR"
779 elif degreeOfSequenceInducement == 3:
780 setOfNucleotides = "ACGUMWKSYRBDHV"
781 #elif degreeOfSequenceInducement == 4:
782 #setOfNucleotides = "ACGTMWKSYRBDHVN"
783
784 tmpSeq = ""
785 listset = setOfNucleotides
786 for pos in Cseq:
787 if pos not in listset:
788 tmpSeq += "N"
789 else:
790 tmpSeq += pos
791
792 return setOfNucleotides, tmpSeq
793
794
795 def getBPDifferenceDistance(stack1, stack2):
796 """
797 Based on the not identical amount of base pairs within both structure stacks
798 """
799 d = 0
800 for i in stack1.keys():
801 # check base pairs in stack 1
802 if i < stack1[i] and stack1[i] != stack2[i]:
803 d += 1
804 # check base pairs in stack 2
805 for i in stack2.keys():
806 if i < stack2[i] and stack1[i] != stack2[i]:
807 d += 1
808 return d
809
810
811 def getStructuralDistance(target_structure, Cseq, path, RNAfold, verbose, LP, BP, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy):
812 """
813 Calculator for Structural Distance
814 """
815 # fold the current solution's sequence to obtain the structure
816
817 current_structure = ""
818
819 if pseudoknots:
820 current_structure = getPKStructure(path,strategy)
821 else:
822 RNAfold_match = RNAfold_pattern.match(consult_RNAfold(path, RNAfold))
823 current_structure = RNAfold_match.group(1)
824
825 # generate the current structure's base-pair stack
826 bp = getbpStack(current_structure)[0]
827 # add case-dependend structural constraints in case of lonley basepairs formation
828 tmp_target_structure_bp = getbpStack(target_structure)[0]
829
830 for lp in LP:
831 if bp[lp] == LP[lp]: # if the base pair is within the current solution structure, re-add the basepair into the constraint structure.
832 #tmp_target_structure[lp] = "("
833 #tmp_target_structure[LP[lp]] = ")"
834 tmp_target_structure_bp[lp] = LP[lp]
835 tmp_target_structure_bp[LP[lp]] = lp
836
837 # REMOVE BLOCK CONSTRAINT AND SUBSTITUTE IT WITH SINGLE STRAND INFORMATION repsective with brackets, if allowed base pairs occure
838 # check for all allowed implicit constraint block declarators
839 for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
840 occurances = []
841 for m in re.finditer(c, target_structure): # search for a declarator in the requested structure
842 occurances.append(m.start()) # save the corresponding index
843
844 # transform declarator into single stranded request
845 for i in occurances:
846 #tmp_target_structure[i] = "."
847 tmp_target_structure_bp[i] = i
848 # infer a base pair within the block declarated positions, if the current structure provides it.
849 for i in occurances:
850 for j in occurances:
851 if i < j:
852 if bp[i] == j:
853 #tmp_target_structure[i] = "("
854 #tmp_target_structure[bp[i]] = ")"
855
856 tmp_target_structure_bp[i] = bp[i]
857 tmp_target_structure_bp[bp[i]] = i
858
859 # CHECK FOR SEQUENCE CONSTRAINT WHICH INDUCES STRUCTURE CONSTRAINT IN THE MOMENTARY SITUATION
860 #print "Checking Cseq influence and it's induced basepairs..."
861 IUPACinducers, tmp_Cseq = getInducingSequencePositions(Cseq, degreeOfSequenceInducement)
862 if len(Cseq.strip("N")) > 0:
863 #print "Processing Cseq influence"
864 # Iterate over all positions within the Base Pair stack
865 for i in BP: # Check for each base index i
866
867 if i < bp[i]: # if the current index is samller that the affiliated in the basepair stack of the current solution
868
869 bp_j = bp[i] # Actual j index of the current solution
870 BP_j = BP[i][1][0] # j index of the requested structure
871 if (i != bp_j and i == BP_j and BP[i][0] in IUPACinducers ): # if i pairs with some other base in the current structure, and i is requested single stranded and the Sequence constraint is allowed to induce...
872 if (BP[bp_j][1][0] == bp_j and BP[bp_j][0] in IUPACinducers):# If position j is requested singlestranded and position j nucleotide can induce base pairs
873 #if isCompatible(bp[i][0], bp[i][1][1], IUPAC_compatibles): # If both nucleotides, i and j are actually compatible
874 #tmp_target_structure[i] = "("
875 #tmp_target_structure[bp_j] = ")"
876
877 tmp_target_structure_bp[i] = bp[i]
878 tmp_target_structure_bp[bp_j] = i
879
880 #tts = "".join(tmp_target_structure)
881 dsreg = getBPDifferenceDistance(tmp_target_structure_bp, bp)
882
883 # CHECK FOR ALL DETERMINED LONELY BASE PAIRS (i<j), if they are formed
884 failLP = 0
885 for lp in LP:
886
887 if bp[lp] != LP[lp]:
888
889 isComp = isCompatible(path[lp],path[LP[lp]], IUPAC_compatibles)
890 isStru = isStructureCompatible(lp, LP[lp] ,bp)
891 if not ( isStru and isStru ): # check if the bases at the specific positions are compatible and check if the
892 # basepair can be formed according to pseudoknot free restriction. If one fails, a penalty distance is raised for that base pair
893 failLP += 1
894
895 #print dsreg, failLP, float(len(tmp_target_structure_bp))
896 dsLP = float(failLP)
897
898 return (dsreg + dsLP) /float(len(tmp_target_structure_bp))
899
900
901 def getGC(sequence):
902 """
903 Calculate GC content of a sequence
904 """
905 GC = 0
906 for nt in sequence:
907 if nt == "G" or nt == "C":
908 GC = GC + 1
909 GC = GC/float(len(sequence))
910 return GC
911
912
913 def getGCDistance(tGC, gc2, L):
914 """
915 Calculate the pseudo GC content distance
916 """
917 nt_coeff = L * tGC
918 pc_nt = (1/float(L))*100
919 #
920 d = gc2 - tGC
921 d = d * 100
922
923 f = math.floor(nt_coeff)
924 c = math.ceil(nt_coeff)
925
926 if d < 0: #
927 #print "case x",(abs(nt_coeff - f)), pc_nt, (abs(nt_coeff - f)) * pc_nt,
928 d = d + (abs(nt_coeff - f)) * pc_nt
929 elif d > 0: # case y
930 #print "case y", abs(nt_coeff - c), pc_nt, abs(nt_coeff - c) * pc_nt,
931 d = d - abs(nt_coeff - c) * pc_nt
932 elif d == 0:
933 pass
934
935 d = round(d, 7)
936 #d = max(0, abs(d)- ( max ( abs( math.ceil(nt_coeff)-(nt_coeff)) , abs(math.floor(nt_coeff)-(nt_coeff)) )/L)*100 )
937 return abs(d)
938
939
940 def getSequenceEditDistance(SC, path):
941 """
942 Calculate sequence edit distance of a solution to the constraint
943 """#
944 IUPAC = {"A":"A", "C":"C", "G":"G", "U":"U", "R":"AG", "Y":"CU", "S":"GC", "W":"AU","K":"GU", "M":"AC", "B":"CGU", "D":"AGU", "H":"ACU", "V":"ACG", "N":"ACGU"}
945 edit = 0
946 for i in xrange(len(SC)):
947 if path[i] not in IUPAC[SC[i]]:
948 edit += 1
949 return edit/float(len(path))
950
951
952
953 def getTransitions(p):
954 """
955 Retreive transitions of a specific path/sequence
956 """
957 transitions = []
958 for pos in xrange(len(p)):
959 if pos == 0:
960 transitions.append(str(pos) + "." + p[pos])
961
962 else:
963 insert = p[pos-1] + p[pos]
964 transitions.append(str(pos) + "." + insert)
965
966 return transitions
967
968
969 def evaporate(t, er):
970 """
971 Evaporate the terrain's pheromone trails
972 """
973 terr, BP = t
974 c = 1
975 for key in terr:
976 p,l,c = terr[key]
977 p *= (1-er)
978 terr[key] = (p, l, c)
979
980
981 def updateValue(distance, correction_term, omega):
982 """
983 Retrieves a distance dependend pheromone value
984 """
985 if correction_term == 0:
986 return 0
987 else:
988 if distance == 0:
989 return omega * correction_term
990 else:
991 return (1/float(distance)) * correction_term
992
993
994 def trailBlaze(p, c_s, s, ds, dgc, dseq, dn, t, correction_terms, BPstack, verbose):
995 """
996 Pheromone Update function accorinding to the quality of the solution
997 """
998 terr, BP = t
999 bpstack, LP = getbpStack(c_s)
1000
1001 struct_correction_term , GC_correction_term, seq_correction_term = correction_terms
1002 omega = 2.23
1003
1004 bs = updateValue(ds, struct_correction_term, omega)
1005 bGC = updateValue(dgc, GC_correction_term, omega)
1006 if dseq != "n.a.":
1007 bSeq = updateValue(dseq, seq_correction_term, omega)
1008 d = bs + bGC + bSeq
1009 else:
1010 d = bs + bGC
1011 transitions = getTransitions(p)
1012
1013 for trans in xrange(len(transitions)): # for each transition in the path
1014 id1 = int(transitions[trans].split(".")[0])
1015 tar_id2 = int(BPstack[id1][1][0]) # getting requested position id2
1016 curr_id2 = int(bpstack[id1]) # getting the current situation
1017 multiplicator = 0
1018 if tar_id2 == curr_id2 and id1 != tar_id2 and id1 != curr_id2: # case of a base pair, having both brackets on the correct position
1019 multiplicator = 1
1020 elif tar_id2 == curr_id2 and id1 == tar_id2 and id1 == curr_id2: # case of a single stranded base in both structures
1021 multiplicator = 1
1022 p, l, c = terr[transitions[trans]] # getting the pheromone and the length value of the single path transition
1023 p += d * multiplicator
1024 terr[transitions[trans]] = (p, l, c) # updating the values wihtin the terrain's
1025 t = (terr, BP)
1026
1027
1028 def updateTerrain(p, c_s, s, ds, dgc, dseq, dn, t, er, correction_terms, BPstack, verbose, ant_count):
1029 """
1030 General updating function
1031 """
1032 evaporate(t,er)
1033 trailBlaze(p, c_s, s, ds, dgc, dseq, dn, t, correction_terms, BPstack, verbose)
1034
1035
1036 def getUsedTime(start_time):
1037 """
1038 Return the used time between -start time- and now.
1039 """
1040 end_time = time.time()
1041 return end_time - start_time
1042
1043
1044 def good2Go(SC, L, CC, STR):
1045 """
1046 Check, if all input is correct and runnable
1047 """
1048 if (SC == 1 and L == 1 and CC == 1 and STR == 1):
1049 return True
1050 else:
1051 print SC,L,CC,STR
1052 return False
1053
1054
1055 def getPathFromSelection( aps, s, terrain, alpha, beta, RNAfold, RNAfold_pattern, GC, SC, LP, verbose, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, IUPAC, pseudoknots, strategy):
1056 """
1057 Returns the winning path from a selection of pathes...
1058 """
1059 terr, BPs = terrain
1060 win_path = 0
1061 for i in xrange(aps):
1062 # Generate Sequence
1063 path = getPath(s, terr, BPs, alpha, beta, IUPAC, IUPAC_reverseComplements)
1064 # Measure sequence features and transform them into singular distances
1065 distance_structural = float(getStructuralDistance(s, SC , path, RNAfold, verbose, LP, BPs, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy))
1066 distance_GC = float(getGCDistance(GC,getGC(path), len(path)))
1067 distance_seq = float(getSequenceEditDistance(SC, path))
1068 # Calculate Distance Score
1069 D = distance_structural + distance_GC + distance_seq
1070
1071 # SELECT THE BEST-OUT-OF-k-SOLUTIONS according to distance score
1072 if i == 0:
1073 win_path = (path, D, distance_structural, distance_GC, distance_seq)
1074 else:
1075 if D < win_path[1]:
1076 win_path = (path, D, distance_structural, distance_GC, distance_seq)
1077 return win_path
1078
1079
1080 def substr(x, string, subst):
1081 """
1082 Classical substring function
1083 """
1084 s1 = string[:x-1]
1085
1086 s2 = string[x-1:x]
1087 s3 = string[x:]
1088 #s2 = s[x+len(string)-x-1:]
1089
1090 return s1 + subst + s3
1091
1092
1093 def inConvergenceCorridor(d_struct, d_gc, BS_d_struct, BS_d_gc):
1094 """
1095 Check if a solutions qualities are within the convergence corridor
1096 """
1097 struct_var = ((BS_d_struct/float(4)) + 3 ) * 4
1098 gc_var = (BS_d_gc + 1/float(100) * 5) + BS_d_gc + 1
1099
1100 if d_struct <= struct_var and d_gc <= gc_var:
1101 return True
1102 else:
1103 return False
1104
1105 def getGCSamplingValue(GC, tGCmax, tGCvar):
1106 """
1107 Returns a suitable GC value, dependend on the user input: Either returning the single GC value,
1108 which the user entered, or a smpled GC value
1109 from a designated distribution in it's interavals
1110 """
1111 returnval = 0
1112 if tGCmax == -1.0 and tGCvar == -1.0: # regular plain tGC value as requested
1113 return GC
1114 elif tGCmax != -1.0 and tGCvar == -1.0: # uniform distribution tGC value sampling
1115 if GC < tGCmax:
1116 tmp_GC = tGCmax
1117 tGCmax = GC
1118 GC = tmp_GC
1119 while returnval <= 0:
1120 returnval = float(numpy.random.uniform(low=GC, high=tGCmax, size=1))
1121 return returnval
1122 elif tGCmax == -1.0 and tGCvar != -1.0: # normal distribution tGC value sampling
1123 while returnval <= 0:
1124 returnval = float(numpy.random.normal(GC, tGCvar, 1))
1125 return returnval
1126
1127
1128 def reachableGC(C_struct):
1129 """
1130 Checks if a demanded GC target content is reachable in dependence with the given sequence constraint.
1131 """
1132 AU = 0
1133 for i in C_struct:
1134 if i == "A" or i == "U":
1135 AU += 1
1136 maxGC = 1 - (AU / float(len(C_struct))) # 1 - min_GC
1137 return maxGC
1138
1139
1140 def runColony(s, SC, objective_to_target_distance, GC, alpha, beta, evaporation_rate, correction_terms, verbose, IUPAC, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, termination_convergence, convergence_count, reset_limit, improve, temperature, paramFile, pseudoknots, strategy):
1141 """
1142 Execution function of a single ant colony finding one solution sequence
1143 """
1144 retString = ""
1145 retString2 = []
1146 BPstack, LP = getBPStack(s, SC)
1147
1148 rGC = reachableGC(SC)
1149 GC_message = ""
1150 if GC > rGC:
1151 print >> sys.stderr, "WARNING: Chosen target GC %s content is not reachable due to sequence constraint! Sequence Constraint GC-content is: %s" % (GC, rGC)
1152 GC = rGC
1153
1154 # Initial Constraint Checks prior to execution
1155 STR = isValidStructure(s)
1156 START_SC , SC = checkSequenceConstraint(str(SC))
1157 START_LENGTH = checkSimilarLength(str(s), str(SC))
1158 START_constraint_compatibility , CompReport = checkConstaintCompatibility(BPstack, SC, IUPAC_compatibles)
1159
1160 g2g = good2Go(START_SC, START_LENGTH, START_constraint_compatibility, STR)
1161 if (g2g == 1):
1162 start_time = time.time()
1163 max_time = 600 # seconds
1164
1165
1166
1167
1168 ####
1169 # INITIALIZATION OF THE RNA TOOLs
1170 #
1171 RNAfold = init_RNAfold(213, temperature, paramFile)
1172 #RNAdistance = init_RNAdistance()
1173 RNAfold_pattern = re.compile('.+\n([.()]+)\s.+')
1174 #RNAdist_pattern = re.compile('.*\s([\d]+)')
1175 #
1176 ####
1177
1178 terrain = initTerrain(s)
1179 #print len(terrain),
1180 terrain = applyTerrainModification(terrain, s, GC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
1181 #print len(terrain[0])
1182 #printTerrain(terrain)
1183 #exit(0)
1184 global_ant_count = 0
1185 global_best_ants = 0
1186 criterion = False
1187 met = True
1188 ant_no = 1
1189 prev_res = 0
1190 seq = ""
1191
1192 counter = 0
1193
1194 dstruct_log = []
1195 dGC_log = []
1196
1197
1198 distance_structural = 1000
1199 distance_GC = 1000
1200 distance_seq = 1000
1201
1202 convergence = convergence_count
1203 convergence_counter = 0
1204
1205 resets = 0
1206
1207 path = ""
1208 curr_structure = ""
1209
1210 Dscore = 100000
1211 distance_structural = 10000
1212 distance_GC = 10000
1213 distance_seq = 10000
1214 best_solution = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
1215 best_solution_local = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
1216
1217 best_solution_since = 0
1218
1219 ants_per_selection = 10
1220 if len(LP) > 0 :
1221 for lp in LP:
1222 s = substr(lp + 1, s, ".")
1223 s = substr(LP[lp] + 1, s, ".")
1224
1225 init = 1
1226 while criterion != met and getUsedTime(start_time) < max_time:
1227 iteration_start = time.time()
1228 global_ant_count += 1
1229 global_best_ants += 1
1230
1231 path_info = getPathFromSelection(ants_per_selection, s, terrain, alpha, beta, RNAfold, RNAfold_pattern, GC, SC, LP, verbose, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, IUPAC, pseudoknots, strategy)
1232
1233 distance_structural_prev = distance_structural
1234 distance_GC_prev = distance_GC
1235 distance_seq_prev = distance_seq
1236
1237 path, Dscore , distance_structural, distance_GC, distance_seq = path_info
1238 curr_structure = ""
1239 if pseudoknots:
1240 curr_structure = getPKStructure(path, strategy)
1241 else:
1242 curr_structure = getRNAfoldStructure(path, RNAfold)
1243
1244 curr_solution = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
1245 # BEST SOLUTION PICKING
1246 if improve == "h": # hierarchical check
1247 # for the global best solution
1248 if distance_structural < best_solution[3] or (distance_structural == best_solution[3] and distance_GC < best_solution[4]):
1249 best_solution = curr_solution
1250 ant_no = 1
1251 # for the local (reset) best solution
1252 if distance_structural < best_solution_local[3] or (distance_structural == best_solution_local[3] and distance_GC < best_solution_local[4]):
1253 best_solution_local = curr_solution
1254
1255 elif improve == "s": #score based check
1256 # store best global solution
1257 if Dscore < best_solution[2]:
1258 best_solution = curr_solution
1259 ant_no = 1
1260 # store best local solution for this reset
1261 if Dscore < best_solution_local[2]:
1262 best_solution_local = curr_solution
1263
1264 # OLD ' BEST SOLUTION ' PICKING
1265 # if Dscore < best_solution[2]:
1266 # best_solution = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
1267 #
1268 # if Dscore < best_solution_local[2]:
1269 # best_solution_local = (path,curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
1270
1271
1272 distance_DN = 0
1273
1274 if verbose:
1275 print "SCORE " + str(Dscore) + " Resets " + str(resets) + " #Ant " + str(global_ant_count) + " out of " + str(ants_per_selection) + " cc " + str(convergence_counter)
1276
1277 print s, " <- target struct"
1278 print best_solution[0] , " <- BS since ", str(best_solution_since), "Size of Terrrain:", len(terrain[0])
1279 print best_solution[1] , " <- BS Dscore " + str(best_solution[2]) + " ds " + str(best_solution[3]) + " dGC " + str(best_solution[4]) + " dseq " + str(best_solution[5])+ " LP " + str(len(LP)) + " <- best solution stats"
1280 print curr_structure, " <- CS"
1281 print path,
1282 print " <- CS", "Dscore", str(Dscore), "ds", distance_structural, "dGC", distance_GC, "GC", getGC(path)*100, "Dseq", distance_seq
1283
1284 #### UPDATING THE TERRAIN ACCORDING TO THE QUALITY OF THE CURRENT BESTO-OUT-OF-k SOLUTION
1285 updateTerrain(path, curr_structure, s, distance_structural,distance_GC, distance_seq, distance_DN, terrain, evaporation_rate, correction_terms, BPstack, verbose, global_ant_count)
1286 ####
1287 if verbose: print "Used time for one iteration", time.time() - iteration_start
1288
1289
1290 # CONVERGENCE AND TERMINATION CRITERION MANAGEMENT
1291 #print distance_structural, distance_GC, best_solution_local[3], best_solution_local[4]
1292 if inConvergenceCorridor(curr_solution[3], curr_solution[4], best_solution_local[3], best_solution_local[4]):
1293 convergence_counter += 1
1294 if distance_structural_prev == distance_structural and distance_GC_prev == distance_GC:
1295 convergence_counter += 1
1296
1297 if best_solution[3] == objective_to_target_distance:
1298 if best_solution[4] == 0.0:
1299 break
1300 ant_no = ant_no + 1
1301 convergence_counter -= 1
1302 else:
1303 ant_no = 1
1304
1305
1306 if ant_no == termination_convergence or resets >= reset_limit or global_ant_count >= 100000 or best_solution_since == 5:
1307 break
1308
1309 # RESET
1310 if ant_no < termination_convergence and convergence_counter >= convergence:
1311
1312 terrain = initTerrain(s)
1313 terrain = applyTerrainModification(terrain, s, GC, SC, BPstack, IUPAC, IUPAC_compatibles, IUPAC_reverseComplements)
1314 criterion = False
1315 met = True
1316 ant_no = 1
1317 prev_res = 0
1318 pre_path = "_" * len(s)
1319 path = ""
1320 curr_structure = ""
1321 counter = 0
1322 Dscore = 100000
1323 distance_structural = 1000
1324 distance_GC = 1000
1325 distance_seq = 1000
1326 best_solution_local = (path, curr_structure, Dscore, distance_structural, distance_GC, distance_seq)
1327 convergence = convergence_count
1328 convergence_counter = 0
1329
1330 if resets == 0:
1331 sentinel_solution = best_solution
1332 best_solution_since += 1
1333 else:
1334 if best_solution[2] < sentinel_solution[2]:
1335 sentinel_solution = best_solution
1336 best_solution_since = 0
1337 else:
1338 best_solution_since += 1
1339
1340 resets += 1
1341
1342 duration = getUsedTime(start_time)
1343
1344 retString += "|Ants:" + str(global_ant_count)
1345 retString += "|Resets:" + str(resets) + "/" + str(reset_limit)
1346 retString += "|AntsTC:" + str(termination_convergence)
1347 retString += "|CC:" + str(convergence_count)
1348 retString += "|IP:" + str(improve)
1349 retString += "|BSS:" + str(best_solution_since)
1350 #if GC_message != "":
1351 # retString += GC_message + "\n"
1352
1353 sequence = best_solution[0]
1354 struct = best_solution[1]
1355
1356 retString += "|LP:" + str(len(LP))
1357 retString += "|ds:" + str(getStructuralDistance(s,SC, sequence, RNAfold, verbose, LP, BPstack, RNAfold_pattern, IUPAC_compatibles, degreeOfSequenceInducement, pseudoknots, strategy))
1358 retString += "|dGC:" + str(best_solution[4])
1359 retString += "|GC:" + str(getGC(sequence)*100)
1360 retString += "|dseq:" + str(getSequenceEditDistance(SC, sequence))
1361 retString += "|L:" + str(len(sequence))
1362 retString += "|Time:" + str(duration)
1363
1364 retString2.append(struct)
1365 retString2.append(sequence)
1366
1367 # CLOSING THE PIPES TO THE PROGRAMS
1368 RNAfold.communicate()
1369 #RNAdistance.communicate()
1370
1371 else: # Structural premisses are not met, htherefore the program will halt with a failure message
1372 retString += "\nSome mistake detected\n"
1373 retString += "SequenceConstraintCheck: " + str(START_SC) + "\nSequenceConstraint: " + str(SC) + "\nLengthCheck: " + str(START_LENGTH) + "\nConstraintCompatibility: " + str(START_constraint_compatibility)+ "\n" + CompReport + "\n"
1374
1375 return (retString, retString2)
1376
1377 def findSequence(structure, Cseq, tGC, colonies, name, alpha, beta, evaporation_rate, struct_correction_term, GC_correction_term, seq_correction_term, degreeOfSequenceInducement, file_id, verbose, output_verbose, tGCmax, tGCvar, termination_convergence, convergence_count, reset_limit, improve, seed, temperature, paramFile, pseudoknots, strategy, useGU, return_mod = False):
1378 """
1379 MAIN antaRNA - ant assembled RNA
1380 """
1381
1382 if seed != "none":
1383 random.seed(seed)
1384
1385 if Cseq == "":
1386 sequenceconstraint = "N" * len(structure)
1387 else:
1388 sequenceconstraint = str(Cseq)
1389
1390 alpha = float(alpha)
1391 beta = float(beta)
1392 tGC = float(tGC)
1393 evaporation_rate = float(evaporation_rate)
1394 struct_correction_term = float(struct_correction_term)
1395 GC_correction_term = float(GC_correction_term)
1396 seq_correction_term = float(seq_correction_term)
1397 colonies = int(colonies)
1398 file_id = str(file_id)
1399 tmp_verbose = verbose
1400 tmp_output_verbose = output_verbose
1401 verbose = tmp_output_verbose # Due to later change, this is a twistaround and a switching of purpose
1402 output_verbose = tmp_verbose # Due to later change, this is a twistaround and a switching of purpose
1403 correction_terms = struct_correction_term, GC_correction_term, seq_correction_term
1404 temperature = float(temperature)
1405 print_to_STDOUT = (file_id == "STDOUT")
1406
1407 useGU = useGU
1408
1409 if return_mod == False:
1410 if print_to_STDOUT == False:
1411 outfolder = '/'.join(file_id.strip().split("/")[:-1])
1412 curr_dir = os.getcwd()
1413 if not os.path.exists(outfolder):
1414 os.makedirs(outfolder)
1415 os.chdir(outfolder)
1416
1417
1418 sequenceconstraint = transform(sequenceconstraint)
1419 ###############################################
1420
1421 # Allowed deviation from the structural target:
1422 objective_to_target_distance = 0.0
1423
1424 # Loading the IUPAC copatibilities of nuleotides and their abstract representing symbols
1425 IUPAC = {"A":"A", "C":"C", "G":"G", "U":"U", "R":"AG", "Y":"CU", "S":"GC", "W":"AU","K":"GU", "M":"AC", "B":"CGU", "D":"AGU", "H":"ACU", "V":"ACG", "N":"ACGU"}
1426 IUPAC_compatibles = loadIUPACcompatibilities(IUPAC, useGU)
1427
1428 IUPAC_reverseComplements = {}
1429 if useGU == False: ## Without the GU basepair
1430 IUPAC_reverseComplements = {"A":"U", "C":"G", "G":"C", "U":"A", "R":"UC", "Y":"AG", "S":"GC", "W":"UA","K":"CA", "M":"UG", "B":"AGC", "D":"ACU", "H":"UGA", "V":"UGC", "N":"ACGU"}
1431 else: ## allowing the GU basepair
1432 IUPAC_reverseComplements = {"A":"U", "C":"G", "G":"UC", "U":"AG", "R":"UC", "Y":"AG", "S":"UGC", "W":"UAG","K":"UCAG", "M":"UG", "B":"AGCU", "D":"AGCU", "H":"UGA", "V":"UGC", "N":"ACGU"}
1433
1434 result = []
1435 for col in xrange(colonies):
1436 # Checking the kind of taget GC value should be used
1437 GC = getGCSamplingValue(tGC, tGCmax, tGCvar)
1438
1439 # Actual execution of a ant colony procesdure
1440 output_v, output_w = runColony(structure, sequenceconstraint, objective_to_target_distance, GC, alpha, beta, evaporation_rate, correction_terms, verbose, IUPAC, IUPAC_compatibles, degreeOfSequenceInducement, IUPAC_reverseComplements, termination_convergence, convergence_count, reset_limit, improve, temperature, paramFile, pseudoknots, strategy)
1441
1442 # Post-Processing the output of a ant colony procedure
1443 line = ">" + name + str(col)
1444 if output_verbose:
1445 line += "|Cstr:" + structure + "|Cseq:" + sequenceconstraint + "|Alpha:" + str(alpha) + "|Beta:" + str(beta) + "|tGC:" + str(GC) + "|ER:" + str(evaporation_rate) + "|Struct_CT:" + str(struct_correction_term) + "|GC_CT:" + str(GC_correction_term) + "|Seq_CT:" + str(seq_correction_term) + output_v + "\n" + "\n".join(output_w)
1446 else:
1447 line += "\n" + output_w[1]
1448 if return_mod == False:
1449 if print_to_STDOUT:
1450 print line
1451 else:
1452 if col == 0:
1453 print2file(file_id, line, 'w')
1454 else:
1455 print2file(file_id, line, 'a')
1456 else:
1457 result.append(line)
1458
1459 if return_mod == True:
1460 return result
1461 if print_to_STDOUT == False:
1462 os.chdir(curr_dir)
1463
1464 def execute(args):
1465 """
1466 CHECK AND PARSE THE COMMAND LINE STUFF
1467 """
1468
1469 # Checking the arguments, parsed from the shell
1470 ###############################################
1471 name = args.name
1472 structure = args.Cstr
1473
1474 if args.Cseq == "":
1475 sequenceconstraint = "N" * len(structure)
1476 else:
1477 sequenceconstraint = args.Cseq
1478
1479 seed = args.seed
1480
1481
1482 alpha = args.alpha
1483 beta = args.beta
1484 tGC = args.tGC
1485 if tGC < 0 or tGC > 1:
1486 print "Error: Chosen tGC not in range [0,1]"
1487 exit(1)
1488 evaporation_rate = args.ER
1489 struct_correction_term = args.Cstrweight
1490 GC_correction_term = args.Cgcweight
1491 seq_correction_term = args.Cseqweight
1492 colonies = args.noOfColonies
1493 degreeOfSequenceInducement = args.level
1494 file_id = args.output_file
1495 verbose = args.verbose
1496 output_verbose = args.output_verbose
1497
1498 tGCmax = args.tGCmax
1499 tGCvar = args.tGCvar
1500
1501 termination_convergence = args.antsTerConv
1502 convergence_count = args.ConvergenceCount
1503 temperature = args.temperature
1504 reset_limit = args.Resets
1505
1506 improve = args.improve_procedure
1507
1508 ### RNAfold parameterfile
1509 paramFile = args.paramFile
1510
1511 # Using the pkiss program under user changeable parameters
1512 pseudoknots = args.pseudoknots
1513
1514 # Loading the optimized parameters for pseudoknots and ignore user input
1515 if args.pseudoknot_parameters:
1516 alpha = 1.0
1517 beta = 0.1
1518 evaporation_rate = 0.2
1519 struct_correction_term = 0.1
1520 GC_correction_term = 1.0
1521 seq_correction_term = 0.5
1522 termination_convergence = 50
1523 convergence_count = 100
1524
1525
1526 strategy = args.strategy
1527 useGU = args.useGUBasePair
1528
1529 checkForViennaTools()
1530 if pseudoknots:
1531 checkForpKiss()
1532 findSequence(structure, sequenceconstraint, tGC, colonies, name, alpha, beta, evaporation_rate, struct_correction_term, GC_correction_term, seq_correction_term, degreeOfSequenceInducement, file_id, verbose, output_verbose, tGCmax, tGCvar, termination_convergence, convergence_count, reset_limit, improve, seed, temperature, paramFile, pseudoknots, strategy, useGU)
1533
1534
1535 def exe():
1536 """
1537 MAIN EXECUTABLE WHICH PARSES THE INPUT LINE
1538 """
1539
1540 argument_parser = argparse.ArgumentParser(
1541 description = """
1542
1543 #########################################################################
1544 # antaRNA - ant assembled RNA #
1545 # -> Ant Colony Optimized RNA Sequence Design #
1546 # ------------------------------------------------------------ #
1547 # Robert Kleinkauf (c) 2015 #
1548 # Bioinformatics, Albert-Ludwigs University Freiburg, Germany #
1549 #########################################################################
1550
1551 - For antaRNA only the VIENNNA RNA Package must be installed on your linux system.
1552 antaRNA will only check, if the executables of RNAfold and RNAdistance of the ViennaRNA package can be found. If those programs are
1553 not installed correctly, no output will be generated, an also no warning will be prompted.
1554 So the binary path of the Vienna Tools must be set up correctly in your system's PATH variable in order to run antaRNA correctly!
1555
1556 - antaRNA was only tested under Linux.
1557
1558 - For questions and remarks please feel free to contact us at http://www.bioinf.uni-freiburg.de/
1559
1560 """,
1561
1562 epilog = """
1563 Example calls:
1564 python antaRNA.py --Cstr "...(((...)))..." --tGC 0.5 -n 2
1565 python antaRNA.py --Cstr ".........AAA(((...)))AAA........." --tGC 0.5 -n 10 --output_file /path/to/antaRNA_TESTRUN -ov
1566 python antaRNA.py --Cstr "BBBBB....AAA(((...)))AAA....BBBBB" --Cseq "NNNNANNNNNCNNNNNNNNNNNGNNNNNNUNNN" --tGC 0.5 -n 10
1567
1568 #########################################################################
1569 # --- Hail to the Queen!!! All power to the swarm!!! --- #
1570 #########################################################################
1571 """,
1572 #formatter_class=RawTextHelpFormatter
1573 )
1574
1575 # mandatorys
1576 argument_parser.add_argument("-Cstr", "--Cstr", help="Structure constraint using RNA dotbracket notation with fuzzy block constraint. \n(TYPE: %(type)s)\n\n", type=str, required=True)
1577 argument_parser.add_argument("-tGC", "--tGC", help="Objective target GC content in [0,1].\n(TYPE: %(type)s)\n\n", type=float, required=True)
1578 argument_parser.add_argument("-n", "--noOfColonies", help="Number of sequences which shall be produced. \n(TYPE: %(type)s)\n\n\n\n", type=int, default=1)
1579 argument_parser.add_argument("-GU", "--useGUBasePair", help="Allowing GU base pairs. \n\n", action="store_true")
1580
1581 argument_parser.add_argument("-s", "--seed", help = "Provides a seed value for the used pseudo random number generator.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="none")
1582 argument_parser.add_argument("-ip", "--improve_procedure", help = "Select the improving method. h=hierarchical, s=score_based.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="s")
1583 argument_parser.add_argument("-r", "--Resets", help = "Amount of maximal terrain resets, until the best solution is retuned as solution.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=5)
1584 argument_parser.add_argument("-CC", "--ConvergenceCount", help = "Delimits the convergence count criterion for a reset.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=130)
1585 argument_parser.add_argument("-aTC", "--antsTerConv", help = "Delimits the amount of internal ants for termination convergence criterion for a reset.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=int, default=50)
1586
1587 argument_parser.add_argument("-p", "--pseudoknots", help = "Switch to pseudoknot based prediction using pKiss. Check the pseudoknot parameter usage!!!\n\n", action="store_true")
1588 argument_parser.add_argument("-pkPar", "--pseudoknot_parameters", help = "Enable optimized parameters for the usage of pseudo knots (Further parameter input ignored).\n\n", action="store_true")
1589 argument_parser.add_argument("--strategy", help = "Defining the pKiss folding strategy.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="A")
1590
1591 # Mutual Exclusiv target GC distribution variables
1592 #tGCgroup = argument_parser.add_mutually_exclusive_group()
1593 argument_parser.add_argument("-tGCmax", "--tGCmax", help = "Provides a maximum tGC value [0,1] for the case of uniform distribution sampling. The regular tGC value serves as minimum value.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=-1.0)
1594 argument_parser.add_argument("-tGCvar", "--tGCvar", help = "Provides a tGC variance (sigma square) for the case of normal distribution sampling. The regular tGC value serves as expectation value (mu).\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=-1.0)
1595
1596 argument_parser.add_argument("-t", "--temperature", help = "Provides a temperature for the folding algorithms.\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=37.0)
1597 argument_parser.add_argument("-P", "--paramFile", help = "Changes the energy parameterfile of RNAfold. If using this explicitly, please provide a suitable energy file delivered by RNAfold. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="")
1598 argument_parser.add_argument("-of","--output_file", help="Provide a path and an output file, e.g. \"/path/to/the/target_file\". \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="STDOUT")
1599 argument_parser.add_argument("-Cseq", "--Cseq", help="Sequence constraint using RNA nucleotide alphabet {A,C,G,U} and wild-card \"N\". \n(TYPE: %(type)s)\n\n", type=str, default = "")
1600 argument_parser.add_argument("-l", "--level", help="Sets the level of allowed influence of sequence constraint on the structure constraint [0:no influence; 3:extensive influence].\n(TYPE: %(type)s)\n\n", type=int, default = 1)
1601 argument_parser.add_argument("--name", help="Defines a name which is used in the sequence output. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=str, default="antaRNA_")
1602 argument_parser.add_argument("-a", "--alpha", help="Sets alpha, probability weight for terrain pheromone influence. [0,1] \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=1.0)
1603 argument_parser.add_argument("-b", "--beta", help="Sets beta, probability weight for terrain path influence. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=1.0)
1604 argument_parser.add_argument("-er", "--ER", help="Pheromone evaporation rate. \n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=0.2)
1605 argument_parser.add_argument("-Cstrw", "--Cstrweight", help="Structure constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=0.5)
1606 argument_parser.add_argument("-Cgcw", "--Cgcweight", help="GC content constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n", type=float, default=5.0)
1607 argument_parser.add_argument("-Cseqw", "--Cseqweight", help="Sequence constraint quality weighting factor. [0,1]\n(DEFAULT: %(default)s, TYPE: %(type)s)\n\n\n", type=float, default=1.0)
1608 argument_parser.add_argument("-ov", "--output_verbose", help="Displayes intermediate output.\n\n", action="store_true")
1609 argument_parser.add_argument("-v", "--verbose", help="Prints additional features and stats to the headers of the produced sequences. Also adds the structure of the sequence.\n\n", action="store_true")
1610
1611 args = argument_parser.parse_args()
1612
1613 execute(args)
1614
1615 def checkForViennaTools():
1616 """
1617 Checking for the presence of the Vienna tools in the system by which'ing for RNAfold and RNAdistance
1618 """
1619 RNAfold_output = subprocess.Popen(["which", "RNAfold"], stdout=subprocess.PIPE).communicate()[0].strip()
1620 if len(RNAfold_output) > 0 and RNAfold_output.find("found") == -1 and RNAfold_output.find(" no ") == -1:
1621 return True
1622 else:
1623 print "It seems the Vienna RNA Package is not installed on your machine. Please do so!"
1624 print "You can get it at http://www.tbi.univie.ac.at/"
1625 exit(0)
1626
1627
1628 def checkForpKiss():
1629 """
1630 Checking for the presence of pKiss
1631 """
1632 pKiss_output = subprocess.Popen(["which", "pKiss"], stdout=subprocess.PIPE).communicate()[0].strip()
1633 if len(pKiss_output) > 0 and pKiss_output.find("found") == -1 and pKiss_output.find(" no ") == -1:
1634 return True
1635 else:
1636 print "It seems that pKiss is not installed on your machine. Please do so!"
1637 print "You can get it at http://bibiserv2.cebitec.uni-bielefeld.de/pkiss"
1638 exit(0)
1639
1640
1641
1642 if __name__ == "__main__":
1643
1644 exe()
1645