annotate _modules/common_readsCoverage_processing.py @ 0:69e8f12c8b31 draft

"planemo upload"
author bioit_sciensano
date Fri, 11 Mar 2022 15:06:20 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
1 ## @file common_readsCoverage_processing.py
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
2 #
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
3 # VL: here I gathered functions that are common to both GPU and mono/multi CPU versions.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
4 # These functions are called after the mapping is done and all the counters are filled from mapping output results.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
5 from __future__ import print_function
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
6
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
7 from time import gmtime, strftime
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
8 import heapq
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
9 import itertools
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
10
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
11 import numpy as np
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
12 import pandas as pd
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
13 # Statistics
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
14 from scipy import stats
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
15 from statsmodels.sandbox.stats.multicomp import multipletests
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
16 from sklearn.tree import DecisionTreeRegressor #TODO VL: fix issue on importing that
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
17
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
18 from _modules.utilities import checkReportTitle
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
19 from _modules.SeqStats import SeqStats
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
20
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
21 import os
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
22
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
23
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
24 k_no_match_for_contig=1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
25
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
26 def wholeCov(whole_coverage, gen_len):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
27 """Calculate the coverage for whole read alignments and its average"""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
28 if gen_len == 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
29 return whole_coverage, 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
30 total_cov = sum(whole_coverage[0]) + sum(whole_coverage[1])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
31 ave_whole_cov = float(total_cov) / (2 * float(gen_len))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
32 added_whole_coverage = [x + y for x, y in zip(whole_coverage[0], whole_coverage[1])]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
33 return added_whole_coverage, ave_whole_cov
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
34
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
35 def testwholeCov(added_whole_coverage, ave_whole_cov, test):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
36 """Return information about whole coverage."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
37 if test:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
38 return ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
39 if ave_whole_cov < 50:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
40 print("\nWARNING: average coverage is under the limit of the software (50)")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
41 elif ave_whole_cov < 200:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
42 print("\nWARNING: average coverage is low (<200), Li's method is presumably unreliable\n")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
43 drop_cov = []
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
44 start_pos = last_pos = count_pos = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
45 for pos in range(len(added_whole_coverage)):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
46 if added_whole_coverage[pos] < (ave_whole_cov / 1.5):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
47 if pos == last_pos+1:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
48 count_pos += 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
49 last_pos = pos
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
50 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
51 if count_pos > 100:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
52 drop_cov.append( (start_pos,last_pos+1) )
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
53 last_pos = start_pos = pos
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
54 count_pos = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
55 last_pos = pos
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
56 return drop_cov
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
57
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
58 def maxPaired(paired_whole_coverage, whole_coverage):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
59 """Max paired coverage using whole coverage, counter edge effect with paired-ends."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
60 pwc = paired_whole_coverage[:]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
61 wc = whole_coverage[:]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
62 for i in range(len(pwc)):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
63 for j in range(len(pwc[i])):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
64 if pwc[i][j] < wc[i][j]:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
65 pwc[i][j] = wc[i][j]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
66 return pwc
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
67
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
68 def replaceNormMean(norm_cov):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
69 """Replace the values not normalised due to covLimit by mean."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
70 nc_sum = nc_count = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
71 for nc in norm_cov:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
72 if nc > 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
73 nc_sum += nc
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
74 nc_count += 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
75 if nc_count == 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
76 mean_nc = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
77 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
78 mean_nc = nc_sum / float(nc_count)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
79 for i in range(len(norm_cov)):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
80 if norm_cov[i] == 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
81 norm_cov[i] = mean_nc
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
82 return norm_cov, mean_nc
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
83
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
84 def normCov(termini_coverage, whole_coverage, covLimit, edge):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
85 """Return the termini_coverage normalised by the whole coverage (% of coverage due to first base)."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
86 normalised_coverage = [len(termini_coverage[0])*[0], len(termini_coverage[0])*[0]]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
87 termini_len = len(termini_coverage[0])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
88 mean_nc = [1,1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
89 for i in range(len(termini_coverage)):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
90 for j in range(len(termini_coverage[i])):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
91 if j < edge or j > termini_len-edge:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
92 continue
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
93 if whole_coverage[i][j] >= covLimit:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
94 if float(whole_coverage[i][j]) != 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
95 normalised_coverage[i][j] = float(termini_coverage[i][j]) / float(whole_coverage[i][j])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
96 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
97 normalised_coverage[i][j] = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
98 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
99 normalised_coverage[i][j] = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
100 normalised_coverage[i], mean_nc[i] = replaceNormMean(normalised_coverage[i])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
101 return normalised_coverage, mean_nc
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
102
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
103 def RemoveEdge(tableau, edge):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
104 return tableau[edge:-edge]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
105
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
106 def usedReads(coverage, tot_reads):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
107 """Retrieve the number of reads after alignment and calculate the percentage of reads lost."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
108 used_reads = sum(coverage[0]) + sum(coverage[1])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
109 lost_reads = tot_reads - used_reads
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
110 lost_perc = (float(tot_reads) - float(used_reads))/float(tot_reads) * 100
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
111 return used_reads, lost_reads, lost_perc
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
112
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
113 ### PEAK functions
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
114 def picMax(coverage, nbr_pic):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
115 """COORDINATES (coverage value, position) of the nbr_pic largest coverage value."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
116 if coverage == [[],[]] or coverage == []:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
117 return "", "", ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
118 picMaxPlus = heapq.nlargest(nbr_pic, zip(coverage[0], itertools.count()))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
119 picMaxMinus = heapq.nlargest(nbr_pic, zip(coverage[1], itertools.count()))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
120 TopFreqH = max(max(np.array(list(zip(*picMaxPlus))[0])), max(np.array(list(zip(*picMaxMinus))[0])))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
121 return picMaxPlus, picMaxMinus, TopFreqH
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
122
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
123 def RemoveClosePicMax(picMax, gen_len, nbr_base):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
124 """Remove peaks that are too close of the maximum (nbr_base around)"""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
125 if nbr_base == 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
126 return picMax[1:], [picMax[0]]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
127 picMaxRC = picMax[:]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
128 posMax = picMaxRC[0][1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
129 LimSup = posMax + nbr_base
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
130 LimInf = posMax - nbr_base
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
131 if LimSup < gen_len and LimInf >= 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
132 PosOut = list(range(LimInf,LimSup))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
133 elif LimSup >= gen_len:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
134 TurnSup = LimSup - gen_len
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
135 PosOut = list(range(posMax,gen_len))+list(range(0,TurnSup)) + list(range(LimInf,posMax))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
136 elif LimInf < 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
137 TurnInf = gen_len + LimInf
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
138 PosOut = list(range(0,posMax))+list(range(TurnInf,gen_len)) + list(range(posMax,LimSup))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
139 picMaxOK = []
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
140 picOUT = []
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
141 for peaks in picMaxRC:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
142 if peaks[1] not in PosOut:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
143 picMaxOK.append(peaks)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
144 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
145 picOUT.append(peaks)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
146 return picMaxOK, picOUT
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
147
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
148 def addClosePic(picList, picClose, norm = 0):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
149 """Add coverage value of close peaks to the top peak. Remove picClose in picList if exist."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
150 if norm:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
151 if picClose[0][0] >= 0.5:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
152 return picList, [picClose[0]]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
153 picListOK = picList[:]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
154 cov_add = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
155 for cov in picClose:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
156 cov_add += cov[0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
157 picListOK[cov[1]] = 0.01
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
158 picListOK[picClose[0][1]] = cov_add
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
159 return picListOK, picClose
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
160
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
161 def remove_pics(arr,n):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
162 '''Removes the n highest values from the array'''
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
163 arr=np.array(arr)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
164 pic_pos=arr.argsort()[-n:][::-1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
165 arr2=np.delete(arr,pic_pos)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
166 return arr2
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
167
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
168 def gamma(X):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
169 """Apply a gamma distribution."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
170 X = np.array(X, dtype=np.int64)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
171 v = remove_pics(X, 3)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
172
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
173 dist_max = float(max(v))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
174 if dist_max == 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
175 return np.array([1.00] * len(X))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
176
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
177 actual = np.bincount(v)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
178 fit_alpha, fit_loc, fit_beta = stats.gamma.fit(v)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
179 expected = stats.gamma.pdf(np.arange(0, dist_max + 1, 1), fit_alpha, loc=fit_loc, scale=fit_beta) * sum(actual)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
180
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
181 return stats.gamma.pdf(X, fit_alpha, loc=fit_loc, scale=fit_beta)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
182
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
183
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
184 # STATISTICS
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
185 def test_pics_decision_tree(whole_coverage, termini_coverage, termini_coverage_norm, termini_coverage_norm_close):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
186 """Fits a gamma distribution using a decision tree."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
187 L = len(whole_coverage[0])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
188 res = pd.DataFrame({"Position": np.array(range(L)) + 1, "termini_plus": termini_coverage[0],
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
189 "SPC_norm_plus": termini_coverage_norm[0], "SPC_norm_minus": termini_coverage_norm[1],
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
190 "SPC_norm_plus_close": termini_coverage_norm_close[0],
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
191 "SPC_norm_minus_close": termini_coverage_norm_close[1], "termini_minus": termini_coverage[1],
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
192 "cov_plus": whole_coverage[0], "cov_minus": whole_coverage[1]})
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
193
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
194 res["cov"] = res["cov_plus"].values + res["cov_minus"].values
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
195
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
196 res["R_plus"] = list(map(float, termini_coverage[0])) // np.mean(termini_coverage[0])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
197 res["R_minus"] = list(map(float, termini_coverage[1])) // np.mean(termini_coverage[1])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
198
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
199 regr = DecisionTreeRegressor(max_depth=3, min_samples_leaf=100)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
200 X = np.arange(L)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
201 X = X[:, np.newaxis]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
202 y = res["cov"].values
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
203 regr.fit(X, y)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
204
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
205 # Predict
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
206 y_1 = regr.predict(X)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
207 res["covnode"] = y_1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
208 covnodes = np.unique(y_1)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
209 thres = np.mean(whole_coverage[0]) / 2
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
210 covnodes = [n for n in covnodes if n > thres]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
211
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
212 for node in covnodes:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
213 X = res[res["covnode"] == node]["termini_plus"].values
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
214 res.loc[res["covnode"] == node, "pval_plus"] = gamma(X)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
215 X = res[res["covnode"] == node]["termini_minus"].values
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
216 res.loc[res["covnode"] == node, "pval_minus"] = gamma(X)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
217
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
218 res.loc[res.pval_plus > 1, 'pval_plus'] = 1.00
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
219 res.loc[res.pval_minus > 1, 'pval_minus'] = 1.00
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
220 res = res.fillna(1.00)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
221
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
222 res['pval_plus_adj'] = multipletests(res["pval_plus"].values, alpha=0.01, method="bonferroni")[1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
223 res['pval_minus_adj'] = multipletests(res["pval_minus"].values, alpha=0.01, method="bonferroni")[1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
224
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
225 res = res.fillna(1.00)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
226
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
227 res_plus = pd.DataFrame(
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
228 {"Position": res['Position'], "SPC_std": res['SPC_norm_plus'] * 100, "SPC": res['SPC_norm_plus_close'] * 100,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
229 "pval_gamma": res['pval_plus'], "pval_gamma_adj": res['pval_plus_adj']})
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
230 res_minus = pd.DataFrame(
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
231 {"Position": res['Position'], "SPC_std": res['SPC_norm_minus'] * 100, "SPC": res['SPC_norm_minus_close'] * 100,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
232 "pval_gamma": res['pval_minus'], "pval_gamma_adj": res['pval_minus_adj']})
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
233
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
234 res_plus.sort_values("SPC", ascending=False, inplace=True)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
235 res_minus.sort_values("SPC", ascending=False, inplace=True)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
236
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
237 res_plus.reset_index(drop=True, inplace=True)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
238 res_minus.reset_index(drop=True, inplace=True)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
239
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
240 return res, res_plus, res_minus
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
241
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
242 ### SCORING functions
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
243 # Li's methodology
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
244 def ratioR1(TopFreqH, used_reads, gen_len):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
245 """Calculate the ratio H/A (R1) = highest frequency/average frequency. For Li's methodology."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
246 AveFreq = (float(used_reads)/float(gen_len)/2)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
247 if AveFreq == 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
248 return 0, 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
249 R1 = float(TopFreqH)/float(AveFreq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
250 return R1, AveFreq
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
251
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
252 def ratioR(picMax):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
253 """Calculate the T1/T2 = Top 1st frequency/Second higher frequency. For Li's methodology."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
254 T1 = picMax[0][0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
255 T2 = max(1,picMax[1][0])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
256 R = float(T1)/float(T2)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
257 return round(R)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
258
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
259
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
260 def packMode(R1, R2, R3):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
261 """Make the prognosis about the phage packaging mode and termini type. For Li's methodology."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
262 packmode = "OTHER"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
263 termini = ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
264 forward = ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
265 reverse = ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
266
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
267 if R1 < 30:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
268 termini = "Absence"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
269 if R2 < 3:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
270 forward = "No Obvious Termini"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
271 if R3 < 3:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
272 reverse = "No Obvious Termini"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
273 elif R1 > 100:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
274 termini = "Fixed"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
275 if R2 < 3:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
276 forward = "Multiple-Pref. Term."
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
277 if R3 < 3:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
278 reverse = "Multiple-Pref. Term."
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
279 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
280 termini = "Preferred"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
281 if R2 < 3:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
282 forward = "Multiple-Pref. Term."
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
283 if R3 < 3:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
284 reverse = "Multiple-Pref. Term."
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
285
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
286 if R2 >= 3:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
287 forward = "Obvious Termini"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
288 if R3 >= 3:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
289 reverse = "Obvious Termini"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
290
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
291 if R2 >= 3 and R3 >= 3:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
292 packmode = "COS"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
293 if R2 >= 3 and R3 < 3:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
294 packmode = "PAC"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
295 if R2 < 3 and R3 >= 3:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
296 packmode = "PAC"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
297 return packmode, termini, forward, reverse
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
298
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
299 ### PHAGE Information
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
300 def orientation(picMaxPlus, picMaxMinus):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
301 """Return phage termini orientation."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
302 if not picMaxPlus and not picMaxMinus:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
303 return "NA"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
304 if picMaxPlus and not picMaxMinus:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
305 return "Forward"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
306 if not picMaxPlus and picMaxMinus:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
307 return "Reverse"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
308 if picMaxPlus and picMaxMinus:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
309 if picMaxPlus[0][0] > picMaxMinus[0][0]:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
310 return "Forward"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
311 elif picMaxMinus[0][0] > picMaxPlus[0][0]:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
312 return "Reverse"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
313 elif picMaxMinus[0][0] == picMaxPlus[0][0]:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
314 return "NA"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
315
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
316
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
317 def typeCOS(PosPlus, PosMinus, nbr_lim):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
318 """ Return type of COS sequence."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
319 if PosPlus < PosMinus and abs(PosPlus-PosMinus) < nbr_lim:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
320 return "COS (5')", "Lambda"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
321 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
322 return "COS (3')", "HK97"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
323
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
324 def sequenceCohesive(Packmode, refseq, picMaxPlus, picMaxMinus, nbr_lim):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
325 """Return cohesive sequence for COS phages."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
326 if Packmode != 'COS':
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
327 return '', Packmode
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
328 PosPlus = picMaxPlus[0][1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
329 PosMinus = picMaxMinus[0][1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
330
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
331 SC_class, SC_type = typeCOS(PosPlus, PosMinus, nbr_lim)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
332
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
333 if SC_class == "COS (5')":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
334 if abs(PosMinus - PosPlus) < nbr_lim:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
335 seqcoh = refseq[min(PosPlus, PosMinus):max(PosPlus, PosMinus) + 1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
336 return seqcoh, Packmode
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
337 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
338 seqcoh = refseq[max(PosPlus, PosMinus) + 1:] + refseq[:min(PosPlus, PosMinus)]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
339 return seqcoh, Packmode
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
340
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
341 elif SC_class == "COS (3')":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
342 if abs(PosMinus - PosPlus) < nbr_lim:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
343 seqcoh = refseq[min(PosPlus, PosMinus) + 1:max(PosPlus, PosMinus)]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
344 return seqcoh, Packmode
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
345 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
346 seqcoh = refseq[max(PosPlus, PosMinus) + 1:] + refseq[:min(PosPlus, PosMinus)]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
347 return seqcoh, Packmode
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
348 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
349 return '', Packmode
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
350
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
351 def selectSignificant(table, pvalue, limit):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
352 """Return significant peaks over a limit"""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
353 table_pvalue = table.loc[lambda df: df.pval_gamma_adj < pvalue, :]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
354 table_pvalue_limit = table_pvalue.loc[lambda df: df.SPC > limit, :]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
355 table_pvalue_limit.reset_index(drop=True, inplace=True)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
356 return table_pvalue_limit
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
357
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
358 def testMu(paired, list_hybrid, gen_len, used_reads, seed, insert, phage_hybrid_coverage, Mu_threshold, hostseq):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
359 """Return Mu if enough hybrid reads compared to theory."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
360 if hostseq == "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
361 return 0, -1, -1, ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
362 if paired != "" and len(insert) != 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
363 insert_mean = sum(insert) / len(insert)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
364 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
365 insert_mean = max(100, seed+10)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
366 Mu_limit = ((insert_mean - seed) / float(gen_len)) * used_reads/2
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
367 test = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
368 Mu_term_plus = "Random"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
369 Mu_term_minus = "Random"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
370 picMaxPlus_Mu, picMaxMinus_Mu, TopFreqH_phage_hybrid = picMax(phage_hybrid_coverage, 1)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
371 picMaxPlus_Mu = picMaxPlus_Mu[0][1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
372 picMaxMinus_Mu = picMaxMinus_Mu[0][1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
373
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
374 # Orientation
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
375 if list_hybrid[0] > list_hybrid[1]:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
376 P_orient = "Forward"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
377 elif list_hybrid[1] > list_hybrid[0]:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
378 P_orient = "Reverse"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
379 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
380 P_orient = ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
381
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
382 # Termini
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
383 if list_hybrid[0] > ( Mu_limit * Mu_threshold ):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
384 test = 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
385 pos_to_check = range(picMaxPlus_Mu+1,gen_len) + range(0,100)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
386 for pos in pos_to_check:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
387 if phage_hybrid_coverage[0][pos] >= max(1,phage_hybrid_coverage[0][picMaxPlus_Mu]/4):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
388 Mu_term_plus = pos
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
389 picMaxPlus_Mu = pos
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
390 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
391 Mu_term_plus = pos
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
392 break
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
393 # Reverse
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
394 if list_hybrid[1] > ( Mu_limit * Mu_threshold ):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
395 test = 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
396 pos_to_check = range(0,picMaxMinus_Mu)[::-1] + range(gen_len-100,gen_len)[::-1]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
397 for pos in pos_to_check:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
398 if phage_hybrid_coverage[1][pos] >= max(1,phage_hybrid_coverage[1][picMaxMinus_Mu]/4):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
399 Mu_term_minus = pos
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
400 picMaxMinus_Mu = pos
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
401 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
402 Mu_term_minus = pos
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
403 break
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
404 return test, Mu_term_plus, Mu_term_minus, P_orient
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
405
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
406 ### DECISION Process
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
407 def decisionProcess(plus_significant, minus_significant, limit_fixed, gen_len, paired, insert, R1, list_hybrid,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
408 used_reads, seed, phage_hybrid_coverage, Mu_threshold, refseq, hostseq):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
409 """ ."""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
410 P_orient = "NA"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
411 P_seqcoh = ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
412 P_concat = ""
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
413 P_type = "-"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
414 Mu_like = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
415 P_left = "Random"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
416 P_right = "Random"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
417 # 2 peaks sig.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
418 if not plus_significant.empty and not minus_significant.empty:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
419 # Multiple
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
420 if (len(plus_significant["SPC"]) > 1 or len(minus_significant["SPC"]) > 1):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
421 if not (plus_significant["SPC"][0] > limit_fixed or minus_significant["SPC"][0] > limit_fixed):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
422 Redundant = 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
423 P_left = "Multiple"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
424 P_right = "Multiple"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
425 Permuted = "Yes"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
426 P_class = "-"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
427 P_type = "-"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
428 return Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
429
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
430 dist_peak = abs(plus_significant['Position'][0] - minus_significant['Position'][0])
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
431 dist_peak_over = abs(abs(plus_significant['Position'][0] - minus_significant['Position'][0]) - gen_len)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
432 P_left = plus_significant['Position'][0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
433 P_right = minus_significant['Position'][0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
434 # COS
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
435 if (dist_peak <= 2) or (dist_peak_over <= 2):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
436 Redundant = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
437 Permuted = "No"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
438 P_class = "COS"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
439 P_type = "-"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
440 elif (dist_peak < 20 and dist_peak > 2) or (dist_peak_over < 20 and dist_peak_over > 2):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
441 Redundant = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
442 Permuted = "No"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
443 P_class, P_type = typeCOS(plus_significant["Position"][0], minus_significant["Position"][0], gen_len / 2)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
444 P_seqcoh, packstat = sequenceCohesive("COS", refseq, [
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
445 ((plus_significant["SPC"][0]), (plus_significant["Position"][0]) - 1)], [((minus_significant["SPC"][0]),
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
446 (
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
447 minus_significant["Position"][
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
448 0]) - 1)], gen_len / 2)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
449 # DTR
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
450 elif (dist_peak <= 1000) or (dist_peak_over <= 1000):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
451 Redundant = 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
452 Permuted = "No"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
453 P_class = "DTR (short)"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
454 P_type = "T7"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
455 P_seqcoh, packstat = sequenceCohesive("COS", refseq, [
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
456 ((plus_significant["SPC"][0]), (plus_significant["Position"][0]) - 1)], [((minus_significant["SPC"][0]),
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
457 (
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
458 minus_significant["Position"][
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
459 0]) - 1)], gen_len / 2)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
460 elif (dist_peak <= 0.1 * gen_len) or (dist_peak_over <= 0.1 * gen_len):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
461 Redundant = 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
462 Permuted = "No"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
463 P_class = "DTR (long)"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
464 P_type = "T5"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
465 P_seqcoh, packstat = sequenceCohesive("COS", refseq, [
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
466 ((plus_significant["SPC"][0]), (plus_significant["Position"][0]) - 1)], [((minus_significant["SPC"][0]),
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
467 (
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
468 minus_significant["Position"][
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
469 0]) - 1)], gen_len / 2)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
470 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
471 Redundant = 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
472 Permuted = "No"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
473 P_class = "-"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
474 P_type = "-"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
475 # 1 peak sig.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
476 elif not plus_significant.empty and minus_significant.empty or plus_significant.empty and not minus_significant.empty:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
477 Redundant = 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
478 Permuted = "Yes"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
479 P_class = "Headful (pac)"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
480 P_type = "P1"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
481 if paired != "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
482 if R1 == 0 or len(insert) == 0:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
483 P_concat = 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
484 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
485 P_concat = round((sum(insert) / len(insert)) / R1) - 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
486 if not plus_significant.empty:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
487 P_left = plus_significant['Position'][0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
488 P_right = "Distributed"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
489 P_orient = "Forward"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
490 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
491 P_left = "Distributed"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
492 P_right = minus_significant['Position'][0]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
493 P_orient = "Reverse"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
494 # 0 peak sig.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
495 elif plus_significant.empty and minus_significant.empty:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
496 Mu_like, Mu_term_plus, Mu_term_minus, P_orient = testMu(paired, list_hybrid, gen_len, used_reads, seed, insert,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
497 phage_hybrid_coverage, Mu_threshold, hostseq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
498 if Mu_like:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
499 Redundant = 0
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
500 Permuted = "No"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
501 P_class = "Mu-like"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
502 P_type = "Mu"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
503 P_left = Mu_term_plus
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
504 P_right = Mu_term_minus
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
505 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
506 Redundant = 1
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
507 Permuted = "Yes"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
508 P_class = "-"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
509 P_type = "-"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
510
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
511 return Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
512
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
513 # Processes coverage values for a sequence.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
514 def processCovValuesForSeq(refseq,hostseq,refseq_name,refseq_liste,seed,analysis_name,tot_reads,results_pos,test_run, paired,edge,host,test, surrounding,limit_preferred,limit_fixed,Mu_threshold,\
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
515 termini_coverage,whole_coverage,paired_whole_coverage,phage_hybrid_coverage,host_hybrid_coverage, host_whole_coverage,insert,list_hybrid,reads_tested,DR,DR_path=None):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
516
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
517 print("\n\nFinished calculating coverage values, the remainder should be completed rapidly\n",
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
518 strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
519
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
520 # WHOLE Coverage : Average, Maximum and Minimum
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
521 added_whole_coverage, ave_whole_cov = wholeCov(whole_coverage, len(refseq))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
522 added_paired_whole_coverage, ave_paired_whole_cov = wholeCov(paired_whole_coverage, len(refseq))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
523 added_host_whole_coverage, ave_host_whole_cov = wholeCov(host_whole_coverage, len(hostseq))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
524
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
525 drop_cov = testwholeCov(added_whole_coverage, ave_whole_cov, test_run)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
526
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
527 # NORM pic by whole coverage (1 base)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
528 if paired != "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
529 #paired_whole_coverage_test = maxPaired(paired_whole_coverage, whole_coverage)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
530 termini_coverage_norm, mean_nc = normCov(termini_coverage, paired_whole_coverage, max(10, ave_whole_cov / 1.5),
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
531 edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
532 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
533 termini_coverage_norm, mean_nc = normCov(termini_coverage, whole_coverage, max(10, ave_whole_cov / 1.5), edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
534
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
535 # REMOVE edge
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
536 termini_coverage[0] = RemoveEdge(termini_coverage[0], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
537 termini_coverage[1] = RemoveEdge(termini_coverage[1], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
538 termini_coverage_norm[0] = RemoveEdge(termini_coverage_norm[0], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
539 termini_coverage_norm[1] = RemoveEdge(termini_coverage_norm[1], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
540 whole_coverage[0] = RemoveEdge(whole_coverage[0], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
541 whole_coverage[1] = RemoveEdge(whole_coverage[1], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
542 paired_whole_coverage[0] = RemoveEdge(paired_whole_coverage[0], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
543 paired_whole_coverage[1] = RemoveEdge(paired_whole_coverage[1], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
544 added_whole_coverage = RemoveEdge(added_whole_coverage, edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
545 added_paired_whole_coverage = RemoveEdge(added_paired_whole_coverage, edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
546 added_host_whole_coverage = RemoveEdge(added_host_whole_coverage, edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
547 phage_hybrid_coverage[0] = RemoveEdge(phage_hybrid_coverage[0], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
548 phage_hybrid_coverage[1] = RemoveEdge(phage_hybrid_coverage[1], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
549 host_whole_coverage[0] = RemoveEdge(host_whole_coverage[0], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
550 host_whole_coverage[1] = RemoveEdge(host_whole_coverage[1], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
551 host_hybrid_coverage[0] = RemoveEdge(host_hybrid_coverage[0], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
552 host_hybrid_coverage[1] = RemoveEdge(host_hybrid_coverage[1], edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
553 refseq = RemoveEdge(refseq, edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
554 if host != "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
555 hostseq = RemoveEdge(hostseq, edge)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
556 gen_len = len(refseq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
557 host_len = len(hostseq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
558 if test == "DL":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
559 gen_len = 100000
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
560
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
561 # READS Total, Used and Lost
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
562 used_reads, lost_reads, lost_perc = usedReads(termini_coverage, reads_tested)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
563
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
564 # PIC Max
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
565 picMaxPlus, picMaxMinus, TopFreqH = picMax(termini_coverage, 5)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
566 picMaxPlus_norm, picMaxMinus_norm, TopFreqH_norm = picMax(termini_coverage_norm, 5)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
567 picMaxPlus_host, picMaxMinus_host, TopFreqH_host = picMax(host_whole_coverage, 5)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
568
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
569 ### ANALYSIS
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
570
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
571 ## Close Peaks
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
572 picMaxPlus, picOUT_forw = RemoveClosePicMax(picMaxPlus, gen_len, surrounding)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
573 picMaxMinus, picOUT_rev = RemoveClosePicMax(picMaxMinus, gen_len, surrounding)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
574 picMaxPlus_norm, picOUT_norm_forw = RemoveClosePicMax(picMaxPlus_norm, gen_len, surrounding)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
575 picMaxMinus_norm, picOUT_norm_rev = RemoveClosePicMax(picMaxMinus_norm, gen_len, surrounding)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
576
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
577 termini_coverage_close = termini_coverage[:]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
578 termini_coverage_close[0], picOUT_forw = addClosePic(termini_coverage[0], picOUT_forw)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
579 termini_coverage_close[1], picOUT_rev = addClosePic(termini_coverage[1], picOUT_rev)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
580
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
581 termini_coverage_norm_close = termini_coverage_norm[:]
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
582 termini_coverage_norm_close[0], picOUT_norm_forw = addClosePic(termini_coverage_norm[0], picOUT_norm_forw, 1)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
583 termini_coverage_norm_close[1], picOUT_norm_rev = addClosePic(termini_coverage_norm[1], picOUT_norm_rev, 1)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
584
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
585 ## Statistical Analysis
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
586 picMaxPlus_norm_close, picMaxMinus_norm_close, TopFreqH_norm = picMax(termini_coverage_norm_close, 5)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
587 phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(paired_whole_coverage, termini_coverage,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
588 termini_coverage_norm,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
589 termini_coverage_norm_close)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
590 # VL: comment that since the 2 different conditions lead to the execution of the same piece of code...
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
591 # if paired != "":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
592 # phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(paired_whole_coverage, termini_coverage,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
593 # termini_coverage_norm,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
594 # termini_coverage_norm_close)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
595 # else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
596 # phage_norm, phage_plus_norm, phage_minus_norm = test_pics_decision_tree(whole_coverage, termini_coverage,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
597 # termini_coverage_norm,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
598 # termini_coverage_norm_close)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
599
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
600
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
601 ## LI Analysis
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
602 picMaxPlus_close, picMaxMinus_close, TopFreqH = picMax(termini_coverage_close, 5)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
603
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
604 R1, AveFreq = ratioR1(TopFreqH, used_reads, gen_len)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
605 R2 = ratioR(picMaxPlus_close)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
606 R3 = ratioR(picMaxMinus_close)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
607
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
608 ArtPackmode, termini, forward, reverse = packMode(R1, R2, R3)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
609 ArtOrient = orientation(picMaxPlus_close, picMaxMinus_close)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
610 ArtcohesiveSeq, ArtPackmode = sequenceCohesive(ArtPackmode, refseq, picMaxPlus_close, picMaxMinus_close,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
611 gen_len / 2)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
612
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
613 ### DECISION Process
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
614
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
615 # PEAKS Significativity
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
616 plus_significant = selectSignificant(phage_plus_norm, 1.0 / gen_len, limit_preferred)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
617 minus_significant = selectSignificant(phage_minus_norm, 1.0 / gen_len, limit_preferred)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
618
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
619 # DECISION
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
620 Redundant, Permuted, P_class, P_type, P_seqcoh, P_concat, P_orient, P_left, P_right, Mu_like = decisionProcess(
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
621 plus_significant, minus_significant, limit_fixed, gen_len, paired, insert, R1, list_hybrid, used_reads,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
622 seed, phage_hybrid_coverage, Mu_threshold, refseq, hostseq)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
623
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
624 ### Results
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
625 if len(refseq_liste) != 1:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
626 if P_class == "-":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
627 if P_left == "Random" and P_right == "Random":
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
628 P_class = "UNKNOWN"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
629 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
630 P_class = "NEW"
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
631 DR[P_class][checkReportTitle(refseq_name[results_pos])] = {"analysis_name": analysis_name, "seed": seed,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
632 "added_whole_coverage": added_whole_coverage,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
633 "Redundant": Redundant, "P_left": P_left,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
634 "P_right": P_right, "Permuted": Permuted,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
635 "P_orient": P_orient,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
636 "termini_coverage_norm_close": termini_coverage_norm_close,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
637 "picMaxPlus_norm_close": picMaxPlus_norm_close,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
638 "picMaxMinus_norm_close": picMaxMinus_norm_close,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
639 "gen_len": gen_len, "tot_reads": tot_reads,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
640 "P_seqcoh": P_seqcoh,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
641 "phage_plus_norm": phage_plus_norm,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
642 "phage_minus_norm": phage_minus_norm,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
643 "ArtPackmode": ArtPackmode, "termini": termini,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
644 "forward": forward, "reverse": reverse,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
645 "ArtOrient": ArtOrient,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
646 "ArtcohesiveSeq": ArtcohesiveSeq,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
647 "termini_coverage_close": termini_coverage_close,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
648 "picMaxPlus_close": picMaxPlus_close,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
649 "picMaxMinus_close": picMaxMinus_close,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
650 "picOUT_norm_forw": picOUT_norm_forw,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
651 "picOUT_norm_rev": picOUT_norm_rev,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
652 "picOUT_forw": picOUT_forw,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
653 "picOUT_rev": picOUT_rev, "lost_perc": lost_perc,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
654 "ave_whole_cov": ave_whole_cov, "R1": R1, "R2": R2,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
655 "R3": R3, "host": host, "host_len": host_len,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
656 "host_whole_coverage": host_whole_coverage,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
657 "picMaxPlus_host": picMaxPlus_host,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
658 "picMaxMinus_host": picMaxMinus_host,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
659 "surrounding": surrounding, "drop_cov": drop_cov,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
660 "paired": paired, "insert": insert,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
661 "phage_hybrid_coverage": phage_hybrid_coverage,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
662 "host_hybrid_coverage": host_hybrid_coverage,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
663 "added_paired_whole_coverage": added_paired_whole_coverage,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
664 "Mu_like": Mu_like, "test_run": test_run,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
665 "P_class": P_class, "P_type": P_type,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
666 "P_concat": P_concat,
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
667 "idx_refseq_in_list": results_pos}
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
668
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
669 if DR_path!=None: # multi machine cluster mode.
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
670 strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
671 P_class_dir=os.path.join(DR_path,P_class)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
672 if os.path.exists(P_class_dir):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
673 if not os.path.isdir(P_class_dir):
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
674 raise RuntimeError("P_class_dir is not a directory")
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
675 else:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
676 os.mkdir(P_class_dir)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
677 import pickle
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
678 fic_name=os.path.join(P_class_dir,checkReportTitle(refseq_name[results_pos]))
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
679 items_to_save=(analysis_name,seed,added_whole_coverage,Redundant,P_left,P_right,Permuted, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
680 P_orient,termini_coverage_norm_close,picMaxPlus_norm_close,picMaxMinus_norm_close, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
681 gen_len,tot_reads,P_seqcoh,phage_plus_norm,phage_minus_norm,ArtPackmode,termini, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
682 forward,reverse,ArtOrient,ArtcohesiveSeq,termini_coverage_close,picMaxPlus_close, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
683 picMaxMinus_close,picOUT_norm_forw,picOUT_norm_rev,picOUT_forw,picOUT_rev, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
684 lost_perc,ave_whole_cov,R1,R2,R3,host,host_len,host_whole_coverage,picMaxPlus_host, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
685 picMaxMinus_host,surrounding,drop_cov,paired, insert,phage_hybrid_coverage, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
686 host_hybrid_coverage,added_paired_whole_coverage,Mu_like,test_run,P_class,P_type,\
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
687 P_concat,results_pos)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
688 with open(fic_name,'wb') as f:
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
689 pickle.dump(items_to_save,f)
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
690 f.close()
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
691
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
692 return SeqStats(P_class, P_left, P_right, P_type, P_orient, ave_whole_cov, phage_plus_norm, phage_minus_norm, ArtcohesiveSeq, P_seqcoh, Redundant, Mu_like, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
693 added_whole_coverage, Permuted, termini_coverage_norm_close, picMaxPlus_norm_close, picMaxMinus_norm_close, gen_len, termini_coverage_close, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
694 ArtPackmode, termini, forward, reverse, ArtOrient, picMaxPlus_close, picMaxMinus_close, picOUT_norm_forw, picOUT_norm_rev, picOUT_forw, picOUT_rev, \
69e8f12c8b31 "planemo upload"
bioit_sciensano
parents:
diff changeset
695 lost_perc, R1, R2, R3, picMaxPlus_host, picMaxMinus_host, drop_cov, added_paired_whole_coverage, P_concat)