comparison curve_fitting.py @ 0:8bf2c507af3a draft

"planemo upload for repository https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/curve_fitting/ commit ef82d0882741042922349499cafa35d20d70ce70"
author imgteam
date Thu, 22 Jul 2021 19:34:36 +0000
parents
children e0af18405e37
comparison
equal deleted inserted replaced
-1:000000000000 0:8bf2c507af3a
1 """
2 Copyright 2021 Biomedical Computer Vision Group, Heidelberg University.
3 Author: Qi Gao (qi.gao@bioquant.uni-heidelberg.de)
4
5 Distributed under the MIT license.
6 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
7
8 """
9
10 import argparse
11
12 import numpy as np
13 import pandas as pd
14 from scipy.optimize import least_squares
15
16
17 def compute_curve(x, par):
18 assert len(par) in [2, 3], 'The degree of curve must be 1 or 2!'
19 if len(par) == 3:
20 return par[0] * x + par[1] + par[2] * x ** 2
21 elif len(par) == 2:
22 return par[0] * x + par[1]
23
24
25 def fitting_err(par, xx, seq, penalty):
26 assert penalty in ['abs', 'quadratic', 'student-t'], 'Unknown penalty function!'
27 curve = compute_curve(xx, par)
28 if penalty == 'abs':
29 err = np.sqrt(np.abs(curve - seq))
30 elif penalty == 'quadratic':
31 err = (curve - seq)
32 elif penalty == 'student-t':
33 a = 1000
34 b = 0.001
35 err = np.sqrt(a * np.log(1 + (b * (curve - seq)) ** 2))
36 return err
37
38
39 def curve_fitting(seq, degree=2, penalty='abs'):
40 assert len(seq) > 5, 'Data is too short for curve fitting!'
41 assert degree in [1, 2], 'The degree of curve must be 1 or 2!'
42
43 par0 = [(seq[-1] - seq[0]) / len(seq), np.mean(seq[0:3])]
44 if degree == 2:
45 par0.append(-0.001)
46
47 xx = np.array([i for i in range(len(seq))]) + 1
48 x = np.array(par0, dtype='float64')
49 result = least_squares(fitting_err, x, args=(xx, seq, penalty))
50
51 return compute_curve(xx, result.x)
52
53
54 def curve_fitting_io(fn_in, fn_out, degree=2, penalty='abs', alpha=0.01):
55 # read all sheets
56 xl = pd.ExcelFile(fn_in)
57 nSpots = len(xl.sheet_names)
58 data_all = []
59 for i in range(nSpots):
60 df = pd.read_excel(xl, xl.sheet_names[i])
61 data_all.append(np.array(df))
62 col_names = df.columns.tolist()
63 ncols_ori = len(col_names)
64
65 # curve_fitting
66 diff = np.array([], dtype=('float64'))
67 for i in range(nSpots):
68 seq = data_all[i][:, -1]
69 seq_fit = seq.copy()
70 idx_valid = ~np.isnan(seq)
71 seq_fit[idx_valid] = curve_fitting(seq[idx_valid], degree=2, penalty='abs')
72 data_all[i] = np.concatenate((data_all[i], seq_fit.reshape(-1, 1)), axis=1)
73 if alpha > 0:
74 diff = np.concatenate((diff, seq_fit[idx_valid] - seq[idx_valid]))
75
76 # add assistive curve
77 if alpha > 0:
78 sorted_diff = np.sort(diff)
79 fac = 1 - alpha / 2
80 sig3 = sorted_diff[int(diff.size * fac)]
81 for i in range(nSpots):
82 seq_assist = data_all[i][:, -1] + sig3
83 data_all[i] = np.concatenate((data_all[i], seq_assist.reshape(-1, 1)), axis=1)
84
85 # write to file
86 with pd.ExcelWriter(fn_out) as writer:
87 for i in range(nSpots):
88 df = pd.DataFrame()
89 for c in range(ncols_ori):
90 df[col_names[c]] = data_all[i][:, c]
91 df['CURVE'] = data_all[i][:, ncols_ori]
92 if alpha > 0:
93 df['CURVE_A'] = data_all[i][:, ncols_ori + 1]
94 df.to_excel(writer, sheet_name=xl.sheet_names[i], index=False, float_format='%.2f')
95 writer.save()
96
97
98 if __name__ == "__main__":
99 parser = argparse.ArgumentParser(description="Fit (1st- or 2nd-degree) polynomial curves to data points")
100 parser.add_argument("fn_in", help="File name of input data points (xlsx)")
101 parser.add_argument("fn_out", help="File name of output fitted curves (xlsx)")
102 parser.add_argument("degree", type=int, help="Degree of the polynomial function")
103 parser.add_argument("penalty", help="Optimization objective for fitting")
104 parser.add_argument("alpha", type=float, help="Significance level for generating assistive curves")
105 args = parser.parse_args()
106 curve_fitting_io(args.fn_in, args.fn_out, args.degree, args.penalty, args.alpha)