Mercurial > repos > galaxyp > appendfdr
comparison append_fdr.py @ 0:ef7cc296f063 draft default tip
Initial commit.
| author | galaxyp |
|---|---|
| date | Fri, 10 May 2013 16:42:08 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ef7cc296f063 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import optparse | |
| 3 import sys | |
| 4 | |
| 5 try: | |
| 6 # Ubuntu deps: gfortan libblas-dev liblapack-dev | |
| 7 # pip deps: numpy scipy | |
| 8 from math import sqrt | |
| 9 from scipy.optimize import root | |
| 10 from numpy import arange, exp, concatenate, sum, log, array, seterr | |
| 11 except ImportError: | |
| 12 # Allow this script to be used for global FDR even | |
| 13 # if these dependencies are not present. | |
| 14 pass | |
| 15 | |
| 16 | |
| 17 SEPARATORS = {"TAB": "\t", | |
| 18 "SPACE": " ", | |
| 19 "COMMA": "," | |
| 20 } | |
| 21 | |
| 22 | |
| 23 def __main__(): | |
| 24 run_script() | |
| 25 | |
| 26 | |
| 27 def append_fdr(input_file, output, settings): | |
| 28 sorted_scores, accum_hits, accum_decoys = _accum_decoys(input_file, settings) | |
| 29 fdr_array = compute_fdr(sorted_scores, accum_hits, accum_decoys, settings) | |
| 30 index = 0 | |
| 31 for line in __read_lines(input_file): | |
| 32 if not line or line.startswith('#'): | |
| 33 continue | |
| 34 entry = Entry(line, settings, index) | |
| 35 this_fdr = fdr_array[entry.score] | |
| 36 new_line = "%s%s%f" % (line, settings["separator"], this_fdr) | |
| 37 print >> output, new_line | |
| 38 index += 1 | |
| 39 | |
| 40 | |
| 41 def compute_fdr(sorted_scores, accum_hits, accum_decoys, settings): | |
| 42 fdr_type = settings["fdr_type"] | |
| 43 compute_functions = {"global_conservative": _compute_fdr_global_conservative, | |
| 44 "global_permissive": _compute_fdr_global_permissive, | |
| 45 #"pspep": _compute_pspep | |
| 46 } | |
| 47 return compute_functions[fdr_type](sorted_scores, accum_hits, accum_decoys, settings) | |
| 48 #return compute_functions[fdr_type](all_hits_array, decoy_hits_array, settings) | |
| 49 | |
| 50 | |
| 51 def _compute_pspep(all_hits, decoy_hits, settings): | |
| 52 scaling = _get_scaling(settings) | |
| 53 seterr(all="ignore") | |
| 54 sigma = array([sqrt(x) if x > 0 else 0.2 for x in decoy_hits]) | |
| 55 if isinstance(all_hits, list): | |
| 56 all_hits = array(all_hits) | |
| 57 if isinstance(decoy_hits, list): | |
| 58 decoy_hits = array(decoy_hits) | |
| 59 searchSeg = concatenate((exp(arange(-8, 9, 2)), -1 * exp(arange(-8, 9, 2)))) | |
| 60 bestResids = sys.float_info.max | |
| 61 bestResidsComb = [0.0, 0.0, 0.0] | |
| 62 for aEst in searchSeg: | |
| 63 for bEst in searchSeg: | |
| 64 for cEst in searchSeg: | |
| 65 try: | |
| 66 sol = _non_linear_fit(aEst, bEst, cEst, all_hits, decoy_hits, sigma) | |
| 67 if sol[3] and sol[3] < bestResids: | |
| 68 bestResids = sol[3] | |
| 69 bestResidsComb = sol[0:3] | |
| 70 except: | |
| 71 pass | |
| 72 (a, b, c) = bestResidsComb[0:3] | |
| 73 fdr_local = scaling * (exp(b * (all_hits - a)) / (exp(b * (all_hits - a)) + 1)) * c | |
| 74 return fdr_local | |
| 75 | |
| 76 | |
| 77 def _get_scaling(settings): | |
| 78 scaling = float(settings.get("scaling", "2.0")) | |
| 79 return scaling | |
| 80 | |
| 81 | |
| 82 def _non_linear_fit(aEst, bEst, cEst, all_hits, decoy_hits, sigma, scaling=2): | |
| 83 guess = [aEst, bEst, cEst] | |
| 84 | |
| 85 def f(a, b, c): | |
| 86 return c * (log(exp(b * (all_hits - a)) + 1) - log(exp(-b * a) + 1)) / b | |
| 87 | |
| 88 def fcn(p): | |
| 89 a = p[0] | |
| 90 b = p[1] | |
| 91 c = p[2] | |
| 92 return (decoy_hits - f(a, b, c)) / sigma | |
| 93 | |
| 94 solution = root(fcn, guess, method='lm') | |
| 95 a = solution.x[0] | |
| 96 b = solution.x[1] | |
| 97 c = solution.x[2] | |
| 98 resids = sum((decoy_hits - f(a, b, c)) ** 2) / len(all_hits) | |
| 99 return (a, b, c, resids) | |
| 100 | |
| 101 | |
| 102 def _compute_fdr_global_conservative(sorted_scores, accum_hits, accum_decoys, settings): | |
| 103 raw_fdrs = build_raw_fdr_dict(sorted_scores, accum_hits, accum_decoys, settings) | |
| 104 fdrs = {} | |
| 105 max_fdr = -1 | |
| 106 for score in sorted_scores: | |
| 107 raw_fdr = raw_fdrs[score] | |
| 108 if raw_fdr > max_fdr: | |
| 109 max_fdr = raw_fdr | |
| 110 fdrs[score] = max_fdr | |
| 111 return fdrs | |
| 112 | |
| 113 | |
| 114 def _compute_fdr_global_permissive(sorted_scores, accum_hits, accum_decoys, settings): | |
| 115 raw_fdrs = build_raw_fdr_dict(sorted_scores, accum_hits, accum_decoys, settings) | |
| 116 fdrs = {} | |
| 117 index = len(sorted_scores) - 1 | |
| 118 min_fdr = 1 | |
| 119 while index >= 0: | |
| 120 score = sorted_scores[index] | |
| 121 raw_fdr = raw_fdrs[score] | |
| 122 if raw_fdr < min_fdr: | |
| 123 min_fdr = raw_fdr | |
| 124 fdrs[score] = min_fdr | |
| 125 index -= 1 | |
| 126 return fdrs | |
| 127 | |
| 128 | |
| 129 def build_raw_fdr_dict(sorted_scores, accum_hits, accum_decoys, settings): | |
| 130 scaling = _get_scaling(settings) | |
| 131 fdrs = {} | |
| 132 for score in sorted_scores: | |
| 133 fdrs[score] = (scaling * accum_decoys[score]) / accum_hits[score] | |
| 134 return fdrs | |
| 135 | |
| 136 | |
| 137 def __read_lines(input_file): | |
| 138 with open(input_file, 'r') as input: | |
| 139 for i, line in enumerate(input): | |
| 140 line = line.rstrip('\r\n') | |
| 141 yield line | |
| 142 | |
| 143 | |
| 144 def __read_entries(input_file, settings): | |
| 145 total_hits = 0 | |
| 146 for line in __read_lines(input_file): | |
| 147 if not line or line.startswith('#'): | |
| 148 continue | |
| 149 entry = Entry(line, settings, total_hits) | |
| 150 total_hits = total_hits + 1 | |
| 151 yield entry | |
| 152 | |
| 153 | |
| 154 class Entry(object): | |
| 155 | |
| 156 def __init__(self, line, settings, index): | |
| 157 self.settings = settings | |
| 158 line_parts = line.split(settings["separator"]) | |
| 159 self.identifier = line_parts[settings["identifiers_index"]] | |
| 160 if settings["score_column"]: | |
| 161 self.score = float(line_parts[settings["score_column"]]) | |
| 162 else: | |
| 163 self.score = index | |
| 164 | |
| 165 @property | |
| 166 def is_decoy(self): | |
| 167 return self.identifier.startswith(self.settings["decoy_prefix"]) | |
| 168 | |
| 169 | |
| 170 def _accum_decoys(input_file, settings): | |
| 171 hits_at_score = {} | |
| 172 decoys_at_score = {} | |
| 173 for entry in __read_entries(input_file, settings): | |
| 174 score = entry.score | |
| 175 score_total = hits_at_score.get(score, 0) + 1 | |
| 176 score_decoys = decoys_at_score.get(score, 0) + (1 if entry.is_decoy else 0) | |
| 177 hits_at_score[score] = score_total | |
| 178 decoys_at_score[score] = score_decoys | |
| 179 sorted_scores = sorted(hits_at_score, reverse=not settings["invert_score"]) | |
| 180 accum_hits = {} | |
| 181 accum_decoys = {} | |
| 182 accum_hit_count = 0 | |
| 183 accum_decoy_count = 0 | |
| 184 for score in sorted_scores: | |
| 185 accum_decoy_count += decoys_at_score[score] | |
| 186 accum_hit_count += hits_at_score[score] | |
| 187 accum_hits[score] = accum_hit_count | |
| 188 accum_decoys[score] = accum_decoy_count | |
| 189 return (sorted_scores, accum_hits, accum_decoys) | |
| 190 | |
| 191 | |
| 192 def _build_arrays(input_file, settings, sorted_scores, accum_hits, accum_decoys): | |
| 193 all_hits = [] | |
| 194 decoy_hits = [] | |
| 195 for entry in __read_entries(input_file, settings): | |
| 196 score = entry.score | |
| 197 all_hits.append(accum_hits[score]) | |
| 198 decoy_hits.append(accum_decoys[score]) | |
| 199 | |
| 200 return (all_hits, decoy_hits) | |
| 201 | |
| 202 | |
| 203 def run_script(): | |
| 204 parser = optparse.OptionParser() | |
| 205 parser.add_option("--input") | |
| 206 parser.add_option("--output") | |
| 207 parser.add_option("--decoy_prefix") | |
| 208 parser.add_option("--identifiers_column") | |
| 209 parser.add_option("--separator", default="TAB") | |
| 210 parser.add_option("--fdr_type", default="global_conservative") | |
| 211 parser.add_option("--scaling") | |
| 212 parser.add_option("--score_column", default=None) | |
| 213 # By default higher score is better. | |
| 214 parser.add_option("--invert_score", default=False, action="store_true") | |
| 215 | |
| 216 (options, args) = parser.parse_args() | |
| 217 decoy_prefix = options.decoy_prefix | |
| 218 identifiers_column = options.identifiers_column | |
| 219 score_column = options.score_column | |
| 220 separator = SEPARATORS[options.separator] | |
| 221 settings = {"decoy_prefix": decoy_prefix, | |
| 222 "identifiers_index": int(identifiers_column) - 1, | |
| 223 "fdr_type": options.fdr_type, | |
| 224 "separator": separator, | |
| 225 "scaling": options.scaling, | |
| 226 "invert_score": options.invert_score | |
| 227 } | |
| 228 if score_column: | |
| 229 settings["score_column"] = int(score_column) - 1 | |
| 230 else: | |
| 231 settings["score_column"] = None | |
| 232 # Assume data is descending, use index as score and invert. | |
| 233 settings["invert_score"] = True | |
| 234 with open(options.output, 'w') as output: | |
| 235 append_fdr(options.input, output, settings) | |
| 236 | |
| 237 | |
| 238 if __name__ == '__main__': | |
| 239 __main__() |
