Mercurial > repos > guerler > springsuite
comparison spring_roc.py @ 34:b300ddbbf9d0 draft
"planemo upload commit 0410e2fadc4e9fc1df6010de7b3678154cbdfe62-dirty"
author | guerler |
---|---|
date | Tue, 24 Nov 2020 17:55:07 +0000 |
parents | 3071750405c9 |
children | 0bcc0a269916 |
comparison
equal
deleted
inserted
replaced
33:f115fbf3ac63 | 34:b300ddbbf9d0 |
---|---|
1 #! /usr/bin/env python | 1 #! /usr/bin/env python |
2 import argparse | 2 import argparse |
3 import math | 3 import math |
4 import random | 4 import random |
5 from os.path import isfile | 5 from os.path import isfile |
6 from datetime import datetime | |
7 | 6 |
8 from matplotlib import pyplot as plt | 7 from matplotlib import pyplot as plt |
9 | 8 |
10 | 9 |
11 def getIds(rawIds): | 10 def getIds(rawIds): |
205 | 204 |
206 # process prediction file | 205 # process prediction file |
207 print("Loading prediction file...") | 206 print("Loading prediction file...") |
208 prediction, _ = getReference(args.input, scoreCol=2) | 207 prediction, _ = getReference(args.input, scoreCol=2) |
209 | 208 |
210 # get subcellular locations from UniProt export | 209 # determine negative set |
211 locations = dict() | 210 print("Identifying non-interacting pairs...") |
212 if isfile(args.locations): | |
213 regions = list() | |
214 if args.regions: | |
215 regions = args.regions.split(",") | |
216 with open(args.locations) as locFile: | |
217 for line in locFile: | |
218 searchKey = "SUBCELLULAR LOCATION" | |
219 searchPos = line.find(searchKey) | |
220 if searchPos != -1: | |
221 uniId = line.split()[0] | |
222 locStart = searchPos + len(searchKey) + 1 | |
223 locId = line[locStart:].split()[0] | |
224 if regions: | |
225 if locId not in regions: | |
226 continue | |
227 if uniId in filterA or uniId in filterB: | |
228 locations[uniId] = locId | |
229 print("Found %d subcellular locations." % (len(list(locations.keys())))) | |
230 | |
231 # estimate background noise | |
232 print("Estimating background noise...") | |
233 negative = set() | 211 negative = set() |
234 filterAList = sorted(list(filterA)) | 212 if isfile(args.negative): |
235 filterBList = sorted(list(filterB)) | 213 # load from explicit file |
236 negativeRequired = positiveCount | 214 with open(args.negative) as file: |
237 random.seed(0) | 215 for line in file: |
238 totalAttempts = int(len(filterAList) * len(filterBList) / 2) | 216 cols = line.split() |
239 while totalAttempts > 0: | 217 nameA = cols[0] |
240 totalAttempts = totalAttempts - 1 | 218 nameB = cols[1] |
241 nameA = random.choice(filterAList) | 219 key = getKey(nameA, nameB) |
242 nameB = random.choice(filterBList) | 220 if key not in putative and key not in negative: |
243 if locations: | 221 negative.add(key) |
244 if nameA not in locations or nameB not in locations: | 222 else: |
245 continue | 223 # get subcellular locations from UniProt export |
246 if locations[nameA] == locations[nameB]: | 224 locations = dict() |
247 continue | 225 if isfile(args.locations): |
248 key = getKey(nameA, nameB) | 226 regions = list() |
249 if key not in putative and key not in negative: | 227 if args.regions: |
250 negative.add(key) | 228 regions = args.regions.split(",") |
251 negativeRequired = negativeRequired - 1 | 229 with open(args.locations) as locFile: |
252 if negativeRequired == 0: | 230 for line in locFile: |
253 break | 231 searchKey = "SUBCELLULAR LOCATION" |
232 searchPos = line.find(searchKey) | |
233 if searchPos != -1: | |
234 uniId = line.split()[0] | |
235 locStart = searchPos + len(searchKey) + 1 | |
236 locId = line[locStart:].split()[0] | |
237 if regions: | |
238 if locId not in regions: | |
239 continue | |
240 if uniId in filterA or uniId in filterB: | |
241 locations[uniId] = locId | |
242 print("Found %d subcellular locations." % (len(list(locations.keys())))) | |
243 | |
244 # randomly sample non-interacting pairs | |
245 filterAList = sorted(list(filterA)) | |
246 filterBList = sorted(list(filterB)) | |
247 negativeRequired = positiveCount | |
248 random.seed(0) | |
249 totalAttempts = int(len(filterAList) * len(filterBList) / 2) | |
250 while totalAttempts > 0: | |
251 totalAttempts = totalAttempts - 1 | |
252 nameA = random.choice(filterAList) | |
253 nameB = random.choice(filterBList) | |
254 if locations: | |
255 if nameA not in locations or nameB not in locations: | |
256 continue | |
257 if locations[nameA] == locations[nameB]: | |
258 continue | |
259 key = getKey(nameA, nameB) | |
260 if key not in putative and key not in negative: | |
261 negative.add(key) | |
262 negativeRequired = negativeRequired - 1 | |
263 if negativeRequired == 0: | |
264 break | |
254 | 265 |
255 # create plot | 266 # create plot |
256 print("Producing plot data...") | 267 print("Producing plot data...") |
257 print("Total count in prediction file: %d." % len(prediction)) | 268 print("Total count in prediction file: %d." % len(prediction)) |
258 print("Total count in positive file: %d." % len(positive)) | 269 print("Total count in positive file: %d." % len(positive)) |
269 plt.savefig(args.output, format="png") | 280 plt.savefig(args.output, format="png") |
270 | 281 |
271 | 282 |
272 if __name__ == "__main__": | 283 if __name__ == "__main__": |
273 parser = argparse.ArgumentParser(description='Create ROC plot.') | 284 parser = argparse.ArgumentParser(description='Create ROC plot.') |
274 parser.add_argument('-i', '--input', help='Input prediction file.', required=True) | 285 parser.add_argument('-i', '--input', help='Input prediction file (2-columns).', required=True) |
275 parser.add_argument('-b', '--biogrid', help='BioGRID interaction database file', required=True) | 286 parser.add_argument('-b', '--biogrid', help='BioGRID interaction database file', required=True) |
276 parser.add_argument('-l', '--locations', help='UniProt export table with subcellular locations', required=False) | 287 parser.add_argument('-l', '--locations', help='UniProt export table with subcellular locations', default="", required=False) |
277 parser.add_argument('-r', '--regions', help='Comma-separated regions', required=False) | 288 parser.add_argument('-r', '--regions', help='Comma-separated regions', required=False) |
289 parser.add_argument('-n', '--negative', help='Negative set (2-columns)', default="", required=False) | |
278 parser.add_argument('-e', '--experiment', help='Type (physical/genetic)', default="", required=False) | 290 parser.add_argument('-e', '--experiment', help='Type (physical/genetic)', default="", required=False) |
279 parser.add_argument('-t', '--throughput', help='Throughput (low/high)', default="", required=False) | 291 parser.add_argument('-t', '--throughput', help='Throughput (low/high)', default="", required=False) |
280 parser.add_argument('-m', '--method', help='Method e.g. Two-hybrid', default="", required=False) | 292 parser.add_argument('-m', '--method', help='Method e.g. Two-hybrid', default="", required=False) |
281 parser.add_argument('-o', '--output', help='Output (png)', required=True) | 293 parser.add_argument('-o', '--output', help='Output (png)', required=True) |
282 args = parser.parse_args() | 294 args = parser.parse_args() |