annotate spring_minz.py @ 26:124f82fbd986 draft

"planemo upload commit 7fe105e5874a3979f2c5ba7570f4f1ad0ec2559b-dirty"
author guerler
date Sat, 31 Oct 2020 22:21:11 +0000
parents 5d1ae615e4ec
children e34da554d415
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
1 #! /usr/bin/env python3
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
2 import argparse
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
3 import os
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
4
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
5 def main(args):
23
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
6 logFile = open(args.log, 'a+')
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
7 targets = list()
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
8 targetPath = args.targetpath.rstrip("/")
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
9 with open(args.targetlist) as file:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
10 for index, line in enumerate(file):
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
11 name = line.strip()
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
12 targets.append(name)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
13 print ("Loaded %s target names from `%s`." % (len(targets), args.targetlist))
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
14 if args.inputlist:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
15 inputs = list()
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
16 inputPath = args.inputpath.rstrip("/")
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
17 with open(args.inputlist) as file:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
18 for index, line in enumerate(file):
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
19 name = line.strip()
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
20 inputs.append(name)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
21 print ("Loaded %s input names from `%s`." % (len(inputs), args.inputlist))
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
22 else:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
23 inputs = targets
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
24 inputPath = targetPath
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
25 crossReference = dict()
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
26 with open(args.crossreference) as file:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
27 for index, line in enumerate(file):
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
28 columns = line.split()
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
29 core = columns[0]
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
30 partner = columns[-1]
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
31 if core not in crossReference:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
32 crossReference[core] = []
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
33 crossReference[core].append(partner)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
34 print ("Loaded cross reference from `%s`." % args.crossreference)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
35 interactions = dict()
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
36 for targetName in targets:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
37 targetFile = "%s/%s" % (targetPath, targetName)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
38 matchScores(targetFile=targetFile,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
39 targetName=targetName,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
40 inputs=inputs,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
41 inputPath=inputPath,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
42 crossReference=crossReference,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
43 minScore=args.minscore,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
44 logFile=logFile,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
45 interactions=interactions)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
46 if args.inputlist:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
47 for inputName in inputs:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
48 inputDirectory = inputPath
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
49 inputFile = "%s/%s" % (inputDirectory, inputName)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
50 matchScores(targetFile=inputFile,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
51 targetName=inputName,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
52 inputs=targets,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
53 inputPath=targetPath,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
54 crossReference=crossReference,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
55 minScore=args.minscore,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
56 logFile=logFile,
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
57 interactions=interactions)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
58 interactions = sorted(interactions.values(), key=lambda item: item["minZ"], reverse=True)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
59 with open(args.output, 'w') as output_file:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
60 for entry in interactions:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
61 output_file.write("%s\t%s\t%s\t%s\n" % (entry["targetName"], entry["inputName"], entry["minZ"], entry["minInfo"]))
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
62 logFile.close()
17
c790d25086dc "planemo upload commit b0ede77caf410ab69043d33a44e190054024d340-dirty"
guerler
parents: 16
diff changeset
63
25
5d1ae615e4ec "planemo upload commit ff7d7e512b79436c3538078552983762330a920d-dirty"
guerler
parents: 23
diff changeset
64 def matchScores(targetFile, targetName, inputs, inputPath, crossReference, minScore, logFile, interactions):
5d1ae615e4ec "planemo upload commit ff7d7e512b79436c3538078552983762330a920d-dirty"
guerler
parents: 23
diff changeset
65 targetTop, targetHits = getTemplateScores(targetFile, minScore)
23
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
66 if not targetHits:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
67 print("No targets found `%s`" % targetFile)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
68 else:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
69 print ("Loaded target scores from `%s`." % targetFile)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
70 for inputName in inputs:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
71 inputFile = "%s/%s" % (inputPath, inputName)
25
5d1ae615e4ec "planemo upload commit ff7d7e512b79436c3538078552983762330a920d-dirty"
guerler
parents: 23
diff changeset
72 inputTop, inputHits = getTemplateScores(inputFile, minScore)
23
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
73 minZ = 0
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
74 minInfo = ""
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
75 for t in targetHits:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
76 if t in crossReference:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
77 partners = crossReference[t]
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
78 for p in partners:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
79 if p in inputHits:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
80 score = min(targetHits[t], inputHits[p])
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
81 if score > minZ:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
82 minZ = score
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
83 minInfo = "%s\t%s\t%s\t%s" % (targetTop, inputTop, t, p)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
84 if minZ > minScore:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
85 if targetName > inputName:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
86 interactionKey = "%s_%s" % (targetName, inputName)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
87 else:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
88 interactionKey = "%s_%s" % (inputName, targetName)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
89 if interactionKey in interactions:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
90 if interactions[interactionKey]["minZ"] >= minZ:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
91 continue
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
92 interactions[interactionKey] = dict(targetName=targetName, inputName=inputName, minZ=minZ, minInfo=minInfo)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
93 logFile.write("Interaction between %s and %s [min-Z: %s].\n" % (targetName, inputName, minZ))
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
94
25
5d1ae615e4ec "planemo upload commit ff7d7e512b79436c3538078552983762330a920d-dirty"
guerler
parents: 23
diff changeset
95 def getTemplateScores(hhrFile, minScore):
23
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
96 result = dict()
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
97 topTemplate = None
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
98 if os.path.isfile(hhrFile):
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
99 with open(hhrFile) as file:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
100 for index, line in enumerate(file):
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
101 if index > 8:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
102 if not line.strip():
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
103 break
25
5d1ae615e4ec "planemo upload commit ff7d7e512b79436c3538078552983762330a920d-dirty"
guerler
parents: 23
diff changeset
104 templateId = line[4:10]
23
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
105 templateScore = float(line[57:63])
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
106 if templateScore > minScore:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
107 if topTemplate is None:
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
108 topTemplate = templateId
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
109 result[templateId] = templateScore
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
110 return topTemplate, result
0
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
111
d30785e31577 "planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
guerler
parents:
diff changeset
112 if __name__ == "__main__":
23
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
113 parser = argparse.ArgumentParser(description='This script identifies interactions by detecting matching HH-search results.')
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
114 parser.add_argument('-tl', '--targetlist', help='Text file containing identifiers.', required=True)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
115 parser.add_argument('-tp', '--targetpath', help='Directory containing `hhr` files', required=True)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
116 parser.add_argument('-il', '--inputlist', help='Text file containing identifiers.', required=False)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
117 parser.add_argument('-ip', '--inputpath', help='Directory containing `hhr` files', required=False)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
118 parser.add_argument('-c', '--crossreference', help='Cross Reference index file', required=True)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
119 parser.add_argument('-o', '--output', help='Output file containing min-Z scores', required=True)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
120 parser.add_argument('-l', '--log', help='Log file', required=True)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
121 parser.add_argument('-m', '--minscore', help='min-Z score threshold', type=int, default=10)
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
122 args = parser.parse_args()
5469e19f1f96 "planemo upload commit 37a4c6844fd7ab1071ddf90f51915ec1a13c26b3"
guerler
parents: 22
diff changeset
123 main(args)