annotate dbkit_merge.py @ 3:03e124ff7e26 draft

"planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
author guerler
date Wed, 16 Dec 2020 13:11:35 +0000
parents 81c7d4668a7e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
1 #! /usr/bin/env python3
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
2 import argparse
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
3 from os.path import getsize
3
03e124ff7e26 "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents: 2
diff changeset
4 from shutil import copyfile
2
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
5
3
03e124ff7e26 "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents: 2
diff changeset
6 from dbkit_package.DBKit import DBKit, writeEntry
2
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
7
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
8
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
9 def main(args):
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
10 logFile = open(args.log, "w")
3
03e124ff7e26 "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents: 2
diff changeset
11 outputIndex = args.outputindex
03e124ff7e26 "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents: 2
diff changeset
12 outputDatabase = args.outputdatabase
2
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
13 if getsize(args.firstindex) > getsize(args.secondindex):
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
14 firstIndex = args.firstindex
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
15 firstData = args.firstdata
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
16 secondIndex = args.secondindex
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
17 secondData = args.seconddata
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
18 else:
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
19 firstIndex = args.secondindex
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
20 firstData = args.seconddata
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
21 secondIndex = args.firstindex
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
22 secondData = args.firstdata
3
03e124ff7e26 "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents: 2
diff changeset
23 copyfile(firstIndex, outputIndex)
03e124ff7e26 "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents: 2
diff changeset
24 copyfile(firstData, outputDatabase)
2
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
25 firstEntries = set()
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
26 with open(firstIndex, "r") as f:
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
27 for line in f:
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
28 name = line.split()[0]
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
29 firstEntries.add(name)
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
30 logFile.write("Detected %s entries.\n" % len(firstEntries))
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
31 secondEntries = list()
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
32 with open(secondIndex, "r") as f:
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
33 for line in f:
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
34 name = line.split()[0]
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
35 secondEntries.append(name)
3
03e124ff7e26 "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents: 2
diff changeset
36 fileName = "temp.dat"
2
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
37 count = 0
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
38 dbkit = DBKit(secondIndex, secondData)
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
39 for secondKey in secondEntries:
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
40 if secondKey not in firstEntries:
3
03e124ff7e26 "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents: 2
diff changeset
41 dbkit.createFile(secondKey, fileName)
03e124ff7e26 "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents: 2
diff changeset
42 writeEntry(secondKey, fileName, outputIndex, outputDatabase)
2
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
43 count = count + 1
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
44 else:
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
45 logFile.write("Skipping existing entry %s.\n" % secondKey)
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
46 logFile.write("Added %s entries.\n" % count)
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
47 logFile.close()
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
48
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
49
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
50 if __name__ == "__main__":
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
51 parser = argparse.ArgumentParser(description='DBKit - Merge database pair.')
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
52 parser.add_argument('-fi', '--firstindex', help='First Index file', required=True)
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
53 parser.add_argument('-fd', '--firstdata', help='First Data file', required=True)
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
54 parser.add_argument('-si', '--secondindex', help='Second Index file', required=True)
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
55 parser.add_argument('-sd', '--seconddata', help='Second Data file', required=True)
3
03e124ff7e26 "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents: 2
diff changeset
56 parser.add_argument('-oi', '--outputindex', help='Output Index file', required=True)
03e124ff7e26 "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents: 2
diff changeset
57 parser.add_argument('-od', '--outputdatabase', help='Output Data file', required=True)
2
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
58 parser.add_argument('-log', '--log', help='Log file', required=True)
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
59 args = parser.parse_args()
81c7d4668a7e "planemo upload commit bd03b7888eab0b010acfc3affd38bf4d4e2bb1ef-dirty"
guerler
parents:
diff changeset
60 main(args)