Mercurial > repos > guerler > dbkit
annotate dbkit_create.py @ 1:987e55ea29b8 draft
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
author | guerler |
---|---|
date | Thu, 26 Nov 2020 11:05:51 +0000 |
parents | 1914107cc967 |
children | 81c7d4668a7e |
rev | line source |
---|---|
0
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
1 #! /usr/bin/env python3 |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
2 import argparse |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
3 from os import system |
1
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
4 from os.path import isfile, getsize |
0
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
5 |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
6 |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
7 def getIdentifiers(args): |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
8 entries = set() |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
9 with open(args.list) as file: |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
10 for line in file: |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
11 entry = line.split()[0] |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
12 idLength = int(args.idlength) |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
13 if idLength > 0: |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
14 entry = entry[:idLength] |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
15 if args.idcase == "lower": |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
16 entry = entry.lower() |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
17 elif args.idcase == "upper": |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
18 entry = entry.upper() |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
19 if args.idextension is not None: |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
20 entry = "%s.%s" % (entry, args.idextension) |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
21 entries.add(entry) |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
22 return sorted(entries) |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
23 |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
24 |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
25 def main(args): |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
26 entries = getIdentifiers(args) |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
27 logFile = open(args.log, "w") |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
28 logFile.write("Found %s entries.\n" % len(entries)) |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
29 outputIndex = args.index |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
30 outputDatabase = args.database |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
31 system("rm -f %s" % outputDatabase) |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
32 indexFile = open(outputIndex, 'w') |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
33 start = 0 |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
34 for entryId in entries: |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
35 logFile.write("Loading %s.\n" % entryId) |
1
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
36 if args.url: |
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
37 fileName = "temp.dat" |
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
38 system("wget -q -O %s %s%s" % (fileName, args.url, entryId)) |
0
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
39 else: |
1
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
40 pathName = args.path.rstrip("/") |
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
41 fileName = "%s/%s" % (pathName, entryId) |
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
42 if isfile(fileName): |
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
43 size = getsize(fileName) |
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
44 if size == 0: |
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
45 logFile.write("Entry `%s` not found.\n" % entryId) |
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
46 else: |
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
47 indexFile.write("%s\t%d\t%d\n" % (entryId, start, size)) |
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
48 start = start + size |
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
49 system("cat %s >> %s" % (fileName, outputDatabase)) |
0
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
50 logFile.flush() |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
51 logFile.close() |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
52 |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
53 |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
54 if __name__ == "__main__": |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
55 parser = argparse.ArgumentParser(description='DBKit - Download and Merge files into a single file.') |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
56 parser.add_argument('-l', '--list', help='List of entries', required=True) |
1
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
57 parser.add_argument('-u', '--url', help='Source Url', required=False) |
987e55ea29b8
"planemo upload commit ce9026535c3c6da5e97366a4f0b347b0ec572dbc-dirty"
guerler
parents:
0
diff
changeset
|
58 parser.add_argument('-p', '--path', help='Path to files', required=False) |
0
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
59 parser.add_argument('-il', '--idlength', help='Format Identifier Length (integer)', required=False, default="0") |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
60 parser.add_argument('-ic', '--idcase', help='Format Identifier Case (lower, upper)', required=False, default=None) |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
61 parser.add_argument('-ie', '--idextension', help='Format Identifier Extension', required=False, default=None) |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
62 parser.add_argument('-o', '--index', help='Output Database Index', required=True) |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
63 parser.add_argument('-d', '--database', help='Output Database', required=True) |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
64 parser.add_argument('-g', '--log', help="Log file", required=True) |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
65 args = parser.parse_args() |
1914107cc967
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
guerler
parents:
diff
changeset
|
66 main(args) |