Mercurial > repos > guerler > dbkit
diff dbkit_create.py @ 0:1914107cc967 draft
"planemo upload commit 3632646bec5edbe47e06c894e32bfd215b895555-dirty"
author | guerler |
---|---|
date | Wed, 25 Nov 2020 17:22:48 +0000 |
parents | |
children | 987e55ea29b8 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dbkit_create.py Wed Nov 25 17:22:48 2020 +0000 @@ -0,0 +1,63 @@ +#! /usr/bin/env python3 +import argparse +from os import system +from os.path import getsize + + +def getIdentifiers(args): + entries = set() + with open(args.list) as file: + for line in file: + entry = line.split()[0] + idLength = int(args.idlength) + if idLength > 0: + entry = entry[:idLength] + if args.idcase == "lower": + entry = entry.lower() + elif args.idcase == "upper": + entry = entry.upper() + if args.idextension is not None: + entry = "%s.%s" % (entry, args.idextension) + entries.add(entry) + return sorted(entries) + + +def main(args): + entries = getIdentifiers(args) + logFile = open(args.log, "w") + logFile.write("Found %s entries.\n" % len(entries)) + outputIndex = args.index + outputDatabase = args.database + tempPath = args.temp.rstrip("/") + tempFile = "%s/temp.pdb" % tempPath + system("mkdir -p %s" % tempPath) + system("rm -f %s" % outputDatabase) + indexFile = open(outputIndex, 'w') + start = 0 + for entryId in entries: + logFile.write("Loading %s.\n" % entryId) + system("wget -q -O %s %s%s" % (tempFile, args.url, entryId)) + tempSize = getsize(tempFile) + if tempSize == 0: + logFile.write("Entry `%s` not found.\n" % entryId) + else: + indexFile.write("%s\t%d\t%d\n" % (entryId, start, tempSize)) + start = start + tempSize + 1 + system("cat %s >> %s" % (tempFile, outputDatabase)) + logFile.flush() + logFile.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='DBKit - Download and Merge files into a single file.') + parser.add_argument('-l', '--list', help='List of entries', required=True) + parser.add_argument('-u', '--url', help='Source Url', required=True) + parser.add_argument('-t', '--temp', help='temp', required=True) + parser.add_argument('-il', '--idlength', help='Format Identifier Length (integer)', required=False, default="0") + parser.add_argument('-ic', '--idcase', help='Format Identifier Case (lower, upper)', required=False, default=None) + parser.add_argument('-ie', '--idextension', help='Format Identifier Extension', required=False, default=None) + parser.add_argument('-o', '--index', help='Output Database Index', required=True) + parser.add_argument('-d', '--database', help='Output Database', required=True) + parser.add_argument('-g', '--log', help="Log file", required=True) + args = parser.parse_args() + main(args)