Mercurial > repos > shellac > guppy_basecaller
annotate env/lib/python3.7/site-packages/cwltool/provenance.py @ 3:758bc20232e8 draft
"planemo upload commit 2a0fe2cc28b09e101d37293e53e82f61762262ec"
author | shellac |
---|---|
date | Thu, 14 May 2020 16:20:52 -0400 |
parents | 26e78fe6e8c4 |
children |
rev | line source |
---|---|
0
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1 """Stores Research Object including provenance.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
2 from __future__ import absolute_import |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
3 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
4 import copy |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
5 import datetime |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
6 import hashlib |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
7 import logging |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
8 import os |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
9 import re |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
10 import shutil |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
11 import tempfile |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
12 import uuid |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
13 from collections import OrderedDict |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
14 from getpass import getuser |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
15 from io import BytesIO, FileIO, TextIOWrapper, open |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
16 from socket import getfqdn |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
17 from typing import (IO, Any, Callable, Dict, List, Generator, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
18 MutableMapping, Optional, Set, Tuple, Union, cast) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
19 from types import ModuleType |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
20 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
21 import prov.model as provM |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
22 import six |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
23 from prov.identifier import Identifier, Namespace |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
24 from prov.model import (PROV, ProvActivity, # pylint: disable=unused-import |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
25 ProvDocument, ProvEntity) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
26 from pathlib2 import Path, PurePosixPath, PurePath |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
27 from ruamel import yaml |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
28 from schema_salad.sourceline import SourceLine |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
29 from six.moves import urllib |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
30 from typing_extensions import (TYPE_CHECKING, # pylint: disable=unused-import |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
31 Text) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
32 # move to a regular typing import when Python 3.3-3.6 is no longer supported |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
33 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
34 from .context import RuntimeContext # pylint: disable=unused-import |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
35 from .errors import WorkflowException |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
36 from .loghandler import _logger |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
37 from .pathmapper import get_listing |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
38 from .process import Process, shortname # pylint: disable=unused-import |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
39 from .stdfsaccess import StdFsAccess # pylint: disable=unused-import |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
40 from .utils import json_dumps, versionstring, onWindows |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
41 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
42 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
43 # imports needed for retrieving user data |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
44 if onWindows(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
45 import ctypes # pylint: disable=unused-import |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
46 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
47 try: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
48 import pwd # pylint: disable=unused-import |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
49 except ImportError: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
50 pass |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
51 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
52 if TYPE_CHECKING: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
53 from .command_line_tool import CommandLineTool, ExpressionTool # pylint: disable=unused-import |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
54 from .workflow import Workflow # pylint: disable=unused-import |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
55 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
56 if six.PY2: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
57 class PermissionError(OSError): # pylint: disable=redefined-builtin |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
58 """Needed for Python2.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
59 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
60 pass |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
61 __citation__ = "https://doi.org/10.5281/zenodo.1208477" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
62 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
63 # NOTE: Semantic versioning of the CWLProv Research Object |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
64 # **and** the cwlprov files |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
65 # |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
66 # Rough guide (major.minor.patch): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
67 # 1. Bump major number if removing/"breaking" resources or PROV statements |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
68 # 2. Bump minor number if adding resources or PROV statements |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
69 # 3. Bump patch number for non-breaking non-adding changes, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
70 # e.g. fixing broken relative paths |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
71 CWLPROV_VERSION = "https://w3id.org/cwl/prov/0.6.0" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
72 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
73 # Research Object folders |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
74 METADATA = "metadata" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
75 DATA = "data" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
76 WORKFLOW = "workflow" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
77 SNAPSHOT = "snapshot" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
78 # sub-folders |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
79 MAIN = os.path.join(WORKFLOW, "main") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
80 PROVENANCE = os.path.join(METADATA, "provenance") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
81 LOGS = os.path.join(METADATA, "logs") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
82 WFDESC = Namespace("wfdesc", 'http://purl.org/wf4ever/wfdesc#') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
83 WFPROV = Namespace("wfprov", 'http://purl.org/wf4ever/wfprov#') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
84 WF4EVER = Namespace("wf4ever", 'http://purl.org/wf4ever/wf4ever#') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
85 RO = Namespace("ro", 'http://purl.org/wf4ever/ro#') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
86 ORE = Namespace("ore", 'http://www.openarchives.org/ore/terms/') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
87 FOAF = Namespace("foaf", 'http://xmlns.com/foaf/0.1/') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
88 SCHEMA = Namespace("schema", 'http://schema.org/') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
89 CWLPROV = Namespace('cwlprov', 'https://w3id.org/cwl/prov#') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
90 ORCID = Namespace("orcid", "https://orcid.org/") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
91 UUID = Namespace("id", "urn:uuid:") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
92 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
93 # BagIt and YAML always use UTF-8 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
94 ENCODING = "UTF-8" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
95 TEXT_PLAIN = 'text/plain; charset="%s"' % ENCODING |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
96 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
97 # sha1, compatible with the File type's "checksum" field |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
98 # e.g. "checksum" = "sha1$47a013e660d408619d894b20806b1d5086aab03b" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
99 # See ./cwltool/schemas/v1.0/Process.yml |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
100 Hasher = hashlib.sha1 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
101 SHA1 = "sha1" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
102 SHA256 = "sha256" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
103 SHA512 = "sha512" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
104 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
105 # TODO: Better identifiers for user, at least |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
106 # these should be preserved in ~/.config/cwl for every execution |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
107 # on this host |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
108 USER_UUID = uuid.uuid4().urn |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
109 ACCOUNT_UUID = uuid.uuid4().urn |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
110 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
111 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
112 def _posix_path(local_path): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
113 # type: (Text) -> Text |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
114 return str(PurePosixPath(Path(local_path))) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
115 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
116 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
117 def _local_path(posix_path): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
118 # type: (Text) -> Text |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
119 return str(Path(posix_path)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
120 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
121 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
122 def _whoami(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
123 # type: () -> Tuple[Text,Text] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
124 """Return the current operating system account as (username, fullname).""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
125 username = getuser() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
126 try: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
127 if onWindows(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
128 get_user_name = ctypes.windll.secur32.GetUserNameExW # type: ignore |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
129 size = ctypes.pointer(ctypes.c_ulong(0)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
130 get_user_name(3, None, size) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
131 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
132 name_buffer = ctypes.create_unicode_buffer(size.contents.value) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
133 get_user_name(3, name_buffer, size) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
134 fullname = str(name_buffer.value) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
135 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
136 fullname = pwd.getpwuid(os.getuid())[4].split(',')[0] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
137 except (KeyError, IndexError): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
138 fullname = username |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
139 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
140 return (username, fullname) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
141 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
142 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
143 class WritableBagFile(FileIO): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
144 """Writes files in research object.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
145 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
146 def __init__(self, research_object, rel_path): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
147 # type: (ResearchObject, Text) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
148 """Initialize an ROBagIt.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
149 self.research_object = research_object |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
150 if Path(rel_path).is_absolute(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
151 raise ValueError("rel_path must be relative: %s" % rel_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
152 self.rel_path = rel_path |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
153 self.hashes = {SHA1: hashlib.sha1(), # nosec |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
154 SHA256: hashlib.sha256(), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
155 SHA512: hashlib.sha512()} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
156 # Open file in Research Object folder |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
157 path = os.path.abspath(os.path.join(research_object.folder, _local_path(rel_path))) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
158 if not path.startswith(os.path.abspath(research_object.folder)): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
159 raise ValueError("Path is outside Research Object: %s" % path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
160 super(WritableBagFile, self).__init__(str(path), mode="w") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
161 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
162 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
163 def write(self, b): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
164 # type: (Union[bytes, Text]) -> int |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
165 if isinstance(b, bytes): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
166 real_b = b |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
167 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
168 real_b = b.encode('utf-8') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
169 total = 0 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
170 length = len(real_b) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
171 while total < length: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
172 ret = super(WritableBagFile, self).write(real_b) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
173 if ret: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
174 total += ret |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
175 for _ in self.hashes.values(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
176 _.update(real_b) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
177 return total |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
178 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
179 def close(self): # type: () -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
180 # FIXME: Convert below block to a ResearchObject method? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
181 if self.rel_path.startswith("data/"): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
182 self.research_object.bagged_size[self.rel_path] = self.tell() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
183 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
184 self.research_object.tagfiles.add(self.rel_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
185 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
186 super(WritableBagFile, self).close() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
187 # { "sha1": "f572d396fae9206628714fb2ce00f72e94f2258f" } |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
188 checksums = {} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
189 for name in self.hashes: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
190 checksums[name] = self.hashes[name].hexdigest().lower() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
191 self.research_object.add_to_manifest(self.rel_path, checksums) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
192 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
193 # To simplify our hash calculation we won't support |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
194 # seeking, reading or truncating, as we can't do |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
195 # similar seeks in the current hash. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
196 # TODO: Support these? At the expense of invalidating |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
197 # the current hash, then having to recalculate at close() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
198 def seekable(self): # type: () -> bool |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
199 return False |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
200 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
201 def readable(self): # type: () -> bool |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
202 return False |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
203 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
204 def truncate(self, size=None): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
205 # type: (Optional[int]) -> int |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
206 # FIXME: This breaks contract IOBase, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
207 # as it means we would have to recalculate the hash |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
208 if size is not None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
209 raise IOError("WritableBagFile can't truncate") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
210 return self.tell() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
211 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
212 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
213 def _check_mod_11_2(numeric_string): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
214 # type: (Text) -> bool |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
215 """ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
216 Validate numeric_string for its MOD-11-2 checksum. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
217 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
218 Any "-" in the numeric_string are ignored. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
219 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
220 The last digit of numeric_string is assumed to be the checksum, 0-9 or X. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
221 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
222 See ISO/IEC 7064:2003 and |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
223 https://support.orcid.org/knowledgebase/articles/116780-structure-of-the-orcid-identifier |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
224 """ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
225 # Strip - |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
226 nums = numeric_string.replace("-", "") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
227 total = 0 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
228 # skip last (check)digit |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
229 for num in nums[:-1]: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
230 digit = int(num) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
231 total = (total+digit)*2 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
232 remainder = total % 11 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
233 result = (12-remainder) % 11 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
234 if result == 10: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
235 checkdigit = "X" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
236 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
237 checkdigit = str(result) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
238 # Compare against last digit or X |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
239 return nums[-1].upper() == checkdigit |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
240 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
241 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
242 def _valid_orcid(orcid): # type: (Optional[Text]) -> Text |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
243 """ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
244 Ensure orcid is a valid ORCID identifier. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
245 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
246 The string must be equivalent to one of these forms: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
247 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
248 0000-0002-1825-0097 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
249 orcid.org/0000-0002-1825-0097 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
250 http://orcid.org/0000-0002-1825-0097 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
251 https://orcid.org/0000-0002-1825-0097 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
252 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
253 If the ORCID number or prefix is invalid, a ValueError is raised. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
254 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
255 The returned ORCID string is always in the form of: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
256 https://orcid.org/0000-0002-1825-0097 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
257 """ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
258 if orcid is None or not orcid: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
259 raise ValueError(u'ORCID cannot be unspecified') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
260 # Liberal in what we consume, e.g. ORCID.org/0000-0002-1825-009x |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
261 orcid = orcid.lower() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
262 match = re.match( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
263 # Note: concatinated r"" r"" below so we can add comments to pattern |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
264 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
265 # Optional hostname, with or without protocol |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
266 r"(http://orcid\.org/|https://orcid\.org/|orcid\.org/)?" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
267 # alternative pattern, but probably messier |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
268 # r"^((https?://)?orcid.org/)?" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
269 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
270 # ORCID number is always 4x4 numerical digits, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
271 # but last digit (modulus 11 checksum) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
272 # can also be X (but we made it lowercase above). |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
273 # e.g. 0000-0002-1825-0097 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
274 # or 0000-0002-1694-233x |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
275 r"(?P<orcid>(\d{4}-\d{4}-\d{4}-\d{3}[0-9x]))$", |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
276 orcid) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
277 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
278 help_url = u"https://support.orcid.org/knowledgebase/articles/"\ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
279 "116780-structure-of-the-orcid-identifier" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
280 if not match: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
281 raise ValueError(u"Invalid ORCID: %s\n%s" % (orcid, help_url)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
282 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
283 # Conservative in what we produce: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
284 # a) Ensure any checksum digit is uppercase |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
285 orcid_num = match.group("orcid").upper() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
286 # b) ..and correct |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
287 if not _check_mod_11_2(orcid_num): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
288 raise ValueError( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
289 u"Invalid ORCID checksum: %s\n%s" % (orcid_num, help_url)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
290 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
291 # c) Re-add the official prefix https://orcid.org/ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
292 return u"https://orcid.org/%s" % orcid_num |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
293 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
294 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
295 class ProvenanceProfile(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
296 """ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
297 Provenance profile. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
298 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
299 Populated as the workflow runs. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
300 """ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
301 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
302 def __init__(self, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
303 research_object, # type: ResearchObject |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
304 full_name, # type: str |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
305 host_provenance, # type: bool |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
306 user_provenance, # type: bool |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
307 orcid, # type: str |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
308 fsaccess, # type: StdFsAccess |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
309 run_uuid=None # type: Optional[uuid.UUID] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
310 ): # type: (...) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
311 """Initialize the provenance profile.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
312 self.fsaccess = fsaccess |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
313 self.orcid = orcid |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
314 self.research_object = research_object |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
315 self.folder = self.research_object.folder |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
316 self.document = ProvDocument() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
317 self.host_provenance = host_provenance |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
318 self.user_provenance = user_provenance |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
319 self.engine_uuid = research_object.engine_uuid |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
320 self.add_to_manifest = self.research_object.add_to_manifest |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
321 if self.orcid: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
322 _logger.debug(u"[provenance] Creator ORCID: %s", self.orcid) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
323 self.full_name = full_name |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
324 if self.full_name: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
325 _logger.debug(u"[provenance] Creator Full name: %s", |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
326 self.full_name) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
327 if run_uuid is None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
328 run_uuid = uuid.uuid4() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
329 self.workflow_run_uuid = run_uuid |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
330 self.workflow_run_uri = run_uuid.urn |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
331 self.generate_prov_doc() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
332 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
333 def __str__(self): # type: () -> str |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
334 """Represent this Provenvance profile as a string.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
335 return "ProvenanceProfile <%s> in <%s>" % ( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
336 self.workflow_run_uri, self.research_object) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
337 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
338 def generate_prov_doc(self): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
339 # type: () -> Tuple[str, ProvDocument] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
340 """Add basic namespaces.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
341 def host_provenance(document): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
342 # type: (ProvDocument) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
343 """Record host provenance.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
344 document.add_namespace(CWLPROV) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
345 document.add_namespace(UUID) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
346 document.add_namespace(FOAF) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
347 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
348 hostname = getfqdn() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
349 # won't have a foaf:accountServiceHomepage for unix hosts, but |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
350 # we can at least provide hostname |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
351 document.agent( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
352 ACCOUNT_UUID, {provM.PROV_TYPE: FOAF["OnlineAccount"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
353 "prov:location": hostname, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
354 CWLPROV["hostname"]: hostname}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
355 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
356 self.cwltool_version = "cwltool %s" % versionstring().split()[-1] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
357 self.document.add_namespace( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
358 'wfprov', 'http://purl.org/wf4ever/wfprov#') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
359 # document.add_namespace('prov', 'http://www.w3.org/ns/prov#') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
360 self.document.add_namespace( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
361 'wfdesc', 'http://purl.org/wf4ever/wfdesc#') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
362 # TODO: Make this ontology. For now only has cwlprov:image |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
363 self.document.add_namespace('cwlprov', 'https://w3id.org/cwl/prov#') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
364 self.document.add_namespace('foaf', 'http://xmlns.com/foaf/0.1/') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
365 self.document.add_namespace('schema', 'http://schema.org/') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
366 self.document.add_namespace('orcid', 'https://orcid.org/') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
367 self.document.add_namespace('id', 'urn:uuid:') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
368 # NOTE: Internet draft expired 2004-03-04 (!) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
369 # https://tools.ietf.org/html/draft-thiemann-hash-urn-01 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
370 # TODO: Change to nih:sha-256; hashes |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
371 # https://tools.ietf.org/html/rfc6920#section-7 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
372 self.document.add_namespace('data', 'urn:hash::sha1:') |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
373 # Also needed for docker images |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
374 self.document.add_namespace(SHA256, "nih:sha-256;") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
375 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
376 # info only, won't really be used by prov as sub-resources use / |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
377 self.document.add_namespace( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
378 'researchobject', self.research_object.base_uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
379 # annotations |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
380 self.metadata_ns = self.document.add_namespace( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
381 'metadata', self.research_object.base_uri + METADATA + "/") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
382 # Pre-register provenance directory so we can refer to its files |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
383 self.provenance_ns = self.document.add_namespace( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
384 'provenance', self.research_object.base_uri |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
385 + _posix_path(PROVENANCE) + "/") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
386 ro_identifier_workflow = self.research_object.base_uri \ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
387 + "workflow/packed.cwl#" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
388 self.wf_ns = self.document.add_namespace("wf", ro_identifier_workflow) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
389 ro_identifier_input = self.research_object.base_uri \ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
390 + "workflow/primary-job.json#" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
391 self.document.add_namespace("input", ro_identifier_input) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
392 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
393 # More info about the account (e.g. username, fullname) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
394 # may or may not have been previously logged by user_provenance() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
395 # .. but we always know cwltool was launched (directly or indirectly) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
396 # by a user account, as cwltool is a command line tool |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
397 account = self.document.agent(ACCOUNT_UUID) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
398 if self.orcid or self.full_name: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
399 person = {provM.PROV_TYPE: PROV["Person"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
400 "prov:type": SCHEMA["Person"]} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
401 if self.full_name: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
402 person["prov:label"] = self.full_name |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
403 person["foaf:name"] = self.full_name |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
404 person["schema:name"] = self.full_name |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
405 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
406 # TODO: Look up name from ORCID API? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
407 pass |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
408 agent = self.document.agent(self.orcid or uuid.uuid4().urn, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
409 person) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
410 self.document.actedOnBehalfOf(account, agent) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
411 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
412 if self.host_provenance: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
413 host_provenance(self.document) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
414 if self.user_provenance: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
415 self.research_object.user_provenance(self.document) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
416 # The execution of cwltool |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
417 wfengine = self.document.agent( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
418 self.engine_uuid, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
419 {provM.PROV_TYPE: PROV["SoftwareAgent"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
420 "prov:type": WFPROV["WorkflowEngine"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
421 "prov:label": self.cwltool_version}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
422 # FIXME: This datetime will be a bit too delayed, we should |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
423 # capture when cwltool.py earliest started? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
424 self.document.wasStartedBy( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
425 wfengine, None, account, datetime.datetime.now()) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
426 # define workflow run level activity |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
427 self.document.activity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
428 self.workflow_run_uri, datetime.datetime.now(), None, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
429 {provM.PROV_TYPE: WFPROV["WorkflowRun"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
430 "prov:label": "Run of workflow/packed.cwl#main"}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
431 # association between SoftwareAgent and WorkflowRun |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
432 main_workflow = "wf:main" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
433 self.document.wasAssociatedWith( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
434 self.workflow_run_uri, self.engine_uuid, main_workflow) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
435 self.document.wasStartedBy( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
436 self.workflow_run_uri, None, self.engine_uuid, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
437 datetime.datetime.now()) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
438 return (self.workflow_run_uri, self.document) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
439 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
440 def evaluate(self, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
441 process, # type: Process |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
442 job, # type: Any |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
443 job_order_object, # type: Dict[Text, Text] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
444 research_obj # type: ResearchObject |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
445 ): # type: (...) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
446 """Evaluate the nature of job.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
447 if not hasattr(process, "steps"): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
448 # record provenance of independent commandline tool executions |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
449 self.prospective_prov(job) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
450 customised_job = copy_job_order(job, job_order_object) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
451 self.used_artefacts(customised_job, self.workflow_run_uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
452 research_obj.create_job(customised_job, job) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
453 elif hasattr(job, "workflow"): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
454 # record provenance of workflow executions |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
455 self.prospective_prov(job) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
456 customised_job = copy_job_order(job, job_order_object) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
457 self.used_artefacts(customised_job, self.workflow_run_uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
458 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
459 def record_process_start(self, process, job, process_run_id=None): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
460 # type: (Process, Any, Optional[str]) -> Optional[str] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
461 if not hasattr(process, 'steps'): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
462 process_run_id = self.workflow_run_uri |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
463 elif not hasattr(job, 'workflow'): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
464 # commandline tool execution as part of workflow |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
465 name = str(job.name) if hasattr(job, 'name') else '' |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
466 process_name = urllib.parse.quote(name, safe=":/,#") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
467 process_run_id = self.start_process(process_name, datetime.datetime.now()) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
468 return process_run_id |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
469 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
470 def start_process(self, process_name, when, process_run_id=None): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
471 # type: (Text, datetime.datetime, Optional[str]) -> str |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
472 """Record the start of each Process.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
473 if process_run_id is None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
474 process_run_id = uuid.uuid4().urn |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
475 prov_label = "Run of workflow/packed.cwl#main/" + process_name |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
476 self.document.activity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
477 process_run_id, None, None, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
478 {provM.PROV_TYPE: WFPROV["ProcessRun"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
479 provM.PROV_LABEL: prov_label}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
480 self.document.wasAssociatedWith( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
481 process_run_id, self.engine_uuid, str("wf:main/" + process_name)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
482 self.document.wasStartedBy( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
483 process_run_id, None, self.workflow_run_uri, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
484 when, None, None) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
485 return process_run_id |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
486 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
487 def record_process_end(self, process_name, process_run_id, outputs, when): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
488 # type: (Text, str, Any, datetime.datetime) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
489 self.generate_output_prov(outputs, process_run_id, process_name) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
490 self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
491 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
492 def declare_file(self, value): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
493 # type: (MutableMapping[Text, Any]) -> Tuple[ProvEntity, ProvEntity, str] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
494 if value["class"] != "File": |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
495 raise ValueError("Must have class:File: %s" % value) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
496 # Need to determine file hash aka RO filename |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
497 entity = None # type: Optional[ProvEntity] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
498 checksum = None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
499 if 'checksum' in value: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
500 csum = value['checksum'] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
501 (method, checksum) = csum.split("$", 1) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
502 if method == SHA1 and \ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
503 self.research_object.has_data_file(checksum): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
504 entity = self.document.entity("data:" + checksum) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
505 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
506 if not entity and 'location' in value: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
507 location = str(value['location']) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
508 # If we made it here, we'll have to add it to the RO |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
509 with self.fsaccess.open(location, "rb") as fhandle: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
510 relative_path = self.research_object.add_data_file(fhandle) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
511 # FIXME: This naively relies on add_data_file setting hash as filename |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
512 checksum = PurePath(relative_path).name |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
513 entity = self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
514 "data:" + checksum, {provM.PROV_TYPE: WFPROV["Artifact"]}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
515 if "checksum" not in value: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
516 value["checksum"] = "%s$%s" % (SHA1, checksum) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
517 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
518 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
519 if not entity and 'contents' in value: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
520 # Anonymous file, add content as string |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
521 entity, checksum = self.declare_string(value["contents"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
522 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
523 # By here one of them should have worked! |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
524 if not entity or not checksum: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
525 raise ValueError("class:File but missing checksum/location/content: %r" % value) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
526 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
527 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
528 # Track filename and extension, this is generally useful only for |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
529 # secondaryFiles. Note that multiple uses of a file might thus record |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
530 # different names for the same entity, so we'll |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
531 # make/track a specialized entity by UUID |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
532 file_id = value.setdefault("@id", uuid.uuid4().urn) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
533 # A specialized entity that has just these names |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
534 file_entity = self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
535 file_id, [(provM.PROV_TYPE, WFPROV["Artifact"]), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
536 (provM.PROV_TYPE, WF4EVER["File"])]) # type: ProvEntity |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
537 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
538 if "basename" in value: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
539 file_entity.add_attributes({CWLPROV["basename"]: value["basename"]}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
540 if "nameroot" in value: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
541 file_entity.add_attributes({CWLPROV["nameroot"]: value["nameroot"]}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
542 if "nameext" in value: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
543 file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
544 self.document.specializationOf(file_entity, entity) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
545 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
546 # Check for secondaries |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
547 for sec in value.get("secondaryFiles", ()): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
548 # TODO: Record these in a specializationOf entity with UUID? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
549 if sec['class'] == "File": |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
550 (sec_entity, _, _) = self.declare_file(sec) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
551 elif sec['class'] == "Directory": |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
552 sec_entity = self.declare_directory(sec) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
553 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
554 raise ValueError("Got unexpected secondaryFiles value: {}".format(sec)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
555 # We don't know how/when/where the secondary file was generated, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
556 # but CWL convention is a kind of summary/index derived |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
557 # from the original file. As its generally in a different format |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
558 # then prov:Quotation is not appropriate. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
559 self.document.derivation( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
560 sec_entity, file_entity, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
561 other_attributes={PROV["type"]: CWLPROV["SecondaryFile"]}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
562 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
563 return file_entity, entity, checksum |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
564 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
565 def declare_directory(self, value): # type: (MutableMapping[Text, Any]) -> ProvEntity |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
566 """Register any nested files/directories.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
567 # FIXME: Calculate a hash-like identifier for directory |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
568 # so we get same value if it's the same filenames/hashes |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
569 # in a different location. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
570 # For now, mint a new UUID to identify this directory, but |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
571 # attempt to keep it inside the value dictionary |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
572 dir_id = value.setdefault("@id", uuid.uuid4().urn) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
573 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
574 # New annotation file to keep the ORE Folder listing |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
575 ore_doc_fn = dir_id.replace("urn:uuid:", "directory-") + ".ttl" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
576 dir_bundle = self.document.bundle(self.metadata_ns[ore_doc_fn]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
577 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
578 coll = self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
579 dir_id, [(provM.PROV_TYPE, WFPROV["Artifact"]), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
580 (provM.PROV_TYPE, PROV["Collection"]), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
581 (provM.PROV_TYPE, PROV["Dictionary"]), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
582 (provM.PROV_TYPE, RO["Folder"])]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
583 # ORE description of ro:Folder, saved separately |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
584 coll_b = dir_bundle.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
585 dir_id, [(provM.PROV_TYPE, RO["Folder"]), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
586 (provM.PROV_TYPE, ORE["Aggregation"])]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
587 self.document.mentionOf(dir_id + "#ore", dir_id, dir_bundle.identifier) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
588 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
589 # dir_manifest = dir_bundle.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
590 # dir_bundle.identifier, {PROV["type"]: ORE["ResourceMap"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
591 # ORE["describes"]: coll_b.identifier}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
592 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
593 coll_attribs = [(ORE["isDescribedBy"], dir_bundle.identifier)] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
594 coll_b_attribs = [] # type: List[Tuple[Identifier, ProvEntity]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
595 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
596 # FIXME: .listing might not be populated yet - hopefully |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
597 # a later call to this method will sort that |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
598 is_empty = True |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
599 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
600 if "listing" not in value: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
601 get_listing(self.fsaccess, value) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
602 for entry in value.get("listing", []): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
603 is_empty = False |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
604 # Declare child-artifacts |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
605 entity = self.declare_artefact(entry) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
606 self.document.membership(coll, entity) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
607 # Membership relation aka our ORE Proxy |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
608 m_id = uuid.uuid4().urn |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
609 m_entity = self.document.entity(m_id) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
610 m_b = dir_bundle.entity(m_id) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
611 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
612 # PROV-O style Dictionary |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
613 # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
614 # ..as prov.py do not currently allow PROV-N extensions |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
615 # like hadDictionaryMember(..) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
616 m_entity.add_asserted_type(PROV["KeyEntityPair"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
617 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
618 m_entity.add_attributes({ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
619 PROV["pairKey"]: entry["basename"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
620 PROV["pairEntity"]: entity, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
621 }) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
622 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
623 # As well as a being a |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
624 # http://wf4ever.github.io/ro/2016-01-28/ro/#FolderEntry |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
625 m_b.add_asserted_type(RO["FolderEntry"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
626 m_b.add_asserted_type(ORE["Proxy"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
627 m_b.add_attributes({ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
628 RO["entryName"]: entry["basename"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
629 ORE["proxyIn"]: coll, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
630 ORE["proxyFor"]: entity, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
631 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
632 }) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
633 coll_attribs.append((PROV["hadDictionaryMember"], m_entity)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
634 coll_b_attribs.append((ORE["aggregates"], m_b)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
635 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
636 coll.add_attributes(coll_attribs) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
637 coll_b.add_attributes(coll_b_attribs) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
638 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
639 # Also Save ORE Folder as annotation metadata |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
640 ore_doc = ProvDocument() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
641 ore_doc.add_namespace(ORE) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
642 ore_doc.add_namespace(RO) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
643 ore_doc.add_namespace(UUID) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
644 ore_doc.add_bundle(dir_bundle) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
645 ore_doc = ore_doc.flattened() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
646 ore_doc_path = str(PurePosixPath(METADATA, ore_doc_fn)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
647 with self.research_object.write_bag_file(ore_doc_path) as provenance_file: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
648 ore_doc.serialize(provenance_file, format="rdf", rdf_format="turtle") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
649 self.research_object.add_annotation(dir_id, [ore_doc_fn], ORE["isDescribedBy"].uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
650 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
651 if is_empty: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
652 # Empty directory |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
653 coll.add_asserted_type(PROV["EmptyCollection"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
654 coll.add_asserted_type(PROV["EmptyDictionary"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
655 self.research_object.add_uri(coll.identifier.uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
656 return coll |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
657 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
658 def declare_string(self, value): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
659 # type: (Union[Text, str]) -> Tuple[ProvEntity,Text] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
660 """Save as string in UTF-8.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
661 byte_s = BytesIO(str(value).encode(ENCODING)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
662 data_file = self.research_object.add_data_file(byte_s, content_type=TEXT_PLAIN) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
663 checksum = PurePosixPath(data_file).name |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
664 # FIXME: Don't naively assume add_data_file uses hash in filename! |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
665 data_id = "data:%s" % PurePosixPath(data_file).stem |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
666 entity = self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
667 data_id, {provM.PROV_TYPE: WFPROV["Artifact"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
668 provM.PROV_VALUE: str(value)}) # type: ProvEntity |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
669 return entity, checksum |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
670 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
671 def declare_artefact(self, value): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
672 # type: (Any) -> ProvEntity |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
673 """Create data artefact entities for all file objects.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
674 if value is None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
675 # FIXME: If this can happen in CWL, we'll |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
676 # need a better way to represent this in PROV |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
677 return self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
678 CWLPROV["None"], {provM.PROV_LABEL: "None"}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
679 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
680 if isinstance(value, (bool, int, float)): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
681 # Typically used in job documents for flags |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
682 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
683 # FIXME: Make consistent hash URIs for these |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
684 # that somehow include the type |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
685 # (so "1" != 1 != "1.0" != true) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
686 entity = self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
687 uuid.uuid4().urn, {provM.PROV_VALUE: value}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
688 self.research_object.add_uri(entity.identifier.uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
689 return entity |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
690 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
691 if isinstance(value, (Text, str)): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
692 (entity, _) = self.declare_string(value) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
693 return entity |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
694 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
695 if isinstance(value, bytes): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
696 # If we got here then we must be in Python 3 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
697 byte_s = BytesIO(value) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
698 data_file = self.research_object.add_data_file(byte_s) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
699 # FIXME: Don't naively assume add_data_file uses hash in filename! |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
700 data_id = "data:%s" % PurePosixPath(data_file).stem |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
701 return self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
702 data_id, {provM.PROV_TYPE: WFPROV["Artifact"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
703 provM.PROV_VALUE: str(value)}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
704 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
705 if isinstance(value, MutableMapping): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
706 if "@id" in value: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
707 # Already processed this value, but it might not be in this PROV |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
708 entities = self.document.get_record(value["@id"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
709 if entities: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
710 return entities[0] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
711 # else, unknown in PROV, re-add below as if it's fresh |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
712 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
713 # Base case - we found a File we need to update |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
714 if value.get("class") == "File": |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
715 (entity, _, _) = self.declare_file(value) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
716 value["@id"] = entity.identifier.uri |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
717 return entity |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
718 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
719 if value.get("class") == "Directory": |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
720 entity = self.declare_directory(value) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
721 value["@id"] = entity.identifier.uri |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
722 return entity |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
723 coll_id = value.setdefault("@id", uuid.uuid4().urn) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
724 # some other kind of dictionary? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
725 # TODO: also Save as JSON |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
726 coll = self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
727 coll_id, [(provM.PROV_TYPE, WFPROV["Artifact"]), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
728 (provM.PROV_TYPE, PROV["Collection"]), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
729 (provM.PROV_TYPE, PROV["Dictionary"])]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
730 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
731 if value.get("class"): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
732 _logger.warning("Unknown data class %s.", value["class"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
733 # FIXME: The class might be "http://example.com/somethingelse" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
734 coll.add_asserted_type(CWLPROV[value["class"]]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
735 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
736 # Let's iterate and recurse |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
737 coll_attribs = [] # type: List[Tuple[Identifier, ProvEntity]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
738 for (key, val) in value.items(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
739 v_ent = self.declare_artefact(val) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
740 self.document.membership(coll, v_ent) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
741 m_entity = self.document.entity(uuid.uuid4().urn) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
742 # Note: only support PROV-O style dictionary |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
743 # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
744 # as prov.py do not easily allow PROV-N extensions |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
745 m_entity.add_asserted_type(PROV["KeyEntityPair"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
746 m_entity.add_attributes({ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
747 PROV["pairKey"]: str(key), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
748 PROV["pairEntity"]: v_ent |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
749 }) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
750 coll_attribs.append((PROV["hadDictionaryMember"], m_entity)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
751 coll.add_attributes(coll_attribs) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
752 self.research_object.add_uri(coll.identifier.uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
753 return coll |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
754 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
755 # some other kind of Collection? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
756 # TODO: also save as JSON |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
757 try: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
758 members = [] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
759 for each_input_obj in iter(value): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
760 # Recurse and register any nested objects |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
761 e = self.declare_artefact(each_input_obj) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
762 members.append(e) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
763 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
764 # If we reached this, then we were allowed to iterate |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
765 coll = self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
766 uuid.uuid4().urn, [(provM.PROV_TYPE, WFPROV["Artifact"]), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
767 (provM.PROV_TYPE, PROV["Collection"])]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
768 if not members: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
769 coll.add_asserted_type(PROV["EmptyCollection"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
770 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
771 for member in members: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
772 # FIXME: This won't preserve order, for that |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
773 # we would need to use PROV.Dictionary |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
774 # with numeric keys |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
775 self.document.membership(coll, member) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
776 self.research_object.add_uri(coll.identifier.uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
777 # FIXME: list value does not support adding "@id" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
778 return coll |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
779 except TypeError: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
780 _logger.warning("Unrecognized type %s of %r", |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
781 type(value), value) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
782 # Let's just fall back to Python repr() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
783 entity = self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
784 uuid.uuid4().urn, {provM.PROV_LABEL: repr(value)}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
785 self.research_object.add_uri(entity.identifier.uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
786 return entity |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
787 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
788 def used_artefacts(self, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
789 job_order, # type: Union[Dict[Any, Any], List[Dict[Any, Any]]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
790 process_run_id, # type: str |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
791 name=None # type: Optional[str] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
792 ): # type: (...) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
793 """Add used() for each data artefact.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
794 if isinstance(job_order, list): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
795 for entry in job_order: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
796 self.used_artefacts(entry, process_run_id, name) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
797 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
798 # FIXME: Use workflow name in packed.cwl, "main" is wrong for nested workflows |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
799 base = "main" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
800 if name is not None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
801 base += "/" + name |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
802 for key, value in job_order.items(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
803 prov_role = self.wf_ns["%s/%s" % (base, key)] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
804 try: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
805 entity = self.declare_artefact(value) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
806 self.document.used( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
807 process_run_id, entity, datetime.datetime.now(), None, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
808 {"prov:role": prov_role}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
809 except OSError: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
810 pass |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
811 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
812 def generate_output_prov(self, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
813 final_output, # type: Union[Dict[Text, Any], List[Dict[Text, Any]]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
814 process_run_id, # type: Optional[str] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
815 name # type: Optional[Text] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
816 ): # type: (...) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
817 """Call wasGeneratedBy() for each output,copy the files into the RO.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
818 if isinstance(final_output, list): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
819 for entry in final_output: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
820 self.generate_output_prov(entry, process_run_id, name) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
821 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
822 # Timestamp should be created at the earliest |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
823 timestamp = datetime.datetime.now() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
824 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
825 # For each output, find/register the corresponding |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
826 # entity (UUID) and document it as generated in |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
827 # a role corresponding to the output |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
828 for output, value in final_output.items(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
829 entity = self.declare_artefact(value) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
830 if name is not None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
831 name = urllib.parse.quote(str(name), safe=":/,#") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
832 # FIXME: Probably not "main" in nested workflows |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
833 role = self.wf_ns["main/%s/%s" % (name, output)] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
834 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
835 role = self.wf_ns["main/%s" % output] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
836 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
837 if not process_run_id: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
838 process_run_id = self.workflow_run_uri |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
839 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
840 self.document.wasGeneratedBy( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
841 entity, process_run_id, timestamp, None, {"prov:role": role}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
842 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
843 def prospective_prov(self, job): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
844 # type: (Any) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
845 """Create prospective prov recording as wfdesc prov:Plan.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
846 if not hasattr(job, "steps"): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
847 # direct command line tool execution |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
848 self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
849 "wf:main", {provM.PROV_TYPE: WFDESC["Process"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
850 "prov:type": PROV["Plan"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
851 "prov:label":"Prospective provenance"}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
852 return |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
853 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
854 self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
855 "wf:main", {provM.PROV_TYPE: WFDESC["Workflow"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
856 "prov:type": PROV["Plan"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
857 "prov:label":"Prospective provenance"}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
858 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
859 for step in job.steps: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
860 stepnametemp = "wf:main/" + str(step.name)[5:] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
861 stepname = urllib.parse.quote(stepnametemp, safe=":/,#") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
862 step = self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
863 stepname, {provM.PROV_TYPE: WFDESC["Process"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
864 "prov:type": PROV["Plan"]}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
865 self.document.entity( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
866 "wf:main", {"wfdesc:hasSubProcess": step, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
867 "prov:label": "Prospective provenance"}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
868 # TODO: Declare roles/parameters as well |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
869 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
870 def activity_has_provenance(self, activity, prov_ids): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
871 # type: (str, List[Identifier]) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
872 """Add http://www.w3.org/TR/prov-aq/ relations to nested PROV files.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
873 # NOTE: The below will only work if the corresponding metadata/provenance arcp URI |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
874 # is a pre-registered namespace in the PROV Document |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
875 attribs = [(PROV["has_provenance"], prov_id) for prov_id in prov_ids] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
876 self.document.activity(activity, other_attributes=attribs) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
877 # Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
878 # as prov:mentionOf() is only for entities, not activities |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
879 uris = [i.uri for i in prov_ids] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
880 self.research_object.add_annotation(activity, uris, PROV["has_provenance"].uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
881 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
882 def finalize_prov_profile(self, name): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
883 # type: (Optional[Text]) -> List[Identifier] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
884 """Transfer the provenance related files to the RO.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
885 # NOTE: Relative posix path |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
886 if name is None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
887 # master workflow, fixed filenames |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
888 filename = "primary.cwlprov" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
889 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
890 # ASCII-friendly filename, avoiding % as we don't want %2520 in manifest.json |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
891 wf_name = urllib.parse.quote(str(name), safe="").replace("%", "_") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
892 # Note that the above could cause overlaps for similarly named |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
893 # workflows, but that's OK as we'll also include run uuid |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
894 # which also covers thhe case of this step being run in |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
895 # multiple places or iterations |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
896 filename = "%s.%s.cwlprov" % (wf_name, self.workflow_run_uuid) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
897 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
898 basename = str(PurePosixPath(PROVENANCE)/filename) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
899 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
900 # TODO: Also support other profiles than CWLProv, e.g. ProvOne |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
901 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
902 # list of prov identifiers of provenance files |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
903 prov_ids = [] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
904 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
905 # https://www.w3.org/TR/prov-xml/ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
906 with self.research_object.write_bag_file(basename + ".xml") as provenance_file: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
907 self.document.serialize(provenance_file, format="xml", indent=4) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
908 prov_ids.append(self.provenance_ns[filename + ".xml"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
909 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
910 # https://www.w3.org/TR/prov-n/ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
911 with self.research_object.write_bag_file(basename + ".provn") as provenance_file: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
912 self.document.serialize(provenance_file, format="provn", indent=2) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
913 prov_ids.append(self.provenance_ns[filename + ".provn"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
914 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
915 # https://www.w3.org/Submission/prov-json/ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
916 with self.research_object.write_bag_file(basename + ".json") as provenance_file: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
917 self.document.serialize(provenance_file, format="json", indent=2) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
918 prov_ids.append(self.provenance_ns[filename + ".json"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
919 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
920 # "rdf" aka https://www.w3.org/TR/prov-o/ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
921 # which can be serialized to ttl/nt/jsonld (and more!) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
922 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
923 # https://www.w3.org/TR/turtle/ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
924 with self.research_object.write_bag_file(basename + ".ttl") as provenance_file: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
925 self.document.serialize(provenance_file, format="rdf", rdf_format="turtle") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
926 prov_ids.append(self.provenance_ns[filename + ".ttl"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
927 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
928 # https://www.w3.org/TR/n-triples/ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
929 with self.research_object.write_bag_file(basename + ".nt") as provenance_file: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
930 self.document.serialize(provenance_file, format="rdf", rdf_format="ntriples") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
931 prov_ids.append(self.provenance_ns[filename + ".nt"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
932 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
933 # https://www.w3.org/TR/json-ld/ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
934 # TODO: Use a nice JSON-LD context |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
935 # see also https://eprints.soton.ac.uk/395985/ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
936 # 404 Not Found on https://provenance.ecs.soton.ac.uk/prov.jsonld :( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
937 with self.research_object.write_bag_file(basename + ".jsonld") as provenance_file: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
938 self.document.serialize(provenance_file, format="rdf", rdf_format="json-ld") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
939 prov_ids.append(self.provenance_ns[filename + ".jsonld"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
940 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
941 _logger.debug(u"[provenance] added provenance: %s", prov_ids) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
942 return prov_ids |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
943 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
944 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
945 class ResearchObject(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
946 """CWLProv Research Object.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
947 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
948 def __init__(self, fsaccess, temp_prefix_ro="tmp", orcid='', full_name=''): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
949 # type: (StdFsAccess, str, Text, Text) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
950 """Initialize the ResearchObject.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
951 self.temp_prefix = temp_prefix_ro |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
952 self.orcid = '' if not orcid else _valid_orcid(orcid) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
953 self.full_name = full_name |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
954 tmp_dir, tmp_prefix = os.path.split(temp_prefix_ro) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
955 self.folder = os.path.abspath(tempfile.mkdtemp(prefix=tmp_prefix, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
956 dir=tmp_dir)) # type: Text |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
957 self.closed = False |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
958 # map of filename "data/de/alsdklkas": 12398123 bytes |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
959 self.bagged_size = {} # type: Dict[Text, int] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
960 self.tagfiles = set() # type: Set[Text] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
961 self._file_provenance = {} # type: Dict[Text, Dict[Text, Text]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
962 self._external_aggregates = [] # type: List[Dict[Text, Text]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
963 self.annotations = [] # type: List[Dict[Text, Any]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
964 self._content_types = {} # type: Dict[Text,str] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
965 self.fsaccess = fsaccess |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
966 # These should be replaced by generate_prov_doc when workflow/run IDs are known: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
967 self.engine_uuid = "urn:uuid:%s" % uuid.uuid4() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
968 self.ro_uuid = uuid.uuid4() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
969 self.base_uri = "arcp://uuid,%s/" % self.ro_uuid |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
970 self.cwltool_version = "cwltool %s" % versionstring().split()[-1] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
971 ## |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
972 self.relativised_input_object = {} # type: Dict[Any, Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
973 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
974 self._initialize() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
975 _logger.debug(u"[provenance] Temporary research object: %s", |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
976 self.folder) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
977 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
978 def self_check(self): # type: () -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
979 """Raise ValueError if this RO is closed.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
980 if self.closed: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
981 raise ValueError( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
982 "This ResearchObject has already been closed and is not " |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
983 "available for futher manipulation.") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
984 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
985 def __str__(self): # type: () -> str |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
986 """Represent this RO as a string.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
987 return "ResearchObject <{}> in <{}>".format(self.ro_uuid, self.folder) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
988 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
989 def _initialize(self): # type: () -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
990 for research_obj_folder in (METADATA, DATA, WORKFLOW, SNAPSHOT, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
991 PROVENANCE, LOGS): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
992 os.makedirs(os.path.join(self.folder, research_obj_folder)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
993 self._initialize_bagit() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
994 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
995 def _initialize_bagit(self): # type: () -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
996 """Write fixed bagit header.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
997 self.self_check() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
998 bagit = os.path.join(self.folder, "bagit.txt") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
999 # encoding: always UTF-8 (although ASCII would suffice here) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1000 # newline: ensure LF also on Windows |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1001 with open(bagit, "w", encoding=ENCODING, newline='\n') as bag_it_file: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1002 # TODO: \n or \r\n ? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1003 bag_it_file.write(u"BagIt-Version: 0.97\n") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1004 bag_it_file.write(u"Tag-File-Character-Encoding: %s\n" % ENCODING) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1005 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1006 def open_log_file_for_activity(self, uuid_uri): # type: (Text) -> WritableBagFile |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1007 self.self_check() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1008 # Ensure valid UUID for safe filenames |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1009 activity_uuid = uuid.UUID(uuid_uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1010 if activity_uuid.urn == self.engine_uuid: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1011 # It's the engine aka cwltool! |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1012 name = "engine" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1013 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1014 name = "activity" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1015 p = os.path.join(LOGS, "{}.{}.txt".format(name, activity_uuid)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1016 _logger.debug("[provenance] Opening log file for %s: %s" % (name, p)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1017 self.add_annotation(activity_uuid.urn, [p], CWLPROV["log"].uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1018 return self.write_bag_file(p) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1019 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1020 def _finalize(self): # type: () -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1021 self._write_ro_manifest() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1022 self._write_bag_info() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1023 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1024 def user_provenance(self, document): # type: (ProvDocument) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1025 """Add the user provenance.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1026 self.self_check() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1027 (username, fullname) = _whoami() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1028 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1029 if not self.full_name: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1030 self.full_name = fullname |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1031 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1032 document.add_namespace(UUID) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1033 document.add_namespace(ORCID) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1034 document.add_namespace(FOAF) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1035 account = document.agent( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1036 ACCOUNT_UUID, {provM.PROV_TYPE: FOAF["OnlineAccount"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1037 "prov:label": username, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1038 FOAF["accountName"]: username}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1039 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1040 user = document.agent( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1041 self.orcid or USER_UUID, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1042 {provM.PROV_TYPE: PROV["Person"], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1043 "prov:label": self.full_name, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1044 FOAF["name"]: self.full_name, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1045 FOAF["account"]: account}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1046 # cwltool may be started on the shell (directly by user), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1047 # by shell script (indirectly by user) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1048 # or from a different program |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1049 # (which again is launched by any of the above) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1050 # |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1051 # We can't tell in which way, but ultimately we're still |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1052 # acting in behalf of that user (even if we might |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1053 # get their name wrong!) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1054 document.actedOnBehalfOf(account, user) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1055 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1056 def write_bag_file(self, path, encoding=ENCODING): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1057 # type: (Text, Optional[str]) -> WritableBagFile |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1058 """Write the bag file into our research object.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1059 self.self_check() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1060 # For some reason below throws BlockingIOError |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1061 #fp = BufferedWriter(WritableBagFile(self, path)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1062 bag_file = WritableBagFile(self, path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1063 if encoding is not None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1064 # encoding: match Tag-File-Character-Encoding: UTF-8 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1065 # newline: ensure LF also on Windows |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1066 return cast(WritableBagFile, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1067 TextIOWrapper(cast(IO[bytes], bag_file), encoding=encoding, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1068 newline="\n")) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1069 return bag_file |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1070 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1071 def add_tagfile(self, path, timestamp=None): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1072 # type: (Text, Optional[datetime.datetime]) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1073 """Add tag files to our research object.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1074 self.self_check() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1075 checksums = {} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1076 # Read file to calculate its checksum |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1077 if os.path.isdir(path): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1078 return |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1079 # FIXME: do the right thing for directories |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1080 with open(path, "rb") as tag_file: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1081 # FIXME: Should have more efficient open_tagfile() that |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1082 # does all checksums in one go while writing through, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1083 # adding checksums after closing. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1084 # Below probably OK for now as metadata files |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1085 # are not too large..? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1086 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1087 checksums[SHA1] = checksum_copy(tag_file, hasher=hashlib.sha1) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1088 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1089 tag_file.seek(0) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1090 checksums[SHA256] = checksum_copy(tag_file, hasher=hashlib.sha256) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1091 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1092 tag_file.seek(0) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1093 checksums[SHA512] = checksum_copy(tag_file, hasher=hashlib.sha512) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1094 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1095 rel_path = _posix_path(os.path.relpath(path, self.folder)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1096 self.tagfiles.add(rel_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1097 self.add_to_manifest(rel_path, checksums) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1098 if timestamp is not None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1099 self._file_provenance[rel_path] = {"createdOn": timestamp.isoformat()} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1100 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1101 def _ro_aggregates(self): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1102 # type: () -> List[Dict[Text, Any]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1103 """Gather dictionary of files to be added to the manifest.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1104 def guess_mediatype(rel_path): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1105 # type: (Text) -> Dict[Text, Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1106 """Return the mediatypes.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1107 media_types = { |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1108 # Adapted from |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1109 # https://w3id.org/bundle/2014-11-05/#media-types |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1110 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1111 "txt": TEXT_PLAIN, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1112 "ttl": 'text/turtle; charset="UTF-8"', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1113 "rdf": 'application/rdf+xml', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1114 "json": 'application/json', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1115 "jsonld": 'application/ld+json', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1116 "xml": 'application/xml', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1117 ## |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1118 "cwl": 'text/x+yaml; charset="UTF-8"', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1119 "provn": 'text/provenance-notation; charset="UTF-8"', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1120 "nt": 'application/n-triples', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1121 } # type: Dict[Text, Text] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1122 conforms_to = { |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1123 "provn": 'http://www.w3.org/TR/2013/REC-prov-n-20130430/', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1124 "cwl": 'https://w3id.org/cwl/', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1125 } # type: Dict[Text, Text] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1126 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1127 prov_conforms_to = { |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1128 "provn": 'http://www.w3.org/TR/2013/REC-prov-n-20130430/', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1129 "rdf": 'http://www.w3.org/TR/2013/REC-prov-o-20130430/', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1130 "ttl": 'http://www.w3.org/TR/2013/REC-prov-o-20130430/', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1131 "nt": 'http://www.w3.org/TR/2013/REC-prov-o-20130430/', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1132 "jsonld": 'http://www.w3.org/TR/2013/REC-prov-o-20130430/', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1133 "xml": 'http://www.w3.org/TR/2013/NOTE-prov-xml-20130430/', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1134 "json": 'http://www.w3.org/Submission/2013/SUBM-prov-json-20130424/', |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1135 } # type: Dict[Text, Text] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1136 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1137 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1138 extension = rel_path.rsplit(".", 1)[-1].lower() # type: Optional[Text] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1139 if extension == rel_path: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1140 # No ".", no extension |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1141 extension = None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1142 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1143 local_aggregate = {} # type: Dict[Text, Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1144 if extension in media_types: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1145 local_aggregate["mediatype"] = media_types[extension] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1146 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1147 if extension in conforms_to: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1148 # TODO: Open CWL file to read its declared "cwlVersion", e.g. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1149 # cwlVersion = "v1.0" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1150 local_aggregate["conformsTo"] = conforms_to[extension] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1151 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1152 if (rel_path.startswith(_posix_path(PROVENANCE)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1153 and extension in prov_conforms_to): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1154 if ".cwlprov" in rel_path: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1155 # Our own! |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1156 local_aggregate["conformsTo"] = [prov_conforms_to[extension], CWLPROV_VERSION] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1157 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1158 # Some other PROV |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1159 # TODO: Recognize ProvOne etc. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1160 local_aggregate["conformsTo"] = prov_conforms_to[extension] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1161 return local_aggregate |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1162 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1163 aggregates = [] # type: List[Dict[Text, Any]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1164 for path in self.bagged_size.keys(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1165 aggregate_dict = {} # type: Dict[Text, Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1166 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1167 temp_path = PurePosixPath(path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1168 folder = temp_path.parent |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1169 filename = temp_path.name |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1170 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1171 # NOTE: Here we end up aggregating the abstract |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1172 # data items by their sha1 hash, so that it matches |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1173 # the entity() in the prov files. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1174 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1175 # TODO: Change to nih:sha-256; hashes |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1176 # https://tools.ietf.org/html/rfc6920#section-7 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1177 aggregate_dict["uri"] = 'urn:hash::sha1:' + filename |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1178 aggregate_dict["bundledAs"] = { |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1179 # The arcp URI is suitable ORE proxy; local to this Research Object. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1180 # (as long as we don't also aggregate it by relative path!) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1181 "uri": self.base_uri + path, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1182 # relate it to the data/ path |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1183 "folder": "/%s/" % folder, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1184 "filename": filename, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1185 } |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1186 if path in self._file_provenance: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1187 # Made by workflow run, merge captured provenance |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1188 aggregate_dict["bundledAs"].update(self._file_provenance[path]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1189 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1190 # Probably made outside wf run, part of job object? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1191 pass |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1192 if path in self._content_types: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1193 aggregate_dict["mediatype"] = self._content_types[path] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1194 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1195 aggregates.append(aggregate_dict) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1196 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1197 for path in self.tagfiles: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1198 if (not (path.startswith(METADATA) or path.startswith(WORKFLOW) or |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1199 path.startswith(SNAPSHOT))): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1200 # probably a bagit file |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1201 continue |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1202 if path == PurePosixPath(METADATA)/"manifest.json": |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1203 # Should not really be there yet! But anyway, we won't |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1204 # aggregate it. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1205 continue |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1206 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1207 rel_aggregates = {} # type: Dict[Text, Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1208 # These are local paths like metadata/provenance - but |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1209 # we need to relativize them for our current directory for |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1210 # as we are saved in metadata/manifest.json |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1211 uri = str(Path(os.pardir)/path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1212 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1213 rel_aggregates["uri"] = uri |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1214 rel_aggregates.update(guess_mediatype(path)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1215 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1216 if path in self._file_provenance: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1217 # Propagate file provenance (e.g. timestamp) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1218 rel_aggregates.update(self._file_provenance[path]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1219 elif not path.startswith(SNAPSHOT): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1220 # make new timestamp? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1221 rel_aggregates.update(self._self_made()) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1222 aggregates.append(rel_aggregates) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1223 aggregates.extend(self._external_aggregates) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1224 return aggregates |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1225 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1226 def add_uri(self, uri, timestamp=None): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1227 # type: (str, Optional[datetime.datetime]) -> Dict[Text, Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1228 self.self_check() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1229 aggr = self._self_made(timestamp=timestamp) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1230 aggr["uri"] = uri |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1231 self._external_aggregates.append(aggr) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1232 return aggr |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1233 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1234 def add_annotation(self, about, content, motivated_by="oa:describing"): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1235 # type: (str, List[str], str) -> str |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1236 """Cheap URI relativize for current directory and /.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1237 self.self_check() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1238 curr = self.base_uri + METADATA + "/" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1239 content = [c.replace(curr, "").replace(self.base_uri, "../") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1240 for c in content] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1241 uri = uuid.uuid4().urn |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1242 ann = { |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1243 u"uri": uri, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1244 u"about": about, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1245 u"content": content, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1246 u"oa:motivatedBy": {"@id": motivated_by} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1247 } |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1248 self.annotations.append(ann) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1249 return uri |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1250 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1251 def _ro_annotations(self): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1252 # type: () -> List[Dict[Text, Any]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1253 annotations = [] # type: List[Dict[Text, Any]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1254 annotations.append({ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1255 "uri": uuid.uuid4().urn, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1256 "about": self.ro_uuid.urn, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1257 "content": "/", |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1258 # https://www.w3.org/TR/annotation-vocab/#named-individuals |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1259 "oa:motivatedBy": {"@id": "oa:describing"} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1260 }) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1261 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1262 # How was it run? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1263 # FIXME: Only primary* |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1264 prov_files = [str(PurePosixPath(p).relative_to(METADATA)) for p in self.tagfiles |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1265 if p.startswith(_posix_path(PROVENANCE)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1266 and "/primary." in p] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1267 annotations.append({ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1268 "uri": uuid.uuid4().urn, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1269 "about": self.ro_uuid.urn, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1270 "content": prov_files, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1271 # Modulation of https://www.w3.org/TR/prov-aq/ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1272 "oa:motivatedBy": {"@id": "http://www.w3.org/ns/prov#has_provenance"} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1273 }) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1274 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1275 # Where is the main workflow? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1276 annotations.append({ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1277 "uri": uuid.uuid4().urn, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1278 "about": str(PurePosixPath("..")/WORKFLOW/"packed.cwl"), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1279 "oa:motivatedBy": {"@id": "oa:highlighting"} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1280 }) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1281 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1282 annotations.append({ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1283 "uri": uuid.uuid4().urn, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1284 "about": self.ro_uuid.urn, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1285 "content": [str(PurePosixPath("..")/WORKFLOW/"packed.cwl"), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1286 str(PurePosixPath("..")/WORKFLOW/"primary-job.json")], |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1287 "oa:motivatedBy": {"@id": "oa:linking"} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1288 }) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1289 # Add user-added annotations at end |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1290 annotations.extend(self.annotations) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1291 return annotations |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1292 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1293 def _authored_by(self): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1294 # type: () -> Dict[Text, Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1295 authored_by = {} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1296 if self.orcid: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1297 authored_by["orcid"] = self.orcid |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1298 if self.full_name: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1299 authored_by["name"] = self.full_name |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1300 if not self.orcid: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1301 authored_by["uri"] = USER_UUID |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1302 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1303 if authored_by: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1304 return {"authoredBy": authored_by} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1305 return {} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1306 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1307 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1308 def _write_ro_manifest(self): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1309 # type: () -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1310 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1311 # Does not have to be this order, but it's nice to be consistent |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1312 manifest = OrderedDict() # type: Dict[Text, Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1313 manifest["@context"] = [ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1314 {"@base": "%s%s/" % (self.base_uri, _posix_path(METADATA))}, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1315 "https://w3id.org/bundle/context" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1316 ] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1317 manifest["id"] = "/" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1318 manifest["conformsTo"] = CWLPROV_VERSION |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1319 filename = "manifest.json" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1320 manifest["manifest"] = filename |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1321 manifest.update(self._self_made()) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1322 manifest.update(self._authored_by()) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1323 manifest["aggregates"] = self._ro_aggregates() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1324 manifest["annotations"] = self._ro_annotations() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1325 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1326 json_manifest = json_dumps(manifest, indent=4, ensure_ascii=False) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1327 rel_path = str(PurePosixPath(METADATA)/filename) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1328 json_manifest += "\n" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1329 with self.write_bag_file(rel_path) as manifest_file: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1330 manifest_file.write(json_manifest) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1331 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1332 def _write_bag_info(self): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1333 # type: () -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1334 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1335 with self.write_bag_file("bag-info.txt") as info_file: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1336 info_file.write(u"Bag-Software-Agent: %s\n" % self.cwltool_version) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1337 # FIXME: require sha-512 of payload to comply with profile? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1338 # FIXME: Update profile |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1339 info_file.write(u"BagIt-Profile-Identifier: https://w3id.org/ro/bagit/profile\n") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1340 info_file.write(u"Bagging-Date: %s\n" % datetime.date.today().isoformat()) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1341 info_file.write(u"External-Description: Research Object of CWL workflow run\n") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1342 if self.full_name: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1343 info_file.write(u"Contact-Name: %s\n" % self.full_name) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1344 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1345 # NOTE: We can't use the urn:uuid:{UUID} of the workflow run (a prov:Activity) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1346 # as identifier for the RO/bagit (a prov:Entity). However the arcp base URI is good. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1347 info_file.write(u"External-Identifier: %s\n" % self.base_uri) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1348 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1349 # Calculate size of data/ (assuming no external fetch.txt files) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1350 total_size = sum(self.bagged_size.values()) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1351 num_files = len(self.bagged_size) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1352 info_file.write(u"Payload-Oxum: %d.%d\n" % (total_size, num_files)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1353 _logger.debug(u"[provenance] Generated bagit metadata: %s", |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1354 self.folder) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1355 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1356 def generate_snapshot(self, prov_dep): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1357 # type: (MutableMapping[Text, Any]) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1358 """Copy all of the CWL files to the snapshot/ directory.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1359 self.self_check() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1360 for key, value in prov_dep.items(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1361 if key == "location" and value.split("/")[-1]: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1362 filename = value.split("/")[-1] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1363 path = os.path.join(self.folder, SNAPSHOT, filename) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1364 filepath = '' |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1365 if "file://" in value: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1366 filepath = value[7:] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1367 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1368 filepath = value |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1369 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1370 # FIXME: What if destination path already exists? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1371 if os.path.exists(filepath): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1372 try: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1373 if os.path.isdir(filepath): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1374 shutil.copytree(filepath, path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1375 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1376 shutil.copy(filepath, path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1377 timestamp = datetime.datetime.fromtimestamp(os.path.getmtime(filepath)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1378 self.add_tagfile(path, timestamp) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1379 except PermissionError: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1380 pass # FIXME: avoids duplicate snapshotting; need better solution |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1381 elif key in ("secondaryFiles", "listing"): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1382 for files in value: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1383 if isinstance(files, MutableMapping): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1384 self.generate_snapshot(files) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1385 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1386 pass |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1387 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1388 def packed_workflow(self, packed): # type: (Text) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1389 """Pack CWL description to generate re-runnable CWL object in RO.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1390 self.self_check() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1391 rel_path = str(PurePosixPath(WORKFLOW)/"packed.cwl") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1392 # Write as binary |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1393 with self.write_bag_file(rel_path, encoding=None) as write_pack: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1394 # YAML is always UTF8, but json.dumps gives us str in py2 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1395 write_pack.write(packed.encode(ENCODING)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1396 _logger.debug(u"[provenance] Added packed workflow: %s", rel_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1397 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1398 def has_data_file(self, sha1hash): # type: (str) -> bool |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1399 """Confirm the presence of the given file in the RO.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1400 folder = os.path.join(self.folder, DATA, sha1hash[0:2]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1401 hash_path = os.path.join(folder, sha1hash) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1402 return os.path.isfile(hash_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1403 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1404 def add_data_file(self, from_fp, timestamp=None, content_type=None): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1405 # type: (IO[Any], Optional[datetime.datetime], Optional[str]) -> Text |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1406 """Copy inputs to data/ folder.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1407 self.self_check() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1408 tmp_dir, tmp_prefix = os.path.split(self.temp_prefix) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1409 with tempfile.NamedTemporaryFile( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1410 prefix=tmp_prefix, dir=tmp_dir, delete=False) as tmp: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1411 checksum = checksum_copy(from_fp, tmp) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1412 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1413 # Calculate hash-based file path |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1414 folder = os.path.join(self.folder, DATA, checksum[0:2]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1415 path = os.path.join(folder, checksum) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1416 # os.rename assumed safe, as our temp file should |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1417 # be in same file system as our temp folder |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1418 if not os.path.isdir(folder): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1419 os.makedirs(folder) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1420 os.rename(tmp.name, path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1421 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1422 # Relative posix path |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1423 # (to avoid \ on Windows) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1424 rel_path = _posix_path(os.path.relpath(path, self.folder)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1425 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1426 # Register in bagit checksum |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1427 if Hasher == hashlib.sha1: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1428 self._add_to_bagit(rel_path, sha1=checksum) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1429 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1430 _logger.warning( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1431 u"[provenance] Unknown hash method %s for bagit manifest", |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1432 Hasher) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1433 # Inefficient, bagit support need to checksum again |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1434 self._add_to_bagit(rel_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1435 _logger.debug(u"[provenance] Added data file %s", path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1436 if timestamp is not None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1437 self._file_provenance[rel_path] = self._self_made(timestamp) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1438 _logger.debug(u"[provenance] Relative path for data file %s", rel_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1439 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1440 if content_type is not None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1441 self._content_types[rel_path] = content_type |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1442 return rel_path |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1443 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1444 def _self_made(self, timestamp=None): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1445 # type: (Optional[datetime.datetime]) -> Dict[Text, Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1446 if timestamp is None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1447 timestamp = datetime.datetime.now() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1448 return { |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1449 "createdOn": timestamp.isoformat(), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1450 "createdBy": {"uri": self.engine_uuid, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1451 "name": self.cwltool_version} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1452 } |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1453 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1454 def add_to_manifest(self, rel_path, checksums): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1455 # type: (Text, Dict[str,str]) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1456 """Add files to the research object manifest.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1457 self.self_check() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1458 if PurePosixPath(rel_path).is_absolute(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1459 raise ValueError("rel_path must be relative: %s" % rel_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1460 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1461 if os.path.commonprefix(["data/", rel_path]) == "data/": |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1462 # payload file, go to manifest |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1463 manifest = "manifest" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1464 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1465 # metadata file, go to tag manifest |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1466 manifest = "tagmanifest" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1467 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1468 # Add checksums to corresponding manifest files |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1469 for (method, hash_value) in checksums.items(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1470 # File not in manifest because we bailed out on |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1471 # existence in bagged_size above |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1472 manifestpath = os.path.join( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1473 self.folder, "%s-%s.txt" % (manifest, method.lower())) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1474 # encoding: match Tag-File-Character-Encoding: UTF-8 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1475 # newline: ensure LF also on Windows |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1476 with open(manifestpath, "a", encoding=ENCODING, newline='\n') \ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1477 as checksum_file: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1478 line = u"%s %s\n" % (hash_value, rel_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1479 _logger.debug(u"[provenance] Added to %s: %s", manifestpath, line) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1480 checksum_file.write(line) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1481 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1482 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1483 def _add_to_bagit(self, rel_path, **checksums): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1484 # type: (Text, Any) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1485 if PurePosixPath(rel_path).is_absolute(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1486 raise ValueError("rel_path must be relative: %s" % rel_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1487 local_path = os.path.join(self.folder, _local_path(rel_path)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1488 if not os.path.exists(local_path): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1489 raise IOError("File %s does not exist within RO: %s" % (rel_path, local_path)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1490 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1491 if rel_path in self.bagged_size: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1492 # Already added, assume checksum OK |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1493 return |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1494 self.bagged_size[rel_path] = os.path.getsize(local_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1495 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1496 if SHA1 not in checksums: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1497 # ensure we always have sha1 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1498 checksums = dict(checksums) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1499 with open(local_path, "rb") as file_path: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1500 # FIXME: Need sha-256 / sha-512 as well for Research Object BagIt profile? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1501 checksums[SHA1] = checksum_copy(file_path, hasher=hashlib.sha1) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1502 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1503 self.add_to_manifest(rel_path, checksums) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1504 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1505 def create_job(self, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1506 builder_job, # type: Dict[Text, Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1507 wf_job=None, # type: Optional[Callable[[Dict[Text, Text], Callable[[Any, Any], Any], RuntimeContext], Generator[Any, None, None]]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1508 is_output=False # type: bool |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1509 ): # type: (...) -> Dict[Text, Text] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1510 #TODO customise the file |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1511 """Generate the new job object with RO specific relative paths.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1512 copied = copy.deepcopy(builder_job) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1513 relativised_input_objecttemp = {} # type: Dict[Text, Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1514 self._relativise_files(copied) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1515 def jdefault(o): # type: (Any) -> Dict[Any, Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1516 return dict(o) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1517 if is_output: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1518 rel_path = PurePosixPath(WORKFLOW)/"primary-output.json" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1519 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1520 rel_path = PurePosixPath(WORKFLOW)/"primary-job.json" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1521 j = json_dumps(copied, indent=4, ensure_ascii=False, default=jdefault) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1522 with self.write_bag_file(str(rel_path)) as file_path: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1523 file_path.write(j + u"\n") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1524 _logger.debug(u"[provenance] Generated customised job file: %s", |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1525 rel_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1526 # Generate dictionary with keys as workflow level input IDs and values |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1527 # as |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1528 # 1) for files the relativised location containing hash |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1529 # 2) for other attributes, the actual value. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1530 relativised_input_objecttemp = {} |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1531 for key, value in copied.items(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1532 if isinstance(value, MutableMapping): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1533 if value.get("class") in ("File", "Directory"): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1534 relativised_input_objecttemp[key] = value |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1535 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1536 relativised_input_objecttemp[key] = value |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1537 self.relativised_input_object.update( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1538 {k: v for k, v in relativised_input_objecttemp.items() if v}) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1539 return self.relativised_input_object |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1540 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1541 def _relativise_files(self, structure): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1542 # type: (Dict[Any, Any]) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1543 """Save any file objects into the RO and update the local paths.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1544 # Base case - we found a File we need to update |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1545 _logger.debug(u"[provenance] Relativising: %s", structure) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1546 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1547 if isinstance(structure, MutableMapping): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1548 if structure.get("class") == "File": |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1549 relative_path = None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1550 if "checksum" in structure: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1551 alg, checksum = structure["checksum"].split("$") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1552 if alg != SHA1: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1553 raise TypeError( |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1554 "Only SHA1 CWL checksums are currently supported: " |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1555 "{}".format(structure)) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1556 if self.has_data_file(checksum): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1557 prefix = checksum[0:2] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1558 relative_path = PurePosixPath("data")/prefix/checksum |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1559 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1560 if not relative_path is not None and "location" in structure: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1561 # Register in RO; but why was this not picked |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1562 # up by used_artefacts? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1563 _logger.info("[provenance] Adding to RO %s", structure["location"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1564 with self.fsaccess.open(structure["location"], "rb") as fp: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1565 relative_path = self.add_data_file(fp) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1566 checksum = PurePosixPath(relative_path).name |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1567 structure["checksum"] = "%s$%s" % (SHA1, checksum) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1568 if relative_path is not None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1569 # RO-relative path as new location |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1570 structure["location"] = str(PurePosixPath("..")/relative_path) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1571 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1572 _logger.warning("Could not determine RO path for file %s", structure) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1573 if "path" in structure: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1574 del structure["path"] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1575 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1576 if structure.get("class") == "Directory": |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1577 # TODO: Generate anonymoys Directory with a "listing" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1578 # pointing to the hashed files |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1579 del structure["location"] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1580 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1581 for val in structure.values(): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1582 try: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1583 self._relativise_files(val) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1584 except OSError: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1585 pass |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1586 return |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1587 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1588 if isinstance(structure, (str, Text)): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1589 # Just a string value, no need to iterate further |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1590 return |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1591 try: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1592 for obj in iter(structure): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1593 # Recurse and rewrite any nested File objects |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1594 self._relativise_files(obj) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1595 except TypeError: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1596 pass |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1597 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1598 def close(self, save_to=None): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1599 # type: (Optional[str]) -> None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1600 """Close the Research Object, optionally saving to specified folder. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1601 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1602 Closing will remove any temporary files used by this research object. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1603 After calling this method, this ResearchObject instance can no longer |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1604 be used, except for no-op calls to .close(). |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1605 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1606 The 'saveTo' folder should not exist - if it does, it will be deleted. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1607 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1608 It is safe to call this function multiple times without the |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1609 'saveTo' argument, e.g. within a try..finally block to |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1610 ensure the temporary files of this Research Object are removed. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1611 """ |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1612 if save_to is None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1613 if not self.closed: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1614 _logger.debug(u"[provenance] Deleting temporary %s", self.folder) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1615 shutil.rmtree(self.folder, ignore_errors=True) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1616 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1617 save_to = os.path.abspath(save_to) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1618 _logger.info(u"[provenance] Finalizing Research Object") |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1619 self._finalize() # write manifest etc. |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1620 # TODO: Write as archive (.zip or .tar) based on extension? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1621 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1622 if os.path.isdir(save_to): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1623 _logger.info(u"[provenance] Deleting existing %s", save_to) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1624 shutil.rmtree(save_to) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1625 shutil.move(self.folder, save_to) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1626 _logger.info(u"[provenance] Research Object saved to %s", save_to) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1627 self.folder = save_to |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1628 self.closed = True |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1629 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1630 def checksum_copy(src_file, # type: IO[Any] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1631 dst_file=None, # type: Optional[IO[Any]] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1632 hasher=Hasher, # type: Callable[[], hashlib._Hash] |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1633 buffersize=1024*1024 # type: int |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1634 ): # type: (...) -> str |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1635 """Compute checksums while copying a file.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1636 # TODO: Use hashlib.new(Hasher_str) instead? |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1637 checksum = hasher() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1638 contents = src_file.read(buffersize) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1639 if dst_file and hasattr(dst_file, "name") and hasattr(src_file, "name"): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1640 temp_location = os.path.join(os.path.dirname(dst_file.name), |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1641 str(uuid.uuid4())) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1642 try: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1643 os.rename(dst_file.name, temp_location) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1644 os.link(src_file.name, dst_file.name) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1645 dst_file = None |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1646 os.unlink(temp_location) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1647 except OSError: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1648 pass |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1649 if os.path.exists(temp_location): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1650 os.rename(temp_location, dst_file.name) # type: ignore |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1651 while contents != b"": |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1652 if dst_file is not None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1653 dst_file.write(contents) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1654 checksum.update(contents) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1655 contents = src_file.read(buffersize) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1656 if dst_file is not None: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1657 dst_file.flush() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1658 return checksum.hexdigest().lower() |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1659 |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1660 def copy_job_order(job, job_order_object): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1661 # type: (Any, Any) -> Any |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1662 """Create copy of job object for provenance.""" |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1663 if not hasattr(job, "tool"): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1664 # direct command line tool execution |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1665 return job_order_object |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1666 customised_job = {} # new job object for RO |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1667 for each, i in enumerate(job.tool["inputs"]): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1668 with SourceLine(job.tool["inputs"], each, WorkflowException, |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1669 _logger.isEnabledFor(logging.DEBUG)): |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1670 iid = shortname(i["id"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1671 if iid in job_order_object: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1672 customised_job[iid] = copy.deepcopy(job_order_object[iid]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1673 # add the input element in dictionary for provenance |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1674 elif "default" in i: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1675 customised_job[iid] = copy.deepcopy(i["default"]) |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1676 # add the default elements in the dictionary for provenance |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1677 else: |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1678 pass |
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
shellac
parents:
diff
changeset
|
1679 return customised_job |