Source code for deriva.transfer.restore.deriva_restore

import io
import os
import sys
import copy
import json
import time
import logging
import datetime
import platform
from collections import OrderedDict
from bdbag import bdbag_api as bdb
from deriva.core import (get_credential, format_credential, urlquote, format_exception, read_config,
                         DEFAULT_SESSION_CONFIG, __version__ as VERSION)
from deriva.core.utils.version_utils import get_installed_version
from deriva.core.ermrest_model import Model
from deriva.core.deriva_server import DerivaServer
from deriva.core.ermrest_catalog import ErmrestCatalog, _clone_state_url as CLONE_STATE_URL
from deriva.core.hatrac_store import HatracStore
from deriva.transfer import DerivaUpload, DerivaUploadError, DerivaUploadConfigurationError, GenericUploader
from deriva.transfer.restore import DerivaRestoreError, DerivaRestoreConfigurationError, \
    DerivaRestoreAuthenticationError, DerivaRestoreAuthorizationError


[docs]class DerivaRestore:
    """
    Restore a DERIVA catalog from a bag archive or directory.
    Core restore logic re-purposed from ErmrestCatalog.clone_catalog().
    """

    RESTORE_STATE_URL = "tag:isrd.isi.edu,2019:restore-status"
    BASE_DATA_INPUT_PATH = os.path.join("records", "{}", "{}.json")
    BASE_ASSETS_INPUT_PATH = "assets"

    def __init__(self, *args, **kwargs):

        self.server_args = args[0]
        self.hostname = None
        self.dst_catalog = None
        self.cancelled = False
        self.input_path = kwargs.get("input_path")
        self.exclude_schemas = kwargs.get("exclude_schemas", list())
        self.exclude_data = kwargs.get("exclude_data", list())
        self.restore_data = not kwargs.get("no_data", False)
        self.data_chunk_size = kwargs.get("data_chunk_size", 10000)
        self.restore_annotations = not kwargs.get("no_annotations", False)
        self.restore_policy = not kwargs.get("no_policy", False)
        self.restore_assets = not kwargs.get("no_assets", False)
        self.strict_bag_validation = not kwargs.get("weak_bag_validation", True)
        self.no_bag_materialize = kwargs.get("no_bag_materialize", False)
        self.upload_config = kwargs.get("asset_config")
        self.truncate_after = True
        self.envars = kwargs.get("envars", dict())
        self.config = kwargs.get("config")
        self.credentials = kwargs.get("credentials", dict())
        config_file = kwargs.get("config_file")
        credential_file = kwargs.get("credential_file")

        info = "%s v%s [Python %s, %s]" % (
            self.__class__.__name__, get_installed_version(VERSION),
            platform.python_version(), platform.platform(aliased=True))
        logging.info("Initializing: %s" % info)

        if not self.server_args:
            raise DerivaRestoreConfigurationError("Target server not specified!")

        # server variable initialization
        self.hostname = self.server_args.get('host', '')
        if not self.hostname:
            raise DerivaRestoreConfigurationError("Host not specified!")
        protocol = self.server_args.get('protocol', 'https')
        self.server_url = protocol + "://" + self.hostname
        self.catalog_id = self.server_args.get("catalog_id",)
        self.session_config = self.server_args.get('session', DEFAULT_SESSION_CONFIG.copy())
        self.session_config["allow_retry_on_all_methods"] = True

        # credential initialization
        token = kwargs.get("token")
        oauth2_token = kwargs.get("oauth2_token")
        username = kwargs.get("username")
        password = kwargs.get("password")
        if token or oauth2_token or (username and password):
            self.credentials = format_credential(token=token,
                                                 oauth2_token=oauth2_token,
                                                 username=username,
                                                 password=password)
        else:
            self.credentials = get_credential(self.hostname, credential_file)

        # destination catalog initialization
        self.server = DerivaServer(protocol,
                                   self.hostname,
                                   self.credentials,
                                   caching=True,
                                   session_config=self.session_config)
        self.server.dcctx["cid"] = kwargs.get("dcctx_cid", "api/" + self.__class__.__name__)
        # process config file
        if config_file:
            try:
                self.config = read_config(config_file)
            except Exception as e:
                raise DerivaRestoreConfigurationError(e)

[docs]    def set_config(self, config):
        self.config = config

[docs]    def set_credentials(self, credentials):
        self.dst_catalog.set_credentials(credentials, self.hostname)
        self.credentials = credentials

[docs]    def prune_parts(self, dest, *extra_victims):
        victims = set(extra_victims)
        if not self.restore_annotations and 'annotations' in dest:
            victims |= {'annotations', }
        if not self.restore_policy:
            victims |= {'acls', 'acl_bindings'}
        for k in victims:
            dest.pop(k, None)
        return dest

[docs]    def copy_sdef(self, schema):
        """Copy schema definition structure with conditional parts for cloning."""
        dest = self.prune_parts(schema.prejson(), 'tables')
        return dest

[docs]    def copy_tdef_core(self, table):
        """Copy table definition structure with conditional parts excluding fkeys."""
        dest = self.prune_parts(table.prejson(), 'foreign_keys')
        dest['column_definitions'] = [self.prune_parts(column) for column in dest['column_definitions']]
        dest['keys'] = [self.prune_parts(column) for column in dest.get('keys', [])]
        dest.setdefault('annotations', {})[self.RESTORE_STATE_URL] = 1 if self.restore_data else None
        return dest

[docs]    def copy_tdef_fkeys(self, table):
        """Copy table fkeys structure."""

        def check(fkdef):
            for fkc in fkdef['referenced_columns']:
                if fkc['schema_name'] == 'public' \
                        and fkc['table_name'] in {'ERMrest_Client', 'ERMrest_Group', 'ERMrest_RID_Lease'} \
                        and fkc['column_name'] == 'RID':
                    raise DerivaRestoreError(
                        "Cannot restore catalog with foreign key reference to "
                        "%(schema_name)s:%(table_name)s:%(column_name)s" % fkc)
            return fkdef

        return [self.prune_parts(check(dest)) for dest in table.prejson().get('foreign_keys', [])]

[docs]    def copy_cdef(self, column):
        """Copy column definition with conditional parts."""
        return column.table.schema.name, column.table.name, self.prune_parts(column.prejson())

[docs]    @staticmethod
    def check_column_compatibility(src, dst):
        """Check compatibility of source and destination column definitions."""

        def error(fieldname, sv, dv):
            return DerivaRestoreError("Source/dest column %s mismatch %s != %s for %s:%s:%s" % (
                fieldname,
                sv, dv,
                src.table.schema.name, src.table.name, src.name
            ))

        if src.type.typename != dst.type.typename:
            raise error("type", src.type.typename, dst.type.typename)
        if src.nullok != dst.nullok:
            raise error("nullok", src.nullok, dst.nullok)
        if src.default != dst.default:
            raise error("default", src.default, dst.default)

[docs]    def copy_kdef(self, key):
        return key.table.schema.name, key.table.name, self.prune_parts(key.prejson())

[docs]    def get_table_path(self, sname, tname, is_bag):
        return os.path.abspath(
            os.path.join(self.input_path, "data" if is_bag else "",
                         self.BASE_DATA_INPUT_PATH.format(urlquote(sname), urlquote(tname))))

[docs]    def load_json_file(self, file_path):
        with io.open(file_path, 'r', encoding='UTF-8') as file_data:
            return json.load(file_data, object_pairs_hook=OrderedDict)

[docs]    def open_json_stream_file(self, table_path):
        """
        Open a JSON-Stream file for reading, caller is responsible for closing.
        """
        table_data = io.open(table_path, 'r', encoding='UTF-8')
        line = table_data.readline().strip()
        table_data.seek(0)
        if line.startswith('{') and line.endswith('}'):
            return table_data
        else:
            table_data.close()
            raise DerivaRestoreError(
                "Input file %s does not appear to be in the required json-stream format." % table_path)

[docs]    def get_json_recordset(self, data, chunk_size, after=None, after_column='RID'):
        chunk = list()
        found = False
        for line in data:
            if isinstance(line, dict):
                row = line
            else:
                row = json.loads(line, object_pairs_hook=OrderedDict)
            if after and not found:
                if after == row[after_column]:
                    found = True
                continue
            chunk.append(row)
            if len(chunk) == chunk_size:
                yield chunk
                chunk = list()
        if chunk:
            yield chunk

[docs]    def restore(self, **kwargs):
        """
        Perform the catalog restore operation. The restore process is broken up into six phases:

        1. Pre-process the input path.
            - If the input path is a file, it is assumed that it is a compressed archive file that can be extracted
            into an input directory via a supported codec: `tar`,`tgz`,`bz2`, or `zip`.
            - If the input directory is a valid _bag_ directory structure, the bag will be materialized.
        2. The catalog schema will be restored first. The schema is restored from a ERMRest JSON schema document file.
            The schema document file must be named `catalog-schema.json` and must appear at the root of the input
            directory. The restore process can be configured to exclude the restoration of an enumerated set both
            schema and tables.
        3. The catalog table data will be restored, if present. The table date restoration process is resilient to
            interruption and may be restarted. However, if the catalog schema or data is mutated outside of the scope of
            the restore function in-between such restarts, the restored catalog's consistency cannot be guaranteed.
            The restore process can be configured to exclude the restoration of table data for a set of tables.
        4. The catalog foreign keys will be restored.
        5. The catalog assets will be restored, if present.
        6. On success, the restore state marker annotations will be deleted and the catalog history will be truncated.

        :param kwargs:
        :return:
        """
        success = True
        start = datetime.datetime.now()

        # pre-process input
        logging.info("Processing input path: %s" % self.input_path)
        is_file, is_dir, is_uri = bdb.inspect_path(self.input_path)
        if not (is_file or is_dir or is_uri):
            raise DerivaRestoreError("Invalid input path [%s]. If the specified input path refers to a locally mounted "
                                     "file or directory, it does not exist or cannot be accessed. If the specified "
                                     "path is a URI, the scheme component of the URI could not be determined." %
                                     self.input_path)
        if is_file or is_dir:
            self.input_path = os.path.abspath(self.input_path)
        if is_file:
            logging.info("The input path [%s] is a file. Assuming input file is a directory archive and extracting..." %
                         self.input_path)
            self.input_path = bdb.extract_bag(self.input_path)

        try:
            if not self.no_bag_materialize:
                self.input_path = bdb.materialize(self.input_path)
        except bdb.bdbagit.BagValidationError as e:
            if self.strict_bag_validation:
                raise DerivaRestoreError(format_exception(e))
            else:
                logging.warning("Input bag validation failed and strict validation mode is disabled. %s" %
                                format_exception(e))
        is_bag = bdb.is_bag(self.input_path)

        src_schema_file = os.path.abspath(
            os.path.join(self.input_path, "data" if is_bag else "", "catalog-schema.json"))
        # the src_catalog_stub created below will never be "connected" in any kind of network sense,
        # but we need an instance of ErmrestCatalog in order to get a working Model from the schema file.
        src_catalog_stub = ErmrestCatalog("file", src_schema_file, "1")
        src_model = Model.fromfile(src_catalog_stub, src_schema_file)

        # initialize/connect to destination catalog
        if not self.catalog_id:
            self.catalog_id = self.server.create_ermrest_catalog().catalog_id
            self.server_args["catalog_id"] = self.catalog_id
            logging.info("Created new target catalog with ID: %s" % self.catalog_id)
        self.dst_catalog = self.server.connect_ermrest(self.catalog_id)

        # init dcctx cid to a default
        self.dst_catalog.dcctx['cid'] = self.__class__.__name__

        # build up the model content we will copy to destination
        dst_model = self.dst_catalog.getCatalogModel()

        logging.info("Restoring %s to catalog: %s" % (self.input_path, self.dst_catalog.get_server_uri()))
        # set top-level config right away and find fatal usage errors...
        if self.restore_policy:
            logging.info("Restoring top-level catalog ACLs...")
            if not src_model.acls:
                logging.info("Source schema does not contain any ACLs.")
            else:
                src_model.acls.owner.extend(dst_model.acls.owner)
                self.dst_catalog.put('/acl', json=src_model.acls)

        if self.restore_annotations:
            logging.info("Restoring top-level catalog annotations...")
            self.dst_catalog.put('/annotation', json=src_model.annotations)

        # build up the model content we will copy to destination
        dst_model = self.dst_catalog.getCatalogModel()

        new_model = []
        new_columns = []  # ERMrest does not currently allow bulk column creation
        new_keys = []  # ERMrest does not currently allow bulk key creation
        restore_states = {}
        fkeys_deferred = {}
        exclude_schemas = [] if self.exclude_schemas is None else self.exclude_schemas
        exclude_data = [] if self.exclude_data is None else self.exclude_data

        try:
            for sname, schema in src_model.schemas.items():
                if sname in exclude_schemas:
                    continue
                if sname not in dst_model.schemas:
                    new_model.append(self.copy_sdef(schema))

                for tname, table in schema.tables.items():
                    if table.kind != 'table':
                        logging.warning('Skipping restore of %s %s:%s' % (table.kind, sname, tname))
                        continue

                    if 'RID' not in table.column_definitions.elements:
                        raise DerivaRestoreError(
                            "Source table %s.%s lacks system-columns and cannot be restored." % (sname, tname))

                    # make sure the source table is pruned of any existing restore state markers
                    if table.annotations.get(CLONE_STATE_URL) is not None:
                        del table.annotations[CLONE_STATE_URL]
                    if table.annotations.get(self.RESTORE_STATE_URL) is not None:
                        del table.annotations[self.RESTORE_STATE_URL]

                    if sname not in dst_model.schemas or tname not in dst_model.schemas[sname].tables:
                        new_model.append(self.copy_tdef_core(table))
                        restore_states[(sname, tname)] = 1 if self.restore_data else None
                        fkeys_deferred[(sname, tname)] = self.copy_tdef_fkeys(table)
                    else:
                        if dst_model.schemas[sname].tables[tname].foreign_keys:
                            # assume presence of any destination foreign keys means we already loaded deferred_fkeys
                            self.restore_data = False
                        else:
                            fkeys_deferred[(sname, tname)] = self.copy_tdef_fkeys(table)

                        src_columns = {c.name: c for c in table.column_definitions}
                        dst_columns = {c.name: c for c in dst_model.schemas[sname].tables[tname].column_definitions}

                        for cname in src_columns:
                            if cname not in dst_columns:
                                new_columns.append(self.copy_cdef(src_columns[cname]))
                            else:
                                self.check_column_compatibility(src_columns[cname], dst_columns[cname])

                        for cname in dst_columns:
                            if cname not in src_columns:
                                raise DerivaRestoreError(
                                    "Destination column %s.%s.%s does not exist in source catalog." %
                                    (sname, tname, cname))

                        src_keys = {tuple(sorted(c.name for c in key.unique_columns)): key for key in table.keys}
                        dst_keys = {tuple(sorted(c.name for c in key.unique_columns)): key for key in
                                    dst_model.schemas[sname].tables[tname].keys}

                        for utuple in src_keys:
                            if utuple not in dst_keys:
                                new_keys.append(self.copy_kdef(src_keys[utuple]))

                        for utuple in dst_keys:
                            if utuple not in src_keys:
                                raise DerivaRestoreError("Destination key %s.%s(%s) does not exist in source catalog."
                                                         % (sname, tname, ', '.join(utuple)))

                        restore_states[(sname, tname)] = \
                            dst_model.schemas[sname].tables[tname].annotations.get(self.RESTORE_STATE_URL)

            restore_states[('public', 'ERMrest_RID_Lease')] = None  # never try to sync leases

            # apply the stage 1 model to the destination in bulk
            logging.info("Restoring catalog schema...")
            if new_model:
                self.dst_catalog.post("/schema", json=new_model).raise_for_status()

            for sname, tname, cdef in new_columns:
                self.dst_catalog.post("/schema/%s/table/%s/column" % (urlquote(sname), urlquote(tname)),
                                      json=cdef).raise_for_status()

            for sname, tname, kdef in new_keys:
                self.dst_catalog.post("/schema/%s/table/%s/key" % (urlquote(sname), urlquote(tname)),
                                      json=kdef).raise_for_status()

            # copy data in stage 2
            if self.restore_data:
                logging.info("Restoring catalog data...")
                for sname, tname in restore_states.keys():
                    object_name = "%s:%s" % (sname, tname)
                    if object_name in exclude_data:
                        continue
                    tname_uri = "%s:%s" % (urlquote(sname), urlquote(tname))
                    if restore_states[(sname, tname)] == 1:
                        # determine current position in (partial?) copy
                        row = self.dst_catalog.get("/entity/%s@sort(RID::desc::)?limit=1" % tname_uri).json()
                        if row:
                            last = row[0]['RID']
                            logging.info("Existing data detected in table [%s] -- will attempt partial restore of "
                                         "remaining records following last known RID: %s" % (tname_uri, last))
                        else:
                            last = None

                        table_path = self.get_table_path(sname, tname, is_bag)
                        if not os.path.isfile(table_path):
                            logging.warning("Restoration of table data [%s] incomplete. File not found: %s" %
                                            (("%s:%s" % (sname, tname)), table_path))
                            continue
                        table = self.get_json_recordset(self.open_json_stream_file(table_path),
                                                        self.data_chunk_size, after=last)

                        total = 0
                        table_success = True
                        try:
                            for chunk in table:
                                if chunk:
                                    self.dst_catalog.post("/entity/%s?nondefaults=RID,RCT,RCB" % tname_uri, json=chunk)
                                    total += len(chunk)
                                else:
                                    break
                        except Exception as e:
                            table_success = False
                            logging.error(format_exception(e))
                        finally:
                            table.close()
                            if table_success:
                                logging.info("Restoration of table data [%s] successful. %s rows restored." %
                                             (tname_uri, total))
                            else:
                                success = False
                                logging.warning("Restoration of table data [%s] failed. %s rows restored." %
                                                (tname_uri, total))

                        # record our progress on catalog in case we fail part way through
                        self.dst_catalog.put(
                            "/schema/%s/table/%s/annotation/%s" % (
                                urlquote(sname),
                                urlquote(tname),
                                urlquote(self.RESTORE_STATE_URL),
                            ),
                            json=2
                        )
                    elif restore_states[(sname, tname)] is None and (sname, tname) in {
                        ('public', 'ERMrest_Client'),
                        ('public', 'ERMrest_Group'),
                    }:
                        # special sync behavior for magic ermrest tables
                        # HACK: these are assumed small enough to join via local merge of arrays
                        page = self.load_json_file(self.get_table_path(sname, tname, is_bag))
                        self.dst_catalog.post("/entity/%s?onconflict=skip" % tname_uri, json=page)

                        # record our progress on catalog in case we fail part way through
                        self.dst_catalog.put(
                            "/schema/%s/table/%s/annotation/%s" % (
                                urlquote(sname),
                                urlquote(tname),
                                urlquote(self.RESTORE_STATE_URL),
                            ),
                            json=2
                        )

            # apply stage 2 model in bulk only... we won't get here unless preceding succeeded
            logging.info("Restoring foreign keys...")
            new_fkeys = []
            for fkeys in fkeys_deferred.values():
                new_fkeys.extend(fkeys)

            if new_fkeys:
                self.dst_catalog.post("/schema", json=new_fkeys)

            # copy over configuration in stage 3
            # we need to do this after deferred_fkeys to handle acl_bindings projections with joins
            logging.info("Restoring catalog configuration...")
            dst_model = self.dst_catalog.getCatalogModel()

            for sname, src_schema in src_model.schemas.items():
                if sname in exclude_schemas:
                    continue
                dst_schema = dst_model.schemas[sname]

                if self.restore_annotations:
                    dst_schema.annotations.clear()
                    dst_schema.annotations.update(src_schema.annotations)

                if self.restore_policy:
                    dst_schema.acls.clear()
                    dst_schema.acls.update(src_schema.acls)

                for tname, src_table in src_schema.tables.items():
                    dst_table = dst_schema.tables[tname]

                    if self.restore_annotations:
                        merged = dict(src_table.annotations)
                        if self.RESTORE_STATE_URL in dst_table.annotations:
                            merged[self.RESTORE_STATE_URL] = dst_table.annotations[self.RESTORE_STATE_URL]
                        dst_table.annotations.clear()
                        dst_table.annotations.update(merged)

                    if self.restore_policy:
                        dst_table.acls.clear()
                        dst_table.acls.update(src_table.acls)
                        dst_table.acl_bindings.clear()
                        dst_table.acl_bindings.update(src_table.acl_bindings)

                    for cname, src_col in src_table.columns.elements.items():
                        dst_col = dst_table.columns[cname]

                        if self.restore_annotations:
                            dst_col.annotations.clear()
                            dst_col.annotations.update(src_col.annotations)

                        if self.restore_policy:
                            dst_col.acls.clear()
                            dst_col.acls.update(src_col.acls)
                            dst_col.acl_bindings.clear()
                            dst_col.acl_bindings.update(src_col.acl_bindings)

                    for src_key in src_table.keys:
                        dst_key = dst_table.key_by_columns([col.name for col in src_key.unique_columns])

                        if self.restore_annotations:
                            dst_key.annotations.clear()
                            dst_key.annotations.update(src_key.annotations)

                    def xlate_column_map(fkey):
                        dst_from_table = dst_table
                        dst_to_schema = dst_model.schemas[fkey.pk_table.schema.name]
                        dst_to_table = dst_to_schema.tables[fkey.pk_table.name]
                        return {
                            dst_from_table._own_column(from_col.name): dst_to_table._own_column(to_col.name)
                            for from_col, to_col in fkey.column_map.items()
                        }

                    for src_fkey in src_table.foreign_keys:
                        dst_fkey = dst_table.fkey_by_column_map(xlate_column_map(src_fkey))

                        if self.restore_annotations:
                            dst_fkey.annotations.clear()
                            dst_fkey.annotations.update(src_fkey.annotations)

                        if self.restore_policy:
                            dst_fkey.acls.clear()
                            dst_fkey.acls.update(src_fkey.acls)
                            dst_fkey.acl_bindings.clear()
                            dst_fkey.acl_bindings.update(src_fkey.acl_bindings)

            # send all the config changes to the server
            dst_model.apply()

            # restore assets
            if self.restore_assets:
                self.upload_assets()

            # cleanup
            if success:
                self.cleanup_restored_catalog()
        except:
            success = False
            raise
        finally:
            elapsed_time = datetime.datetime.now() - start
            total_secs = elapsed_time.total_seconds()
            elapsed = time.strftime('%H:%M:%S', time.gmtime(total_secs))
            logging.info("Restore of catalog %s %s. %s" % (self.dst_catalog.get_server_uri(),
                                                           "completed successfully" if success else "failed",
                                                           ("Elapsed time: %s" % elapsed) if (total_secs > 0) else ""))

[docs]    def cleanup_restored_catalog(self):
        # cleanup restore state markers
        logging.info("Cleaning up restore state...")
        dst_model = self.dst_catalog.getCatalogModel()
        for sname, schema in dst_model.schemas.items():
            for tname, table in schema.tables.items():
                if sname == "public" and tname == "ERMrest_RID_Lease":
                    continue
                annotation_uri = "/schema/%s/table/%s/annotation/%s" % (
                    urlquote(sname),
                    urlquote(tname),
                    urlquote(self.RESTORE_STATE_URL)
                )
                try:
                    self.dst_catalog.delete(annotation_uri)
                except Exception as e:
                    logging.warning("Unable to cleanup restore state marker annotation %s: %s" %
                                    (annotation_uri, format_exception(e)))
                    continue

        # truncate restore history
        if self.truncate_after:
            logging.info("Truncating restore history...")
            snaptime = self.dst_catalog.get("/").json()["snaptime"]
            self.dst_catalog.delete("/history/,%s" % urlquote(snaptime))

[docs]    def upload_assets(self):
        asset_dir = os.path.join(self.input_path, self.BASE_ASSETS_INPUT_PATH)
        if not os.path.isdir(asset_dir):
            logging.debug("No asset directory found. Will not attempt to upload file assets.")
            return

        logging.info("Restoring file assets...")
        uploader = GenericUploader(config_file=self.upload_config, server=self.server_args)
        uploader.setCredentials(self.credentials)
        uploader.setConfig(self.upload_config)
        uploader.scanDirectory(asset_dir, abort_on_invalid_input=False, purge_state=False)
        uploader.uploadFiles(file_callback=uploader.defaultFileCallback)
        uploader.cleanup()