Source code for cognite.extractorutils.metrics

#  Copyright 2020 Cognite AS
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
Module containing tools for pushers for metric reporting.

The classes in this module scrape the default Prometheus registry and uploads it periodically to either a Prometheus
push gateway, or to CDF as time series.

The ``BaseMetrics`` class forms the basis for a metrics collection for an extractor, containing some general metrics
that all extractors should report. To create your own set of metrics, subclass this class and populate it with
extractor-specific metrics, as such:

.. code-block:: python

    class MyMetrics(BaseMetrics):
        def __init__(self):
            super().__init__(extractor_name="my_extractor", extractor_version=__version__)

            self.a_counter = Counter("my_extractor_example_counter", "An example counter")
            ...

The metrics module also contains some Pusher classes that are used to routinely send metrics to a
remote server, these can be automatically created with the ``start_pushers`` method described in
``configtools``.

"""

import logging
import os
import threading
from abc import ABC, abstractmethod
from time import sleep
from types import TracebackType
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union

import arrow
import psutil
from prometheus_client import Gauge, Info, Metric
from prometheus_client.core import REGISTRY
from prometheus_client.exposition import basic_auth_handler, delete_from_gateway, pushadd_to_gateway

from cognite.client import CogniteClient
from cognite.client.data_classes import Asset, Datapoints, DatapointsArray, TimeSeries
from cognite.client.exceptions import CogniteDuplicatedError
from cognite.extractorutils.threading import CancellationToken
from cognite.extractorutils.util import EitherId

from .util import ensure_time_series

_metrics_singularities = {}


T = TypeVar("T")



[docs]
def safe_get(cls: Type[T], *args: Any, **kwargs: Any) -> T:
    """
    A factory for instances of metrics collections.

    Since Prometheus doesn't allow multiple metrics with the same name, any subclass of BaseMetrics must never be
    created more than once. This function creates an instance of the given class on the first call and stores it, any
    subsequent calls with the same class as argument will return the same instance.

    .. code-block:: python

        >>> a = safe_get(MyMetrics)  # This will create a new instance of MyMetrics
        >>> b = safe_get(MyMetrics)  # This will return the same instance
        >>> a is b
        True


    Args:
        cls: Metrics class to either create or get a cached version of

    Returns:
        An instance of given class
    """
    global _metrics_singularities

    if cls not in _metrics_singularities:
        _metrics_singularities[cls] = cls(*args, **kwargs)

    return _metrics_singularities[cls]




[docs]
class BaseMetrics:
    """
    Base collection of extractor metrics. The class also spawns a collector thread on init that regularly fetches
    process information and update the ``process_*`` gauges.

    To create a set of metrics for an extractor, create a subclass of this class.

    **Note that only one instance of this class (or any subclass) can exist simultaneously**

    The collection includes the following metrics:
     * startup:                     Startup time (unix epoch)
     * finish:                      Finish time (unix epoch)
     * process_num_threads          Number of active threads. Set automatically.
     * process_memory_bytes         Memory usage of extractor. Set automatically.
     * process_cpu_percent          CPU usage of extractor. Set automatically.

    Args:
        extractor_name: Name of extractor, used to prefix metric names
        process_scrape_interval: Interval (in seconds) between each fetch of data for the ``process_*`` gauges
    """

    def __init__(self, extractor_name: str, extractor_version: str, process_scrape_interval: float = 15):
        extractor_name = extractor_name.strip().replace(" ", "_")

        self.startup = Gauge(f"{extractor_name}_start_time", "Timestamp (seconds) of when the extractor last started")
        self.finish = Gauge(
            f"{extractor_name}_finish_time", "Timestamp (seconds) of then the extractor last finished cleanly"
        )

        self._process = psutil.Process(os.getpid())

        self.process_num_threads = Gauge(f"{extractor_name}_num_threads", "Number of threads")
        self.process_memory_bytes = Gauge(f"{extractor_name}_memory_bytes", "Memory usage in bytes")
        self.process_memory_bytes_available = Gauge(
            f"{extractor_name}_memory_bytes_available", "Memory available in bytes"
        )
        self.process_cpu_percent = Gauge(f"{extractor_name}_cpu_percent", "CPU usage percent")

        self.info = Info(f"{extractor_name}_info", "Information about running extractor")
        self.info.info({"extractor_version": extractor_version, "extractor_type": extractor_name})

        self.process_scrape_interval = process_scrape_interval
        self._start_proc_collector()

        self.startup.set_to_current_time()

    def _proc_collect(self) -> None:
        """
        Collect values for process metrics
        """
        total_memory_available = psutil.virtual_memory().total
        while True:
            self.process_num_threads.set(self._process.num_threads())
            self.process_memory_bytes.set(self._process.memory_info().rss)
            self.process_memory_bytes_available.set(total_memory_available)
            self.process_cpu_percent.set(self._process.cpu_percent())

            sleep(self.process_scrape_interval)

    def _start_proc_collector(self) -> None:
        """
        Start a thread that collects process metrics at a regular interval
        """
        thread = threading.Thread(target=self._proc_collect, name="ProcessMetricsCollector", daemon=True)
        thread.start()




[docs]
class AbstractMetricsPusher(ABC):
    """
    Base class for metric pushers. Metric pushers spawns a thread that routinely pushes metrics to a configured
    destination.

    Contains all the logic for starting and running threads.

    Args:
        push_interval: Seconds between each upload call
        thread_name: Name of thread to start. If omitted, a standard name such as Thread-4 will be generated.
        cancellation_token: Event object to be used as a thread cancelation event
    """

    def __init__(
        self,
        push_interval: Optional[int] = None,
        thread_name: Optional[str] = None,
        cancellation_token: Optional[CancellationToken] = None,
    ):
        self.push_interval = push_interval
        self.thread_name = thread_name

        self.thread: Optional[threading.Thread] = None
        self.thread_name = thread_name
        self.cancellation_token = cancellation_token.create_child_token() if cancellation_token else CancellationToken()

        self.logger = logging.getLogger(__name__)

    @abstractmethod
    def _push_to_server(self) -> None:
        """
        Push metrics to a remote server, to be overrided in subclasses.
        """
        pass

    def _run(self) -> None:
        """
        Run push loop.
        """
        while not self.cancellation_token.is_cancelled:
            self._push_to_server()
            self.cancellation_token.wait(self.push_interval)


[docs]
    def start(self) -> None:
        """
        Starts a thread that pushes the default registry to the configured gateway at certain intervals.

        """
        self.thread = threading.Thread(target=self._run, daemon=True, name=self.thread_name)
        self.thread.start()



[docs]
    def stop(self) -> None:
        """
        Stop the push loop.
        """
        # Make sure everything is pushed
        self._push_to_server()
        self.cancellation_token.cancel()


    def __enter__(self) -> "AbstractMetricsPusher":
        """
        Wraps around start method, for use as context manager

        Returns:
            self
        """
        self.start()
        return self

    def __exit__(
        self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
    ) -> None:
        """
        Wraps around stop method, for use as context manager

        Args:
            exc_type: Exception type
            exc_val: Exception value
            exc_tb: Traceback
        """
        self.stop()




[docs]
class PrometheusPusher(AbstractMetricsPusher):
    """
    Pusher to a Prometheus push gateway.

    Args:
        job_name: Prometheus job name
        username: Push gateway credentials
        password: Push gateway credentials
        url: URL (with portnum) of push gateway
        push_interval: Seconds between each upload call
        thread_name: Name of thread to start. If omitted, a standard name such as Thread-4 will be generated.
        cancellation_token: Event object to be used as a thread cancelation event
    """

    def __init__(
        self,
        job_name: str,
        url: str,
        push_interval: int,
        username: Optional[str] = None,
        password: Optional[str] = None,
        thread_name: Optional[str] = None,
        cancellation_token: Optional[CancellationToken] = None,
    ):
        super(PrometheusPusher, self).__init__(push_interval, thread_name, cancellation_token)

        self.username = username
        self.job_name = job_name
        self.password = password

        self.url = url

    def _auth_handler(self, url: str, method: str, timeout: int, headers: List[Tuple[str, str]], data: Any) -> Callable:
        """
        Returns a authentication handler against the Prometheus Pushgateway to use in the pushadd_to_gateway method.

        Args:
            url:      Push gateway
            method:   HTTP method
            timeout:  Request timeout (seconds)
            headers:  HTTP headers
            data:     Data to send

        Returns:
            prometheus_client.exposition.basic_auth_handler: A authentication handler based on this client.
        """
        return basic_auth_handler(url, method, timeout, headers, data, self.username, self.password)

    def _push_to_server(self) -> None:
        """
        Push the default metrics registry to the configured Prometheus Pushgateway.
        """
        if not self.url or not self.job_name:
            return

        try:
            pushadd_to_gateway(self.url, job=self.job_name, registry=REGISTRY, handler=self._auth_handler)

        except OSError as exp:
            self.logger.warning("Failed to push metrics to %s: %s", self.url, str(exp))
        except Exception:
            self.logger.exception("Failed to push metrics to %s", self.url)

        self.logger.debug("Pushed metrics to %s", self.url)


[docs]
    def clear_gateway(self) -> None:
        """
        Delete metrics stored at the gateway (reset gateway).
        """
        delete_from_gateway(self.url, job=self.job_name, handler=self._auth_handler)
        self.logger.debug("Deleted metrics from push gateway %s", self.url)





[docs]
class CognitePusher(AbstractMetricsPusher):
    """
    Pusher to CDF. Creates time series in CDF for all Gauges and Counters in the default Prometheus registry.

    Optional contextualization with an Asset to make the time series observable in Asset Data Insight. The given asset
    will be created at root level in the tenant if it doesn't already exist.

    Args:
        cdf_client: The CDF tenant to upload time series to
        external_id_prefix: Unique external ID prefix for this pusher.
        push_interval: Seconds between each upload call
        asset: Optional contextualization.
        data_set: Data set the metrics timeseries created under.
        thread_name: Name of thread to start. If omitted, a standard name such as Thread-4 will be generated.
        cancellation_token: Event object to be used as a thread cancelation event
    """

    def __init__(
        self,
        cdf_client: CogniteClient,
        external_id_prefix: str,
        push_interval: int,
        asset: Optional[Asset] = None,
        data_set: Optional[EitherId] = None,
        thread_name: Optional[str] = None,
        cancellation_token: Optional[CancellationToken] = None,
    ):
        super(CognitePusher, self).__init__(push_interval, thread_name, cancellation_token)

        self.cdf_client = cdf_client
        self.asset = asset
        self.external_id_prefix = external_id_prefix
        self.data_set = data_set

        self._init_cdf()

        self._cdf_project = cdf_client.config.project

    def _init_cdf(self) -> None:
        """
        Initialize the CDF tenant with the necessary time series and asset.
        """
        time_series: List[TimeSeries] = []

        if self.asset is not None:
            # Ensure that asset exist, and retrieve internal ID
            asset: Optional[Asset]
            try:
                asset = self.cdf_client.assets.create(self.asset)
            except CogniteDuplicatedError:
                asset = self.cdf_client.assets.retrieve(external_id=self.asset.external_id)

            asset_id = asset.id if asset is not None else None

        else:
            asset_id = None

        data_set_id = None
        if self.data_set:
            dataset = self.cdf_client.data_sets.retrieve(
                id=self.data_set.internal_id, external_id=self.data_set.external_id
            )
            if dataset:
                data_set_id = dataset.id

        for metric in REGISTRY.collect():
            if type(metric) is Metric and metric.type in ["gauge", "counter"]:
                external_id = self.external_id_prefix + metric.name

                time_series.append(
                    TimeSeries(
                        external_id=external_id,
                        name=metric.name,
                        legacy_name=external_id,
                        description=metric.documentation,
                        asset_id=asset_id,
                        data_set_id=data_set_id,
                    )
                )

        ensure_time_series(self.cdf_client, time_series)

    def _push_to_server(self) -> None:
        """
        Create datapoints an push them to their respective time series
        """
        timestamp = int(arrow.get().float_timestamp * 1000)

        datapoints: List[Dict[str, Union[str, int, List[Any], Datapoints, DatapointsArray]]] = []

        for metric in REGISTRY.collect():
            if type(metric) == Metric and metric.type in ["gauge", "counter"]:
                if len(metric.samples) == 0:
                    continue

                external_id = self.external_id_prefix + metric.name
                datapoints.append({"externalId": external_id, "datapoints": [(timestamp, metric.samples[0].value)]})

        self.cdf_client.time_series.data.insert_multiple(datapoints)
        self.logger.debug("Pushed metrics to CDF tenant '%s'", self._cdf_project)