From 3de29f6d5a3017b57c553c5e2fb63a50994df840 Mon Sep 17 00:00:00 2001 From: Mangirdas Date: Sat, 27 Jan 2018 08:05:31 +0000 Subject: Rebase Prometheus example for new scrape endpoints and expose alert manager --- roles/openshift_prometheus/templates/prometheus.j2 | 92 ++++++++--- .../templates/prometheus.yml.j2 | 175 ++++++++++++++------- 2 files changed, 192 insertions(+), 75 deletions(-) (limited to 'roles/openshift_prometheus/templates') diff --git a/roles/openshift_prometheus/templates/prometheus.j2 b/roles/openshift_prometheus/templates/prometheus.j2 index d780550b8..c0abd483b 100644 --- a/roles/openshift_prometheus/templates/prometheus.j2 +++ b/roles/openshift_prometheus/templates/prometheus.j2 @@ -19,7 +19,7 @@ spec: labels: app: prometheus spec: - serviceAccountName: prometheus + serviceAccountName: "{{ openshift_prometheus_service_name }}" {% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %} nodeSelector: {% for key, value in openshift_prometheus_node_selector.items() %} @@ -47,15 +47,15 @@ spec: cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}" {% endif %} ports: - - containerPort: 8443 + - containerPort: {{ openshift_prometheus_service_targetport }} name: web args: - -provider=openshift - - -https-address=:8443 + - -https-address=:{{ openshift_prometheus_service_targetport }} - -http-address= - -email-domain=* - -upstream=http://localhost:9090 - - -client-id=system:serviceaccount:{{ namespace }}:prometheus + - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }} - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' - -tls-cert=/etc/tls/private/tls.crt @@ -67,9 +67,9 @@ spec: - -skip-auth-regex=^/metrics volumeMounts: - mountPath: /etc/tls/private - name: prometheus-tls + name: prometheus-tls-secret - mountPath: /etc/proxy/secrets - name: prometheus-secrets + name: prometheus-proxy-secret - mountPath: /prometheus name: prometheus-data @@ -104,7 +104,7 @@ spec: - mountPath: /prometheus name: prometheus-data - # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy + # Deploy alert-buffer behind oauth alerts-proxy - name: alerts-proxy image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}" imagePullPolicy: IfNotPresent @@ -124,15 +124,15 @@ spec: cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}" {% endif %} ports: - - containerPort: 9443 + - containerPort: {{ openshift_prometheus_alerts_service_targetport }} name: web args: - -provider=openshift - - -https-address=:9443 + - -https-address=:{{ openshift_prometheus_alerts_service_targetport }} - -http-address= - -email-domain=* - -upstream=http://localhost:9099 - - -client-id=system:serviceaccount:{{ namespace }}:prometheus + - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }} - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' - -tls-cert=/etc/tls/private/tls.crt @@ -143,9 +143,9 @@ spec: - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt volumeMounts: - mountPath: /etc/tls/private - name: alerts-tls + name: alerts-tls-secret - mountPath: /etc/proxy/secrets - name: alerts-secrets + name: alerts-proxy-secret - name: alert-buffer args: @@ -169,11 +169,54 @@ spec: {% endif %} volumeMounts: - mountPath: /alert-buffer - name: alert-buffer-data + name: alerts-data ports: - containerPort: 9099 name: alert-buf + # Deploy alertmanager behind oauth alertmanager-proxy + - name: alertmanager-proxy + image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}" + imagePullPolicy: IfNotPresent + requests: +{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %} + memory: "{{ openshift_prometheus_oauth_proxy_memory_requests }}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %} + cpu: "{{ openshift_prometheus_oauth_proxy_cpu_requests }}" +{% endif %} + limits: +{% if openshift_prometheus_oauth_proxy_memory_limit is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %} + memory: "{{ openshift_prometheus_oauth_proxy_memory_limit }}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %} + cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}" +{% endif %} + ports: + - containerPort: {{ openshift_prometheus_alertmanager_service_targetport }} + name: web + args: + - -provider=openshift + - -https-address=:{{ openshift_prometheus_alertmanager_service_targetport }} + - -http-address= + - -email-domain=* + - -upstream=http://localhost:9093 + - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }} + - -openshift-ca=/etc/pki/tls/cert.pem + - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret-file=/etc/proxy/secrets/session_secret + - -skip-auth-regex=^/metrics + volumeMounts: + - mountPath: /etc/tls/private + name: alertmanager-tls-secret + - mountPath: /etc/proxy/secrets + name: alertmanager-proxy-secret + - name: alertmanager args: - -config.file=/etc/alertmanager/alertmanager.yml @@ -205,14 +248,15 @@ spec: restartPolicy: Always volumes: + - name: prometheus-config configMap: defaultMode: 420 name: prometheus - - name: prometheus-secrets + - name: prometheus-proxy-secret secret: secretName: prometheus-proxy - - name: prometheus-tls + - name: prometheus-tls-secret secret: secretName: prometheus-tls - name: prometheus-data @@ -225,13 +269,19 @@ spec: - name: alertmanager-config configMap: defaultMode: 420 - name: prometheus-alerts - - name: alerts-secrets + name: alertmanager + - name: alertmanager-proxy-secret secret: - secretName: alerts-proxy - - name: alerts-tls + secretName: alertmanager-proxy + - name: alertmanager-tls-secret + secret: + secretName: alertmanager-tls + - name: alerts-tls-secret secret: - secretName: prometheus-alerts-tls + secretName: alerts-tls + - name: alerts-proxy-secret + secret: + secretName: alerts-proxy - name: alertmanager-data {% if openshift_prometheus_alertmanager_storage_type == 'pvc' %} persistentVolumeClaim: @@ -239,7 +289,7 @@ spec: {% else %} emptydir: {} {% endif %} - - name: alert-buffer-data + - name: alerts-data {% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %} persistentVolumeClaim: claimName: {{ openshift_prometheus_alertbuffer_pvc_name }} diff --git a/roles/openshift_prometheus/templates/prometheus.yml.j2 b/roles/openshift_prometheus/templates/prometheus.yml.j2 index 63430f834..005c2c564 100644 --- a/roles/openshift_prometheus/templates/prometheus.yml.j2 +++ b/roles/openshift_prometheus/templates/prometheus.yml.j2 @@ -1,10 +1,5 @@ rule_files: - - 'prometheus.rules' -{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %} - - 'prometheus.additional.rules' -{% endif %} - - + - '*.rules' # A scrape configuration for running Prometheus on a Kubernetes cluster. # This uses separate scrape configs for cluster components (i.e. API server, node) @@ -39,31 +34,11 @@ scrape_configs: action: keep regex: default;kubernetes;https -# Scrape config for nodes. -# -# Each node exposes a /metrics endpoint that contains operational metrics for -# the Kubelet and other components. -- job_name: 'kubernetes-nodes' - - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - # Scrape config for controllers. # # Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for # the controllers. # -# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via -# endpoints. - job_name: 'kubernetes-controllers' scheme: https @@ -87,6 +62,27 @@ scrape_configs: regex: (.+)(?::\d+) replacement: $1:8444 +# Scrape config for nodes. +# +# Each node exposes a /metrics endpoint that contains operational metrics for +# the Kubelet and other components. +- job_name: 'kubernetes-nodes' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + # Drop a very high cardinality metric that is incorrect in 3.7. It will be + # fixed in 3.9. + metric_relabel_configs: + - source_labels: [__name__] + action: drop + regex: 'openshift_sdn_pod_(setup|teardown)_latency(.*)' + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + # Scrape config for cAdvisor. # # Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that @@ -107,6 +103,14 @@ scrape_configs: kubernetes_sd_configs: - role: node + # Exclude a set of high cardinality metrics that can contribute to significant + # memory use in large clusters. These can be selectively enabled as necessary + # for medium or small clusters. + metric_relabel_configs: + - source_labels: [__name__] + action: drop + regex: 'container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_usage_bytes|memory_swap|memory_working_set_bytes|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total|spec_(.*))' + relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) @@ -133,38 +137,101 @@ scrape_configs: - role: endpoints relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + # only scrape infrastructure components + - source_labels: [__meta_kubernetes_namespace] + action: keep + regex: 'default|logging|metrics|kube-.+|openshift|openshift-.+' + # drop infrastructure components managed by other scrape targets + - source_labels: [__meta_kubernetes_service_name] + action: drop + regex: 'prometheus-node-exporter' + # only those that have requested scraping + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+)(?::\d+);(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + +# Scrape config for node-exporter, which is expected to be running on port 9100. +- job_name: 'kubernetes-nodes-exporter' + + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + + kubernetes_sd_configs: + - role: node + + metric_relabel_configs: + - source_labels: [__name__] + action: drop + regex: 'node_cpu|node_(disk|scrape_collector)_.+' + # preserve a subset of the network, netstat, vmstat, and filesystem series + - source_labels: [__name__] action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + regex: '(node_(netstat_Ip_.+|vmstat_(nr|thp)_.+|filesystem_(free|size|device_error)|network_(transmit|receive)_(drop|errs)))' + target_label: __name__ + replacement: renamed_$1 + - source_labels: [__name__] + action: drop + regex: 'node_(netstat|vmstat|filesystem|network)_.+' + - source_labels: [__name__] action: replace + regex: 'renamed_(.+)' + target_label: __name__ + replacement: $1 + # drop any partial expensive series + - source_labels: [__name__, device] + action: drop + regex: 'node_network_.+;veth.+' + - source_labels: [__name__, mountpoint] + action: drop + regex: 'node_filesystem_(free|size|device_error);([^/].*|/.+)' + + relabel_configs: + - source_labels: [__address__] + regex: '(.*):10250' + replacement: '${1}:9100' target_label: __address__ - regex: (.+)(?::\d+);(\d+) - replacement: $1:$2 - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username] - action: replace - target_label: __basic_auth_username__ - regex: (.+) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password] - action: replace - target_label: __basic_auth_password__ - regex: (.+) + - source_labels: [__meta_kubernetes_node_label_kubernetes_io_hostname] + target_label: __instance__ - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name + regex: __meta_kubernetes_node_label_(.+) + +# Scrape config for the template service broker +- job_name: 'openshift-template-service-broker' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt + server_name: apiserver.openshift-template-service-broker.svc + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: openshift-template-service-broker;apiserver;https + alerting: alertmanagers: -- cgit v1.2.1