summaryrefslogtreecommitdiffstats
path: root/roles/openshift_prometheus/templates
diff options
context:
space:
mode:
authorMangirdas <m.judeikis@gmail.com>2018-01-27 08:05:31 +0000
committerMangirdas <m.judeikis@gmail.com>2018-01-28 08:38:36 +0000
commit3de29f6d5a3017b57c553c5e2fb63a50994df840 (patch)
tree8d19dd62d0375ce989751e612f32486b72e82e9c /roles/openshift_prometheus/templates
parenta24ccff0423ca25bfcb1a3d9f79470aae5948d66 (diff)
downloadopenshift-3de29f6d5a3017b57c553c5e2fb63a50994df840.tar.gz
openshift-3de29f6d5a3017b57c553c5e2fb63a50994df840.tar.bz2
openshift-3de29f6d5a3017b57c553c5e2fb63a50994df840.tar.xz
openshift-3de29f6d5a3017b57c553c5e2fb63a50994df840.zip
Rebase Prometheus example for new scrape endpoints and expose alert manager
Diffstat (limited to 'roles/openshift_prometheus/templates')
-rw-r--r--roles/openshift_prometheus/templates/prometheus.j292
-rw-r--r--roles/openshift_prometheus/templates/prometheus.yml.j2175
2 files changed, 192 insertions, 75 deletions
diff --git a/roles/openshift_prometheus/templates/prometheus.j2 b/roles/openshift_prometheus/templates/prometheus.j2
index d780550b8..c0abd483b 100644
--- a/roles/openshift_prometheus/templates/prometheus.j2
+++ b/roles/openshift_prometheus/templates/prometheus.j2
@@ -19,7 +19,7 @@ spec:
labels:
app: prometheus
spec:
- serviceAccountName: prometheus
+ serviceAccountName: "{{ openshift_prometheus_service_name }}"
{% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %}
nodeSelector:
{% for key, value in openshift_prometheus_node_selector.items() %}
@@ -47,15 +47,15 @@ spec:
cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
{% endif %}
ports:
- - containerPort: 8443
+ - containerPort: {{ openshift_prometheus_service_targetport }}
name: web
args:
- -provider=openshift
- - -https-address=:8443
+ - -https-address=:{{ openshift_prometheus_service_targetport }}
- -http-address=
- -email-domain=*
- -upstream=http://localhost:9090
- - -client-id=system:serviceaccount:{{ namespace }}:prometheus
+ - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }}
- '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
- '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
- -tls-cert=/etc/tls/private/tls.crt
@@ -67,9 +67,9 @@ spec:
- -skip-auth-regex=^/metrics
volumeMounts:
- mountPath: /etc/tls/private
- name: prometheus-tls
+ name: prometheus-tls-secret
- mountPath: /etc/proxy/secrets
- name: prometheus-secrets
+ name: prometheus-proxy-secret
- mountPath: /prometheus
name: prometheus-data
@@ -104,7 +104,7 @@ spec:
- mountPath: /prometheus
name: prometheus-data
- # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy
+ # Deploy alert-buffer behind oauth alerts-proxy
- name: alerts-proxy
image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}"
imagePullPolicy: IfNotPresent
@@ -124,15 +124,15 @@ spec:
cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
{% endif %}
ports:
- - containerPort: 9443
+ - containerPort: {{ openshift_prometheus_alerts_service_targetport }}
name: web
args:
- -provider=openshift
- - -https-address=:9443
+ - -https-address=:{{ openshift_prometheus_alerts_service_targetport }}
- -http-address=
- -email-domain=*
- -upstream=http://localhost:9099
- - -client-id=system:serviceaccount:{{ namespace }}:prometheus
+ - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }}
- '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
- '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
- -tls-cert=/etc/tls/private/tls.crt
@@ -143,9 +143,9 @@ spec:
- -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
volumeMounts:
- mountPath: /etc/tls/private
- name: alerts-tls
+ name: alerts-tls-secret
- mountPath: /etc/proxy/secrets
- name: alerts-secrets
+ name: alerts-proxy-secret
- name: alert-buffer
args:
@@ -169,11 +169,54 @@ spec:
{% endif %}
volumeMounts:
- mountPath: /alert-buffer
- name: alert-buffer-data
+ name: alerts-data
ports:
- containerPort: 9099
name: alert-buf
+ # Deploy alertmanager behind oauth alertmanager-proxy
+ - name: alertmanager-proxy
+ image: "{{ l_openshift_prometheus_proxy_image_prefix }}oauth-proxy:{{ l_openshift_prometheus_proxy_image_version }}"
+ imagePullPolicy: IfNotPresent
+ requests:
+{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %}
+ memory: "{{ openshift_prometheus_oauth_proxy_memory_requests }}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %}
+ cpu: "{{ openshift_prometheus_oauth_proxy_cpu_requests }}"
+{% endif %}
+ limits:
+{% if openshift_prometheus_oauth_proxy_memory_limit is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %}
+ memory: "{{ openshift_prometheus_oauth_proxy_memory_limit }}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %}
+ cpu: "{{ openshift_prometheus_oauth_proxy_cpu_limit }}"
+{% endif %}
+ ports:
+ - containerPort: {{ openshift_prometheus_alertmanager_service_targetport }}
+ name: web
+ args:
+ - -provider=openshift
+ - -https-address=:{{ openshift_prometheus_alertmanager_service_targetport }}
+ - -http-address=
+ - -email-domain=*
+ - -upstream=http://localhost:9093
+ - -client-id=system:serviceaccount:{{ namespace }}:{{ openshift_prometheus_service_name }}
+ - -openshift-ca=/etc/pki/tls/cert.pem
+ - -openshift-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
+ - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
+ - -tls-cert=/etc/tls/private/tls.crt
+ - -tls-key=/etc/tls/private/tls.key
+ - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token
+ - -cookie-secret-file=/etc/proxy/secrets/session_secret
+ - -skip-auth-regex=^/metrics
+ volumeMounts:
+ - mountPath: /etc/tls/private
+ name: alertmanager-tls-secret
+ - mountPath: /etc/proxy/secrets
+ name: alertmanager-proxy-secret
+
- name: alertmanager
args:
- -config.file=/etc/alertmanager/alertmanager.yml
@@ -205,14 +248,15 @@ spec:
restartPolicy: Always
volumes:
+
- name: prometheus-config
configMap:
defaultMode: 420
name: prometheus
- - name: prometheus-secrets
+ - name: prometheus-proxy-secret
secret:
secretName: prometheus-proxy
- - name: prometheus-tls
+ - name: prometheus-tls-secret
secret:
secretName: prometheus-tls
- name: prometheus-data
@@ -225,13 +269,19 @@ spec:
- name: alertmanager-config
configMap:
defaultMode: 420
- name: prometheus-alerts
- - name: alerts-secrets
+ name: alertmanager
+ - name: alertmanager-proxy-secret
secret:
- secretName: alerts-proxy
- - name: alerts-tls
+ secretName: alertmanager-proxy
+ - name: alertmanager-tls-secret
+ secret:
+ secretName: alertmanager-tls
+ - name: alerts-tls-secret
secret:
- secretName: prometheus-alerts-tls
+ secretName: alerts-tls
+ - name: alerts-proxy-secret
+ secret:
+ secretName: alerts-proxy
- name: alertmanager-data
{% if openshift_prometheus_alertmanager_storage_type == 'pvc' %}
persistentVolumeClaim:
@@ -239,7 +289,7 @@ spec:
{% else %}
emptydir: {}
{% endif %}
- - name: alert-buffer-data
+ - name: alerts-data
{% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %}
persistentVolumeClaim:
claimName: {{ openshift_prometheus_alertbuffer_pvc_name }}
diff --git a/roles/openshift_prometheus/templates/prometheus.yml.j2 b/roles/openshift_prometheus/templates/prometheus.yml.j2
index 63430f834..005c2c564 100644
--- a/roles/openshift_prometheus/templates/prometheus.yml.j2
+++ b/roles/openshift_prometheus/templates/prometheus.yml.j2
@@ -1,10 +1,5 @@
rule_files:
- - 'prometheus.rules'
-{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %}
- - 'prometheus.additional.rules'
-{% endif %}
-
-
+ - '*.rules'
# A scrape configuration for running Prometheus on a Kubernetes cluster.
# This uses separate scrape configs for cluster components (i.e. API server, node)
@@ -39,31 +34,11 @@ scrape_configs:
action: keep
regex: default;kubernetes;https
-# Scrape config for nodes.
-#
-# Each node exposes a /metrics endpoint that contains operational metrics for
-# the Kubelet and other components.
-- job_name: 'kubernetes-nodes'
-
- scheme: https
- tls_config:
- ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
- kubernetes_sd_configs:
- - role: node
-
- relabel_configs:
- - action: labelmap
- regex: __meta_kubernetes_node_label_(.+)
-
# Scrape config for controllers.
#
# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for
# the controllers.
#
-# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via
-# endpoints.
- job_name: 'kubernetes-controllers'
scheme: https
@@ -87,6 +62,27 @@ scrape_configs:
regex: (.+)(?::\d+)
replacement: $1:8444
+# Scrape config for nodes.
+#
+# Each node exposes a /metrics endpoint that contains operational metrics for
+# the Kubelet and other components.
+- job_name: 'kubernetes-nodes'
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+ kubernetes_sd_configs:
+ - role: node
+ # Drop a very high cardinality metric that is incorrect in 3.7. It will be
+ # fixed in 3.9.
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ action: drop
+ regex: 'openshift_sdn_pod_(setup|teardown)_latency(.*)'
+ relabel_configs:
+ - action: labelmap
+ regex: __meta_kubernetes_node_label_(.+)
+
# Scrape config for cAdvisor.
#
# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that
@@ -107,6 +103,14 @@ scrape_configs:
kubernetes_sd_configs:
- role: node
+ # Exclude a set of high cardinality metrics that can contribute to significant
+ # memory use in large clusters. These can be selectively enabled as necessary
+ # for medium or small clusters.
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ action: drop
+ regex: 'container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_usage_bytes|memory_swap|memory_working_set_bytes|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total|spec_(.*))'
+
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
@@ -133,38 +137,101 @@ scrape_configs:
- role: endpoints
relabel_configs:
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
- action: keep
- regex: true
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
- action: replace
- target_label: __scheme__
- regex: (https?)
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+ # only scrape infrastructure components
+ - source_labels: [__meta_kubernetes_namespace]
+ action: keep
+ regex: 'default|logging|metrics|kube-.+|openshift|openshift-.+'
+ # drop infrastructure components managed by other scrape targets
+ - source_labels: [__meta_kubernetes_service_name]
+ action: drop
+ regex: 'prometheus-node-exporter'
+ # only those that have requested scraping
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
+ action: keep
+ regex: true
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
+ action: replace
+ target_label: __scheme__
+ regex: (https?)
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+ action: replace
+ target_label: __metrics_path__
+ regex: (.+)
+ - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+ action: replace
+ target_label: __address__
+ regex: (.+)(?::\d+);(\d+)
+ replacement: $1:$2
+ - action: labelmap
+ regex: __meta_kubernetes_service_label_(.+)
+ - source_labels: [__meta_kubernetes_namespace]
+ action: replace
+ target_label: kubernetes_namespace
+ - source_labels: [__meta_kubernetes_service_name]
+ action: replace
+ target_label: kubernetes_name
+
+# Scrape config for node-exporter, which is expected to be running on port 9100.
+- job_name: 'kubernetes-nodes-exporter'
+
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+
+ kubernetes_sd_configs:
+ - role: node
+
+ metric_relabel_configs:
+ - source_labels: [__name__]
+ action: drop
+ regex: 'node_cpu|node_(disk|scrape_collector)_.+'
+ # preserve a subset of the network, netstat, vmstat, and filesystem series
+ - source_labels: [__name__]
action: replace
- target_label: __metrics_path__
- regex: (.+)
- - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+ regex: '(node_(netstat_Ip_.+|vmstat_(nr|thp)_.+|filesystem_(free|size|device_error)|network_(transmit|receive)_(drop|errs)))'
+ target_label: __name__
+ replacement: renamed_$1
+ - source_labels: [__name__]
+ action: drop
+ regex: 'node_(netstat|vmstat|filesystem|network)_.+'
+ - source_labels: [__name__]
action: replace
+ regex: 'renamed_(.+)'
+ target_label: __name__
+ replacement: $1
+ # drop any partial expensive series
+ - source_labels: [__name__, device]
+ action: drop
+ regex: 'node_network_.+;veth.+'
+ - source_labels: [__name__, mountpoint]
+ action: drop
+ regex: 'node_filesystem_(free|size|device_error);([^/].*|/.+)'
+
+ relabel_configs:
+ - source_labels: [__address__]
+ regex: '(.*):10250'
+ replacement: '${1}:9100'
target_label: __address__
- regex: (.+)(?::\d+);(\d+)
- replacement: $1:$2
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username]
- action: replace
- target_label: __basic_auth_username__
- regex: (.+)
- - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password]
- action: replace
- target_label: __basic_auth_password__
- regex: (.+)
+ - source_labels: [__meta_kubernetes_node_label_kubernetes_io_hostname]
+ target_label: __instance__
- action: labelmap
- regex: __meta_kubernetes_service_label_(.+)
- - source_labels: [__meta_kubernetes_namespace]
- action: replace
- target_label: kubernetes_namespace
- - source_labels: [__meta_kubernetes_service_name]
- action: replace
- target_label: kubernetes_name
+ regex: __meta_kubernetes_node_label_(.+)
+
+# Scrape config for the template service broker
+- job_name: 'openshift-template-service-broker'
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt
+ server_name: apiserver.openshift-template-service-broker.svc
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+ kubernetes_sd_configs:
+ - role: endpoints
+
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+ action: keep
+ regex: openshift-template-service-broker;apiserver;https
+
alerting:
alertmanagers: