From ec75d0ac888f3fab87f8d335224596df045e260a Mon Sep 17 00:00:00 2001 From: Zohar Galor Date: Tue, 20 Jun 2017 17:28:07 +0300 Subject: Create ansible role for deploying prometheus on openshift A new role for installing prometheus on openshift. Depends on `openshift_hosted_prometheus_deploy` flag role creates: - prometheus namespace - prometheus clusterrolebinding and service account - pvs for prometheus, alertmanager and alertbuffer for internal nfs - prometheus pod with prometheus behind oauth-proxy, alertmanager and alert-buffer behind oauth-proxy - prometheus and alertmanager configmaps - prometheus and alerts services and direct routes - prometheus, alertmanager and alert-buffer pvcs --- .../templates/alertmanager.yml.j2 | 20 ++ .../templates/prom-pv-alertbuffer.yml.j2 | 15 ++ .../templates/prom-pv-alertmanager.yml.j2 | 15 ++ .../templates/prom-pv-server.yml.j2 | 15 ++ .../templates/prometheus.rules.j2 | 4 + .../templates/prometheus.yml.j2 | 174 +++++++++++++++ .../templates/prometheus_deployment.j2 | 240 +++++++++++++++++++++ 7 files changed, 483 insertions(+) create mode 100644 roles/openshift_prometheus/templates/alertmanager.yml.j2 create mode 100644 roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 create mode 100644 roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 create mode 100644 roles/openshift_prometheus/templates/prom-pv-server.yml.j2 create mode 100644 roles/openshift_prometheus/templates/prometheus.rules.j2 create mode 100644 roles/openshift_prometheus/templates/prometheus.yml.j2 create mode 100644 roles/openshift_prometheus/templates/prometheus_deployment.j2 (limited to 'roles/openshift_prometheus/templates') diff --git a/roles/openshift_prometheus/templates/alertmanager.yml.j2 b/roles/openshift_prometheus/templates/alertmanager.yml.j2 new file mode 100644 index 000000000..6c432a3d0 --- /dev/null +++ b/roles/openshift_prometheus/templates/alertmanager.yml.j2 @@ -0,0 +1,20 @@ +global: + +# The root route on which each incoming alert enters. +route: + # default route if none match + receiver: alert-buffer-wh + + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + # TODO: + group_by: [] + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + +receivers: +- name: alert-buffer-wh + webhook_configs: + - url: http://localhost:9099/topics/alerts diff --git a/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 new file mode 100644 index 000000000..55a5e19c3 --- /dev/null +++ b/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: prometheus-alertbuffer + labels: + storage: prometheus-alertbuffer +spec: + capacity: + storage: 15Gi + accessModes: + - ReadWriteOnce + nfs: + path: /exports/prometheus-alertbuffer + server: {{ openshift_prometheus_nfs_server }} + persistentVolumeReclaimPolicy: Retain diff --git a/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 new file mode 100644 index 000000000..4ee518735 --- /dev/null +++ b/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: prometheus-alertmanager + labels: + storage: prometheus-alertmanager +spec: + capacity: + storage: 15Gi + accessModes: + - ReadWriteOnce + nfs: + path: /exports/prometheus-alertmanager + server: {{ openshift_prometheus_nfs_server }} + persistentVolumeReclaimPolicy: Retain diff --git a/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 new file mode 100644 index 000000000..933bf0f60 --- /dev/null +++ b/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: prometheus + labels: + storage: prometheus +spec: + capacity: + storage: 15Gi + accessModes: + - ReadWriteOnce + nfs: + path: /exports/prometheus + server: {{ openshift_prometheus_nfs_server }} + persistentVolumeReclaimPolicy: Retain diff --git a/roles/openshift_prometheus/templates/prometheus.rules.j2 b/roles/openshift_prometheus/templates/prometheus.rules.j2 new file mode 100644 index 000000000..e861dc127 --- /dev/null +++ b/roles/openshift_prometheus/templates/prometheus.rules.j2 @@ -0,0 +1,4 @@ +groups: +- name: example-rules + interval: 30s # defaults to global interval + rules: diff --git a/roles/openshift_prometheus/templates/prometheus.yml.j2 b/roles/openshift_prometheus/templates/prometheus.yml.j2 new file mode 100644 index 000000000..63430f834 --- /dev/null +++ b/roles/openshift_prometheus/templates/prometheus.yml.j2 @@ -0,0 +1,174 @@ +rule_files: + - 'prometheus.rules' +{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %} + - 'prometheus.additional.rules' +{% endif %} + + + +# A scrape configuration for running Prometheus on a Kubernetes cluster. +# This uses separate scrape configs for cluster components (i.e. API server, node) +# and services to allow each to use different authentication configs. +# +# Kubernetes labels will be added as Prometheus labels on metrics via the +# `labelmap` relabeling action. + +# Scrape config for API servers. +# +# Kubernetes exposes API servers as endpoints to the default/kubernetes +# service so this uses `endpoints` role and uses relabelling to only keep +# the endpoints associated with the default/kubernetes service using the +# default named port `https`. This works for single API server deployments as +# well as HA API server deployments. +scrape_configs: +- job_name: 'kubernetes-apiservers' + + kubernetes_sd_configs: + - role: endpoints + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + +# Scrape config for nodes. +# +# Each node exposes a /metrics endpoint that contains operational metrics for +# the Kubelet and other components. +- job_name: 'kubernetes-nodes' + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + +# Scrape config for controllers. +# +# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for +# the controllers. +# +# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via +# endpoints. +- job_name: 'kubernetes-controllers' + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: endpoints + + # Keep only the default/kubernetes service endpoints for the https port, and then + # set the port to 8444. This is the default configuration for the controllers on OpenShift + # masters. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + - source_labels: [__address__] + action: replace + target_label: __address__ + regex: (.+)(?::\d+) + replacement: $1:8444 + +# Scrape config for cAdvisor. +# +# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that +# reports container metrics for each running pod. Scrape those by default. +- job_name: 'kubernetes-cadvisor' + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + +{% if kubernetes_version | float() >= 1.7 | float() %} + metrics_path: /metrics/cadvisor +{% else %} + metrics_path: /metrics +{% endif %} + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + +# Scrape config for service endpoints. +# +# The relabeling allows the actual service scrape endpoint to be configured +# via the following annotations: +# +# * `prometheus.io/scrape`: Only scrape services that have a value of `true` +# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need +# to set this to `https` & most likely set the `tls_config` of the scrape config. +# * `prometheus.io/path`: If the metrics path is not `/metrics` override this. +# * `prometheus.io/port`: If the metrics are exposed on a different port to the +# service then set this appropriately. +- job_name: 'kubernetes-service-endpoints' + + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # TODO: this should be per target + insecure_skip_verify: true + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+)(?::\d+);(\d+) + replacement: $1:$2 + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username] + action: replace + target_label: __basic_auth_username__ + regex: (.+) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password] + action: replace + target_label: __basic_auth_password__ + regex: (.+) + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: + - "localhost:9093" diff --git a/roles/openshift_prometheus/templates/prometheus_deployment.j2 b/roles/openshift_prometheus/templates/prometheus_deployment.j2 new file mode 100644 index 000000000..98c117f19 --- /dev/null +++ b/roles/openshift_prometheus/templates/prometheus_deployment.j2 @@ -0,0 +1,240 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: prometheus + namespace: {{ namespace }} + labels: + app: prometheus +spec: + replicas: {{ prom_replicas|default(1) }} + selector: + provider: openshift + matchLabels: + app: prometheus + template: + metadata: + name: prometheus + labels: + app: prometheus + spec: + serviceAccountName: prometheus +{% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %} + nodeSelector: +{% for key, value in openshift_prometheus_node_selector.iteritems() %} + {{key}}: "{{value}}" +{% endfor %} +{% endif %} + containers: + # Deploy Prometheus behind an oauth proxy + - name: prom-proxy + image: "{{ openshift_prometheus_image_proxy }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %} + memory: "{{openshift_prometheus_oauth_proxy_memory_requests}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %} + cpu: "{{openshift_prometheus_oauth_proxy_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_memory_requests_limit_proxy is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %} + memory: "{{openshift_prometheus_oauth_proxy_memory_limit}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %} + cpu: "{{openshift_prometheus_oauth_proxy_cpu_limit}}" +{% endif %} + ports: + - containerPort: 8443 + name: web + args: + - -provider=openshift + - -https-address=:8443 + - -http-address= + - -email-domain=* + - -upstream=http://localhost:9090 + - -client-id=system:serviceaccount:{{ namespace }}:prometheus + - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret-file=/etc/proxy/secrets/session_secret + - -skip-auth-regex=^/metrics + volumeMounts: + - mountPath: /etc/tls/private + name: prometheus-tls + - mountPath: /etc/proxy/secrets + name: prometheus-secrets + - mountPath: /prometheus + name: prometheus-data + + - name: prometheus + args: + - --storage.tsdb.retention=6h + - --config.file=/etc/prometheus/prometheus.yml + - --web.listen-address=localhost:9090 + image: "{{ openshift_prometheus_image_prometheus }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_memory_requests is defined and openshift_prometheus_memory_requests is not none %} + memory: "{{openshift_prometheus_memory_requests}}" +{% endif %} +{% if openshift_prometheus_cpu_requests is defined and openshift_prometheus_cpu_requests is not none %} + cpu: "{{openshift_prometheus_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_memory_limit is defined and openshift_prometheus_memory_limit is not none %} + memory: "{{ openshift_prometheus_memory_limit }}" +{% endif %} +{% if openshift_prometheus_cpu_limit is defined and openshift_prometheus_cpu_limit is not none %} + cpu: "{{openshift_prometheus_cpu_limit}}" +{% endif %} + + volumeMounts: + - mountPath: /etc/prometheus + name: prometheus-config + - mountPath: /prometheus + name: prometheus-data + + # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy + - name: alerts-proxy + image: "{{ openshift_prometheus_image_proxy }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %} + memory: "{{openshift_prometheus_oauth_proxy_memory_requests}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %} + cpu: "{{openshift_prometheus_oauth_proxy_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_oauth_proxy_memory_limit is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %} + memory: "{{openshift_prometheus_oauth_proxy_memory_limit}}" +{% endif %} +{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %} + cpu: "{{openshift_prometheus_oauth_proxy_cpu_limit}}" +{% endif %} + ports: + - containerPort: 9443 + name: web + args: + - -provider=openshift + - -https-address=:9443 + - -http-address= + - -email-domain=* + - -upstream=http://localhost:9099 + - -client-id=system:serviceaccount:{{ namespace }}:prometheus + - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}' + - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}' + - -tls-cert=/etc/tls/private/tls.crt + - -tls-key=/etc/tls/private/tls.key + - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token + - -cookie-secret-file=/etc/proxy/secrets/session_secret + volumeMounts: + - mountPath: /etc/tls/private + name: alerts-tls + - mountPath: /etc/proxy/secrets + name: alerts-secrets + + - name: alert-buffer + args: + - --storage-path=/alert-buffer/messages.db + image: "{{ openshift_prometheus_image_alertbuffer }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_alertbuffer_memory_requests is defined and openshift_prometheus_alertbuffer_memory_requests is not none %} + memory: "{{openshift_prometheus_alertbuffer_memory_requests}}" +{% endif %} +{% if openshift_prometheus_alertbuffer_cpu_requests is defined and openshift_prometheus_alertbuffer_cpu_requests is not none %} + cpu: "{{openshift_prometheus_alertbuffer_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_alertbuffer_memory_limit is defined and openshift_prometheus_alertbuffer_memory_limit is not none %} + memory: "{{openshift_prometheus_alertbuffer_memory_limit}}" +{% endif %} +{% if openshift_prometheus_alertbuffer_cpu_limit is defined and openshift_prometheus_alertbuffer_cpu_limit is not none %} + cpu: "{{openshift_prometheus_alertbuffer_cpu_limit}}" +{% endif %} + volumeMounts: + - mountPath: /alert-buffer + name: alert-buffer-data + ports: + - containerPort: 9099 + name: alert-buf + + - name: alertmanager + args: + - -config.file=/etc/alertmanager/alertmanager.yml + image: "{{ openshift_prometheus_image_alertmanager }}" + imagePullPolicy: IfNotPresent + resources: + requests: +{% if openshift_prometheus_alertmanager_memory_requests is defined and openshift_prometheus_alertmanager_memory_requests is not none %} + memory: "{{openshift_prometheus_alertmanager_memory_requests}}" +{% endif %} +{% if openshift_prometheus_alertmanager_cpu_requests is defined and openshift_prometheus_alertmanager_cpu_requests is not none %} + cpu: "{{openshift_prometheus_alertmanager_cpu_requests}}" +{% endif %} + limits: +{% if openshift_prometheus_alertmanager_memory_limit is defined and openshift_prometheus_alertmanager_memory_limit is not none %} + memory: "{{openshift_prometheus_alertmanager_memory_limit}}" +{% endif %} +{% if openshift_prometheus_alertmanager_cpu_limit is defined and openshift_prometheus_alertmanager_cpu_limit is not none %} + cpu: "{{openshift_prometheus_alertmanager_cpu_limit}}" +{% endif %} + ports: + - containerPort: 9093 + name: web + volumeMounts: + - mountPath: /etc/alertmanager + name: alertmanager-config + - mountPath: /alertmanager + name: alertmanager-data + + restartPolicy: Always + volumes: + - name: prometheus-config + configMap: + defaultMode: 420 + name: prometheus + - name: prometheus-secrets + secret: + secretName: prometheus-proxy + - name: prometheus-tls + secret: + secretName: prometheus-tls + - name: prometheus-data +{% if openshift_prometheus_storage_type == 'pvc' %} + persistentVolumeClaim: + claimName: {{ openshift_prometheus_pvc_name }} +{% else %} + emptydir: {} +{% endif %} + - name: alertmanager-config + configMap: + defaultMode: 420 + name: prometheus-alerts + - name: alerts-secrets + secret: + secretName: alerts-proxy + - name: alerts-tls + secret: + secretName: prometheus-alerts-tls + - name: alertmanager-data +{% if openshift_prometheus_alertmanager_storage_type == 'pvc' %} + persistentVolumeClaim: + claimName: {{ openshift_prometheus_alertmanager_pvc_name }} +{% else %} + emptydir: {} +{% endif %} + - name: alert-buffer-data +{% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %} + persistentVolumeClaim: + claimName: {{ openshift_prometheus_alertbuffer_pvc_name }} +{% else %} + emptydir: {} +{% endif %} -- cgit v1.2.1