summaryrefslogtreecommitdiffstats
path: root/roles
diff options
context:
space:
mode:
authorScott Dodson <sdodson@redhat.com>2017-09-06 10:32:39 -0400
committerGitHub <noreply@github.com>2017-09-06 10:32:39 -0400
commitbb59f5617bce05d61f594efe6e602d307bc6f200 (patch)
tree9b31d02dc5a8d8986e08901d6ea4ffd5ef07ca98 /roles
parent5aaa24b25653cca479955eb39d353caee0fcf373 (diff)
parentec75d0ac888f3fab87f8d335224596df045e260a (diff)
downloadopenshift-bb59f5617bce05d61f594efe6e602d307bc6f200.tar.gz
openshift-bb59f5617bce05d61f594efe6e602d307bc6f200.tar.bz2
openshift-bb59f5617bce05d61f594efe6e602d307bc6f200.tar.xz
openshift-bb59f5617bce05d61f594efe6e602d307bc6f200.zip
Merge pull request #4509 from zgalor/prometheus-role
Create ansible role for deploying prometheus on openshift
Diffstat (limited to 'roles')
-rw-r--r--roles/openshift_prometheus/README.md95
-rw-r--r--roles/openshift_prometheus/defaults/main.yaml74
-rw-r--r--roles/openshift_prometheus/files/openshift_prometheus.exports3
-rw-r--r--roles/openshift_prometheus/meta/main.yaml19
-rw-r--r--roles/openshift_prometheus/tasks/create_pvs.yaml36
-rw-r--r--roles/openshift_prometheus/tasks/install_prometheus.yaml241
-rw-r--r--roles/openshift_prometheus/tasks/main.yaml26
-rw-r--r--roles/openshift_prometheus/tasks/nfs.yaml44
-rw-r--r--roles/openshift_prometheus/templates/alertmanager.yml.j220
-rw-r--r--roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j215
-rw-r--r--roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j215
-rw-r--r--roles/openshift_prometheus/templates/prom-pv-server.yml.j215
-rw-r--r--roles/openshift_prometheus/templates/prometheus.rules.j24
-rw-r--r--roles/openshift_prometheus/templates/prometheus.yml.j2174
-rw-r--r--roles/openshift_prometheus/templates/prometheus_deployment.j2240
-rw-r--r--roles/openshift_prometheus/tests/inventory2
-rw-r--r--roles/openshift_prometheus/tests/test.yaml5
17 files changed, 1028 insertions, 0 deletions
diff --git a/roles/openshift_prometheus/README.md b/roles/openshift_prometheus/README.md
new file mode 100644
index 000000000..c5a44bffb
--- /dev/null
+++ b/roles/openshift_prometheus/README.md
@@ -0,0 +1,95 @@
+OpenShift Prometheus
+====================
+
+OpenShift Prometheus Installation
+
+Requirements
+------------
+
+
+Role Variables
+--------------
+
+For default values, see [`defaults/main.yaml`](defaults/main.yaml).
+
+- `openshift_prometheus_state`: present - install/update. absent - uninstall.
+
+- `openshift_prometheus_namespace`: project (i.e. namespace) where the components will be
+ deployed.
+
+- `openshift_prometheus_replicas`: The number of replicas for prometheus deployment.
+
+- `openshift_prometheus_node_selector`: Selector for the nodes prometheus will be deployed on.
+
+- `openshift_prometheus_image_<COMPONENT>`: specify image for the component
+
+## Storage related variables
+Each prometheus component (prometheus, alertmanager, alert-buffer, oauth-proxy) can set pv claim by setting corresponding role variable:
+```
+openshift_prometheus_<COMPONENT>_storage_type: <VALUE>
+openshift_prometheus_<COMPONENT>_pvc_(name|size|access_modes|pv_selector): <VALUE>
+```
+e.g
+```
+openshift_prometheus_storage_type: pvc
+openshift_prometheus_alertmanager_pvc_name: alertmanager
+openshift_prometheus_alertbuffer_pvc_size: 10G
+openshift_prometheus_pvc_access_modes: [ReadWriteOnce]
+```
+
+## Additional Alert Rules file variable
+An external file with alert rules can be added by setting path to additional rules variable:
+```
+openshift_prometheus_additional_rules_file: <PATH>
+```
+
+File content should be in prometheus alert rules format.
+Following example sets rule to fire an alert when one of the cluster nodes is down:
+
+```
+groups:
+- name: example-rules
+ interval: 30s # defaults to global interval
+ rules:
+ - alert: Node Down
+ expr: up{job="kubernetes-nodes"} == 0
+ annotations:
+ miqTarget: "ContainerNode"
+ severity: "HIGH"
+ message: "{{ '{{' }}{{ '$labels.instance' }}{{ '}}' }} is down"
+```
+
+
+## Additional variables to control resource limits
+Each prometheus component (prometheus, alertmanager, alert-buffer, oauth-proxy) can specify a cpu and memory limits and requests by setting
+the corresponding role variable:
+```
+openshift_prometheus_<COMPONENT>_(limits|requests)_(memory|cpu): <VALUE>
+```
+e.g
+```
+openshift_prometheus_alertmanager_limits_memory: 1Gi
+openshift_prometheus_oath_proxy_requests_cpu: 100
+```
+
+Dependencies
+------------
+
+openshift_facts
+
+
+Example Playbook
+----------------
+
+```
+- name: Configure openshift-prometheus
+ hosts: oo_first_master
+ roles:
+ - role: openshift_prometheus
+```
+
+License
+-------
+
+Apache License, Version 2.0
+
diff --git a/roles/openshift_prometheus/defaults/main.yaml b/roles/openshift_prometheus/defaults/main.yaml
new file mode 100644
index 000000000..18d6a1645
--- /dev/null
+++ b/roles/openshift_prometheus/defaults/main.yaml
@@ -0,0 +1,74 @@
+---
+# defaults file for openshift_prometheus
+openshift_prometheus_state: present
+
+openshift_prometheus_namespace: prometheus
+
+openshift_prometheus_replicas: 1
+openshift_prometheus_node_selector: {"region":"infra"}
+
+# images
+openshift_prometheus_image_proxy: "openshift/oauth-proxy:v1.0.0"
+openshift_prometheus_image_prometheus: "openshift/prometheus:v2.0.0-dev"
+openshift_prometheus_image_alertmanager: "openshift/prometheus-alertmanager:dev"
+openshift_prometheus_image_alertbuffer: "ilackarms/message-buffer"
+
+# additional prometheus rules file
+openshift_prometheus_additional_rules_file: null
+
+# All the required exports
+openshift_prometheus_pv_exports:
+ - prometheus
+ - prometheus-alertmanager
+ - prometheus-alertbuffer
+# PV template files and their created object names
+openshift_prometheus_pv_data:
+ - pv_name: prometheus
+ pv_template: prom-pv-server.yml
+ pv_label: Prometheus Server PV
+ - pv_name: prometheus-alertmanager
+ pv_template: prom-pv-alertmanager.yml
+ pv_label: Prometheus Alertmanager PV
+ - pv_name: prometheus-alertbuffer
+ pv_template: prom-pv-alertbuffer.yml
+ pv_label: Prometheus Alert Buffer PV
+
+# Hostname/IP of the NFS server. Currently defaults to first master
+openshift_prometheus_nfs_server: "{{ groups.nfs.0 }}"
+
+# storage
+openshift_prometheus_storage_type: pvc
+openshift_prometheus_pvc_name: prometheus
+openshift_prometheus_pvc_size: 10G
+openshift_prometheus_pvc_access_modes: [ReadWriteOnce]
+openshift_prometheus_pvc_pv_selector: {}
+
+openshift_prometheus_alertmanager_storage_type: pvc
+openshift_prometheus_alertmanager_pvc_name: prometheus-alertmanager
+openshift_prometheus_alertmanager_pvc_size: 10G
+openshift_prometheus_alertmanager_pvc_access_modes: [ReadWriteOnce]
+openshift_prometheus_alertmanager_pvc_pv_selector: {}
+
+openshift_prometheus_alertbuffer_storage_type: pvc
+openshift_prometheus_alertbuffer_pvc_name: prometheus-alertbuffer
+openshift_prometheus_alertbuffer_pvc_size: 10G
+openshift_prometheus_alertbuffer_pvc_access_modes: [ReadWriteOnce]
+openshift_prometheus_alertbuffer_pvc_pv_selector: {}
+
+# container resources
+openshift_prometheus_cpu_limit: null
+openshift_prometheus_memory_limit: null
+openshift_prometheus_cpu_requests: null
+openshift_prometheus_memory_requests: null
+openshift_prometheus_alertmanager_cpu_limit: null
+openshift_prometheus_alertmanager_memory_limit: null
+openshift_prometheus_alertmanager_cpu_requests: null
+openshift_prometheus_alertmanager_memory_requests: null
+openshift_prometheus_alertbuffer_cpu_limit: null
+openshift_prometheus_alertbuffer_memory_limit: null
+openshift_prometheus_alertbuffer_cpu_requests: null
+openshift_prometheus_alertbuffer_memory_requests: null
+openshift_prometheus_oauth_proxy_cpu_limit: null
+openshift_prometheus_oauth_proxy_memory_limit: null
+openshift_prometheus_oauth_proxy_cpu_requests: null
+openshift_prometheus_oauth_proxy_memory_requests: null
diff --git a/roles/openshift_prometheus/files/openshift_prometheus.exports b/roles/openshift_prometheus/files/openshift_prometheus.exports
new file mode 100644
index 000000000..3ccedb1fd
--- /dev/null
+++ b/roles/openshift_prometheus/files/openshift_prometheus.exports
@@ -0,0 +1,3 @@
+/exports/prometheus *(rw,no_root_squash,no_wdelay)
+/exports/prometheus-alertmanager *(rw,no_root_squash,no_wdelay)
+/exports/prometheus-alertbuffer *(rw,no_root_squash,no_wdelay)
diff --git a/roles/openshift_prometheus/meta/main.yaml b/roles/openshift_prometheus/meta/main.yaml
new file mode 100644
index 000000000..33188bb7e
--- /dev/null
+++ b/roles/openshift_prometheus/meta/main.yaml
@@ -0,0 +1,19 @@
+---
+galaxy_info:
+ author: OpenShift Development <dev@lists.openshift.redhat.com>
+ description: Deploy OpenShift prometheus integration for the cluster
+ company: Red Hat, Inc.
+ license: license (Apache)
+ min_ansible_version: 2.2
+ platforms:
+ - name: EL
+ versions:
+ - 7
+ - name: Fedora
+ versions:
+ - all
+ categories:
+ - openshift
+dependencies:
+- { role: lib_openshift }
+- { role: openshift_facts }
diff --git a/roles/openshift_prometheus/tasks/create_pvs.yaml b/roles/openshift_prometheus/tasks/create_pvs.yaml
new file mode 100644
index 000000000..4e79da05f
--- /dev/null
+++ b/roles/openshift_prometheus/tasks/create_pvs.yaml
@@ -0,0 +1,36 @@
+---
+# Check for existance and then conditionally:
+# - evaluate templates
+# - PVs
+#
+# These tasks idempotently create required Prometheus PV objects. Do not
+# call this file directly. This file is intended to be ran as an
+# include that has a 'with_items' attached to it. Hence the use below
+# of variables like "{{ item.pv_label }}"
+
+- name: "Check if the {{ item.pv_label }} template has been created already"
+ oc_obj:
+ namespace: "{{ openshift_prometheus_namespace }}"
+ state: list
+ kind: pv
+ name: "{{ item.pv_name }}"
+ register: prom_pv_check
+
+# Skip all of this if the PV already exists
+- block:
+ - name: "Ensure the {{ item.pv_label }} template is evaluated"
+ template:
+ src: "{{ item.pv_template }}.j2"
+ dest: "{{ tempdir }}/templates/{{ item.pv_template }}"
+
+ - name: "Ensure {{ item.pv_label }} is created"
+ oc_obj:
+ namespace: "{{ openshift_prometheus_namespace }}"
+ kind: pv
+ name: "{{ item.pv_name }}"
+ state: present
+ delete_after: True
+ files:
+ - "{{ tempdir }}/templates/{{ item.pv_template }}"
+ when:
+ - not prom_pv_check.results.results.0
diff --git a/roles/openshift_prometheus/tasks/install_prometheus.yaml b/roles/openshift_prometheus/tasks/install_prometheus.yaml
new file mode 100644
index 000000000..93bdda3e8
--- /dev/null
+++ b/roles/openshift_prometheus/tasks/install_prometheus.yaml
@@ -0,0 +1,241 @@
+---
+
+# namespace
+- name: Add prometheus project
+ oc_project:
+ state: "{{ state }}"
+ name: "{{ openshift_prometheus_namespace }}"
+ node_selector: "{{ openshift_prometheus_node_selector | oo_selector_to_string_list() }}"
+ description: Prometheus
+
+# secrets
+- name: Set alert and prometheus secrets
+ oc_secret:
+ state: "{{ state }}"
+ name: "{{ item }}-proxy"
+ namespace: "{{ openshift_prometheus_namespace }}"
+ contents:
+ - path: session_secret
+ data: "{{ 43 | oo_random_word }}="
+ with_items:
+ - prometheus
+ - alerts
+
+# serviceaccount
+- name: create prometheus serviceaccount
+ oc_serviceaccount:
+ state: "{{ state }}"
+ name: prometheus
+ namespace: "{{ openshift_prometheus_namespace }}"
+ # TODO add annotations when supproted
+ # annotations:
+ # serviceaccounts.openshift.io/oauth-redirectreference.prom: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
+ # serviceaccounts.openshift.io/oauth-redirectreference.alerts: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
+
+ secrets:
+ - prometheus-secrets
+ changed_when: no
+
+# TODO remove this when annotations are supported by oc_serviceaccount
+- name: annotate serviceaccount
+ command: >
+ {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
+ serviceaccount prometheus
+ serviceaccounts.openshift.io/oauth-redirectreference.prom='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"prometheus"}}'
+ serviceaccounts.openshift.io/oauth-redirectreference.alerts='{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"alerts"}}'
+
+
+# create clusterrolebinding for prometheus serviceaccount
+- name: Set cluster-reader permissions for prometheus
+ oc_adm_policy_user:
+ state: "{{ state }}"
+ namespace: "{{ openshift_prometheus_namespace }}"
+ resource_kind: cluster-role
+ resource_name: cluster-reader
+ user: "system:serviceaccount:{{ openshift_prometheus_namespace }}:prometheus"
+
+
+######################################################################
+# NFS
+# In the case that we are not running on a cloud provider, volumes must be statically provisioned
+
+- include: nfs.yaml
+ when: not (openshift_cloudprovider_kind is defined and (openshift_cloudprovider_kind == 'aws' or openshift_cloudprovider_kind == 'gce'))
+
+
+# create prometheus and alerts services
+# TODO join into 1 task with loop
+- name: Create prometheus service
+ oc_service:
+ state: "{{ state }}"
+ name: "{{ item.name }}"
+ namespace: "{{ openshift_prometheus_namespace }}"
+ selector:
+ app: prometheus
+ labels:
+ name: "{{ item.name }}"
+ # TODO add annotations when supported
+ # annotations:
+ # service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls"
+ ports:
+ - port: 443
+ targetPort: 8443
+ with_items:
+ - name: prometheus
+
+- name: Create alerts service
+ oc_service:
+ state: "{{ state }}"
+ name: "{{ item.name }}"
+ namespace: "{{ openshift_prometheus_namespace }}"
+ selector:
+ app: prometheus
+ labels:
+ name: "{{ item.name }}"
+ # TODO add annotations when supported
+ # annotations:
+ # service.alpha.openshift.io/serving-cert-secret-name: "{{item.name}}-tls"
+ ports:
+ - port: 443
+ targetPort: 9443
+ with_items:
+ - name: alerts
+
+
+# Annotate services with secret name
+# TODO remove this when annotations are supported by oc_service
+- name: annotate prometheus service
+ command: >
+ {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
+ service prometheus 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-tls'
+
+- name: annotate alerts service
+ command: >
+ {{ openshift.common.client_binary }} annotate --overwrite -n {{ openshift_prometheus_namespace }}
+ service alerts 'service.alpha.openshift.io/serving-cert-secret-name=prometheus-alerts-tls'
+
+# create prometheus and alerts routes
+- name: create prometheus and alerts routes
+ oc_route:
+ state: "{{ state }}"
+ name: "{{ item.name }}"
+ namespace: "{{ openshift_prometheus_namespace }}"
+ service_name: "{{ item.name }}"
+ tls_termination: reencrypt
+ with_items:
+ - name: prometheus
+ - name: alerts
+
+# Storage
+- name: create prometheus pvc
+ oc_pvc:
+ namespace: "{{ openshift_prometheus_namespace }}"
+ name: "{{ openshift_prometheus_pvc_name }}"
+ access_modes: "{{ openshift_prometheus_pvc_access_modes }}"
+ volume_capacity: "{{ openshift_prometheus_pvc_size }}"
+ selector: "{{ openshift_prometheus_pvc_pv_selector }}"
+
+- name: create alertmanager pvc
+ oc_pvc:
+ namespace: "{{ openshift_prometheus_namespace }}"
+ name: "{{ openshift_prometheus_alertmanager_pvc_name }}"
+ access_modes: "{{ openshift_prometheus_alertmanager_pvc_access_modes }}"
+ volume_capacity: "{{ openshift_prometheus_alertmanager_pvc_size }}"
+ selector: "{{ openshift_prometheus_alertmanager_pvc_pv_selector }}"
+
+- name: create alertbuffer pvc
+ oc_pvc:
+ namespace: "{{ openshift_prometheus_namespace }}"
+ name: "{{ openshift_prometheus_alertbuffer_pvc_name }}"
+ access_modes: "{{ openshift_prometheus_alertbuffer_pvc_access_modes }}"
+ volume_capacity: "{{ openshift_prometheus_alertbuffer_pvc_size }}"
+ selector: "{{ openshift_prometheus_alertbuffer_pvc_pv_selector }}"
+
+# create prometheus deployment
+- name: Set prometheus deployment template
+ template:
+ src: prometheus_deployment.j2
+ dest: "{{ tempdir }}/templates/prometheus.yaml"
+ vars:
+ namespace: "{{ openshift_prometheus_namespace }}"
+ prom_replicas: "{{ openshift_prometheus_replicas }}"
+
+- name: Set prometheus deployment
+ oc_obj:
+ state: "{{ state }}"
+ name: "prometheus"
+ namespace: "{{ openshift_prometheus_namespace }}"
+ kind: deployment
+ files:
+ - "{{ tempdir }}/templates/prometheus.yaml"
+ delete_after: true
+
+# prometheus configmap
+# Copy the additional rules file if it is defined
+- name: Copy additional rules file to host
+ copy:
+ src: "{{ openshift_prometheus_additional_rules_file }}"
+ dest: "{{ tempdir }}/prometheus.additional.rules"
+ when:
+ - openshift_prometheus_additional_rules_file is defined
+ - openshift_prometheus_additional_rules_file is not none
+ - openshift_prometheus_additional_rules_file | trim | length > 0
+
+- stat:
+ path: "{{ tempdir }}/prometheus.additional.rules"
+ register: additional_rules_stat
+
+# The kubernetes version impacts the prometheus scraping endpoint
+# so gathering it before constructing the configmap
+- name: get oc version
+ oc_version:
+ register: oc_version
+
+- set_fact:
+ kubernetes_version: "{{ oc_version.results.kubernetes_short | float }}"
+
+- template:
+ src: prometheus.yml.j2
+ dest: "{{ tempdir }}/prometheus.yml"
+ changed_when: no
+
+- template:
+ src: prometheus.rules.j2
+ dest: "{{ tempdir }}/prometheus.rules"
+ changed_when: no
+
+# In prometheus configmap create "additional.rules" section if file exists
+- name: Set prometheus configmap
+ oc_configmap:
+ state: "{{ state }}"
+ name: "prometheus"
+ namespace: "{{ openshift_prometheus_namespace }}"
+ from_file:
+ prometheus.rules: "{{ tempdir }}/prometheus.rules"
+ prometheus.additional.rules: "{{ tempdir }}/prometheus.additional.rules"
+ prometheus.yml: "{{ tempdir }}/prometheus.yml"
+ when: additional_rules_stat.stat.exists == True
+
+- name: Set prometheus configmap
+ oc_configmap:
+ state: "{{ state }}"
+ name: "prometheus"
+ namespace: "{{ openshift_prometheus_namespace }}"
+ from_file:
+ prometheus.rules: "{{ tempdir }}/prometheus.rules"
+ prometheus.yml: "{{ tempdir }}/prometheus.yml"
+ when: additional_rules_stat.stat.exists == False
+
+# alertmanager configmap
+- template:
+ src: alertmanager.yml.j2
+ dest: "{{ tempdir }}/alertmanager.yml"
+ changed_when: no
+
+- name: Set alertmanager configmap
+ oc_configmap:
+ state: "{{ state }}"
+ name: "prometheus-alerts"
+ namespace: "{{ openshift_prometheus_namespace }}"
+ from_file:
+ alertmanager.yml: "{{ tempdir }}/alertmanager.yml"
diff --git a/roles/openshift_prometheus/tasks/main.yaml b/roles/openshift_prometheus/tasks/main.yaml
new file mode 100644
index 000000000..523a64334
--- /dev/null
+++ b/roles/openshift_prometheus/tasks/main.yaml
@@ -0,0 +1,26 @@
+---
+
+- name: Create temp directory for doing work in on target
+ command: mktemp -td openshift-prometheus-ansible-XXXXXX
+ register: mktemp
+ changed_when: False
+
+- set_fact:
+ tempdir: "{{ mktemp.stdout }}"
+
+- name: Create templates subdirectory
+ file:
+ state: directory
+ path: "{{ tempdir }}/templates"
+ mode: 0755
+ changed_when: False
+
+- include: install_prometheus.yaml
+ vars:
+ state: "{{ openshift_prometheus_state }}"
+
+- name: Delete temp directory
+ file:
+ name: "{{ tempdir }}"
+ state: absent
+ changed_when: False
diff --git a/roles/openshift_prometheus/tasks/nfs.yaml b/roles/openshift_prometheus/tasks/nfs.yaml
new file mode 100644
index 000000000..0b45f2cee
--- /dev/null
+++ b/roles/openshift_prometheus/tasks/nfs.yaml
@@ -0,0 +1,44 @@
+---
+# Tasks to statically provision NFS volumes
+# Include if not using dynamic volume provisioning
+- name: Ensure the /exports/ directory exists
+ file:
+ path: /exports/
+ state: directory
+ mode: 0755
+ owner: root
+ group: root
+
+- name: Ensure the prom-pv0X export directories exist
+ file:
+ path: "/exports/{{ item }}"
+ state: directory
+ mode: 0777
+ owner: nfsnobody
+ group: nfsnobody
+ with_items: "{{ openshift_prometheus_pv_exports }}"
+
+- name: Ensure the NFS exports for Prometheus PVs exist
+ copy:
+ src: openshift_prometheus.exports
+ dest: /etc/exports.d/openshift_prometheus.exports
+ register: nfs_exports_updated
+
+- name: Ensure the NFS export table is refreshed if exports were added
+ command: exportfs -ar
+ when:
+ - nfs_exports_updated.changed
+
+
+######################################################################
+# Create the required Prometheus PVs. Check out these online docs if you
+# need a refresher on includes looping with items:
+# * http://docs.ansible.com/ansible/playbooks_loops.html#loops-and-includes-in-2-0
+# * http://stackoverflow.com/a/35128533
+#
+# TODO: Handle the case where a PV template is updated in
+# openshift-ansible and the change needs to be landed on the managed
+# cluster.
+
+- include: create_pvs.yaml
+ with_items: "{{ openshift_prometheus_pv_data }}"
diff --git a/roles/openshift_prometheus/templates/alertmanager.yml.j2 b/roles/openshift_prometheus/templates/alertmanager.yml.j2
new file mode 100644
index 000000000..6c432a3d0
--- /dev/null
+++ b/roles/openshift_prometheus/templates/alertmanager.yml.j2
@@ -0,0 +1,20 @@
+global:
+
+# The root route on which each incoming alert enters.
+route:
+ # default route if none match
+ receiver: alert-buffer-wh
+
+ # The labels by which incoming alerts are grouped together. For example,
+ # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+ # be batched into a single group.
+ # TODO:
+ group_by: []
+
+ # All the above attributes are inherited by all child routes and can
+ # overwritten on each.
+
+receivers:
+- name: alert-buffer-wh
+ webhook_configs:
+ - url: http://localhost:9099/topics/alerts
diff --git a/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2
new file mode 100644
index 000000000..55a5e19c3
--- /dev/null
+++ b/roles/openshift_prometheus/templates/prom-pv-alertbuffer.yml.j2
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: prometheus-alertbuffer
+ labels:
+ storage: prometheus-alertbuffer
+spec:
+ capacity:
+ storage: 15Gi
+ accessModes:
+ - ReadWriteOnce
+ nfs:
+ path: /exports/prometheus-alertbuffer
+ server: {{ openshift_prometheus_nfs_server }}
+ persistentVolumeReclaimPolicy: Retain
diff --git a/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2
new file mode 100644
index 000000000..4ee518735
--- /dev/null
+++ b/roles/openshift_prometheus/templates/prom-pv-alertmanager.yml.j2
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: prometheus-alertmanager
+ labels:
+ storage: prometheus-alertmanager
+spec:
+ capacity:
+ storage: 15Gi
+ accessModes:
+ - ReadWriteOnce
+ nfs:
+ path: /exports/prometheus-alertmanager
+ server: {{ openshift_prometheus_nfs_server }}
+ persistentVolumeReclaimPolicy: Retain
diff --git a/roles/openshift_prometheus/templates/prom-pv-server.yml.j2 b/roles/openshift_prometheus/templates/prom-pv-server.yml.j2
new file mode 100644
index 000000000..933bf0f60
--- /dev/null
+++ b/roles/openshift_prometheus/templates/prom-pv-server.yml.j2
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: prometheus
+ labels:
+ storage: prometheus
+spec:
+ capacity:
+ storage: 15Gi
+ accessModes:
+ - ReadWriteOnce
+ nfs:
+ path: /exports/prometheus
+ server: {{ openshift_prometheus_nfs_server }}
+ persistentVolumeReclaimPolicy: Retain
diff --git a/roles/openshift_prometheus/templates/prometheus.rules.j2 b/roles/openshift_prometheus/templates/prometheus.rules.j2
new file mode 100644
index 000000000..e861dc127
--- /dev/null
+++ b/roles/openshift_prometheus/templates/prometheus.rules.j2
@@ -0,0 +1,4 @@
+groups:
+- name: example-rules
+ interval: 30s # defaults to global interval
+ rules:
diff --git a/roles/openshift_prometheus/templates/prometheus.yml.j2 b/roles/openshift_prometheus/templates/prometheus.yml.j2
new file mode 100644
index 000000000..63430f834
--- /dev/null
+++ b/roles/openshift_prometheus/templates/prometheus.yml.j2
@@ -0,0 +1,174 @@
+rule_files:
+ - 'prometheus.rules'
+{% if openshift_prometheus_additional_rules_file is defined and openshift_prometheus_additional_rules_file is not none %}
+ - 'prometheus.additional.rules'
+{% endif %}
+
+
+
+# A scrape configuration for running Prometheus on a Kubernetes cluster.
+# This uses separate scrape configs for cluster components (i.e. API server, node)
+# and services to allow each to use different authentication configs.
+#
+# Kubernetes labels will be added as Prometheus labels on metrics via the
+# `labelmap` relabeling action.
+
+# Scrape config for API servers.
+#
+# Kubernetes exposes API servers as endpoints to the default/kubernetes
+# service so this uses `endpoints` role and uses relabelling to only keep
+# the endpoints associated with the default/kubernetes service using the
+# default named port `https`. This works for single API server deployments as
+# well as HA API server deployments.
+scrape_configs:
+- job_name: 'kubernetes-apiservers'
+
+ kubernetes_sd_configs:
+ - role: endpoints
+
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+ # Keep only the default/kubernetes service endpoints for the https port. This
+ # will add targets for each API server which Kubernetes adds an endpoint to
+ # the default/kubernetes service.
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+ action: keep
+ regex: default;kubernetes;https
+
+# Scrape config for nodes.
+#
+# Each node exposes a /metrics endpoint that contains operational metrics for
+# the Kubelet and other components.
+- job_name: 'kubernetes-nodes'
+
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+ kubernetes_sd_configs:
+ - role: node
+
+ relabel_configs:
+ - action: labelmap
+ regex: __meta_kubernetes_node_label_(.+)
+
+# Scrape config for controllers.
+#
+# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for
+# the controllers.
+#
+# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via
+# endpoints.
+- job_name: 'kubernetes-controllers'
+
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+ kubernetes_sd_configs:
+ - role: endpoints
+
+ # Keep only the default/kubernetes service endpoints for the https port, and then
+ # set the port to 8444. This is the default configuration for the controllers on OpenShift
+ # masters.
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+ action: keep
+ regex: default;kubernetes;https
+ - source_labels: [__address__]
+ action: replace
+ target_label: __address__
+ regex: (.+)(?::\d+)
+ replacement: $1:8444
+
+# Scrape config for cAdvisor.
+#
+# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that
+# reports container metrics for each running pod. Scrape those by default.
+- job_name: 'kubernetes-cadvisor'
+
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+{% if kubernetes_version | float() >= 1.7 | float() %}
+ metrics_path: /metrics/cadvisor
+{% else %}
+ metrics_path: /metrics
+{% endif %}
+
+ kubernetes_sd_configs:
+ - role: node
+
+ relabel_configs:
+ - action: labelmap
+ regex: __meta_kubernetes_node_label_(.+)
+
+# Scrape config for service endpoints.
+#
+# The relabeling allows the actual service scrape endpoint to be configured
+# via the following annotations:
+#
+# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
+# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+# to set this to `https` & most likely set the `tls_config` of the scrape config.
+# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+# * `prometheus.io/port`: If the metrics are exposed on a different port to the
+# service then set this appropriately.
+- job_name: 'kubernetes-service-endpoints'
+
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ # TODO: this should be per target
+ insecure_skip_verify: true
+
+ kubernetes_sd_configs:
+ - role: endpoints
+
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
+ action: keep
+ regex: true
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
+ action: replace
+ target_label: __scheme__
+ regex: (https?)
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+ action: replace
+ target_label: __metrics_path__
+ regex: (.+)
+ - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+ action: replace
+ target_label: __address__
+ regex: (.+)(?::\d+);(\d+)
+ replacement: $1:$2
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_username]
+ action: replace
+ target_label: __basic_auth_username__
+ regex: (.+)
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_password]
+ action: replace
+ target_label: __basic_auth_password__
+ regex: (.+)
+ - action: labelmap
+ regex: __meta_kubernetes_service_label_(.+)
+ - source_labels: [__meta_kubernetes_namespace]
+ action: replace
+ target_label: kubernetes_namespace
+ - source_labels: [__meta_kubernetes_service_name]
+ action: replace
+ target_label: kubernetes_name
+
+alerting:
+ alertmanagers:
+ - scheme: http
+ static_configs:
+ - targets:
+ - "localhost:9093"
diff --git a/roles/openshift_prometheus/templates/prometheus_deployment.j2 b/roles/openshift_prometheus/templates/prometheus_deployment.j2
new file mode 100644
index 000000000..98c117f19
--- /dev/null
+++ b/roles/openshift_prometheus/templates/prometheus_deployment.j2
@@ -0,0 +1,240 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+ name: prometheus
+ namespace: {{ namespace }}
+ labels:
+ app: prometheus
+spec:
+ replicas: {{ prom_replicas|default(1) }}
+ selector:
+ provider: openshift
+ matchLabels:
+ app: prometheus
+ template:
+ metadata:
+ name: prometheus
+ labels:
+ app: prometheus
+ spec:
+ serviceAccountName: prometheus
+{% if openshift_prometheus_node_selector is iterable and openshift_prometheus_node_selector | length > 0 %}
+ nodeSelector:
+{% for key, value in openshift_prometheus_node_selector.iteritems() %}
+ {{key}}: "{{value}}"
+{% endfor %}
+{% endif %}
+ containers:
+ # Deploy Prometheus behind an oauth proxy
+ - name: prom-proxy
+ image: "{{ openshift_prometheus_image_proxy }}"
+ imagePullPolicy: IfNotPresent
+ resources:
+ requests:
+{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %}
+ memory: "{{openshift_prometheus_oauth_proxy_memory_requests}}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %}
+ cpu: "{{openshift_prometheus_oauth_proxy_cpu_requests}}"
+{% endif %}
+ limits:
+{% if openshift_prometheus_memory_requests_limit_proxy is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %}
+ memory: "{{openshift_prometheus_oauth_proxy_memory_limit}}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %}
+ cpu: "{{openshift_prometheus_oauth_proxy_cpu_limit}}"
+{% endif %}
+ ports:
+ - containerPort: 8443
+ name: web
+ args:
+ - -provider=openshift
+ - -https-address=:8443
+ - -http-address=
+ - -email-domain=*
+ - -upstream=http://localhost:9090
+ - -client-id=system:serviceaccount:{{ namespace }}:prometheus
+ - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
+ - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
+ - -tls-cert=/etc/tls/private/tls.crt
+ - -tls-key=/etc/tls/private/tls.key
+ - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token
+ - -cookie-secret-file=/etc/proxy/secrets/session_secret
+ - -skip-auth-regex=^/metrics
+ volumeMounts:
+ - mountPath: /etc/tls/private
+ name: prometheus-tls
+ - mountPath: /etc/proxy/secrets
+ name: prometheus-secrets
+ - mountPath: /prometheus
+ name: prometheus-data
+
+ - name: prometheus
+ args:
+ - --storage.tsdb.retention=6h
+ - --config.file=/etc/prometheus/prometheus.yml
+ - --web.listen-address=localhost:9090
+ image: "{{ openshift_prometheus_image_prometheus }}"
+ imagePullPolicy: IfNotPresent
+ resources:
+ requests:
+{% if openshift_prometheus_memory_requests is defined and openshift_prometheus_memory_requests is not none %}
+ memory: "{{openshift_prometheus_memory_requests}}"
+{% endif %}
+{% if openshift_prometheus_cpu_requests is defined and openshift_prometheus_cpu_requests is not none %}
+ cpu: "{{openshift_prometheus_cpu_requests}}"
+{% endif %}
+ limits:
+{% if openshift_prometheus_memory_limit is defined and openshift_prometheus_memory_limit is not none %}
+ memory: "{{ openshift_prometheus_memory_limit }}"
+{% endif %}
+{% if openshift_prometheus_cpu_limit is defined and openshift_prometheus_cpu_limit is not none %}
+ cpu: "{{openshift_prometheus_cpu_limit}}"
+{% endif %}
+
+ volumeMounts:
+ - mountPath: /etc/prometheus
+ name: prometheus-config
+ - mountPath: /prometheus
+ name: prometheus-data
+
+ # Deploy alertmanager behind prometheus-alert-buffer behind an oauth proxy
+ - name: alerts-proxy
+ image: "{{ openshift_prometheus_image_proxy }}"
+ imagePullPolicy: IfNotPresent
+ resources:
+ requests:
+{% if openshift_prometheus_oauth_proxy_memory_requests is defined and openshift_prometheus_oauth_proxy_memory_requests is not none %}
+ memory: "{{openshift_prometheus_oauth_proxy_memory_requests}}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_requests is defined and openshift_prometheus_oauth_proxy_cpu_requests is not none %}
+ cpu: "{{openshift_prometheus_oauth_proxy_cpu_requests}}"
+{% endif %}
+ limits:
+{% if openshift_prometheus_oauth_proxy_memory_limit is defined and openshift_prometheus_oauth_proxy_memory_limit is not none %}
+ memory: "{{openshift_prometheus_oauth_proxy_memory_limit}}"
+{% endif %}
+{% if openshift_prometheus_oauth_proxy_cpu_limit is defined and openshift_prometheus_oauth_proxy_cpu_limit is not none %}
+ cpu: "{{openshift_prometheus_oauth_proxy_cpu_limit}}"
+{% endif %}
+ ports:
+ - containerPort: 9443
+ name: web
+ args:
+ - -provider=openshift
+ - -https-address=:9443
+ - -http-address=
+ - -email-domain=*
+ - -upstream=http://localhost:9099
+ - -client-id=system:serviceaccount:{{ namespace }}:prometheus
+ - '-openshift-sar={"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}'
+ - '-openshift-delegate-urls={"/": {"resource": "namespaces", "verb": "get", "resourceName": "{{ namespace }}", "namespace": "{{ namespace }}"}}'
+ - -tls-cert=/etc/tls/private/tls.crt
+ - -tls-key=/etc/tls/private/tls.key
+ - -client-secret-file=/var/run/secrets/kubernetes.io/serviceaccount/token
+ - -cookie-secret-file=/etc/proxy/secrets/session_secret
+ volumeMounts:
+ - mountPath: /etc/tls/private
+ name: alerts-tls
+ - mountPath: /etc/proxy/secrets
+ name: alerts-secrets
+
+ - name: alert-buffer
+ args:
+ - --storage-path=/alert-buffer/messages.db
+ image: "{{ openshift_prometheus_image_alertbuffer }}"
+ imagePullPolicy: IfNotPresent
+ resources:
+ requests:
+{% if openshift_prometheus_alertbuffer_memory_requests is defined and openshift_prometheus_alertbuffer_memory_requests is not none %}
+ memory: "{{openshift_prometheus_alertbuffer_memory_requests}}"
+{% endif %}
+{% if openshift_prometheus_alertbuffer_cpu_requests is defined and openshift_prometheus_alertbuffer_cpu_requests is not none %}
+ cpu: "{{openshift_prometheus_alertbuffer_cpu_requests}}"
+{% endif %}
+ limits:
+{% if openshift_prometheus_alertbuffer_memory_limit is defined and openshift_prometheus_alertbuffer_memory_limit is not none %}
+ memory: "{{openshift_prometheus_alertbuffer_memory_limit}}"
+{% endif %}
+{% if openshift_prometheus_alertbuffer_cpu_limit is defined and openshift_prometheus_alertbuffer_cpu_limit is not none %}
+ cpu: "{{openshift_prometheus_alertbuffer_cpu_limit}}"
+{% endif %}
+ volumeMounts:
+ - mountPath: /alert-buffer
+ name: alert-buffer-data
+ ports:
+ - containerPort: 9099
+ name: alert-buf
+
+ - name: alertmanager
+ args:
+ - -config.file=/etc/alertmanager/alertmanager.yml
+ image: "{{ openshift_prometheus_image_alertmanager }}"
+ imagePullPolicy: IfNotPresent
+ resources:
+ requests:
+{% if openshift_prometheus_alertmanager_memory_requests is defined and openshift_prometheus_alertmanager_memory_requests is not none %}
+ memory: "{{openshift_prometheus_alertmanager_memory_requests}}"
+{% endif %}
+{% if openshift_prometheus_alertmanager_cpu_requests is defined and openshift_prometheus_alertmanager_cpu_requests is not none %}
+ cpu: "{{openshift_prometheus_alertmanager_cpu_requests}}"
+{% endif %}
+ limits:
+{% if openshift_prometheus_alertmanager_memory_limit is defined and openshift_prometheus_alertmanager_memory_limit is not none %}
+ memory: "{{openshift_prometheus_alertmanager_memory_limit}}"
+{% endif %}
+{% if openshift_prometheus_alertmanager_cpu_limit is defined and openshift_prometheus_alertmanager_cpu_limit is not none %}
+ cpu: "{{openshift_prometheus_alertmanager_cpu_limit}}"
+{% endif %}
+ ports:
+ - containerPort: 9093
+ name: web
+ volumeMounts:
+ - mountPath: /etc/alertmanager
+ name: alertmanager-config
+ - mountPath: /alertmanager
+ name: alertmanager-data
+
+ restartPolicy: Always
+ volumes:
+ - name: prometheus-config
+ configMap:
+ defaultMode: 420
+ name: prometheus
+ - name: prometheus-secrets
+ secret:
+ secretName: prometheus-proxy
+ - name: prometheus-tls
+ secret:
+ secretName: prometheus-tls
+ - name: prometheus-data
+{% if openshift_prometheus_storage_type == 'pvc' %}
+ persistentVolumeClaim:
+ claimName: {{ openshift_prometheus_pvc_name }}
+{% else %}
+ emptydir: {}
+{% endif %}
+ - name: alertmanager-config
+ configMap:
+ defaultMode: 420
+ name: prometheus-alerts
+ - name: alerts-secrets
+ secret:
+ secretName: alerts-proxy
+ - name: alerts-tls
+ secret:
+ secretName: prometheus-alerts-tls
+ - name: alertmanager-data
+{% if openshift_prometheus_alertmanager_storage_type == 'pvc' %}
+ persistentVolumeClaim:
+ claimName: {{ openshift_prometheus_alertmanager_pvc_name }}
+{% else %}
+ emptydir: {}
+{% endif %}
+ - name: alert-buffer-data
+{% if openshift_prometheus_alertbuffer_storage_type == 'pvc' %}
+ persistentVolumeClaim:
+ claimName: {{ openshift_prometheus_alertbuffer_pvc_name }}
+{% else %}
+ emptydir: {}
+{% endif %}
diff --git a/roles/openshift_prometheus/tests/inventory b/roles/openshift_prometheus/tests/inventory
new file mode 100644
index 000000000..878877b07
--- /dev/null
+++ b/roles/openshift_prometheus/tests/inventory
@@ -0,0 +1,2 @@
+localhost
+
diff --git a/roles/openshift_prometheus/tests/test.yaml b/roles/openshift_prometheus/tests/test.yaml
new file mode 100644
index 000000000..37baf573c
--- /dev/null
+++ b/roles/openshift_prometheus/tests/test.yaml
@@ -0,0 +1,5 @@
+---
+- hosts: localhost
+ remote_user: root
+ roles:
+ - openshift_prometheus