summaryrefslogtreecommitdiffstats
path: root/roles/openshift_prometheus/templates/prometheus.yml.j2
blob: 005c2c564985effb37813e8cf5d777ca7dd3301f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
rule_files:
  - '*.rules'

# A scrape configuration for running Prometheus on a Kubernetes cluster.
# This uses separate scrape configs for cluster components (i.e. API server, node)
# and services to allow each to use different authentication configs.
#
# Kubernetes labels will be added as Prometheus labels on metrics via the
# `labelmap` relabeling action.

# Scrape config for API servers.
#
# Kubernetes exposes API servers as endpoints to the default/kubernetes
# service so this uses `endpoints` role and uses relabelling to only keep
# the endpoints associated with the default/kubernetes service using the
# default named port `https`. This works for single API server deployments as
# well as HA API server deployments.
scrape_configs:
- job_name: 'kubernetes-apiservers'

  kubernetes_sd_configs:
  - role: endpoints

  scheme: https
  tls_config:
    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

  # Keep only the default/kubernetes service endpoints for the https port. This
  # will add targets for each API server which Kubernetes adds an endpoint to
  # the default/kubernetes service.
  relabel_configs:
  - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
    action: keep
    regex: default;kubernetes;https

# Scrape config for controllers.
#
# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for
# the controllers.
#
- job_name: 'kubernetes-controllers'

  scheme: https
  tls_config:
    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

  kubernetes_sd_configs:
  - role: endpoints

  # Keep only the default/kubernetes service endpoints for the https port, and then
  # set the port to 8444. This is the default configuration for the controllers on OpenShift
  # masters.
  relabel_configs:
  - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
    action: keep
    regex: default;kubernetes;https
  - source_labels: [__address__]
    action: replace
    target_label: __address__
    regex: (.+)(?::\d+)
    replacement: $1:8444

# Scrape config for nodes.
#
# Each node exposes a /metrics endpoint that contains operational metrics for
# the Kubelet and other components.
- job_name: 'kubernetes-nodes'
  scheme: https
  tls_config:
    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
  kubernetes_sd_configs:
  - role: node
  # Drop a very high cardinality metric that is incorrect in 3.7. It will be
  # fixed in 3.9.
  metric_relabel_configs:
  - source_labels: [__name__]
    action: drop
    regex: 'openshift_sdn_pod_(setup|teardown)_latency(.*)'
  relabel_configs:
  - action: labelmap
    regex: __meta_kubernetes_node_label_(.+)

# Scrape config for cAdvisor.
#
# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that
# reports container metrics for each running pod. Scrape those by default.
- job_name: 'kubernetes-cadvisor'

  scheme: https
  tls_config:
    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

{% if kubernetes_version | float() >= 1.7 | float() %}
  metrics_path: /metrics/cadvisor
{% else %}
  metrics_path: /metrics
{% endif %}

  kubernetes_sd_configs:
  - role: node

  # Exclude a set of high cardinality metrics that can contribute to significant
  # memory use in large clusters. These can be selectively enabled as necessary
  # for medium or small clusters.
  metric_relabel_configs:
  - source_labels: [__name__]
    action: drop
    regex: 'container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_usage_bytes|memory_swap|memory_working_set_bytes|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total|spec_(.*))'

  relabel_configs:
  - action: labelmap
    regex: __meta_kubernetes_node_label_(.+)

# Scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
# to set this to `https` & most likely set the `tls_config` of the scrape config.
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
# service then set this appropriately.
- job_name: 'kubernetes-service-endpoints'

  tls_config:
    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
    # TODO: this should be per target
    insecure_skip_verify: true

  kubernetes_sd_configs:
  - role: endpoints

  relabel_configs:
    # only scrape infrastructure components
    - source_labels: [__meta_kubernetes_namespace]
      action: keep
      regex: 'default|logging|metrics|kube-.+|openshift|openshift-.+'
    # drop infrastructure components managed by other scrape targets
    - source_labels: [__meta_kubernetes_service_name]
      action: drop
      regex: 'prometheus-node-exporter'
    # only those that have requested scraping
    - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
      action: keep
      regex: true
    - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
      action: replace
      target_label: __scheme__
      regex: (https?)
    - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
      action: replace
      target_label: __metrics_path__
      regex: (.+)
    - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
      action: replace
      target_label: __address__
      regex: (.+)(?::\d+);(\d+)
      replacement: $1:$2
    - action: labelmap
      regex: __meta_kubernetes_service_label_(.+)
    - source_labels: [__meta_kubernetes_namespace]
      action: replace
      target_label: kubernetes_namespace
    - source_labels: [__meta_kubernetes_service_name]
      action: replace
      target_label: kubernetes_name

# Scrape config for node-exporter, which is expected to be running on port 9100.
- job_name: 'kubernetes-nodes-exporter'

  tls_config:
    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt

  kubernetes_sd_configs:
  - role: node

  metric_relabel_configs:
  - source_labels: [__name__]
    action: drop
    regex: 'node_cpu|node_(disk|scrape_collector)_.+'
  # preserve a subset of the network, netstat, vmstat, and filesystem series
  - source_labels: [__name__]
    action: replace
    regex: '(node_(netstat_Ip_.+|vmstat_(nr|thp)_.+|filesystem_(free|size|device_error)|network_(transmit|receive)_(drop|errs)))'
    target_label: __name__
    replacement: renamed_$1
  - source_labels: [__name__]
    action: drop
    regex: 'node_(netstat|vmstat|filesystem|network)_.+'
  - source_labels: [__name__]
    action: replace
    regex: 'renamed_(.+)'
    target_label: __name__
    replacement: $1
  # drop any partial expensive series
  - source_labels: [__name__, device]
    action: drop
    regex: 'node_network_.+;veth.+'
  - source_labels: [__name__, mountpoint]
    action: drop
    regex: 'node_filesystem_(free|size|device_error);([^/].*|/.+)'

  relabel_configs:
  - source_labels: [__address__]
    regex: '(.*):10250'
    replacement: '${1}:9100'
    target_label: __address__
  - source_labels: [__meta_kubernetes_node_label_kubernetes_io_hostname]
    target_label: __instance__
  - action: labelmap
    regex: __meta_kubernetes_node_label_(.+)

# Scrape config for the template service broker
- job_name: 'openshift-template-service-broker'
  scheme: https
  tls_config:
    ca_file: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt
    server_name: apiserver.openshift-template-service-broker.svc
  bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token

  kubernetes_sd_configs:
  - role: endpoints

  relabel_configs:
  - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
    action: keep
    regex: openshift-template-service-broker;apiserver;https


alerting:
  alertmanagers:
  - scheme: http
    static_configs:
    - targets:
      - "localhost:9093"