From 16161bafb7253d00c161c98a4c61820cb0d71e9e Mon Sep 17 00:00:00 2001 From: Remy Moll Date: Sun, 5 Jan 2025 16:26:46 +0100 Subject: [PATCH] monitoring swtich back to prometheus-operator --- apps/grafana/grafana.values.yaml | 2 +- .../argocd/argocd-cmd-params.configmap.yaml | 4 +- infrastructure/argocd/argocd.configmap.yaml | 1 + infrastructure/argocd/ingress.yaml | 11 +- infrastructure/argocd/kustomization.yaml | 7 +- infrastructure/argocd/servicemonitor.yaml | 77 +++ infrastructure/monitoring/kustomization.yaml | 15 +- .../prometheus-node-exporter.values.yaml | 14 + .../monitoring/prometheus.values.yaml | 574 ------------------ infrastructure/monitoring/prometheus.yaml | 78 +++ .../monitoring/thanos-query.deployment.yaml | 55 ++ .../monitoring/thanos-store.statefulset.yaml | 71 +++ .../monitoring/application.yaml | 3 + 13 files changed, 319 insertions(+), 593 deletions(-) create mode 100644 infrastructure/argocd/servicemonitor.yaml create mode 100644 infrastructure/monitoring/prometheus-node-exporter.values.yaml delete mode 100644 infrastructure/monitoring/prometheus.values.yaml create mode 100644 infrastructure/monitoring/prometheus.yaml create mode 100644 infrastructure/monitoring/thanos-query.deployment.yaml create mode 100644 infrastructure/monitoring/thanos-store.statefulset.yaml diff --git a/apps/grafana/grafana.values.yaml b/apps/grafana/grafana.values.yaml index 78144d4..89e2694 100644 --- a/apps/grafana/grafana.values.yaml +++ b/apps/grafana/grafana.values.yaml @@ -37,7 +37,7 @@ datasources: datasources: - name: Prometheus type: prometheus - url: http://prometheus-server.monitoring.svc:80 + url: http://prometheus.monitoring.svc:9090 isDefault: true - name: Thanos type: prometheus diff --git a/infrastructure/argocd/argocd-cmd-params.configmap.yaml b/infrastructure/argocd/argocd-cmd-params.configmap.yaml index c0d45be..268371f 100644 --- a/infrastructure/argocd/argocd-cmd-params.configmap.yaml +++ b/infrastructure/argocd/argocd-cmd-params.configmap.yaml @@ -3,4 +3,6 @@ kind: ConfigMap metadata: name: argocd-cmd-params-cm data: - server.insecure: "true" \ No newline at end of file + # server.insecure: "true" + # DID NOT FIX RELOAD LOOPS + # application.namespaces: "*" \ No newline at end of file diff --git a/infrastructure/argocd/argocd.configmap.yaml b/infrastructure/argocd/argocd.configmap.yaml index 5b2345f..c7ca20f 100644 --- a/infrastructure/argocd/argocd.configmap.yaml +++ b/infrastructure/argocd/argocd.configmap.yaml @@ -7,3 +7,4 @@ data: # switch to annotation based resource tracking as per # https://argo-cd.readthedocs.io/en/stable/user-guide/resource_tracking/ application.resourceTrackingMethod: annotation+label + admin.enabled: "false" diff --git a/infrastructure/argocd/ingress.yaml b/infrastructure/argocd/ingress.yaml index 3a99d2f..a030741 100644 --- a/infrastructure/argocd/ingress.yaml +++ b/infrastructure/argocd/ingress.yaml @@ -9,16 +9,9 @@ spec: routes: - kind: Rule match: Host(`argocd.kluster.moll.re`) - priority: 10 services: - name: argocd-server - port: 80 - - kind: Rule - match: Host(`argocd.kluster.moll.re`) && Header(`Content-Type`, `application/grpc`) - priority: 11 - services: - - name: argocd-server - port: 80 - scheme: h2c + port: 443 + scheme: https tls: certResolver: default-tls \ No newline at end of file diff --git a/infrastructure/argocd/kustomization.yaml b/infrastructure/argocd/kustomization.yaml index 39d33db..98c136b 100644 --- a/infrastructure/argocd/kustomization.yaml +++ b/infrastructure/argocd/kustomization.yaml @@ -4,14 +4,15 @@ kind: Kustomization namespace: argocd resources: - namespace.yaml - - https://raw.githubusercontent.com/argoproj/argo-cd/v2.13.1/manifests/install.yaml + - https://raw.githubusercontent.com/argoproj/argo-cd/v2.13.3/manifests/install.yaml - ingress.yaml - argo-apps.application.yaml - bootstrap-repo.sealedsecret.yaml - argocd-oauth.sealedsecret.yaml + - servicemonitor.yaml + # DID NOT FIX RELOAD LOOPS + # - github.com/argoproj/argo-cd/examples/k8s-rbac/argocd-server-applications?ref=master -components: - - https://github.com/argoproj-labs/argocd-extensions/manifests patches: - path: argocd.configmap.yaml diff --git a/infrastructure/argocd/servicemonitor.yaml b/infrastructure/argocd/servicemonitor.yaml new file mode 100644 index 0000000..49bacb7 --- /dev/null +++ b/infrastructure/argocd/servicemonitor.yaml @@ -0,0 +1,77 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: argocd-metrics + labels: + release: prometheus-operator +spec: + selector: + matchLabels: + app.kubernetes.io/name: argocd-metrics + endpoints: + - port: metrics +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: argocd-server-metrics + labels: + release: prometheus-operator +spec: + selector: + matchLabels: + app.kubernetes.io/name: argocd-server-metrics + endpoints: + - port: metrics +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: argocd-repo-server-metrics + labels: + release: prometheus-operator +spec: + selector: + matchLabels: + app.kubernetes.io/name: argocd-repo-server + endpoints: + - port: metrics +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: argocd-applicationset-controller-metrics + labels: + release: prometheus-operator +spec: + selector: + matchLabels: + app.kubernetes.io/name: argocd-applicationset-controller + endpoints: + - port: metrics +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: argocd-dex-server + labels: + release: prometheus-operator +spec: + selector: + matchLabels: + app.kubernetes.io/name: argocd-dex-server + endpoints: + - port: metrics +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: argocd-redis-haproxy-metrics + labels: + release: prometheus-operator +spec: + selector: + matchLabels: + app.kubernetes.io/name: argocd-redis-ha-haproxy + endpoints: + - port: http-exporter-port \ No newline at end of file diff --git a/infrastructure/monitoring/kustomization.yaml b/infrastructure/monitoring/kustomization.yaml index dc55e83..9cd3953 100644 --- a/infrastructure/monitoring/kustomization.yaml +++ b/infrastructure/monitoring/kustomization.yaml @@ -6,8 +6,13 @@ namespace: monitoring resources: - namespace.yaml # prometheus-operator crds + - https://github.com/prometheus-operator/prometheus-operator?ref=v0.79.2 + # single prometheus instance with a thanos sidecar + - prometheus.yaml + - thanos-store.statefulset.yaml + - thanos-query.deployment.yaml - thanos-objstore-config.sealedsecret.yaml - # - loki-objstore-config.sealedsecret.yaml + images: - name: thanos @@ -21,8 +26,8 @@ helmCharts: repo: https://grafana.github.io/helm-charts version: 6.24.0 valuesFile: loki.values.yaml - - name: prometheus - releaseName: prometheus + - name: prometheus-node-exporter + releaseName: prometheus-node-exporter repo: https://prometheus-community.github.io/helm-charts - version: 26.0.1 - valuesFile: prometheus.values.yaml + version: 4.43.1 + valuesFile: prometheus-node-exporter.values.yaml diff --git a/infrastructure/monitoring/prometheus-node-exporter.values.yaml b/infrastructure/monitoring/prometheus-node-exporter.values.yaml new file mode 100644 index 0000000..8e17d39 --- /dev/null +++ b/infrastructure/monitoring/prometheus-node-exporter.values.yaml @@ -0,0 +1,14 @@ +prometheus: + monitor: + enabled: true + + jobLabel: "node-exporter" + + +resources: + limits: + cpu: 200m + memory: 50Mi + requests: + cpu: 100m + memory: 30Mi diff --git a/infrastructure/monitoring/prometheus.values.yaml b/infrastructure/monitoring/prometheus.values.yaml deleted file mode 100644 index 0d55740..0000000 --- a/infrastructure/monitoring/prometheus.values.yaml +++ /dev/null @@ -1,574 +0,0 @@ -podSecurityPolicy: - enabled: true - -server: - extraArgs: - log.level: debug - storage.tsdb.min-block-duration: 2h # Don't change this, see docs/components/sidecar.md - storage.tsdb.max-block-duration: 2h # Don't change this, see docs/components/sidecar.md - retention: 180d - service: - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "9090" - statefulSet: - enabled: true - podAnnotations: - prometheus.io/scrape: "true" - prometheus.io/port: "10902" - # sidecarContainers: - # thanos-sidecar: - # image: thanos - # resources: - # requests: - # memory: "512Mi" - # env: - # - name: GOOGLE_APPLICATION_CREDENTIALS - # value: /etc/secret/sa - # args: - # - "sidecar" - # - "--log.level=debug" - # - "--tsdb.path=/data/" - # - "--prometheus.url=http://127.0.0.1:9090" - # - "--objstore.config={type: GCS, config: {bucket: BUCKET_REPLACE_ME}}" - # - "--reloader.config-file=/etc/prometheus-config/prometheus.yml" - # - "--reloader.config-envsubst-file=/etc/prometheus-shared/prometheus.yml" - # - "--reloader.rule-dir=/etc/prometheus-config/rules" - # ports: - # - name: sidecar-http - # containerPort: 10902 - # - name: grpc - # containerPort: 10901 - # - name: cluster - # containerPort: 10900 - # volumeMounts: - # - name: storage-volume - # mountPath: /data - # - name: thanos-storage-secret - # mountPath: /etc/secret - # - name: config-volume - # mountPath: /etc/prometheus-config - # readOnly: false - # - name: prometheus-config-shared - # mountPath: /etc/prometheus-shared/ - # readOnly: false - # # configPath: /etc/prometheus-shared/prometheus.yml - # replicaCount: 1 - # persistentVolume: - # size: 20Gi - # storageClass: nfs-client - # extraVolumes: # spec.template.spec.volumes - # - name: prometheus-config-shared - # emptyDir: {} - # extraVolumeMounts: # spec.template.spec.containers.volumeMounts for prometheus container - # - name: prometheus-config-shared - # mountPath: /etc/prometheus-shared/ - # resources: - # requests: - # memory: 1Gi - # global: - # scrape_interval: 5s - # scrape_timeout: 4s - # external_labels: - # prometheus_group: KLUSTER - # prometheus_replica: '$(HOSTNAME)' - # evaluation_interval: 5s - # extraSecretMounts: - # - name: thanos-storage-secret - # mountPath: /etc/secret/ - # subPath: sa - # readOnly: false - # secretName: thanos-objstore-config - -# as thanos sidecar is taking care of the config reload -# we can disable the prometheus configmap reload -configmapReload: - prometheus: - enabled: false - -## Prometheus server ConfigMap entries -## -serverFiles: - ## Alerts configuration - ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ - alerting_rules.yml: {} - # groups: - # - name: Instances - # rules: - # - alert: InstanceDown - # expr: up == 0 - # for: 5m - # labels: - # severity: page - # annotations: - # description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.' - # summary: 'Instance {{ $labels.instance }} down' - ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use alerting_rules.yml - alerts: {} - - ## Records configuration - ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/ - recording_rules.yml: {} - ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use recording_rules.yml - rules: {} - - prometheus.yml: - rule_files: - - /etc/config/recording_rules.yml - - /etc/config/alerting_rules.yml - ## Below two files are DEPRECATED will be removed from this default values file - - /etc/config/rules - - /etc/config/alerts - - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - # A scrape configuration for running Prometheus on a Kubernetes cluster. - # This uses separate scrape configs for cluster components (i.e. API server, node) - # and services to allow each to use different authentication configs. - # - # Kubernetes labels will be added as Prometheus labels on metrics via the - # `labelmap` relabeling action. - - # Scrape config for API servers. - # - # Kubernetes exposes API servers as endpoints to the default/kubernetes - # service so this uses `endpoints` role and uses relabelling to only keep - # the endpoints associated with the default/kubernetes service using the - # default named port `https`. This works for single API server deployments as - # well as HA API server deployments. - - job_name: 'kubernetes-apiservers' - - kubernetes_sd_configs: - - role: endpoints - - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - # Keep only the default/kubernetes service endpoints for the https port. This - # will add targets for each API server which Kubernetes adds an endpoint to - # the default/kubernetes service. - relabel_configs: - - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: default;kubernetes;https - - - job_name: 'kubernetes-nodes' - - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/$1/proxy/metrics - - - - job_name: 'kubernetes-nodes-cadvisor' - - # Default to scraping over https. If required, just disable this or change to - # `http`. - scheme: https - - # This TLS & bearer token file config is used to connect to the actual scrape - # endpoints for cluster components. This is separate to discovery auth - # configuration because discovery & scraping are two separate concerns in - # Prometheus. The discovery auth config is automatic if Prometheus runs inside - # the cluster. Otherwise, more config options have to be provided within the - # . - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - # If your node certificates are self-signed or use a different CA to the - # master CA, then disable certificate verification below. Note that - # certificate verification is an integral part of a secure infrastructure - # so this should only be disabled in a controlled environment. You can - # disable certificate verification by uncommenting the line below. - # - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - # This configuration will work only on kubelet 1.7.3+ - # As the scrape endpoints for cAdvisor have changed - # if you are using older version you need to change the replacement to - # replacement: /api/v1/nodes/$1:4194/proxy/metrics - # more info here https://github.com/coreos/prometheus-operator/issues/633 - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor - - # Metric relabel configs to apply to samples before ingestion. - # [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs) - # metric_relabel_configs: - # - action: labeldrop - # regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone) - - # Scrape config for service endpoints. - # - # The relabeling allows the actual service scrape endpoint to be configured - # via the following annotations: - # - # * `prometheus.io/scrape`: Only scrape services that have a value of - # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well. - # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need - # to set this to `https` & most likely set the `tls_config` of the scrape config. - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: If the metrics are exposed on a different port to the - # service then set this appropriately. - # * `prometheus.io/param_`: If the metrics endpoint uses parameters - # then you can set any parameter - - job_name: 'kubernetes-service-endpoints' - honor_labels: true - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape_slow] - action: drop - regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] - action: replace - target_label: __address__ - regex: (.+?)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: service - - source_labels: [__meta_kubernetes_pod_node_name] - action: replace - target_label: node - - # Scrape config for slow service endpoints; same as above, but with a larger - # timeout and a larger interval - # - # The relabeling allows the actual service scrape endpoint to be configured - # via the following annotations: - # - # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true` - # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need - # to set this to `https` & most likely set the `tls_config` of the scrape config. - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: If the metrics are exposed on a different port to the - # service then set this appropriately. - # * `prometheus.io/param_`: If the metrics endpoint uses parameters - # then you can set any parameter - - job_name: 'kubernetes-service-endpoints-slow' - honor_labels: true - - scrape_interval: 5m - scrape_timeout: 30s - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape_slow] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] - action: replace - target_label: __address__ - regex: (.+?)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: service - - source_labels: [__meta_kubernetes_pod_node_name] - action: replace - target_label: node - - - job_name: 'prometheus-pushgateway' - honor_labels: true - - kubernetes_sd_configs: - - role: service - - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] - action: keep - regex: pushgateway - - # Example scrape config for probing services via the Blackbox Exporter. - # - # The relabeling allows the actual service scrape endpoint to be configured - # via the following annotations: - # - # * `prometheus.io/probe`: Only probe services that have a value of `true` - - job_name: 'kubernetes-services' - honor_labels: true - - metrics_path: /probe - params: - module: [http_2xx] - - kubernetes_sd_configs: - - role: service - - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] - action: keep - regex: true - - source_labels: [__address__] - target_label: __param_target - - target_label: __address__ - replacement: blackbox - - source_labels: [__param_target] - target_label: instance - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - target_label: namespace - - source_labels: [__meta_kubernetes_service_name] - target_label: service - - # Example scrape config for pods - # - # The relabeling allows the actual pod scrape endpoint to be configured via the - # following annotations: - # - # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`, - # except if `prometheus.io/scrape-slow` is set to `true` as well. - # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need - # to set this to `https` & most likely set the `tls_config` of the scrape config. - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. - - job_name: 'kubernetes-pods' - honor_labels: true - - kubernetes_sd_configs: - - role: pod - - relabel_configs: - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow] - action: drop - regex: true - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme] - action: replace - regex: (https?) - target_label: __scheme__ - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - target_label: __address__ - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - - source_labels: [__meta_kubernetes_pod_phase] - regex: Pending|Succeeded|Failed|Completed - action: drop - - source_labels: [__meta_kubernetes_pod_node_name] - action: replace - target_label: node - - # Example Scrape config for pods which should be scraped slower. An useful example - # would be stackriver-exporter which queries an API on every scrape of the pod - # - # The relabeling allows the actual pod scrape endpoint to be configured via the - # following annotations: - # - # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true` - # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need - # to set this to `https` & most likely set the `tls_config` of the scrape config. - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. - - job_name: 'kubernetes-pods-slow' - honor_labels: true - - scrape_interval: 5m - scrape_timeout: 30s - - kubernetes_sd_configs: - - role: pod - - relabel_configs: - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow] - action: keep - regex: true - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme] - action: replace - regex: (https?) - target_label: __scheme__ - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - target_label: __address__ - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: pod - - source_labels: [__meta_kubernetes_pod_phase] - regex: Pending|Succeeded|Failed|Completed - action: drop - - source_labels: [__meta_kubernetes_pod_node_name] - action: replace - target_label: node - - - - -# Configuration of subcharts defined in Chart.yaml - -## alertmanager sub-chart configurable values -## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/alertmanager -## -alertmanager: - enabled: false - -## kube-state-metrics sub-chart configurable values -## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics -## -kube-state-metrics: - ## If false, kube-state-metrics sub-chart will not be installed - ## - enabled: true - -## prometheus-node-exporter sub-chart configurable values -## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter -## -prometheus-node-exporter: - ## If false, node-exporter will not be installed - ## - enabled: true - - rbac: - pspEnabled: false - - containerSecurityContext: - allowPrivilegeEscalation: false - -## prometheus-pushgateway sub-chart configurable values -## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-pushgateway -## -prometheus-pushgateway: - ## If false, pushgateway will not be installed - ## - enabled: false diff --git a/infrastructure/monitoring/prometheus.yaml b/infrastructure/monitoring/prometheus.yaml new file mode 100644 index 0000000..3820ebe --- /dev/null +++ b/infrastructure/monitoring/prometheus.yaml @@ -0,0 +1,78 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: ["get", "list", "watch"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: monitoring # needs to be the same as in the kustomization.yaml +--- +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: prometheus +spec: + securityContext: + runAsUser: 65534 # same as the thanos sidecar + resources: + requests: + memory: 400Mi + retention: 730d + retentionSize: 3GiB + serviceAccountName: prometheus + enableAdminAPI: false + serviceMonitorNamespaceSelector: {} + serviceMonitorSelector: {} + thanos: + version: v0.34.1 + objectStorageConfig: + # loads the config from a secret named thanos-objstore-config in the same namespace + key: thanos.yaml + name: thanos-objstore-config +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus +spec: + type: ClusterIP + ports: + - port: 9090 + targetPort: 9090 + protocol: TCP + selector: + prometheus: prometheus diff --git a/infrastructure/monitoring/thanos-query.deployment.yaml b/infrastructure/monitoring/thanos-query.deployment.yaml new file mode 100644 index 0000000..3a782e7 --- /dev/null +++ b/infrastructure/monitoring/thanos-query.deployment.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: thanos-querier + labels: + app: thanos-querier +spec: + replicas: 1 + selector: + matchLabels: + app: thanos-querier + template: + metadata: + labels: + app: thanos-querier + spec: + containers: + - name: thanos + image: thanos + args: + - query + - --log.level=debug + - --query.replica-label=replica + - --endpoint=dnssrv+_grpc._tcp.thanos-store:10901 + - --endpoint=dnssrv+_grpc._tcp.prometheus:9090 + ports: + - name: http + containerPort: 10902 + - name: grpc + containerPort: 10901 + livenessProbe: + httpGet: + port: http + path: /-/healthy + readinessProbe: + httpGet: + port: http + path: /-/ready +--- +apiVersion: v1 +kind: Service +metadata: + name: thanos-querier +spec: + selector: + app: thanos-querier + ports: + - name: http + protocol: TCP + port: 10902 + targetPort: http + - name: grpc + protocol: TCP + port: 10901 + targetPort: grpc diff --git a/infrastructure/monitoring/thanos-store.statefulset.yaml b/infrastructure/monitoring/thanos-store.statefulset.yaml new file mode 100644 index 0000000..4055317 --- /dev/null +++ b/infrastructure/monitoring/thanos-store.statefulset.yaml @@ -0,0 +1,71 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: thanos-store + labels: + app: thanos-store +spec: + replicas: 1 + selector: + matchLabels: + app: thanos-store + template: + metadata: + labels: + app: thanos-store + thanos-store-api: "true" + spec: + containers: + - name: thanos + image: thanos + args: + - store + - --log.level=debug + - --data-dir=/data + - --grpc-address=0.0.0.0:10901 + - --http-address=0.0.0.0:10902 + - --objstore.config-file=/etc/secret/thanos.yaml + - --index-cache-size=500MB + - --chunk-pool-size=500MB + ports: + - name: http + containerPort: 10902 + - name: grpc + containerPort: 10901 + livenessProbe: + httpGet: + port: 10902 + path: /-/healthy + readinessProbe: + httpGet: + port: 10902 + path: /-/ready + volumeMounts: + - name: thanos-objstore-config + mountPath: /etc/secret + readOnly: true + - name: thanos-data + mountPath: /data + volumes: + - name: thanos-objstore-config + secret: + secretName: thanos-objstore-config + - name: thanos-data + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/name: thanos-store + name: thanos-store +spec: + ports: + - name: grpc + port: 10901 + targetPort: 10901 + - name: http + port: 10902 + targetPort: 10902 + selector: + app: thanos-store diff --git a/kluster-deployments/monitoring/application.yaml b/kluster-deployments/monitoring/application.yaml index 8e79792..b21ab37 100644 --- a/kluster-deployments/monitoring/application.yaml +++ b/kluster-deployments/monitoring/application.yaml @@ -17,3 +17,6 @@ spec: automated: prune: true selfHeal: true + syncOptions: + - Replace=true + # because the prometheus-operator CRDs are too large