From ee202235074c87599904105932ffaf4b9e0717d1 Mon Sep 17 00:00:00 2001 From: Remy Moll Date: Mon, 23 Dec 2024 22:40:35 +0100 Subject: [PATCH] monitoring cleanup --- .../grafana-admin.sealedsecret.yaml | 0 .../grafana-auth.sealedsecret.yaml | 0 .../grafana.ingress.yaml | 0 .../grafana.values.yaml | 4 +- .../kustomization.yaml | 0 apps/{monitoring => grafana}/namespace.yaml | 0 infrastructure/monitoring/kustomization.yaml | 28 + infrastructure/monitoring/loki.values.yaml | 73 +++ .../{prometheus => monitoring}/namespace.yaml | 0 .../monitoring/prometheus.values.yaml | 573 ++++++++++++++++++ .../thanos-objstore-config.sealedsecret.yaml | 16 + infrastructure/prometheus/kustomization.yaml | 20 - infrastructure/prometheus/prometheus.yaml | 78 --- .../thanos-objstore-config.sealedsecret.yaml | 16 - .../prometheus/thanos-query.deployment.yaml | 55 -- .../prometheus/thanos-store.statefulset.yaml | 71 --- kluster-deployments/grafana/application.yaml | 18 + .../kustomization.yaml | 0 kluster-deployments/kustomization.yaml | 4 +- .../monitoring/application.yaml | 9 +- .../prometheus/application.yaml | 22 - 21 files changed, 717 insertions(+), 270 deletions(-) rename apps/{monitoring => grafana}/grafana-admin.sealedsecret.yaml (100%) rename apps/{monitoring => grafana}/grafana-auth.sealedsecret.yaml (100%) rename apps/{monitoring => grafana}/grafana.ingress.yaml (100%) rename apps/{monitoring => grafana}/grafana.values.yaml (95%) rename apps/{monitoring => grafana}/kustomization.yaml (100%) rename apps/{monitoring => grafana}/namespace.yaml (100%) create mode 100644 infrastructure/monitoring/kustomization.yaml create mode 100644 infrastructure/monitoring/loki.values.yaml rename infrastructure/{prometheus => monitoring}/namespace.yaml (100%) create mode 100644 infrastructure/monitoring/prometheus.values.yaml create mode 100644 infrastructure/monitoring/thanos-objstore-config.sealedsecret.yaml delete mode 100644 infrastructure/prometheus/kustomization.yaml delete mode 100644 infrastructure/prometheus/prometheus.yaml delete mode 100644 infrastructure/prometheus/thanos-objstore-config.sealedsecret.yaml delete mode 100644 infrastructure/prometheus/thanos-query.deployment.yaml delete mode 100644 infrastructure/prometheus/thanos-store.statefulset.yaml create mode 100644 kluster-deployments/grafana/application.yaml rename kluster-deployments/{monitoring => grafana}/kustomization.yaml (100%) delete mode 100644 kluster-deployments/prometheus/application.yaml diff --git a/apps/monitoring/grafana-admin.sealedsecret.yaml b/apps/grafana/grafana-admin.sealedsecret.yaml similarity index 100% rename from apps/monitoring/grafana-admin.sealedsecret.yaml rename to apps/grafana/grafana-admin.sealedsecret.yaml diff --git a/apps/monitoring/grafana-auth.sealedsecret.yaml b/apps/grafana/grafana-auth.sealedsecret.yaml similarity index 100% rename from apps/monitoring/grafana-auth.sealedsecret.yaml rename to apps/grafana/grafana-auth.sealedsecret.yaml diff --git a/apps/monitoring/grafana.ingress.yaml b/apps/grafana/grafana.ingress.yaml similarity index 100% rename from apps/monitoring/grafana.ingress.yaml rename to apps/grafana/grafana.ingress.yaml diff --git a/apps/monitoring/grafana.values.yaml b/apps/grafana/grafana.values.yaml similarity index 95% rename from apps/monitoring/grafana.values.yaml rename to apps/grafana/grafana.values.yaml index 196b923..7148172 100644 --- a/apps/monitoring/grafana.values.yaml +++ b/apps/grafana/grafana.values.yaml @@ -37,11 +37,11 @@ datasources: datasources: - name: Thanos type: prometheus - url: http://thanos-querier.prometheus.svc:10902 + url: http://thanos-querier.monitoring.svc:10902 isDefault: true - name: Prometheus type: prometheus - url: http://prometheus.prometheus.svc:9090 + url: http://prometheus.monitoring.svc:9090 isDefault: false dashboardProviders: diff --git a/apps/monitoring/kustomization.yaml b/apps/grafana/kustomization.yaml similarity index 100% rename from apps/monitoring/kustomization.yaml rename to apps/grafana/kustomization.yaml diff --git a/apps/monitoring/namespace.yaml b/apps/grafana/namespace.yaml similarity index 100% rename from apps/monitoring/namespace.yaml rename to apps/grafana/namespace.yaml diff --git a/infrastructure/monitoring/kustomization.yaml b/infrastructure/monitoring/kustomization.yaml new file mode 100644 index 0000000..dc55e83 --- /dev/null +++ b/infrastructure/monitoring/kustomization.yaml @@ -0,0 +1,28 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: monitoring + +resources: + - namespace.yaml + # prometheus-operator crds + - thanos-objstore-config.sealedsecret.yaml + # - loki-objstore-config.sealedsecret.yaml + +images: + - name: thanos + newName: quay.io/thanos/thanos + newTag: v0.37.2 + + +helmCharts: + - name: loki + releaseName: loki + repo: https://grafana.github.io/helm-charts + version: 6.24.0 + valuesFile: loki.values.yaml + - name: prometheus + releaseName: prometheus + repo: https://prometheus-community.github.io/helm-charts + version: 26.0.1 + valuesFile: prometheus.values.yaml diff --git a/infrastructure/monitoring/loki.values.yaml b/infrastructure/monitoring/loki.values.yaml new file mode 100644 index 0000000..9a5e487 --- /dev/null +++ b/infrastructure/monitoring/loki.values.yaml @@ -0,0 +1,73 @@ +loki: + commonConfig: + replication_factor: 1 + schemaConfig: + configs: + - from: "2024-04-01" + store: tsdb + object_store: s3 + schema: v13 + index: + prefix: loki_index_ + period: 24h + pattern_ingester: + enabled: true + limits_config: + allow_structured_metadata: true + volume_enabled: true + retention_period: 672h # 28 days retention + ruler: + enable_api: true + storage: + type: filesystem + filesystem: + chunks_directory: /var/loki/chunks + rules_directory: /var/loki/rules + admin_api_directory: /var/loki/admin + +minio: + enabled: false + +deploymentMode: SingleBinary + +singleBinary: + replicas: 1 + persistence: + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: true + # -- Enable persistent disk + enabled: true + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: nfs-client + +# Zero out replica counts of other deployment modes +backend: + replicas: 0 +read: + replicas: 0 +write: + replicas: 0 +ingester: + replicas: 0 +querier: + replicas: 0 +queryFrontend: + replicas: 0 +queryScheduler: + replicas: 0 +distributor: + replicas: 0 +compactor: + replicas: 0 +indexGateway: + replicas: 0 +bloomCompactor: + replicas: 0 +bloomGateway: + replicas: 0 \ No newline at end of file diff --git a/infrastructure/prometheus/namespace.yaml b/infrastructure/monitoring/namespace.yaml similarity index 100% rename from infrastructure/prometheus/namespace.yaml rename to infrastructure/monitoring/namespace.yaml diff --git a/infrastructure/monitoring/prometheus.values.yaml b/infrastructure/monitoring/prometheus.values.yaml new file mode 100644 index 0000000..1447d69 --- /dev/null +++ b/infrastructure/monitoring/prometheus.values.yaml @@ -0,0 +1,573 @@ +podSecurityPolicy: + enabled: true + +server: + extraArgs: + log.level: debug + storage.tsdb.min-block-duration: 2h # Don't change this, see docs/components/sidecar.md + storage.tsdb.max-block-duration: 2h # Don't change this, see docs/components/sidecar.md + retention: 4h + service: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9090" + statefulSet: + enabled: true + podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "10902" + sidecarContainers: + thanos-sidecar: + image: thanos + resources: + requests: + memory: "512Mi" + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /etc/secret/sa + args: + - "sidecar" + - "--log.level=debug" + - "--tsdb.path=/data/" + - "--prometheus.url=http://127.0.0.1:9090" + - "--objstore.config={type: GCS, config: {bucket: BUCKET_REPLACE_ME}}" + - "--reloader.config-file=/etc/prometheus-config/prometheus.yml" + - "--reloader.config-envsubst-file=/etc/prometheus-shared/prometheus.yml" + - "--reloader.rule-dir=/etc/prometheus-config/rules" + ports: + - name: sidecar-http + containerPort: 10902 + - name: grpc + containerPort: 10901 + - name: cluster + containerPort: 10900 + volumeMounts: + - name: storage-volume + mountPath: /data + - name: thanos-storage-secret + mountPath: /etc/secret + - name: config-volume + mountPath: /etc/prometheus-config + readOnly: false + - name: prometheus-config-shared + mountPath: /etc/prometheus-shared/ + readOnly: false + configPath: /etc/prometheus-shared/prometheus.yml + replicaCount: 1 + persistentVolume: + size: 20Gi + extraVolumes: # spec.template.spec.volumes + - name: prometheus-config-shared + emptyDir: {} + extraVolumeMounts: # spec.template.spec.containers.volumeMounts for prometheus container + - name: prometheus-config-shared + mountPath: /etc/prometheus-shared/ + resources: + requests: + memory: 1Gi + global: + scrape_interval: 5s + scrape_timeout: 4s + external_labels: + prometheus_group: KLUSTER + prometheus_replica: '$(HOSTNAME)' + evaluation_interval: 5s + extraSecretMounts: + - name: thanos-objstore-config + mountPath: /etc/secret/ + subPath: sa + readOnly: false + secretName: thanos-storage-secret + +# as thanos sidecar is taking care of the config reload +# we can disable the prometheus configmap reload +configmapReload: + prometheus: + enabled: false + +## Prometheus server ConfigMap entries +## +serverFiles: + ## Alerts configuration + ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ + alerting_rules.yml: {} + # groups: + # - name: Instances + # rules: + # - alert: InstanceDown + # expr: up == 0 + # for: 5m + # labels: + # severity: page + # annotations: + # description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.' + # summary: 'Instance {{ $labels.instance }} down' + ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use alerting_rules.yml + alerts: {} + + ## Records configuration + ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/ + recording_rules.yml: {} + ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use recording_rules.yml + rules: {} + + prometheus.yml: + rule_files: + - /etc/config/recording_rules.yml + - /etc/config/alerting_rules.yml + ## Below two files are DEPRECATED will be removed from this default values file + - /etc/config/rules + - /etc/config/alerts + + scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + + # A scrape configuration for running Prometheus on a Kubernetes cluster. + # This uses separate scrape configs for cluster components (i.e. API server, node) + # and services to allow each to use different authentication configs. + # + # Kubernetes labels will be added as Prometheus labels on metrics via the + # `labelmap` relabeling action. + + # Scrape config for API servers. + # + # Kubernetes exposes API servers as endpoints to the default/kubernetes + # service so this uses `endpoints` role and uses relabelling to only keep + # the endpoints associated with the default/kubernetes service using the + # default named port `https`. This works for single API server deployments as + # well as HA API server deployments. + - job_name: 'kubernetes-apiservers' + + kubernetes_sd_configs: + - role: endpoints + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. + # + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + + - job_name: 'kubernetes-nodes' + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. + # + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/$1/proxy/metrics + + + - job_name: 'kubernetes-nodes-cadvisor' + + # Default to scraping over https. If required, just disable this or change to + # `http`. + scheme: https + + # This TLS & bearer token file config is used to connect to the actual scrape + # endpoints for cluster components. This is separate to discovery auth + # configuration because discovery & scraping are two separate concerns in + # Prometheus. The discovery auth config is automatic if Prometheus runs inside + # the cluster. Otherwise, more config options have to be provided within the + # . + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # If your node certificates are self-signed or use a different CA to the + # master CA, then disable certificate verification below. Note that + # certificate verification is an integral part of a secure infrastructure + # so this should only be disabled in a controlled environment. You can + # disable certificate verification by uncommenting the line below. + # + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + kubernetes_sd_configs: + - role: node + + # This configuration will work only on kubelet 1.7.3+ + # As the scrape endpoints for cAdvisor have changed + # if you are using older version you need to change the replacement to + # replacement: /api/v1/nodes/$1:4194/proxy/metrics + # more info here https://github.com/coreos/prometheus-operator/issues/633 + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor + + # Metric relabel configs to apply to samples before ingestion. + # [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs) + # metric_relabel_configs: + # - action: labeldrop + # regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone) + + # Scrape config for service endpoints. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape`: Only scrape services that have a value of + # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well. + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + # * `prometheus.io/param_`: If the metrics endpoint uses parameters + # then you can set any parameter + - job_name: 'kubernetes-service-endpoints' + honor_labels: true + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape_slow] + action: drop + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: service + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: node + + # Scrape config for slow service endpoints; same as above, but with a larger + # timeout and a larger interval + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + # * `prometheus.io/param_`: If the metrics endpoint uses parameters + # then you can set any parameter + - job_name: 'kubernetes-service-endpoints-slow' + honor_labels: true + + scrape_interval: 5m + scrape_timeout: 30s + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape_slow] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+?)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: service + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: node + + - job_name: 'prometheus-pushgateway' + honor_labels: true + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: pushgateway + + # Example scrape config for probing services via the Blackbox Exporter. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/probe`: Only probe services that have a value of `true` + - job_name: 'kubernetes-services' + honor_labels: true + + metrics_path: /probe + params: + module: [http_2xx] + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: true + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + replacement: blackbox + - source_labels: [__param_target] + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + - source_labels: [__meta_kubernetes_service_name] + target_label: service + + # Example scrape config for pods + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`, + # except if `prometheus.io/scrape-slow` is set to `true` as well. + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. + - job_name: 'kubernetes-pods' + honor_labels: true + + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow] + action: drop + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme] + action: replace + regex: (https?) + target_label: __scheme__ + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] + action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + target_label: __address__ + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] + action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + - source_labels: [__meta_kubernetes_pod_phase] + regex: Pending|Succeeded|Failed|Completed + action: drop + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: node + + # Example Scrape config for pods which should be scraped slower. An useful example + # would be stackriver-exporter which queries an API on every scrape of the pod + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. + - job_name: 'kubernetes-pods-slow' + honor_labels: true + + scrape_interval: 5m + scrape_timeout: 30s + + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme] + action: replace + regex: (https?) + target_label: __scheme__ + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] + action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$2]:$1' + target_label: __address__ + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip] + action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $2:$1 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: pod + - source_labels: [__meta_kubernetes_pod_phase] + regex: Pending|Succeeded|Failed|Completed + action: drop + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: node + + + + +# Configuration of subcharts defined in Chart.yaml + +## alertmanager sub-chart configurable values +## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/alertmanager +## +alertmanager: + enabled: false + +## kube-state-metrics sub-chart configurable values +## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics +## +kube-state-metrics: + ## If false, kube-state-metrics sub-chart will not be installed + ## + enabled: true + +## prometheus-node-exporter sub-chart configurable values +## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter +## +prometheus-node-exporter: + ## If false, node-exporter will not be installed + ## + enabled: true + + rbac: + pspEnabled: false + + containerSecurityContext: + allowPrivilegeEscalation: false + +## prometheus-pushgateway sub-chart configurable values +## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-pushgateway +## +prometheus-pushgateway: + ## If false, pushgateway will not be installed + ## + enabled: false diff --git a/infrastructure/monitoring/thanos-objstore-config.sealedsecret.yaml b/infrastructure/monitoring/thanos-objstore-config.sealedsecret.yaml new file mode 100644 index 0000000..581fbf4 --- /dev/null +++ b/infrastructure/monitoring/thanos-objstore-config.sealedsecret.yaml @@ -0,0 +1,16 @@ +--- +apiVersion: bitnami.com/v1alpha1 +kind: SealedSecret +metadata: + creationTimestamp: null + name: thanos-objstore-config + namespace: monitoring +spec: + encryptedData: + thanos.yaml: AgAqlul2V1idfgbWvq/0ljSFlxOOsQmwlGd+jRvDDyi1nlR8woHrp7lW6AxJ/8mBtb5htCuJzLgx+HVrN/EN+fRn5xG3D5+8xs4jWBOQ49MgLSAjJavFPcVY5xiBpGaw/N8aotlbfv6Wa2/+cmiAzVDPwnOj5zCS/EU58Tu2YFeVSbMUlu0NFAeyBW0DVT2enuVLToP4Ge4T0U9F99NHOh2zlVG82iI+4RxCu/WBkOU/urVleGwCYkcr/ItmXiwRXbwnWUtEUf28Q4ArpuZXFkKZUMoIwOjkXgOn/ySBLVvf0yy1+WOcYAIX9ouxu6i4T1GAZO9RnKeMJOIyebI3EOMA2dxQFpQg2/XhhHz2Ds2oDX/yr7vXbZJGyiCvTnnFUvFALKWIjRXXWphdqHDk6iP8tFIKVFsn7UxgMVFRcs6DmcMpBgFOcjpHr4HFZap5G9hI3cscmkNfwU+JOXkDEGRpZkkECza4wlQln8Wptq1qa+I+DSclqLOcvoEvNCJCIIgh5tINJ0KiZcrBvymUZZ9VduH4TFHR/UQK7M7It892TDNUlIp2UDWiuQ2DJysOJXmvSiNo8PGWSyDJwKJPhaWqXz9RUsb4D8gq/a+0qC7DOICrJEUj7WL8dwaKoQa32Cf+wopwrjFWSE7pAfiBJo+Dqa9jHIDv2hVsdU8NXqiFK35XHyUT4i0KWc+UZg4ObotGxYMvRtJuc3S7ZGTJ4YKDP5iThuNSuNd1pd1YjirpvVtL2o5BYh2i55F3DfVREofYpBCjK1e43mHOwEUYZ7Ff6p1+S0PXZnkL53xHMiiW3yr0v1g2ZYk7vzkENb9epzm24fNX/4ZiJdb0glEJmB674bgDSeh9PA5q8nJIKk6vsbrzfaAYWIn5Ai9MPbAVfg9pPkMyy9ydd+SqecujkWm++4dHqB1WJUg= + template: + metadata: + creationTimestamp: null + name: thanos-objstore-config + namespace: monitoring + type: Opaque diff --git a/infrastructure/prometheus/kustomization.yaml b/infrastructure/prometheus/kustomization.yaml deleted file mode 100644 index bb52adf..0000000 --- a/infrastructure/prometheus/kustomization.yaml +++ /dev/null @@ -1,20 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: prometheus - -resources: - - namespace.yaml - # prometheus-operator crds - - https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.70.0/bundle.yaml - - prometheus.yaml - - thanos-objstore-config.sealedsecret.yaml - # thanos deployment from kube-thanos project - - thanos-store.statefulset.yaml - - thanos-query.deployment.yaml - - -images: - - name: thanos - newName: quay.io/thanos/thanos - newTag: v0.37.2 diff --git a/infrastructure/prometheus/prometheus.yaml b/infrastructure/prometheus/prometheus.yaml deleted file mode 100644 index f4e8f26..0000000 --- a/infrastructure/prometheus/prometheus.yaml +++ /dev/null @@ -1,78 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: prometheus ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: prometheus -rules: -- apiGroups: [""] - resources: - - nodes - - nodes/metrics - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: - - configmaps - verbs: ["get"] -- apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: ["get", "list", "watch"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus -subjects: -- kind: ServiceAccount - name: prometheus - namespace: prometheus # needs to be the same as in the kustomization.yaml ---- -apiVersion: monitoring.coreos.com/v1 -kind: Prometheus -metadata: - name: prometheus -spec: - securityContext: - runAsUser: 65534 # same as the thanos sidecar - resources: - requests: - memory: 400Mi - retention: 730d - retentionSize: 3GiB - serviceAccountName: prometheus - enableAdminAPI: false - serviceMonitorNamespaceSelector: {} - serviceMonitorSelector: {} - thanos: - version: v0.34.1 - objectStorageConfig: - # loads the config from a secret named thanos-objstore-config in the same namespace - key: thanos.yaml - name: thanos-objstore-config ---- -apiVersion: v1 -kind: Service -metadata: - name: prometheus -spec: - type: ClusterIP - ports: - - port: 9090 - targetPort: 9090 - protocol: TCP - selector: - prometheus: prometheus diff --git a/infrastructure/prometheus/thanos-objstore-config.sealedsecret.yaml b/infrastructure/prometheus/thanos-objstore-config.sealedsecret.yaml deleted file mode 100644 index 64eccd0..0000000 --- a/infrastructure/prometheus/thanos-objstore-config.sealedsecret.yaml +++ /dev/null @@ -1,16 +0,0 @@ ---- -apiVersion: bitnami.com/v1alpha1 -kind: SealedSecret -metadata: - creationTimestamp: null - name: thanos-objstore-config - namespace: prometheus -spec: - encryptedData: - thanos.yaml: AgByW/LKzPh0QeNsHR8Us4bJ/0chIQErhfh5plY1tjqiZyNLlxZ+NygYYzVggW02k4gAsKs68trbLBbeTTEhpKYP8hUphNb13lrgp07wYpOQjUF57i6RjPM2QNJpO0qLSk/nOPIOtR3XKn+nXxdJDmh3j5y0zxVz5O7MLh7adwOaHlyWTLMJjI1cda8YljDp2FYs24lHHMw4gXAYUecGDJNQqw5Xy9IiGh8kBbcKe3j6bVCj1yxPbHszmvZ2s+Q+mnndXnoeLMhwjZhMF8/PETxmSZ2bs41k3lHm/2rcPQCJsl9CuJEGKhu6ndKrVhtury4/US/FheEOoGF0YZk/AQMHII/mxy8haPNxtQTDs4rfYz/BA8cMMZll44wxOY9gAOmhm3sG6GI9wcB1Z65p98xSuDaInknO80l07vwMAAvmrZbT53Fmefrxl+jE1pImcGEsL0MfP621nTXlOBW9keF+6aUOubrwjPKKSXdqZU21acNbaIeRQSJyaOBStAKLfnPFmaryGisgNu0hCk/WmszZ0/s/ilvdMdAD6kKoiKL/NWfXtHATh/fnd76bKfSzNQk6e+WWfomToYVU0HRgAaWnIzjB9Q4tjxkbRwteEodU+K1BvD4xQ0sfQB2vHlDjQGC3pjIUFCWG0SzQGb7oe6+X2CJpcNIBHwF661iELJpJkg8dLsPtwb+8Rj6BL+ZtyVKYv18nDNON0WVpwJb/IHHSmxfYD5b/q6fATCFj55IXK5Nr4VO65a2Sv5Iv0/TTUVkwb8dkMmwfs5qcQiZ4oKWx8Ol6GkjDZrFARUtHQ/9KiZ9xDj3tPic2TeQfKr27sgc4lEL8RSxaRKHkkxIAioea3YgFfBm7ZfoxMlzJnQ1vI2vDvJcRXhWKSGdXiKOddwLSVMZFsSRRi9AxH87Sjt7j1wvsA7xgBqc= - template: - metadata: - creationTimestamp: null - name: thanos-objstore-config - namespace: prometheus - type: Opaque diff --git a/infrastructure/prometheus/thanos-query.deployment.yaml b/infrastructure/prometheus/thanos-query.deployment.yaml deleted file mode 100644 index 3a782e7..0000000 --- a/infrastructure/prometheus/thanos-query.deployment.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: thanos-querier - labels: - app: thanos-querier -spec: - replicas: 1 - selector: - matchLabels: - app: thanos-querier - template: - metadata: - labels: - app: thanos-querier - spec: - containers: - - name: thanos - image: thanos - args: - - query - - --log.level=debug - - --query.replica-label=replica - - --endpoint=dnssrv+_grpc._tcp.thanos-store:10901 - - --endpoint=dnssrv+_grpc._tcp.prometheus:9090 - ports: - - name: http - containerPort: 10902 - - name: grpc - containerPort: 10901 - livenessProbe: - httpGet: - port: http - path: /-/healthy - readinessProbe: - httpGet: - port: http - path: /-/ready ---- -apiVersion: v1 -kind: Service -metadata: - name: thanos-querier -spec: - selector: - app: thanos-querier - ports: - - name: http - protocol: TCP - port: 10902 - targetPort: http - - name: grpc - protocol: TCP - port: 10901 - targetPort: grpc diff --git a/infrastructure/prometheus/thanos-store.statefulset.yaml b/infrastructure/prometheus/thanos-store.statefulset.yaml deleted file mode 100644 index 4055317..0000000 --- a/infrastructure/prometheus/thanos-store.statefulset.yaml +++ /dev/null @@ -1,71 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: thanos-store - labels: - app: thanos-store -spec: - replicas: 1 - selector: - matchLabels: - app: thanos-store - template: - metadata: - labels: - app: thanos-store - thanos-store-api: "true" - spec: - containers: - - name: thanos - image: thanos - args: - - store - - --log.level=debug - - --data-dir=/data - - --grpc-address=0.0.0.0:10901 - - --http-address=0.0.0.0:10902 - - --objstore.config-file=/etc/secret/thanos.yaml - - --index-cache-size=500MB - - --chunk-pool-size=500MB - ports: - - name: http - containerPort: 10902 - - name: grpc - containerPort: 10901 - livenessProbe: - httpGet: - port: 10902 - path: /-/healthy - readinessProbe: - httpGet: - port: 10902 - path: /-/ready - volumeMounts: - - name: thanos-objstore-config - mountPath: /etc/secret - readOnly: true - - name: thanos-data - mountPath: /data - volumes: - - name: thanos-objstore-config - secret: - secretName: thanos-objstore-config - - name: thanos-data - emptyDir: {} ---- -apiVersion: v1 -kind: Service -metadata: - labels: - app.kubernetes.io/name: thanos-store - name: thanos-store -spec: - ports: - - name: grpc - port: 10901 - targetPort: 10901 - - name: http - port: 10902 - targetPort: 10902 - selector: - app: thanos-store diff --git a/kluster-deployments/grafana/application.yaml b/kluster-deployments/grafana/application.yaml new file mode 100644 index 0000000..d6f93b4 --- /dev/null +++ b/kluster-deployments/grafana/application.yaml @@ -0,0 +1,18 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: grafana-application + namespace: argocd +spec: + project: apps + source: + repoURL: ssh://git@git.kluster.moll.re:2222/remoll/k3s-infra.git + targetRevision: main + path: apps/grafana + destination: + server: https://kubernetes.default.svc + namespace: grafana + syncPolicy: + automated: + prune: true + selfHeal: true diff --git a/kluster-deployments/monitoring/kustomization.yaml b/kluster-deployments/grafana/kustomization.yaml similarity index 100% rename from kluster-deployments/monitoring/kustomization.yaml rename to kluster-deployments/grafana/kustomization.yaml diff --git a/kluster-deployments/kustomization.yaml b/kluster-deployments/kustomization.yaml index 79eb761..085a8e5 100644 --- a/kluster-deployments/kustomization.yaml +++ b/kluster-deployments/kustomization.yaml @@ -20,7 +20,7 @@ resources: - traefik/ - external-dns/ - external-services/ - - prometheus/application.yaml + - monitoring/application.yaml - authelia/ # simple apps @@ -35,7 +35,7 @@ resources: - linkding/ - media/ - minecraft/application.yaml - - monitoring/ + - grafana/ - ntfy/ - paperless/ - recipes/ diff --git a/kluster-deployments/monitoring/application.yaml b/kluster-deployments/monitoring/application.yaml index 36deee1..f6ee898 100644 --- a/kluster-deployments/monitoring/application.yaml +++ b/kluster-deployments/monitoring/application.yaml @@ -1,14 +1,15 @@ apiVersion: argoproj.io/v1alpha1 kind: Application metadata: - name: monitoring-application + name: prometheus-application namespace: argocd + spec: - project: apps + project: infrastructure source: - repoURL: ssh://git@git.kluster.moll.re:2222/remoll/k3s-infra.git + repoURL: git@github.com:moll-re/bootstrap-k3s-infra.git targetRevision: main - path: apps/monitoring + path: infrastructure/prometheus destination: server: https://kubernetes.default.svc namespace: monitoring diff --git a/kluster-deployments/prometheus/application.yaml b/kluster-deployments/prometheus/application.yaml deleted file mode 100644 index bef03c5..0000000 --- a/kluster-deployments/prometheus/application.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: prometheus-application - namespace: argocd - -spec: - project: infrastructure - source: - repoURL: git@github.com:moll-re/bootstrap-k3s-infra.git - targetRevision: main - path: infrastructure/prometheus - destination: - server: https://kubernetes.default.svc - namespace: monitoring - syncPolicy: - automated: - prune: true - selfHeal: true - syncOptions: - - Replace=true - # because the prom crds exceed the default 256Ki limit \ No newline at end of file