monitoring swtich back to prometheus-operator

2025-01-05 16:26:46 +01:00
parent 2a56392af0
commit 16161bafb7
13 changed files with 319 additions and 593 deletions
--- a/apps/grafana/grafana.values.yaml
+++ b/apps/grafana/grafana.values.yaml
@@ -37,7 +37,7 @@ datasources:
    datasources:
      - name: Prometheus
        type: prometheus
-        url: http://prometheus-server.monitoring.svc:80
+        url: http://prometheus.monitoring.svc:9090
        isDefault: true
      - name: Thanos
        type: prometheus
--- a/infrastructure/argocd/argocd-cmd-params.configmap.yaml
+++ b/infrastructure/argocd/argocd-cmd-params.configmap.yaml
@@ -3,4 +3,6 @@ kind: ConfigMap
 metadata:
  name: argocd-cmd-params-cm
 data:
-  server.insecure: "true"
+  # server.insecure: "true"
+  # DID NOT FIX RELOAD LOOPS
+  # application.namespaces: "*"
--- a/infrastructure/argocd/argocd.configmap.yaml
+++ b/infrastructure/argocd/argocd.configmap.yaml
@@ -7,3 +7,4 @@ data:
  # switch to annotation based resource tracking as per
  # https://argo-cd.readthedocs.io/en/stable/user-guide/resource_tracking/
  application.resourceTrackingMethod: annotation+label
+  admin.enabled: "false"
--- a/infrastructure/argocd/ingress.yaml
+++ b/infrastructure/argocd/ingress.yaml
@@ -9,16 +9,9 @@ spec:
  routes:
    - kind: Rule
      match: Host(`argocd.kluster.moll.re`)
-      priority: 10
      services:
        - name: argocd-server
-          port: 80
-    - kind: Rule
-      match: Host(`argocd.kluster.moll.re`) && Header(`Content-Type`, `application/grpc`)
-      priority: 11
-      services:
-        - name: argocd-server
-          port: 80
-          scheme: h2c
+          port: 443
+          scheme: https
  tls:
    certResolver: default-tls
--- a/infrastructure/argocd/kustomization.yaml
+++ b/infrastructure/argocd/kustomization.yaml
@@ -4,14 +4,15 @@ kind: Kustomization
 namespace: argocd
 resources:
  - namespace.yaml
-  - https://raw.githubusercontent.com/argoproj/argo-cd/v2.13.1/manifests/install.yaml
+  - https://raw.githubusercontent.com/argoproj/argo-cd/v2.13.3/manifests/install.yaml
  - ingress.yaml
  - argo-apps.application.yaml
  - bootstrap-repo.sealedsecret.yaml
  - argocd-oauth.sealedsecret.yaml
+  - servicemonitor.yaml
+  # DID NOT FIX RELOAD LOOPS
+  # - github.com/argoproj/argo-cd/examples/k8s-rbac/argocd-server-applications?ref=master

-components:
-  - https://github.com/argoproj-labs/argocd-extensions/manifests

 patches:
  - path: argocd.configmap.yaml
--- a/infrastructure/argocd/servicemonitor.yaml
+++ b/infrastructure/argocd/servicemonitor.yaml
@@ -0,0 +1,77 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: argocd-metrics
+  labels:
+    release: prometheus-operator
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: argocd-metrics
+  endpoints:
+  - port: metrics
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: argocd-server-metrics
+  labels:
+    release: prometheus-operator
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: argocd-server-metrics
+  endpoints:
+  - port: metrics
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: argocd-repo-server-metrics
+  labels:
+    release: prometheus-operator
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: argocd-repo-server
+  endpoints:
+  - port: metrics
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: argocd-applicationset-controller-metrics
+  labels:
+    release: prometheus-operator
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: argocd-applicationset-controller
+  endpoints:
+  - port: metrics
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: argocd-dex-server
+  labels:
+    release: prometheus-operator
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: argocd-dex-server
+  endpoints:
+    - port: metrics
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: argocd-redis-haproxy-metrics
+  labels:
+    release: prometheus-operator
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: argocd-redis-ha-haproxy
+  endpoints:
+  - port: http-exporter-port
--- a/infrastructure/monitoring/kustomization.yaml
+++ b/infrastructure/monitoring/kustomization.yaml
@@ -6,8 +6,13 @@ namespace: monitoring
 resources: 
  - namespace.yaml
  # prometheus-operator crds
+  - https://github.com/prometheus-operator/prometheus-operator?ref=v0.79.2
+  # single prometheus instance with a thanos sidecar
+  - prometheus.yaml
+  - thanos-store.statefulset.yaml
+  - thanos-query.deployment.yaml
  - thanos-objstore-config.sealedsecret.yaml
-  # - loki-objstore-config.sealedsecret.yaml
+

 images:
  - name: thanos
@@ -21,8 +26,8 @@ helmCharts:
    repo: https://grafana.github.io/helm-charts
    version: 6.24.0
    valuesFile: loki.values.yaml
-  - name: prometheus
-    releaseName: prometheus
+  - name: prometheus-node-exporter
+    releaseName: prometheus-node-exporter
    repo: https://prometheus-community.github.io/helm-charts
-    version: 26.0.1
-    valuesFile: prometheus.values.yaml
+    version: 4.43.1
+    valuesFile: prometheus-node-exporter.values.yaml
--- a/infrastructure/monitoring/prometheus-node-exporter.values.yaml
+++ b/infrastructure/monitoring/prometheus-node-exporter.values.yaml
@@ -0,0 +1,14 @@
+prometheus:
+  monitor:
+    enabled: true
+
+    jobLabel: "node-exporter"
+
+  
+resources:
+  limits:
+    cpu: 200m
+    memory: 50Mi
+  requests:
+    cpu: 100m
+    memory: 30Mi
--- a/infrastructure/monitoring/prometheus.values.yaml
+++ b/infrastructure/monitoring/prometheus.values.yaml
@@ -1,574 +0,0 @@
-podSecurityPolicy:
-  enabled: true
-
-server:
-  extraArgs:
-    log.level: debug
-    storage.tsdb.min-block-duration: 2h # Don't change this, see docs/components/sidecar.md
-    storage.tsdb.max-block-duration: 2h # Don't change this, see docs/components/sidecar.md
-  retention: 180d
-  service:
-    annotations:
-      prometheus.io/scrape: "true"
-      prometheus.io/port: "9090"
-  statefulSet:
-    enabled: true
-  podAnnotations:
-    prometheus.io/scrape: "true"
-    prometheus.io/port: "10902"
-  # sidecarContainers:
-  #   thanos-sidecar:
-  #     image: thanos
-  #     resources:
-  #       requests:
-  #         memory: "512Mi"
-  #     env:
-  #       - name: GOOGLE_APPLICATION_CREDENTIALS
-  #         value: /etc/secret/sa
-  #     args:
-  #       - "sidecar"
-  #       - "--log.level=debug"
-  #       - "--tsdb.path=/data/"
-  #       - "--prometheus.url=http://127.0.0.1:9090"
-  #       - "--objstore.config={type: GCS, config: {bucket: BUCKET_REPLACE_ME}}"
-  #       - "--reloader.config-file=/etc/prometheus-config/prometheus.yml"
-  #       - "--reloader.config-envsubst-file=/etc/prometheus-shared/prometheus.yml"
-  #       - "--reloader.rule-dir=/etc/prometheus-config/rules"
-  #     ports:
-  #       - name: sidecar-http
-  #         containerPort: 10902
-  #       - name: grpc
-  #         containerPort: 10901
-  #       - name: cluster
-  #         containerPort: 10900
-  #     volumeMounts:
-  #       - name: storage-volume
-  #         mountPath: /data
-  #       - name: thanos-storage-secret
-  #         mountPath: /etc/secret
-  #       - name: config-volume
-  #         mountPath: /etc/prometheus-config
-  #         readOnly: false
-  #       - name: prometheus-config-shared
-  #         mountPath: /etc/prometheus-shared/
-  #         readOnly: false
-  # # configPath: /etc/prometheus-shared/prometheus.yml
-  # replicaCount: 1
-  # persistentVolume:
-  #   size: 20Gi
-  #   storageClass: nfs-client
-  # extraVolumes: # spec.template.spec.volumes
-  #   - name: prometheus-config-shared
-  #     emptyDir: {}
-  # extraVolumeMounts: # spec.template.spec.containers.volumeMounts for prometheus container
-  #   - name: prometheus-config-shared
-  #     mountPath: /etc/prometheus-shared/
-  # resources:
-  #   requests:
-  #     memory: 1Gi
-  # global:
-  #   scrape_interval: 5s
-  #   scrape_timeout: 4s
-  #   external_labels:
-  #     prometheus_group: KLUSTER
-  #     prometheus_replica: '$(HOSTNAME)'
-  #   evaluation_interval: 5s
-  # extraSecretMounts:
-  #   - name: thanos-storage-secret
-  #     mountPath: /etc/secret/
-  #     subPath: sa
-  #     readOnly: false
-  #     secretName: thanos-objstore-config
-
-# as thanos sidecar is taking care of the config reload
-# we can disable the prometheus configmap reload
-configmapReload:
-  prometheus:
-    enabled: false
-
-## Prometheus server ConfigMap entries
-##
-serverFiles:
-  ## Alerts configuration
-  ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
-  alerting_rules.yml: {}
-  # groups:
-  #   - name: Instances
-  #     rules:
-  #       - alert: InstanceDown
-  #         expr: up == 0
-  #         for: 5m
-  #         labels:
-  #           severity: page
-  #         annotations:
-  #           description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.'
-  #           summary: 'Instance {{ $labels.instance }} down'
-  ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use alerting_rules.yml
-  alerts: {}
-
-  ## Records configuration
-  ## Ref: https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/
-  recording_rules.yml: {}
-  ## DEPRECATED DEFAULT VALUE, unless explicitly naming your files, please use recording_rules.yml
-  rules: {}
-
-  prometheus.yml:
-    rule_files:
-      - /etc/config/recording_rules.yml
-      - /etc/config/alerting_rules.yml
-    ## Below two files are DEPRECATED will be removed from this default values file
-      - /etc/config/rules
-      - /etc/config/alerts
-
-    scrape_configs:
-      - job_name: prometheus
-        static_configs:
-          - targets:
-            - localhost:9090
-
-      # A scrape configuration for running Prometheus on a Kubernetes cluster.
-      # This uses separate scrape configs for cluster components (i.e. API server, node)
-      # and services to allow each to use different authentication configs.
-      #
-      # Kubernetes labels will be added as Prometheus labels on metrics via the
-      # `labelmap` relabeling action.
-
-      # Scrape config for API servers.
-      #
-      # Kubernetes exposes API servers as endpoints to the default/kubernetes
-      # service so this uses `endpoints` role and uses relabelling to only keep
-      # the endpoints associated with the default/kubernetes service using the
-      # default named port `https`. This works for single API server deployments as
-      # well as HA API server deployments.
-      - job_name: 'kubernetes-apiservers'
-
-        kubernetes_sd_configs:
-          - role: endpoints
-
-        # Default to scraping over https. If required, just disable this or change to
-        # `http`.
-        scheme: https
-
-        # This TLS & bearer token file config is used to connect to the actual scrape
-        # endpoints for cluster components. This is separate to discovery auth
-        # configuration because discovery & scraping are two separate concerns in
-        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
-        # the cluster. Otherwise, more config options have to be provided within the
-        # <kubernetes_sd_config>.
-        tls_config:
-          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-          # If your node certificates are self-signed or use a different CA to the
-          # master CA, then disable certificate verification below. Note that
-          # certificate verification is an integral part of a secure infrastructure
-          # so this should only be disabled in a controlled environment. You can
-          # disable certificate verification by uncommenting the line below.
-          #
-          insecure_skip_verify: true
-        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
-        # Keep only the default/kubernetes service endpoints for the https port. This
-        # will add targets for each API server which Kubernetes adds an endpoint to
-        # the default/kubernetes service.
-        relabel_configs:
-          - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
-            action: keep
-            regex: default;kubernetes;https
-
-      - job_name: 'kubernetes-nodes'
-
-        # Default to scraping over https. If required, just disable this or change to
-        # `http`.
-        scheme: https
-
-        # This TLS & bearer token file config is used to connect to the actual scrape
-        # endpoints for cluster components. This is separate to discovery auth
-        # configuration because discovery & scraping are two separate concerns in
-        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
-        # the cluster. Otherwise, more config options have to be provided within the
-        # <kubernetes_sd_config>.
-        tls_config:
-          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-          # If your node certificates are self-signed or use a different CA to the
-          # master CA, then disable certificate verification below. Note that
-          # certificate verification is an integral part of a secure infrastructure
-          # so this should only be disabled in a controlled environment. You can
-          # disable certificate verification by uncommenting the line below.
-          #
-          insecure_skip_verify: true
-        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
-        kubernetes_sd_configs:
-          - role: node
-
-        relabel_configs:
-          - action: labelmap
-            regex: __meta_kubernetes_node_label_(.+)
-          - target_label: __address__
-            replacement: kubernetes.default.svc:443
-          - source_labels: [__meta_kubernetes_node_name]
-            regex: (.+)
-            target_label: __metrics_path__
-            replacement: /api/v1/nodes/$1/proxy/metrics
-
-
-      - job_name: 'kubernetes-nodes-cadvisor'
-
-        # Default to scraping over https. If required, just disable this or change to
-        # `http`.
-        scheme: https
-
-        # This TLS & bearer token file config is used to connect to the actual scrape
-        # endpoints for cluster components. This is separate to discovery auth
-        # configuration because discovery & scraping are two separate concerns in
-        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
-        # the cluster. Otherwise, more config options have to be provided within the
-        # <kubernetes_sd_config>.
-        tls_config:
-          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-          # If your node certificates are self-signed or use a different CA to the
-          # master CA, then disable certificate verification below. Note that
-          # certificate verification is an integral part of a secure infrastructure
-          # so this should only be disabled in a controlled environment. You can
-          # disable certificate verification by uncommenting the line below.
-          #
-          insecure_skip_verify: true
-        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-
-        kubernetes_sd_configs:
-          - role: node
-
-        # This configuration will work only on kubelet 1.7.3+
-        # As the scrape endpoints for cAdvisor have changed
-        # if you are using older version you need to change the replacement to
-        # replacement: /api/v1/nodes/$1:4194/proxy/metrics
-        # more info here https://github.com/coreos/prometheus-operator/issues/633
-        relabel_configs:
-          - action: labelmap
-            regex: __meta_kubernetes_node_label_(.+)
-          - target_label: __address__
-            replacement: kubernetes.default.svc:443
-          - source_labels: [__meta_kubernetes_node_name]
-            regex: (.+)
-            target_label: __metrics_path__
-            replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
-
-        # Metric relabel configs to apply to samples before ingestion.
-        # [Metric Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs)
-        # metric_relabel_configs:
-        # - action: labeldrop
-        #   regex: (kubernetes_io_hostname|failure_domain_beta_kubernetes_io_region|beta_kubernetes_io_os|beta_kubernetes_io_arch|beta_kubernetes_io_instance_type|failure_domain_beta_kubernetes_io_zone)
-
-      # Scrape config for service endpoints.
-      #
-      # The relabeling allows the actual service scrape endpoint to be configured
-      # via the following annotations:
-      #
-      # * `prometheus.io/scrape`: Only scrape services that have a value of
-      # `true`, except if `prometheus.io/scrape-slow` is set to `true` as well.
-      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-      # to set this to `https` & most likely set the `tls_config` of the scrape config.
-      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
-      # service then set this appropriately.
-      # * `prometheus.io/param_<parameter>`: If the metrics endpoint uses parameters
-      # then you can set any parameter
-      - job_name: 'kubernetes-service-endpoints'
-        honor_labels: true
-
-        kubernetes_sd_configs:
-          - role: endpoints
-
-        relabel_configs:
-          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
-            action: keep
-            regex: true
-          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape_slow]
-            action: drop
-            regex: true
-          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
-            action: replace
-            target_label: __scheme__
-            regex: (https?)
-          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
-            action: replace
-            target_label: __metrics_path__
-            regex: (.+)
-          - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
-            action: replace
-            target_label: __address__
-            regex: (.+?)(?::\d+)?;(\d+)
-            replacement: $1:$2
-          - action: labelmap
-            regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
-            replacement: __param_$1
-          - action: labelmap
-            regex: __meta_kubernetes_service_label_(.+)
-          - source_labels: [__meta_kubernetes_namespace]
-            action: replace
-            target_label: namespace
-          - source_labels: [__meta_kubernetes_service_name]
-            action: replace
-            target_label: service
-          - source_labels: [__meta_kubernetes_pod_node_name]
-            action: replace
-            target_label: node
-
-      # Scrape config for slow service endpoints; same as above, but with a larger
-      # timeout and a larger interval
-      #
-      # The relabeling allows the actual service scrape endpoint to be configured
-      # via the following annotations:
-      #
-      # * `prometheus.io/scrape-slow`: Only scrape services that have a value of `true`
-      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-      # to set this to `https` & most likely set the `tls_config` of the scrape config.
-      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-      # * `prometheus.io/port`: If the metrics are exposed on a different port to the
-      # service then set this appropriately.
-      # * `prometheus.io/param_<parameter>`: If the metrics endpoint uses parameters
-      # then you can set any parameter
-      - job_name: 'kubernetes-service-endpoints-slow'
-        honor_labels: true
-
-        scrape_interval: 5m
-        scrape_timeout: 30s
-
-        kubernetes_sd_configs:
-          - role: endpoints
-
-        relabel_configs:
-          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape_slow]
-            action: keep
-            regex: true
-          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
-            action: replace
-            target_label: __scheme__
-            regex: (https?)
-          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
-            action: replace
-            target_label: __metrics_path__
-            regex: (.+)
-          - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
-            action: replace
-            target_label: __address__
-            regex: (.+?)(?::\d+)?;(\d+)
-            replacement: $1:$2
-          - action: labelmap
-            regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
-            replacement: __param_$1
-          - action: labelmap
-            regex: __meta_kubernetes_service_label_(.+)
-          - source_labels: [__meta_kubernetes_namespace]
-            action: replace
-            target_label: namespace
-          - source_labels: [__meta_kubernetes_service_name]
-            action: replace
-            target_label: service
-          - source_labels: [__meta_kubernetes_pod_node_name]
-            action: replace
-            target_label: node
-
-      - job_name: 'prometheus-pushgateway'
-        honor_labels: true
-
-        kubernetes_sd_configs:
-          - role: service
-
-        relabel_configs:
-          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
-            action: keep
-            regex: pushgateway
-
-      # Example scrape config for probing services via the Blackbox Exporter.
-      #
-      # The relabeling allows the actual service scrape endpoint to be configured
-      # via the following annotations:
-      #
-      # * `prometheus.io/probe`: Only probe services that have a value of `true`
-      - job_name: 'kubernetes-services'
-        honor_labels: true
-
-        metrics_path: /probe
-        params:
-          module: [http_2xx]
-
-        kubernetes_sd_configs:
-          - role: service
-
-        relabel_configs:
-          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
-            action: keep
-            regex: true
-          - source_labels: [__address__]
-            target_label: __param_target
-          - target_label: __address__
-            replacement: blackbox
-          - source_labels: [__param_target]
-            target_label: instance
-          - action: labelmap
-            regex: __meta_kubernetes_service_label_(.+)
-          - source_labels: [__meta_kubernetes_namespace]
-            target_label: namespace
-          - source_labels: [__meta_kubernetes_service_name]
-            target_label: service
-
-      # Example scrape config for pods
-      #
-      # The relabeling allows the actual pod scrape endpoint to be configured via the
-      # following annotations:
-      #
-      # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`,
-      # except if `prometheus.io/scrape-slow` is set to `true` as well.
-      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-      # to set this to `https` & most likely set the `tls_config` of the scrape config.
-      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-      # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
-      - job_name: 'kubernetes-pods'
-        honor_labels: true
-
-        kubernetes_sd_configs:
-          - role: pod
-
-        relabel_configs:
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
-            action: keep
-            regex: true
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow]
-            action: drop
-            regex: true
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme]
-            action: replace
-            regex: (https?)
-            target_label: __scheme__
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
-            action: replace
-            target_label: __metrics_path__
-            regex: (.+)
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip]
-            action: replace
-            regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
-            replacement: '[$2]:$1'
-            target_label: __address__
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip]
-            action: replace
-            regex: (\d+);((([0-9]+?)(\.|$)){4})
-            replacement: $2:$1
-            target_label: __address__
-          - action: labelmap
-            regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
-            replacement: __param_$1
-          - action: labelmap
-            regex: __meta_kubernetes_pod_label_(.+)
-          - source_labels: [__meta_kubernetes_namespace]
-            action: replace
-            target_label: namespace
-          - source_labels: [__meta_kubernetes_pod_name]
-            action: replace
-            target_label: pod
-          - source_labels: [__meta_kubernetes_pod_phase]
-            regex: Pending|Succeeded|Failed|Completed
-            action: drop
-          - source_labels: [__meta_kubernetes_pod_node_name]
-            action: replace
-            target_label: node
-
-      # Example Scrape config for pods which should be scraped slower. An useful example
-      # would be stackriver-exporter which queries an API on every scrape of the pod
-      #
-      # The relabeling allows the actual pod scrape endpoint to be configured via the
-      # following annotations:
-      #
-      # * `prometheus.io/scrape-slow`: Only scrape pods that have a value of `true`
-      # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
-      # to set this to `https` & most likely set the `tls_config` of the scrape config.
-      # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
-      # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`.
-      - job_name: 'kubernetes-pods-slow'
-        honor_labels: true
-
-        scrape_interval: 5m
-        scrape_timeout: 30s
-
-        kubernetes_sd_configs:
-          - role: pod
-
-        relabel_configs:
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape_slow]
-            action: keep
-            regex: true
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme]
-            action: replace
-            regex: (https?)
-            target_label: __scheme__
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
-            action: replace
-            target_label: __metrics_path__
-            regex: (.+)
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip]
-            action: replace
-            regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
-            replacement: '[$2]:$1'
-            target_label: __address__
-          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __meta_kubernetes_pod_ip]
-            action: replace
-            regex: (\d+);((([0-9]+?)(\.|$)){4})
-            replacement: $2:$1
-            target_label: __address__
-          - action: labelmap
-            regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
-            replacement: __param_$1
-          - action: labelmap
-            regex: __meta_kubernetes_pod_label_(.+)
-          - source_labels: [__meta_kubernetes_namespace]
-            action: replace
-            target_label: namespace
-          - source_labels: [__meta_kubernetes_pod_name]
-            action: replace
-            target_label: pod
-          - source_labels: [__meta_kubernetes_pod_phase]
-            regex: Pending|Succeeded|Failed|Completed
-            action: drop
-          - source_labels: [__meta_kubernetes_pod_node_name]
-            action: replace
-            target_label: node
-
-
-
-
-# Configuration of subcharts defined in Chart.yaml
-
-## alertmanager sub-chart configurable values
-## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/alertmanager
-##
-alertmanager:
-  enabled: false
-
-## kube-state-metrics sub-chart configurable values
-## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics
-##
-kube-state-metrics:
-  ## If false, kube-state-metrics sub-chart will not be installed
-  ##
-  enabled: true
-
-## prometheus-node-exporter sub-chart configurable values
-## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter
-##
-prometheus-node-exporter:
-  ## If false, node-exporter will not be installed
-  ##
-  enabled: true
-
-  rbac:
-    pspEnabled: false
-
-  containerSecurityContext:
-    allowPrivilegeEscalation: false
-
-## prometheus-pushgateway sub-chart configurable values
-## Please see https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-pushgateway
-##
-prometheus-pushgateway:
-  ## If false, pushgateway will not be installed
-  ##
-  enabled: false
--- a/infrastructure/monitoring/prometheus.yaml
+++ b/infrastructure/monitoring/prometheus.yaml
@@ -0,0 +1,78 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: prometheus
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: prometheus
+rules:
+- apiGroups: [""]
+  resources:
+  - nodes
+  - nodes/metrics
+  - services
+  - endpoints
+  - pods
+  verbs: ["get", "list", "watch"]
+- apiGroups: [""]
+  resources:
+  - configmaps
+  verbs: ["get"]
+- apiGroups:
+  - networking.k8s.io
+  resources:
+  - ingresses
+  verbs: ["get", "list", "watch"]
+- nonResourceURLs: ["/metrics"]
+  verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus
+subjects:
+- kind: ServiceAccount
+  name: prometheus
+  namespace: monitoring # needs to be the same as in the kustomization.yaml
+---
+apiVersion: monitoring.coreos.com/v1
+kind: Prometheus
+metadata:
+  name: prometheus
+spec:
+  securityContext:
+    runAsUser: 65534 # same as the thanos sidecar
+  resources:
+    requests:
+      memory: 400Mi
+  retention: 730d
+  retentionSize: 3GiB
+  serviceAccountName: prometheus
+  enableAdminAPI: false
+  serviceMonitorNamespaceSelector: {}
+  serviceMonitorSelector: {}
+  thanos:
+    version: v0.34.1
+    objectStorageConfig:
+      # loads the config from a secret named thanos-objstore-config in the same namespace
+      key: thanos.yaml
+      name: thanos-objstore-config
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus
+spec:
+  type: ClusterIP
+  ports:
+  - port: 9090
+    targetPort: 9090
+    protocol: TCP
+  selector:
+    prometheus: prometheus
--- a/infrastructure/monitoring/thanos-query.deployment.yaml
+++ b/infrastructure/monitoring/thanos-query.deployment.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: thanos-querier
+  labels:
+    app: thanos-querier
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: thanos-querier
+  template:
+    metadata:
+      labels:
+        app: thanos-querier
+    spec:
+      containers:
+      - name: thanos
+        image: thanos
+        args:
+        - query
+        - --log.level=debug
+        - --query.replica-label=replica
+        - --endpoint=dnssrv+_grpc._tcp.thanos-store:10901
+        - --endpoint=dnssrv+_grpc._tcp.prometheus:9090
+        ports:
+        - name: http
+          containerPort: 10902
+        - name: grpc
+          containerPort: 10901
+        livenessProbe:
+          httpGet:
+            port: http
+            path: /-/healthy
+        readinessProbe:
+          httpGet:
+            port: http
+            path: /-/ready
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: thanos-querier
+spec:
+  selector:
+    app: thanos-querier
+  ports:
+    - name: http
+      protocol: TCP
+      port: 10902
+      targetPort: http
+    - name: grpc
+      protocol: TCP
+      port: 10901
+      targetPort: grpc
--- a/infrastructure/monitoring/thanos-store.statefulset.yaml
+++ b/infrastructure/monitoring/thanos-store.statefulset.yaml
@@ -0,0 +1,71 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: thanos-store
+  labels:
+    app: thanos-store
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: thanos-store
+  template:
+    metadata:
+      labels:
+        app: thanos-store
+        thanos-store-api: "true"
+    spec:
+      containers:
+        - name: thanos
+          image: thanos
+          args:
+          - store
+          - --log.level=debug
+          - --data-dir=/data
+          - --grpc-address=0.0.0.0:10901
+          - --http-address=0.0.0.0:10902
+          - --objstore.config-file=/etc/secret/thanos.yaml
+          - --index-cache-size=500MB
+          - --chunk-pool-size=500MB
+          ports:
+          - name: http
+            containerPort: 10902
+          - name: grpc
+            containerPort: 10901
+          livenessProbe:
+            httpGet:
+              port: 10902
+              path: /-/healthy
+          readinessProbe:
+            httpGet:
+              port: 10902
+              path: /-/ready
+          volumeMounts:
+            - name: thanos-objstore-config
+              mountPath: /etc/secret
+              readOnly: true
+            - name: thanos-data
+              mountPath: /data
+      volumes:
+        - name: thanos-objstore-config
+          secret:
+            secretName: thanos-objstore-config
+        - name: thanos-data
+          emptyDir: {}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app.kubernetes.io/name: thanos-store
+  name: thanos-store
+spec:
+  ports:
+  - name: grpc
+    port: 10901
+    targetPort: 10901
+  - name: http
+    port: 10902
+    targetPort: 10902
+  selector:
+    app: thanos-store
--- a/kluster-deployments/monitoring/application.yaml
+++ b/kluster-deployments/monitoring/application.yaml
@@ -17,3 +17,6 @@ spec:
    automated:
      prune: true
      selfHeal: true
+    syncOptions:
+      - Replace=true
+      # because the prometheus-operator CRDs are too large