Kubernetes-Prometheus监控

Prometheus

Thanos + Prometheus 高可用

创建namespace

kubectl create ns thanos

安装Prometheus

apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups: [""]
  resources:
  - nodes
  - nodes/proxy
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
- apiGroups:
  - extensions
  resources:
  - ingresses
  verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: thanos
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: thanos
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: thanos
data:
  prometheus.yml: |
    global:
      scrape_interval:     15s 
      evaluation_interval: 15s
      external_labels:
          k8s_cluster: a-cluster
    scrape_configs:
      - job_name: 'prometheus'
        static_configs:
        - targets: ['localhost:9090']
      - job_name: 'node-exporter'
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: [__address__]
          regex: '(.*):10250'
          replacement: '${1}:9100'
          target_label: __address__
          action: replace
        - source_labels: [__address__]
          target_label: instance
        - source_labels: [ __address__]
          regex: (.*):(.*)
          replacement: ${1}
          target_label: node
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: prometueus-data
  namespace: thanos
spec:
  accessModes:
  - ReadWriteMany
  resources:
    requests:
      storage: 10Gi
  storageClassName: nfs-client
---
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app: prometheus
  name: prometheus
  namespace: thanos
spec:
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      containers:
        - args:
            - sidecar
            - '--tsdb.path=/prometheus'
            - '--prometheus.url=http://localhost:9090'
            - '--http-address=0.0.0.0:19191'
            - '--grpc-address=0.0.0.0:19091'
          image: 'harbor.axzo.cn/library/thanos:latest'
          imagePullPolicy: IfNotPresent
          name: thanos
          ports:
            - containerPort: 19191
              protocol: TCP
            - containerPort: 19091
              protocol: TCP
          resources:
            limits:
              cpu: '1'
              memory: 1Gi
            requests:
              cpu: 500m
              memory: 1Gi
          volumeMounts:
            - mountPath: /prometheus
              name: prometueus-data
        - args:
            - '--config.file=/etc/prometheus/prometheus.yml'
            - '--storage.tsdb.path=/prometheus'
            - '--storage.tsdb.retention.time=168h'
            - '--web.enable-lifecycle'
            - '--storage.tsdb.no-lockfile'
            - '--web.route-prefix=/'
            - '--web.listen-address=0.0.0.0:9090'
          image: 'harbor.axzo.cn/library/prometheus:2.53.1'
          imagePullPolicy: IfNotPresent
          name: prometheus
          ports:
            - containerPort: 9090
              protocol: TCP
          volumeMounts:
            - mountPath: /etc/prometheus
              name: prometheus-config
            - mountPath: /prometheus
              name: prometueus-data
      serviceAccount: prometheus
      serviceAccountName: prometheus
      terminationGracePeriodSeconds: 30
      volumes:
        - configMap:
            defaultMode: 420
            name: prometheus-config
          name: prometheus-config
        - name: prometueus-data
          persistentVolumeClaim:
            claimName: prometueus-data
---
apiVersion: v1
kind: Service
metadata:
  labels:
    name: prometheus
  name: prometheus
  namespace: thanos
spec:
  ports:
    - name: prometheus
      port: 9090
      protocol: TCP
      targetPort: 9090
    - name: thanos-19091
      port: 19091
      protocol: TCP
      targetPort: 19091
    - name: thanos-19191
      port: 19191
      protocol: TCP
      targetPort: 19191
  selector:
    app: prometheus

安装Node-Exporter

kind: DaemonSet
apiVersion: apps/v1
metadata:
  name: node-exporter
  namespace: thanos
spec:
  selector:
    matchLabels:
      app: node-exporter
  template:
    metadata:
      labels:
        app: node-exporter
    spec:
      volumes:
        - name: proc
          hostPath:
            path: /proc
        - name: sys
          hostPath:
            path: /sys
        - name: host
          hostPath:
            path: /
      containers:
        - name: node-exporter
          ports:
            - containerPort: 9100
          image: harbor.axzo.cn/library/node-exporter:latest
          args:
            - '--path.procfs=/host/proc'
            - '--path.sysfs=/host/sys'
            - '--path.rootfs=/host'
            - '--collector.arp'
            - '--collector.bcache'
            - '--collector.bonding'
            - '--no-collector.buddyinfo'
            - '--collector.conntrack'
            - '--collector.cpu'
            - '--collector.diskstats'
            - '--no-collector.drbd'
            - '--collector.edac'
            - '--collector.entropy'
            - '--collector.filefd'
            - '--collector.filesystem'
            - '--collector.hwmon'
            - '--collector.infiniband'
            - '--no-collector.interrupts'
            - '--collector.ipvs'
            - '--no-collector.ksmd'
            - '--collector.loadavg'
            - '--no-collector.logind'
            - '--collector.mdadm'
            - '--collector.meminfo'
            - '--no-collector.meminfo_numa'
            - '--no-collector.mountstats'
            - '--collector.netdev'
            - '--collector.netstat'
            - '--collector.nfs'
            - '--collector.nfsd'
            - '--no-collector.ntp'
            - '--no-collector.processes'
            - '--no-collector.qdisc'
            - '--no-collector.runit'
            - '--collector.sockstat'
            - '--collector.stat'
            - '--no-collector.supervisord'
            - '--no-collector.systemd'
            - '--no-collector.tcpstat'
            - '--collector.textfile'
            - '--collector.time'
            - '--collector.timex'
            - '--collector.uname'
            - '--collector.vmstat'
            - '--no-collector.wifi'
            - '--collector.xfs'
            - '--collector.zfs'
          resources:
            limits:
              cpu: 200m
              memory: 200Mi
            requests:
              cpu: 100m
              memory: 30Mi
          volumeMounts:
            - name: proc
              readOnly: true
              mountPath: /host/proc
            - name: sys
              readOnly: true
              mountPath: /host/sys
            - name: host
              readOnly: true
              mountPath: /host
      nodeSelector:
        kubernetes.io/os: linux
      hostNetwork: true
      hostPID: true
      securityContext:
        runAsUser: 65534
        runAsNonRoot: true
      tolerations:
        - operator: Exists

安装Thanos

apiVersion: apps/v1
kind: Deployment
metadata:
  name: thanos-query
  namespace: thanos
spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: thanos-query
  template:
    metadata:
      labels:
        app.kubernetes.io/name: thanos-query
    spec:
      containers:
        - args:
            - query
            - '--grpc-address=0.0.0.0:19091'
            - '--http-address=0.0.0.0:9090'
            - '--log.level=info'
            - '--log.format=logfmt'
            - '--endpoint=prometheus:19091'
            - '--query.auto-downsampling'
          image: 'harbor.axzo.cn/library/thanos:latest'
          imagePullPolicy: IfNotPresent
          livenessProbe:
            failureThreshold: 4
            httpGet:
              path: /-/healthy
              port: 9090
              scheme: HTTP
            periodSeconds: 30
            successThreshold: 1
            timeoutSeconds: 1
          name: thanos-query
          readinessProbe:
            failureThreshold: 20
            httpGet:
              path: /-/ready
              port: 9090
              scheme: HTTP
            periodSeconds: 5
            successThreshold: 1
            timeoutSeconds: 1
          resources:
            limits:
              cpu: '4'
              memory: 4Gi
            requests:
              cpu: '2'
              memory: 4Gi
      imagePullSecrets:
        - name: harbor
      nodeSelector:
        kubernetes.io/os: linux
      securityContext:
        fsGroup: 65534
        runAsUser: 65534
      terminationGracePeriodSeconds: 120