feat: добавить аддоны Loki, Promtail, Tempo, Pushgateway

Loki (grafana/loki 6.7.3):
- Монолитный режим (singleBinary) — оптимально для малых кластеров
- Хранение: filesystem по умолчанию, автоматически S3 если addon_minio=true
  (создаёт бакеты chunks/ruler/admin в MinIO через Job)
- Retention 30 дней, отключён self-monitoring (без Grafana Agent Operator)
- ServiceMonitor + Grafana datasource ConfigMap при addon_prometheus_stack=true
- Datasource включает derivedFields → Tempo если addon_tempo=true

Promtail (grafana/promtail 6.16.4):
- DaemonSet на всех нодах (tolerations: Exists)
- Отправляет логи в Loki по http://loki.loki.svc.cluster.local:3100
- ServiceMonitor при addon_prometheus_stack=true

Tempo (grafana/tempo 1.10.3):
- Монолитный режим с PVC для трейсов
- Receivers: OTLP gRPC/HTTP, Jaeger, опционально Zipkin
- Grafana datasource ConfigMap при addon_prometheus_stack=true
  (tracesToLogsV2 → Loki если addon_loki=true, serviceMap → Prometheus)
- ServiceMonitor при addon_prometheus_stack=true

Pushgateway (prometheus-community/prometheus-pushgateway 2.14.0):
- Устанавливается в monitoring namespace (рядом с Prometheus)
- ServiceMonitor при addon_prometheus_stack=true
- Опциональные persistence и ingress
This commit is contained in:
Sergey Antropoff
2026-04-25 11:37:23 +03:00
parent dbc21150b2
commit b8e15fae45
17 changed files with 717 additions and 0 deletions

7
addons/loki/playbook.yml Normal file
View File

@@ -0,0 +1,7 @@
---
- name: Install Loki
hosts: k3s_master[0]
gather_facts: false
become: true
roles:
- role: "{{ playbook_dir }}/role"

View File

@@ -0,0 +1,44 @@
---
loki_version: "6.7.3"
loki_namespace: "loki"
loki_chart_repo: "https://grafana.github.io/helm-charts"
# Количество реплик (singleBinary — монолитный режим, оптимально для малых кластеров)
loki_replicas: 1
# Тип хранилища: filesystem | s3
# Автоматически s3 если addon_minio установлен
loki_storage_type: "{{ 's3' if addon_minio | default(false) | bool else 'filesystem' }}"
loki_storage_size: "10Gi"
loki_storage_class: "" # "" = default StorageClass
# MinIO/S3 (используются если loki_storage_type == 's3')
loki_s3_endpoint: "http://minio.minio.svc.cluster.local:9000"
loki_s3_bucket_chunks: "loki-chunks"
loki_s3_bucket_ruler: "loki-ruler"
loki_s3_bucket_admin: "loki-admin"
loki_s3_access_key: "{{ vault_minio_root_user | default('minioadmin') }}"
loki_s3_secret_key: "{{ vault_minio_root_password | default('changeme-minio') }}"
loki_s3_region: "us-east-1"
# Срок хранения логов
loki_retention_period: "720h" # 30 дней
# Ingress (обычно не нужен — Grafana обращается напрямую внутри кластера)
loki_ingress_enabled: false
loki_ingress_host: "loki.local"
loki_ingress_class: "{{ ingress_nginx_class_name | default('nginx') }}"
loki_ingress_tls: false
loki_ingress_cert_issuer: "{{ cert_manager_default_issuer_name | default('letsencrypt-prod') }}"
# Метрики
loki_metrics_enabled: true
# ServiceMonitor создаётся только когда addon_prometheus_stack: true
loki_resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi

View File

@@ -0,0 +1,100 @@
---
- name: Add Grafana Helm repo
kubernetes.core.helm_repository:
name: grafana
repo_url: "{{ loki_chart_repo }}"
environment:
KUBECONFIG: "{{ k3s_kubeconfig_path }}"
- name: Create Loki namespace
ansible.builtin.command: >
k3s kubectl create namespace {{ loki_namespace }}
--dry-run=client -o yaml | k3s kubectl apply -f -
changed_when: false
- name: Create MinIO buckets for Loki
kubernetes.core.k8s:
state: present
definition:
apiVersion: batch/v1
kind: Job
metadata:
name: loki-create-minio-buckets
namespace: "{{ loki_namespace }}"
spec:
ttlSecondsAfterFinished: 300
template:
spec:
restartPolicy: OnFailure
containers:
- name: mc
image: minio/mc:latest
command:
- /bin/sh
- -c
- |
mc alias set minio {{ loki_s3_endpoint }} {{ loki_s3_access_key }} {{ loki_s3_secret_key }}
mc mb --ignore-existing minio/{{ loki_s3_bucket_chunks }}
mc mb --ignore-existing minio/{{ loki_s3_bucket_ruler }}
mc mb --ignore-existing minio/{{ loki_s3_bucket_admin }}
environment:
KUBECONFIG: "{{ k3s_kubeconfig_path }}"
when: loki_storage_type == 's3'
- name: Wait for MinIO bucket Job to complete
ansible.builtin.command: >
k3s kubectl -n {{ loki_namespace }}
wait job/loki-create-minio-buckets
--for=condition=complete --timeout=120s
changed_when: false
when: loki_storage_type == 's3'
- name: Template Loki values
ansible.builtin.template:
src: loki-values.yaml.j2
dest: /tmp/loki-values.yaml
mode: '0644'
- name: Install Loki via Helm
kubernetes.core.helm:
name: loki
chart_ref: grafana/loki
chart_version: "{{ loki_version }}"
release_namespace: "{{ loki_namespace }}"
create_namespace: true
wait: true
timeout: "10m0s"
values_files:
- /tmp/loki-values.yaml
environment:
KUBECONFIG: "{{ k3s_kubeconfig_path }}"
- name: Wait for Loki to be ready
ansible.builtin.command: >
k3s kubectl -n {{ loki_namespace }}
rollout status statefulset/loki --timeout=180s
changed_when: false
retries: 3
delay: 10
- name: Template Loki Grafana datasource ConfigMap
ansible.builtin.template:
src: loki-grafana-datasource.yaml.j2
dest: /tmp/loki-grafana-datasource.yaml
mode: '0644'
when: addon_prometheus_stack | default(false) | bool
- name: Apply Loki Grafana datasource ConfigMap
ansible.builtin.command: k3s kubectl apply -f /tmp/loki-grafana-datasource.yaml
changed_when: true
when: addon_prometheus_stack | default(false) | bool
- name: Show Loki access info
ansible.builtin.debug:
msg:
- "Loki установлен в namespace: {{ loki_namespace }}"
- "Push URL (для Promtail): http://loki.{{ loki_namespace }}.svc.cluster.local:3100/loki/api/v1/push"
- "Query URL: http://loki.{{ loki_namespace }}.svc.cluster.local:3100"
- "Хранилище: {{ loki_storage_type }}, retention: {{ loki_retention_period }}"
- "{% if addon_prometheus_stack | default(false) | bool %}Datasource 'Loki' добавлен в Grafana автоматически{% else %}Добавь datasource в Grafana: тип Loki, URL http://loki.{{ loki_namespace }}.svc.cluster.local:3100{% endif %}"
- "{% if addon_promtail | default(false) | bool %}Promtail уже установлен — логи собираются{% else %}Установи Promtail: make addon-promtail{% endif %}"

View File

@@ -0,0 +1,26 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: loki-grafana-datasource
namespace: {{ prometheus_stack_namespace | default('monitoring') }}
labels:
grafana_datasource: "1"
data:
loki-datasource.yaml: |
apiVersion: 1
datasources:
- name: Loki
type: loki
uid: loki
access: proxy
url: http://loki.{{ loki_namespace }}.svc.cluster.local:3100
isDefault: false
jsonData:
maxLines: 1000
{% if addon_tempo | default(false) | bool %}
derivedFields:
- datasourceUid: tempo
matcherRegex: "traceID=(\\w+)"
name: TraceID
url: '${__value.raw}'
{% endif %}

View File

@@ -0,0 +1,120 @@
## Loki Helm values — Ansible managed
deploymentMode: SingleBinary
loki:
auth_enabled: false
commonConfig:
replication_factor: 1
storage:
{% if loki_storage_type == 's3' %}
type: s3
s3:
endpoint: "{{ loki_s3_endpoint }}"
region: "{{ loki_s3_region }}"
secretAccessKey: "{{ loki_s3_secret_key }}"
accessKeyId: "{{ loki_s3_access_key }}"
s3ForcePathStyle: true
insecure: true
bucketNames:
chunks: "{{ loki_s3_bucket_chunks }}"
ruler: "{{ loki_s3_bucket_ruler }}"
admin: "{{ loki_s3_bucket_admin }}"
{% else %}
type: filesystem
{% endif %}
schemaConfig:
configs:
- from: "2024-04-01"
store: tsdb
object_store: "{{ loki_storage_type == 's3' | ternary('s3', 'filesystem') }}"
schema: v13
index:
prefix: index_
period: 24h
limits_config:
retention_period: "{{ loki_retention_period }}"
compactor:
retention_enabled: true
singleBinary:
replicas: {{ loki_replicas }}
resources:
requests:
cpu: "{{ loki_resources.requests.cpu }}"
memory: "{{ loki_resources.requests.memory }}"
limits:
cpu: "{{ loki_resources.limits.cpu }}"
memory: "{{ loki_resources.limits.memory }}"
persistence:
enabled: {{ (loki_storage_type != 's3') | lower }}
size: "{{ loki_storage_size }}"
{% if loki_storage_class %}
storageClass: "{{ loki_storage_class }}"
{% endif %}
tolerations:
- key: "node-role.kubernetes.io/control-plane"
operator: "Exists"
effect: "NoSchedule"
# Отключить масштабируемые компоненты (используется singleBinary)
read:
replicas: 0
write:
replicas: 0
backend:
replicas: 0
# Кэши отключены для простоты (singleBinary не требует)
chunksCache:
enabled: false
resultsCache:
enabled: false
# Отключить self-monitoring (требует Grafana Agent Operator)
monitoring:
selfMonitoring:
enabled: false
grafanaAgent:
installOperator: false
lokiCanary:
enabled: false
serviceMonitor:
enabled: {{ loki_metrics_enabled | lower }}
additionalLabels:
release: kube-prometheus-stack
# Тесты создают лишние поды — отключаем
test:
enabled: false
# Gateway (nginx) отключён — прямое обращение к сервису
gateway:
enabled: false
{% if loki_ingress_enabled %}
ingress:
enabled: true
ingressClassName: "{{ loki_ingress_class }}"
hosts:
- host: "{{ loki_ingress_host }}"
paths:
- path: /
pathType: Prefix
{% if loki_ingress_tls %}
tls:
- secretName: loki-tls
hosts:
- "{{ loki_ingress_host }}"
annotations:
cert-manager.io/cluster-issuer: "{{ loki_ingress_cert_issuer }}"
{% endif %}
{% endif %}

View File

@@ -0,0 +1,7 @@
---
- name: Install Promtail
hosts: k3s_master[0]
gather_facts: false
become: true
roles:
- role: "{{ playbook_dir }}/role"

View File

@@ -0,0 +1,22 @@
---
promtail_version: "6.16.4"
promtail_namespace: "promtail"
promtail_chart_repo: "https://grafana.github.io/helm-charts"
# URL Loki push endpoint
promtail_loki_url: "http://loki.{{ loki_namespace | default('loki') }}.svc.cluster.local:3100/loki/api/v1/push"
# Tenant ID (оставь пустым если Loki auth_enabled: false)
promtail_tenant_id: ""
# Метрики
promtail_metrics_enabled: true
# ServiceMonitor создаётся только когда addon_prometheus_stack: true
promtail_resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 128Mi

View File

@@ -0,0 +1,61 @@
---
- name: Add Grafana Helm repo
kubernetes.core.helm_repository:
name: grafana
repo_url: "{{ promtail_chart_repo }}"
environment:
KUBECONFIG: "{{ k3s_kubeconfig_path }}"
- name: Install Promtail via Helm
kubernetes.core.helm:
name: promtail
chart_ref: grafana/promtail
chart_version: "{{ promtail_version }}"
release_namespace: "{{ promtail_namespace }}"
create_namespace: true
wait: true
timeout: "5m0s"
values:
config:
clients:
- url: "{{ promtail_loki_url }}"
{% if promtail_tenant_id %}
tenantID: "{{ promtail_tenant_id }}"
{% endif %}
snippets:
pipelineStages:
- cri: {}
tolerations:
- operator: Exists # DaemonSet на всех нодах включая мастера
resources:
requests:
cpu: "{{ promtail_resources.requests.cpu }}"
memory: "{{ promtail_resources.requests.memory }}"
limits:
cpu: "{{ promtail_resources.limits.cpu }}"
memory: "{{ promtail_resources.limits.memory }}"
serviceMonitor:
enabled: "{{ promtail_metrics_enabled | bool and addon_prometheus_stack | default(false) | bool }}"
labels:
release: kube-prometheus-stack
environment:
KUBECONFIG: "{{ k3s_kubeconfig_path }}"
- name: Wait for Promtail DaemonSet to be ready
ansible.builtin.command: >
k3s kubectl -n {{ promtail_namespace }}
rollout status daemonset/promtail --timeout=120s
changed_when: false
retries: 3
delay: 10
- name: Show Promtail access info
ansible.builtin.debug:
msg:
- "Promtail установлен в namespace: {{ promtail_namespace }}"
- "Отправляет логи в: {{ promtail_loki_url }}"
- "DaemonSet собирает логи со всех нод кластера"
- "Логи доступны в Grafana → Explore → datasource Loki"

View File

@@ -0,0 +1,7 @@
---
- name: Install Prometheus Pushgateway
hosts: k3s_master[0]
gather_facts: false
become: true
roles:
- role: "{{ playbook_dir }}/role"

View File

@@ -0,0 +1,29 @@
---
pushgateway_version: "2.14.0"
# Устанавливается в namespace prometheus-stack для близости к Prometheus
pushgateway_namespace: "{{ prometheus_stack_namespace | default('monitoring') }}"
pushgateway_chart_repo: "https://prometheus-community.github.io/helm-charts"
# Persistence — сохраняет метрики между рестартами пода
pushgateway_persistence_enabled: false
pushgateway_persistence_size: "2Gi"
pushgateway_persistence_class: "" # "" = default StorageClass
# Ingress
pushgateway_ingress_enabled: false
pushgateway_ingress_host: "pushgateway.local"
pushgateway_ingress_class: "{{ ingress_nginx_class_name | default('nginx') }}"
pushgateway_ingress_tls: false
pushgateway_ingress_cert_issuer: "{{ cert_manager_default_issuer_name | default('letsencrypt-prod') }}"
# Метрики
pushgateway_metrics_enabled: true
# ServiceMonitor создаётся только когда addon_prometheus_stack: true
pushgateway_resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi

View File

@@ -0,0 +1,74 @@
---
- name: Add prometheus-community Helm repo
kubernetes.core.helm_repository:
name: prometheus-community
repo_url: "{{ pushgateway_chart_repo }}"
environment:
KUBECONFIG: "{{ k3s_kubeconfig_path }}"
- name: Install Prometheus Pushgateway via Helm
kubernetes.core.helm:
name: pushgateway
chart_ref: prometheus-community/prometheus-pushgateway
chart_version: "{{ pushgateway_version }}"
release_namespace: "{{ pushgateway_namespace }}"
create_namespace: true
wait: true
timeout: "5m0s"
values:
tolerations:
- key: "node-role.kubernetes.io/control-plane"
operator: "Exists"
effect: "NoSchedule"
resources:
requests:
cpu: "{{ pushgateway_resources.requests.cpu }}"
memory: "{{ pushgateway_resources.requests.memory }}"
limits:
cpu: "{{ pushgateway_resources.limits.cpu }}"
memory: "{{ pushgateway_resources.limits.memory }}"
persistentVolume:
enabled: "{{ pushgateway_persistence_enabled | bool }}"
size: "{{ pushgateway_persistence_size }}"
{% if pushgateway_persistence_class %}
storageClass: "{{ pushgateway_persistence_class }}"
{% endif %}
serviceMonitor:
enabled: "{{ pushgateway_metrics_enabled | bool and addon_prometheus_stack | default(false) | bool }}"
namespace: "{{ pushgateway_namespace }}"
additionalLabels:
release: kube-prometheus-stack
ingress:
enabled: "{{ pushgateway_ingress_enabled | bool }}"
{% if pushgateway_ingress_enabled %}
ingressClassName: "{{ pushgateway_ingress_class }}"
hosts:
- host: "{{ pushgateway_ingress_host }}"
paths:
- path: /
pathType: Prefix
{% if pushgateway_ingress_tls %}
tls:
- secretName: pushgateway-tls
hosts:
- "{{ pushgateway_ingress_host }}"
annotations:
cert-manager.io/cluster-issuer: "{{ pushgateway_ingress_cert_issuer }}"
{% endif %}
{% endif %}
environment:
KUBECONFIG: "{{ k3s_kubeconfig_path }}"
- name: Show Pushgateway access info
ansible.builtin.debug:
msg:
- "Pushgateway установлен в namespace: {{ pushgateway_namespace }}"
- "URL внутри кластера: http://pushgateway-prometheus-pushgateway.{{ pushgateway_namespace }}.svc.cluster.local:9091"
- "Пример отправки метрики:"
- " echo 'job_duration_seconds 42' | curl --data-binary @- http://pushgateway-prometheus-pushgateway.{{ pushgateway_namespace }}.svc.cluster.local:9091/metrics/job/my_job"
- "{% if pushgateway_ingress_enabled %}UI: http{{ 's' if pushgateway_ingress_tls else '' }}://{{ pushgateway_ingress_host }}{% else %}Port-forward: kubectl port-forward svc/pushgateway-prometheus-pushgateway -n {{ pushgateway_namespace }} 9091:9091{% endif %}"
- "{% if addon_prometheus_stack | default(false) | bool %}Prometheus автоматически скрейпит Pushgateway через ServiceMonitor{% endif %}"

View File

@@ -0,0 +1,7 @@
---
- name: Install Tempo
hosts: k3s_master[0]
gather_facts: false
become: true
roles:
- role: "{{ playbook_dir }}/role"

View File

@@ -0,0 +1,31 @@
---
tempo_version: "1.10.3"
tempo_namespace: "tempo"
tempo_chart_repo: "https://grafana.github.io/helm-charts"
# Срок хранения трейсов
tempo_retention_period: "720h" # 30 дней
# PVC для данных Tempo
tempo_storage_size: "10Gi"
tempo_storage_class: "" # "" = default StorageClass
# Receivers (протоколы приёма трейсов)
# OTLP gRPC:4317, OTLP HTTP:4318
# Jaeger gRPC:14250, Jaeger HTTP:14268
# Zipkin:9411
tempo_receivers_otlp_enabled: true
tempo_receivers_jaeger_enabled: true
tempo_receivers_zipkin_enabled: false
# Метрики
tempo_metrics_enabled: true
# ServiceMonitor создаётся только когда addon_prometheus_stack: true
tempo_resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi

View File

@@ -0,0 +1,102 @@
---
- name: Add Grafana Helm repo
kubernetes.core.helm_repository:
name: grafana
repo_url: "{{ tempo_chart_repo }}"
environment:
KUBECONFIG: "{{ k3s_kubeconfig_path }}"
- name: Install Tempo via Helm
kubernetes.core.helm:
name: tempo
chart_ref: grafana/tempo
chart_version: "{{ tempo_version }}"
release_namespace: "{{ tempo_namespace }}"
create_namespace: true
wait: true
timeout: "5m0s"
values:
tempo:
retention: "{{ tempo_retention_period }}"
receivers:
{% if tempo_receivers_otlp_enabled %}
otlp:
protocols:
grpc:
endpoint: "0.0.0.0:4317"
http:
endpoint: "0.0.0.0:4318"
{% endif %}
{% if tempo_receivers_jaeger_enabled %}
jaeger:
protocols:
thrift_http:
endpoint: "0.0.0.0:14268"
grpc:
endpoint: "0.0.0.0:14250"
{% endif %}
{% if tempo_receivers_zipkin_enabled %}
zipkin:
endpoint: "0.0.0.0:9411"
{% endif %}
persistence:
enabled: true
accessModes:
- ReadWriteOnce
size: "{{ tempo_storage_size }}"
{% if tempo_storage_class %}
storageClassName: "{{ tempo_storage_class }}"
{% endif %}
tolerations:
- key: "node-role.kubernetes.io/control-plane"
operator: "Exists"
effect: "NoSchedule"
resources:
requests:
cpu: "{{ tempo_resources.requests.cpu }}"
memory: "{{ tempo_resources.requests.memory }}"
limits:
cpu: "{{ tempo_resources.limits.cpu }}"
memory: "{{ tempo_resources.limits.memory }}"
serviceMonitor:
enabled: "{{ tempo_metrics_enabled | bool and addon_prometheus_stack | default(false) | bool }}"
additionalLabels:
release: kube-prometheus-stack
environment:
KUBECONFIG: "{{ k3s_kubeconfig_path }}"
- name: Wait for Tempo to be ready
ansible.builtin.command: >
k3s kubectl -n {{ tempo_namespace }}
rollout status deployment/tempo --timeout=120s
changed_when: false
retries: 3
delay: 10
- name: Template Tempo Grafana datasource ConfigMap
ansible.builtin.template:
src: tempo-grafana-datasource.yaml.j2
dest: /tmp/tempo-grafana-datasource.yaml
mode: '0644'
when: addon_prometheus_stack | default(false) | bool
- name: Apply Tempo Grafana datasource ConfigMap
ansible.builtin.command: k3s kubectl apply -f /tmp/tempo-grafana-datasource.yaml
changed_when: true
when: addon_prometheus_stack | default(false) | bool
- name: Show Tempo access info
ansible.builtin.debug:
msg:
- "Tempo установлен в namespace: {{ tempo_namespace }}"
- "HTTP API (query): http://tempo.{{ tempo_namespace }}.svc.cluster.local:3200"
- "{% if tempo_receivers_otlp_enabled %}OTLP gRPC: tempo.{{ tempo_namespace }}.svc.cluster.local:4317{% endif %}"
- "{% if tempo_receivers_otlp_enabled %}OTLP HTTP: tempo.{{ tempo_namespace }}.svc.cluster.local:4318{% endif %}"
- "{% if tempo_receivers_jaeger_enabled %}Jaeger HTTP: tempo.{{ tempo_namespace }}.svc.cluster.local:14268{% endif %}"
- "{% if addon_prometheus_stack | default(false) | bool %}Datasource 'Tempo' добавлен в Grafana автоматически{% else %}Добавь datasource в Grafana: тип Tempo, URL http://tempo.{{ tempo_namespace }}.svc.cluster.local:3200{% endif %}"
- "Отправляй трейсы из приложений через OTLP на tempo.{{ tempo_namespace }}.svc.cluster.local:4317"

View File

@@ -0,0 +1,33 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: tempo-grafana-datasource
namespace: {{ prometheus_stack_namespace | default('monitoring') }}
labels:
grafana_datasource: "1"
data:
tempo-datasource.yaml: |
apiVersion: 1
datasources:
- name: Tempo
type: tempo
uid: tempo
access: proxy
url: http://tempo.{{ tempo_namespace }}.svc.cluster.local:3200
isDefault: false
jsonData:
httpMethod: GET
nodeGraph:
enabled: true
search:
hide: false
serviceMap:
datasourceUid: prometheus
{% if addon_loki | default(false) | bool %}
tracesToLogsV2:
datasourceUid: loki
spanStartTimeShift: '-1h'
spanEndTimeShift: '1h'
filterByTraceID: true
filterBySpanID: false
{% endif %}