first sync
Some checks failed
Deployment Verification / deploy-and-test (push) Failing after 5m16s

This commit is contained in:
Hubert Cornet 2024-12-26 19:11:34 +01:00
parent be8109e7c6
commit b46894e48b
9 changed files with 3324 additions and 24 deletions

10
alertmanager/config.yml Normal file
View File

@ -0,0 +1,10 @@
route:
receiver: 'slack'
receivers:
- name: 'slack'
# slack_configs:
# - send_resolved: true
# username: '<username>'
# channel: '#<channel-name>'
# api_url: '<incomming-webhook-url>'

View File

@ -2,41 +2,158 @@
networks: networks:
traefik_front_network: traefik_front_network:
external: true external: true
back_network_: back_network_pg:
driver: bridge driver: bridge
attachable: true attachable: true
#### SERVICES #### SERVICES
services: services:
### hello_world ### prometheus
hello_world: prometheus:
container_name: gitea-app container_name: prometheus-app
hostname: gitea-app hostname: prometheus-app
image: hello-world image: prom/prometheus:latest
environment:
restart: always restart: always
networks:
# - back_network_gitea
- traefik_front_network
volumes: volumes:
- ./prometheus:/etc/prometheus/
- ./prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
# ports:
# - 9090:9090
networks:
- traefik_front_network
- back_network_pg
links:
- cadvisor:cadvisor
- alertmanager:alertmanager
depends_on:
- cadvisor
labels: labels:
- "traefik.enable=true" - "traefik.enable=true"
- "traefik.docker.network=traefik_front_network" - "traefik.docker.network=traefik_front_network"
# HTTP # HTTP
- "traefik.http.routers.hello-world-http.rule=Host(`hello-world.tips-of-mine.com`)" - "traefik.http.routers.prometheus-http.rule=Host(`prometheus.tips-of-mine.com`)"
- "traefik.http.routers.hello-world-http.entrypoints=http" - "traefik.http.routers.prometheus-http.entrypoints=http"
- "traefik.http.routers.hello-world-http.priority=49" - "traefik.http.routers.prometheus-http.priority=39"
# HTTPS # HTTPS
- "traefik.http.routers.hello-world-https.rule=Host(`hello-world.tips-of-mine.com`)" - "traefik.http.routers.prometheus-https.rule=Host(`prometheus.tips-of-mine.com`)"
- "traefik.http.routers.hello-world-https.entrypoints=https" - "traefik.http.routers.prometheus-https.entrypoints=https"
- "traefik.http.routers.hello-world-https.tls=true" - "traefik.http.routers.prometheus-https.tls=true"
- "traefik.http.routers.hello-world-https.priority=50" - "traefik.http.routers.prometheus-https.priority=40"
- "traefik.http.routers.gitea.service=gitea-https-service" - "traefik.http.routers.prometheus.service=prometheus-https-service"
# Middleware # Middleware
# Service # Service
# - "traefik.http.services.gitea-https-service.loadbalancer.server.port=3000" - "traefik.http.services.prometheus-https-service.loadbalancer.server.port=9090"
# - "traefik.http.services.gitea-https-service.loadbalancer.server.scheme=https" # - "traefik.http.services.prometheus-https-service.loadbalancer.server.scheme=https"
# - "traefik.http.services.gitea-https-service.loadbalancer.healthcheck.hostname=gitea.traefik.me" - "traefik.http.services.prometheus-https-service.loadbalancer.healthcheck.hostname=prometheus.tips-of-mine.com"
# - "traefik.http.services.gitea-https-service.loadbalancer.healthcheck.method=foobar" - "traefik.http.services.prometheus-https-service.loadbalancer.healthcheck.method=foobar"
# - "traefik.http.services.gitea-https-service.loadbalancer.healthcheck.timeout=10" - "traefik.http.services.prometheus-https-service.loadbalancer.healthcheck.timeout=10"
# - "traefik.http.services.gitea-https-service.loadbalancer.healthcheck.interval=30" - "traefik.http.services.prometheus-https-service.loadbalancer.healthcheck.interval=30"
### node-exporter
node-exporter:
container_name: prometheus-node-exporter
hostname: prometheus-node-exporter
image: prom/node-exporter:latest
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- --collector.filesystem.ignored-mount-points
- '^/(sys|proc|dev|host|etc|rootfs/var/lib/docker/containers|rootfs/var/lib/docker/overlay2|rootfs/run/docker/netns|rootfs/var/lib/docker/aufs)($$|/)'
ports:
- 9100:9100
networks:
- back_network_pg
restart: always
deploy:
mode: global
### alertmanager
alertmanager:
container_name: prometheus-alertmanager
hostname: prometheus-alertmanager
image: prom/alertmanager:latest
restart: always
ports:
- 9093:9093
networks:
- back_network_pg
volumes:
- ./alertmanager/:/etc/alertmanager/
command:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/alertmanager'
### cadvisor
cadvisor:
container_name: prometheus-cadvisor
hostname: prometheus-cadvisor
image: gcr.io/cadvisor/cadvisor:latest
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
ports:
- 8080:8080
networks:
- back_network_pg
restart: always
deploy:
mode: global
### grafana
grafana:
container_name: grafana-app
hostname: grafana-app
image: grafana/grafana:latest
user: '472'
restart: always
environment:
GF_INSTALL_PLUGINS: 'grafana-clock-panel,grafana-simple-json-datasource'
volumes:
- ./grafana_data:/var/lib/grafana
- ./grafana/provisioning/:/etc/grafana/provisioning/
env_file:
- ./grafana/config.monitoring
# ports:
# - 3000:3000
networks:
- traefik_front_network
- back_network_pg
depends_on:
- prometheus
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik_front_network"
# HTTP
- "traefik.http.routers.grafana-http.rule=Host(`grafana.tips-of-mine.com`)"
- "traefik.http.routers.grafana-http.entrypoints=http"
- "traefik.http.routers.grafana-http.priority=41"
# HTTPS
- "traefik.http.routers.grafana-https.rule=Host(`grafana.tips-of-mine.com`)"
- "traefik.http.routers.grafana-https.entrypoints=https"
- "traefik.http.routers.grafana-https.tls=true"
- "traefik.http.routers.grafana-https.priority=42"
- "traefik.http.routers.grafana.service=grafana-https-service"
# Middleware
# Service
- "traefik.http.services.grafana-https-service.loadbalancer.server.port=3000"
# - "traefik.http.services.grafana-https-service.loadbalancer.server.scheme=https"
- "traefik.http.services.grafana-https-service.loadbalancer.healthcheck.hostname=grafana.tips-of-mine.com"
- "traefik.http.services.grafana-https-service.loadbalancer.healthcheck.method=foobar"
- "traefik.http.services.grafana-https-service.loadbalancer.healthcheck.timeout=10"
- "traefik.http.services.grafana-https-service.loadbalancer.healthcheck.interval=30"
#### VOLUMES
#volumes:
# prometheus_data: {}
# grafana_data: {}

View File

@ -0,0 +1,3 @@
GF_SECURITY_ADMIN_USER=admin
GF_SECURITY_ADMIN_PASSWORD=foobar
GF_USERS_ALLOW_SIGN_UP=false

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,11 @@
apiVersion: 1
providers:
- name: 'Prometheus'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
options:
path: /etc/grafana/provisioning/dashboards

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,50 @@
# config file version
apiVersion: 1
# list of datasources that should be deleted from the database
deleteDatasources:
- name: Prometheus
orgId: 1
# list of datasources to insert/update depending
# whats available in the database
datasources:
# <string, required> name of the datasource. Required
- name: Prometheus
# <string, required> datasource type. Required
type: prometheus
# <string, required> access mode. direct or proxy. Required
access: proxy
# <int> org id. will default to orgId 1 if not specified
orgId: 1
# <string> url
url: http://prometheus:9090
# <string> database password, if used
password:
# <string> database user, if used
user:
# <string> database name, if used
database:
# <bool> enable/disable basic auth
basicAuth: false
# <string> basic auth username, if used
basicAuthUser:
# <string> basic auth password, if used
basicAuthPassword:
# <bool> enable/disable with credentials headers
withCredentials:
# <bool> mark as default datasource. Max one per org
isDefault: true
# <map> fields that will be converted to json and stored in json_data
jsonData:
graphiteVersion: "1.1"
tlsAuth: false
tlsAuthWithCACert: false
# <string> json object of data that will be encrypted.
secureJsonData:
tlsCACert: "..."
tlsClientCert: "..."
tlsClientKey: "..."
version: 1
# <bool> allow users to edit datasources from the UI.
editable: true

22
prometheus/alert.rules Normal file
View File

@ -0,0 +1,22 @@
groups:
- name: example
rules:
# Alert for any instance that is unreachable for >2 minutes.
- alert: service_down
expr: up == 0
for: 2m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
- alert: high_load
expr: node_load1 > 0.5
for: 2m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} under high load"
description: "{{ $labels.instance }} of job {{ $labels.job }} is under high load."

94
prometheus/prometheus.yml Normal file
View File

@ -0,0 +1,94 @@
# my global config
global:
scrape_interval: 15s # By default, scrape targets every 15 seconds.
evaluation_interval: 15s # By default, scrape targets every 15 seconds.
# scrape_timeout is set to the global default (10s).
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'my-project'
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
rule_files:
- 'alert.rules'
# - "first.rules"
# - "second.rules"
# alert
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "alertmanager:9093"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: app
scrape_interval: 5s
static_configs:
- targets: ['host.docker.internal:8000']
- job_name: 'prometheus'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
static_configs:
- targets: ['localhost:9090']
- job_name: 'cadvisor'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
dns_sd_configs:
- names:
- 'tasks.cadvisor'
type: 'A'
port: 8080
# static_configs:
# - targets: ['cadvisor:8080']
- job_name: 'node-exporter'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
dns_sd_configs:
- names:
- 'tasks.node-exporter'
type: 'A'
port: 9100
# - job_name: 'pushgateway'
# scrape_interval: 10s
# dns_sd_configs:
# - names:
# - 'tasks.pushgateway'
# type: 'A'
# port: 9091
# static_configs:
# - targets: ['node-exporter:9100']
- job_name: 'traefik-app'
scrape_interval: 5s
static_configs:
- targets: ['10.0.4.29:8181']
- job_name: 'keycloak-app'
scrape_interval: 5s
static_configs:
- targets: ['10.0.4.29:8282']
- job_name: 'airflow'
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
static_configs:
- targets:
- '10.12.1.14:9102'