Skip to content

Instantly share code, notes, and snippets.

@lostsnow
Last active July 12, 2018 01:48
Show Gist options
  • Save lostsnow/8c7a96933cae686e9f074d66ac0c6178 to your computer and use it in GitHub Desktop.
Save lostsnow/8c7a96933cae686e9f074d66ac0c6178 to your computer and use it in GitHub Desktop.
Grafana & Prometheus

Grafana & Prometheus

Grafana

cat > /etc/yum.repos.d/prometheus.repo <<-GRAFANA
[grafana]
name=grafana
baseurl=https://packagecloud.io/grafana/stable/el/7/\$basearch
repo_gpgcheck=1
enabled=1
gpgcheck=1
gpgkey=https://packagecloud.io/gpg.key https://grafanarel.s3.amazonaws.com/RPM-GPG-KEY-grafana
sslverify=1
sslcacert=/etc/pki/tls/certs/ca-bundle.crt
GRAFANA

yum install grafana

plugins

grafana-cli plugins install grafana-piechart-panel
grafana-cli plugins install yesoreyeram-boomtable-panel
grafana-cli plugins install neocat-cal-heatmap-panel
grafana-cli plugins install petrslavotinek-carpetplot-panel

service

systemctl enable grafana-server.service
systemctl start grafana-server.service
systemctl status grafana-server.service -l

Dashboards

https://github.com/lostsnow/grafana-dashboards

Prometheus

cat > /etc/yum.repos.d/prometheus.repo <<-PROMETHEUS
[prometheus]
name=prometheus
baseurl=https://packagecloud.io/prometheus-rpm/release/el/7/\$basearch
repo_gpgcheck=1
enabled=1
gpgkey=https://packagecloud.io/prometheus-rpm/release/gpgkey https://raw.githubusercontent.com/lest/prometheus-rpm/master/RPM-GPG-KEY-prometheus-rpm
gpgcheck=1
sslverify=1
sslcacert=/etc/pki/tls/certs/ca-bundle.crt
metadata_expire=300
PROMETHEUS

yum install -y prometheus2 alertmanager

service

systemctl enable grafana-server.service
systemctl start grafana-server.service
systemctl status grafana-server.service -l

exporter

lanip=$(ifconfig eth0 | grep 'inet' | awk '{print $2}')
hname=$(hostname)
src_dir=/opt/soft
mkdir -p ${src_dir}
docker_compose_dir=/opt/projects/deploy/docker/prometheus


yum install -y node_exporter mysqld_exporter blackbox_exporter


# node_exporter
echo "NODE_EXPORTER_OPTS=\"--web.listen-address=${lanip}:13100\"" > /etc/default/node_exporter
cat /etc/default/node_exporter

systemctl enable node_exporter.service
systemctl start node_exporter.service
systemctl status node_exporter.service -l


# mysqld_exporter
CREATE USER 'exporter'@'localhost' IDENTIFIED BY 'pass' WITH MAX_USER_CONNECTIONS 3;
GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'localhost';

echo "DATA_SOURCE_NAME=\"exporter:pass@(localhost:3306)/\"" > /etc/default/mysqld_exporter
echo "MYSQLD_EXPORTER_OPTS=\"-web.listen-address=${lanip}:13104 -config.my-cnf=/etc/my.cnf\"" >> /etc/default/mysqld_exporter
cat /etc/default/mysqld_exporter

systemctl enable mysqld_exporter.service
systemctl start mysqld_exporter.service
systemctl status mysqld_exporter.service -l


# nginx-exporter
mkdir -p ${docker_compose_dir}/nginx
cd ${docker_compose_dir}/nginx
cat > docker-compose.yml <<-COMPOSE_NGX
exporter:
  image: fish/nginx-exporter
  restart: unless-stopped
  ports:
    - "${lanip}:13113:9113"
  extra_hosts:
    - "${hname}:${lanip}"
  command: ["-nginx.scrape_uri=http://${hname}/status_ngx"]
COMPOSE_NGX
docker-compose up -d
docker-compose ps


# nginx-request_exporter
# https://github.com/hnlq715/nginx-prometheus-metrics


# phpfpm_exporter
mkdir -p ${docker_compose_dir}/php-fpm
cd ${docker_compose_dir}/php-fpm
cat > docker-compose.yml <<-COMPOSE_PHPFPM
exporter:
  image: barwell/phpfpm_exporter
  restart: unless-stopped
  ports:
    - "${lanip}:13253:9253"
  environment:
    - PHPFPM_HOST=${hname}
    - PHPFPM_PATH=/status_fpm
  extra_hosts:
    - "${hname}:${lanip}"
COMPOSE_PHPFPM
docker-compose up -d
docker-compose ps


# ssl-exporter
mkdir -p ${docker_compose_dir}/ssl
cd ${docker_compose_dir}/ssl
cat > docker-compose.yml <<-COMPOSE_SSL
exporter:
  image: ribbybibby/ssl-exporter
  restart: unless-stopped
  ports:
    - "${lanip}:13219:9219"
  extra_hosts:
    - "${hname}:${lanip}"
  command: ["--tls.insecure"]
COMPOSE_SSL
docker-compose up -d
docker-compose ps


# domain_exporter
mkdir -p ${docker_compose_dir}/domain
cd ${docker_compose_dir}/domain

cat > domains.yml <<-DOMAINS
domains:
  - domain.com
DOMAINS

cat > docker-compose.yml <<-COMPOSE_DOMAIN
exporter:
  image: quay.io/shift/domain_exporter
  restart: unless-stopped
  ports:
    - "${lanip}:13203:9203"
  extra_hosts:
    - "${hname}:${lanip}"
  volumes:
    - ./domains.yml:/domains.yml
  command: ["--config=/domains.yml"]
COMPOSE_DOMAIN
docker-compose up -d
docker-compose ps

statup

go get -u -v github.com/hunterlong/statup
cd $GOPATH/src/github.com/hunterlong/statup/

# change this line
# met += fmt.Sprintf("statup_service_latency{id=\"%v\" name=\"%v\"} %f\n", v.Id, v.Name, (v.Latency * 100))

go build -v

mkdir -p /opt/projects/deploy/statup/
mv statup /opt/projects/deploy/statup/

cat > /usr/lib/systemd/system/statup.service <<-STATUP
[Unit]
Description=Statup Server
After=network.target
After=systemd-user-sessions.service
After=network-online.target

[Service]
User=www
Group=www
Type=simple
Restart=always
ExecStart=/opt/projects/deploy/statup/statup
WorkingDirectory=/opt/projects/deploy/statup

[Install]
WantedBy=multi-user.target
STATUP

systemctl enable statup.service
systemctl start statup.service
systemctl status statup.service -l
groups:
- name: Domain
rules:
- alert: SSLExpiredSoon
expr: ((((ssl_cert_not_after - time()) * on (instance,issuer_cn,serial_no) group_left (dnsnames) ssl_cert_subject_alternative_dnsnames) * on (instance,issuer_cn,serial_no) group_le
ft (subject_cn) ssl_cert_subject_common_name) / 86400) < 20
for: 1m
labels:
severity: high
annotations:
summary: "SSL certificate {{ $labels.instance }} expired soon"
description: "SSL certificate {{ $labels.instance }} CN: [{{ $labels.subject_cn }}] DNS: [{{ $labels.dnsnames }}] ISSUER: [{{ $labels.issuer_cn }}] will expired in {{ humanize $v
alue }} days"
- alert: DomainExpiredSoon
expr: domain_expiration < 30
for: 1m
labels:
severity: high
annotations:
summary: "Domain {{ $labels.domain }} expired soon"
description: "Domain {{ $labels.domain }} will expired in {{ $value }} days"
groups:
- name: MySQLAlert
rules:
- alert: MySQLSlaveLag
expr: mysql_slave_status_seconds_behind_master > 200
for: 1m
labels:
severity: warning
annotations:
summary: "MySQL Slave lag is too high"
description: "MySQL Slave {{ $labels.instance }} of job {{ $labels.job }} lag is too high"
- alert: MySQLReplicationSQLThreadStatus
expr: mysql_slave_status_slave_sql_running == 0
for: 1m
labels:
severity: high
annotations:
summary: "MySQL Slave SQL thread stop"
description: "MySQL Slave {{ $labels.instance }} of job {{ $labels.job }} SQL thread stop"
- alert: MySQLReplicationIOThreadStatus
expr: mysql_slave_status_slave_io_running == 0
for: 1m
labels:
severity: high
annotations:
summary: "MySQL Slave IO thread stop"
description: "MySQL Slave {{ $labels.instance }} of job {{ $labels.job }} IO thread stop"
- alert: MySQLStatus
expr: mysql_up == 0
for: 20s
labels:
severity: critical
annotations:
summary: "MySQL Process Down"
description: "MySQL {{ $labels.instance }} of job {{ $labels.job }} Process Down"
- name: NginxAlert
rules:
- alert: NginxStatus
expr: nginx_up == 0
for: 20s
labels:
severity: critical
annotations:
summary: "Nginx Process Down"
description: "Nginx {{ $labels.instance }} of job {{ $labels.job }} Process Down"
- alert: NginxConnectionHigh
expr: (sum(increase(nginx_connections_processed_total[1m])) by (instance)) > 50000
for: 20s
labels:
severity: high
annotations:
summary: "Nginx connection high"
description: "Nginx connection of {{ $labels.instance }} is above 50000 (current value is: {{ $value }})"
groups:
- name: Exporter
rules:
- alert: ExporterDown
expr: up == 0
for: 1m
labels:
severity: high
annotations:
summary: "Instance {{ $labels.instance }} exporter down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
- name: NodeCPU
rules:
- alert: NodeCPUUsageHigh
expr: (100 * (1 - avg by(instance)(irate(node_cpu_seconds_total{mode='idle'}[1m])))) > 90
for: 20s
labels:
severity: high
annotations:
summary: "Server is using a LOT of CPU"
description: "{{ $labels.instance }}: CPU usage is {{ humanize $value }}%"
- alert: NodeCPULoadHigh
expr: ((sum(node_load1) by (instance)) / (count(count(node_cpu_seconds_total) by (instance, cpu)) by (instance))) > 1.5
for: 30s
labels:
severity: warning
annotations:
summary: "Server under high load"
description: "{{ $labels.instance }}: The avg load 1m is at {{ $value }}"
- name: NodeMemory
rules:
- alert: NodeMemoryUsageHigh
expr: ((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / node_memory_MemTotal_bytes * 100) > 90
for: 30s
labels:
severity: high
annotations:
summary: "High Memory usage detected"
description: "{{ $labels.instance }}: Memory usage is above 90% (current value is: {{ $value }})"
- name: NodeDisk
rules:
- alert: NodeDiskFreeSpaceLow
expr: ((node_filesystem_avail_bytes{device=~"/dev/.*"} / node_filesystem_size_bytes{device=~"/dev/.*"}) * 100) < 10
for: 1m
labels:
severity: warning
annotations:
summary: "Free Disk Space"
description: "{{ $labels.instance }}: Free Disk Space of {{ $labels.mountpoint }} is under 10% (current value is: {{ $value }})"
- alert: NodeDiskIOPSReadHigh
expr: (irate(node_disk_reads_completed_total{device=~"[a-z]*[a-z]"}[1m])) > 2500
for: 5m
labels:
severity: warning
annotations:
summary: "Disk IOPS Read high"
description: "{{ $labels.instance }}: Disk IOPS Read of {{ $labels.device }} is above 2500 (current value is: {{ $value }})"
- alert: NodeDiskIOPSWriteHigh
expr: (irate(node_disk_writes_completed_total{device=~"[a-z]*[a-z]"}[1m])) > 1500
for: 1m
labels:
severity: warning
annotations:
summary: "Disk IOPS Write high"
description: "{{ $labels.instance }}: Disk IOPS Write of {{ $labels.device }} is above 1500 (current value is: {{ $value }})"
- name: NodeNetWork
rules:
- alert: NodeNetWorkTrafficInHigh
expr: (sum(irate(node_network_receive_bytes_total{device=~"eth.*|tun.*"}[1m])) by (instance, device) * 8 / 1024 / 1024) > 30
for: 20s
labels:
severity: high
annotations:
summary: "Network Traffic In high"
description: "{{ $labels.instance }}: Network Traffic In of {{ $labels.device }} is above 30Mbps (current value is: {{ humanize $value }})"
- alert: NodeNetWorkTrafficOutHigh
expr: (sum(irate(node_network_transmit_bytes_total{device=~"eth.*|tun.*"}[1m])) by (instance, device) * 8 / 1024 / 1024) > 30
for: 20s
labels:
severity: high
annotations:
summary: "Network Traffic Out high"
description: "{{ $labels.instance }}: Network Traffic Out of {{ $labels.device }} is above 30Mbps (current value is: {{ humanize $value }})"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment