Skip to content

Instantly share code, notes, and snippets.

@dobesv
Last active May 25, 2021 07:46
Show Gist options
  • Save dobesv/be5aa3e6e5830e54c0e77b73884333cc to your computer and use it in GitHub Desktop.
Save dobesv/be5aa3e6e5830e54c0e77b73884333cc to your computer and use it in GitHub Desktop.
FROM zookeeper:3.5.6
# Disable Java's built-in DNS cache
RUN sed -i 's/#networkaddress\.cache\.ttl=-1$/networkaddress.cache.ttl=0/' "${JAVA_HOME}/lib/security/java.security"
# Enable prometheus stats export
COPY zookeeper-jmx-exporter-rules.yaml /opt/jmx-exporter/zookeeper-jmx-exporter-rules.yaml
RUN mkdir -p /opt/jmx-exporter && wget -O /opt/jmx-exporter/jmx-exporter.jar "https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.12.0/jmx_prometheus_javaagent-0.12.0.jar"
ENV SERVER_JVMFLAGS -javaagent:/opt/jmx-exporter/jmx-exporter.jar=9101:/opt/jmx-exporter/zookeeper-jmx-exporter-rules.yaml
resources:
- ../../../base/node
patchesJson6902:
- target:
group: apps
version: v1
kind: StatefulSet
namespace: zookeeper
name: zk-e
path: zk-e.statefulset.patch.yaml
- target:
version: v1
kind: Service
namespace: zookeeper
name: zk
path: zk-e.service.patch.yaml
nameSuffix: -e
- op: add
path: /metadata/labels/zone
value: a
- op: add
path: /spec/selector/zone
value: a
- op: add
path: /spec/template/spec/affinity/nodeAffinity
value:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: failure-domain.beta.kubernetes.io/zone
operator: In
values:
- us-east-1a
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: ZOO_SERVERS
value: 'server.1=0.0.0.0:2888:3888;2181 server.2=zk-c.zookeeper.svc:2888:3888;2181 server.3=zk-e.zookeeper.svc:2888:3888;2181'
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: ZOO_MY_ID
value: '1'
- op: add
path: /metadata/labels/zone
value: a
- op: add
path: /spec/selector/matchLabels/zone
value: a
- op: add
path: /spec/template/metadata/labels/zone
value: a
- op: replace
path: /spec/volumeClaimTemplates/0/spec/storageClassName
value: gp2-a
- op: replace
path: /spec/volumeClaimTemplates/1/spec/storageClassName
value: gp2-a
- op: add
path: /metadata/labels/zone
value: c
- op: add
path: /spec/selector/zone
value: c
- op: add
path: /spec/template/spec/affinity/nodeAffinity
value:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: failure-domain.beta.kubernetes.io/zone
operator: In
values:
- us-east-1c
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: ZOO_SERVERS
value: 'server.1=zk-a.zookeeper.svc:2888:3888;2181 server.2=0.0.0.0:2888:3888;2181 server.3=zk-e.zookeeper.svc:2888:3888;2181'
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: ZOO_MY_ID
value: '2'
- op: add
path: /metadata/labels/zone
value: c
- op: add
path: /spec/selector/matchLabels/zone
value: c
- op: add
path: /spec/template/metadata/labels/zone
value: c
- op: replace
path: /spec/volumeClaimTemplates/0/spec/storageClassName
value: gp2-c
- op: replace
path: /spec/volumeClaimTemplates/1/spec/storageClassName
value: gp2-c
apiVersion: v1
kind: Service
metadata:
name: zk-cs
namespace: zookeeper
labels:
app: zk
spec:
ports:
- port: 2181
name: client
selector:
app: zk
- op: add
path: /metadata/labels/zone
value: e
- op: add
path: /spec/selector/zone
value: e
- op: add
path: /spec/template/spec/affinity/nodeAffinity
value:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: failure-domain.beta.kubernetes.io/zone
operator: In
values:
- us-east-1e
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: ZOO_SERVERS
value: 'server.1=zk-a.zookeeper.svc:2888:3888;2181 server.2=zk-c.zookeeper.svc:2888:3888;2181 server.3=0.0.0.0:2888:3888;2181'
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: ZOO_MY_ID
value: '3'
- op: add
path: /metadata/labels/zone
value: e
- op: add
path: /spec/selector/matchLabels/zone
value: e
- op: add
path: /spec/template/metadata/labels/zone
value: e
- op: replace
path: /spec/volumeClaimTemplates/0/spec/storageClassName
value: gp2-e
- op: replace
path: /spec/volumeClaimTemplates/1/spec/storageClassName
value: gp2-e
apiVersion: v1
kind: Service
metadata:
name: zk-hs
namespace: zookeeper
labels:
app: zk
audience: internal
spec:
ports:
- port: 2888
name: server
- port: 3888
name: leader-election
- port: 9101
name: prom-metrics
clusterIP: None
selector:
app: zk
apiVersion: v1
kind: Service
metadata:
name: zk-prom-metrics
namespace: zookeeper
labels:
app: zk
component: prom-metrics
spec:
ports:
- port: 9101
name: prom-metrics
clusterIP: None
selector:
app: zk
apiVersion: policy/v1beta1
kind: PodDisruptionBudget
metadata:
name: zk-pdb
namespace: zookeeper
spec:
selector:
matchLabels:
app: zk
maxUnavailable: 1
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: zk-rules
namespace: prometheus
labels:
prometheus: k8s
role: alert-rules
spec:
groups:
- name: zk.rules
rules:
- alert: ZooKeeperMemoryLow
expr: (jvm_memory_bytes_used{job="zk-prom-metrics", area="heap"} / jvm_memory_bytes_max{job="zk-prom-metrics", area="heap"})*100 > 90
for: 10m
labels:
severity: warning
annotations:
description: '{{$labels.pod}} is using {{$value}}% of available heap memory'
summary: ZooKeeper memory low
- alert: ZooKeeperNoLeader
expr: sum(zookeeper_Leader) by (pod) == 0
for: 5m
labels:
severity: warning
annotations:
description: 'ZooKeeper node has no leader selected'
summary: ZooKeeper node has no leader
- alert: ZooKeeperHighAvgRequestLatency
expr: max(zookeeper_AvgRequestLatency) > 10
for: 5m
labels:
severity: warning
annotations:
description: 'ZooKeeper request latency high ({{$value}} ticks)'
summary: ZooKeeper request latency high
- alert: ZooKeeperHighOutstandingRequests
expr: sum(zookeeper_OutstandingRequests) > 10
for: 1m
labels:
severity: warning
annotations:
description: 'ZooKeeper request backlog is {{$value}}'
summary: ZooKeeper request backlog high
- alert: ZooKeeperNodeNotInEnsemble
expr: zookeeper_PartOfEnsemble == 0
for: 1m
labels:
severity: warning
annotations:
description: '{{$labels.pod}} not in ZooKeeper ensemble'
summary: ZooKeeper request backlog high
apiVersion: v1
kind: Service
metadata:
name: zk
namespace: zookeeper
labels:
app: zk
spec:
ports:
- port: 2181
name: client
- port: 2888
name: server
- port: 3888
name: leader-election
- port: 9101
name: prom-metrics
selector:
app: zk
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: zk
namespace: prometheus
labels:
app: zk
spec:
selector:
matchLabels:
app: zk
component: prom-metrics
namespaceSelector:
matchNames:
- zookeeper
endpoints:
- port: prom-metrics
interval: 30s
path: /metrics
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: zk
namespace: zookeeper
labels:
app: zk
spec:
selector:
matchLabels:
app: zk
serviceName: zk-hs
replicas: 1
updateStrategy:
type: RollingUpdate
podManagementPolicy: OrderedReady
template:
metadata:
labels:
app: zk
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- zk
topologyKey: kubernetes.io/hostname
containers:
- name: zookeeper
image: 555337501170.dkr.ecr.us-east-1.amazonaws.com/zookeeper:2020-03-05-79cdac47
resources:
requests:
memory: 1Gi
cpu: "0.5"
env:
- name: ZOO_TICK_TIME
value: "2000"
- name: ZOO_INIT_LIMIT
value: "10"
- name: ZOO_SYNC_LIMIT
value: "5"
- name: ZOO_MAX_CLIENT_CNXNS
value: "60"
- name: ZOO_AUTOPURGE_PURGEINTERVAL
value: "12"
- name: ZOO_AUTOPURGE_SNAPRETAINCOUNT
value: "3"
- name: ZK_SERVER_HEAP
value: "1000"
- name: ZOO_4LW_COMMANDS_WHITELIST
value: srvr,ruok
- name: ZOO_LOG4J_PROP
value: WARN,CONSOLE
ports:
- containerPort: 2181
name: client
- containerPort: 2888
name: server
- containerPort: 3888
name: leader-election
- containerPort: 9101
name: prom-metrics
livenessProbe:
exec:
command:
- /bin/bash
- -c
- echo "ruok" | nc -w 2 -q 2 localhost 2181 | grep imok
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 10
failureThreshold: 5
successThreshold: 1
readinessProbe:
exec:
command:
- /bin/bash
- -c
- echo "ruok" | nc -w 2 -q 2 localhost 2181 | grep imok
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 10
failureThreshold: 1
successThreshold: 1
volumeMounts:
- name: datadir
mountPath: /data
- name: datalog
mountPath: /datalog
securityContext:
runAsUser: 1000
fsGroup: 1000
subdomain: zk-hs
volumeClaimTemplates:
- metadata:
name: datadir
spec:
storageClassName: gp2
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
- metadata:
name: datalog
spec:
storageClassName: gp2
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
rules:
# replicated Zookeeper
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+)><>(\\w+)"
name: "zookeeper_$2"
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+), name1=replica.(\\d+)><>(\\w+)"
name: "zookeeper_$3"
labels:
replicaId: "$2"
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+), name1=replica.(\\d+), name2=(\\w+)><>(\\w+)"
name: "zookeeper_$4"
labels:
replicaId: "$2"
memberType: "$3"
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+), name1=replica.(\\d+), name2=(\\w+), name3=(\\w+)><>(\\w+)"
name: "zookeeper_$4_$5"
labels:
replicaId: "$2"
memberType: "$3"
# standalone Zookeeper
- pattern: "org.apache.ZooKeeperService<name0=StandaloneServer_port(\\d+)><>(\\w+)"
name: "zookeeper_$2"
- pattern: "org.apache.ZooKeeperService<name0=StandaloneServer_port(\\d+), name1=InMemoryDataTree><>(\\w+)"
name: "zookeeper_$2"
apiVersion: v1
kind: Namespace
metadata:
name: zookeeper
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment