Created
June 17, 2020 01:37
-
-
Save seansu4you87/8941a16bb23ba117456374b233de3675 to your computer and use it in GitHub Desktop.
Chaos example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
describe "Replica Reads Resilience" do | |
it "can survive replica failure" do | |
shard_1_query = User.where(id: 1) | |
shard_2_query = User.where(id: 2) | |
expect(shard_1_query.count).to eq(1) | |
expect(shard_2_query.count).to eq(1) | |
task = pod_failure :create, "vttablet-shard1-replica2" | |
task.join | |
expect(shard_1_query.count).to eq(1) | |
expect(shard_2_query.count).to eq(1) | |
task = pod_failure :heal, "vttablet-shard1-replica2" | |
task.join | |
expect(shard_1_query.count).to eq(1) | |
expect(shard_2_query.count).to eq(1) | |
end | |
end | |
describe "MySQL SSR" do | |
context "single region" do | |
it "can survive SSR replica failure up to a certain point" do | |
# NOTE(yu): Topology | |
# | |
# - 1 single region | |
# - 3 SSR (one of which is master) | |
# - requires 1 SSR ack to ack a write | |
# Works since both SSRs are in the network | |
User.create!(id: 1) | |
task = pod_partition :create, "vttablet-shard1-replica1", "vttablet-shard1-replica2" | |
task.join | |
# Works since vttablet-shard1-replica3 is still in the network | |
User.create!(id: 3) | |
task = pod_partition :create, "vttablet-shard1-replica1", "vttablet-shard1-replica3" | |
task.join | |
# Fails since no SSRs are in the network! | |
expect { User.create!(id: 3) }.to raise_error(ActiveRecord::Timeout) | |
end | |
end | |
context "multi region" do | |
it "can survive SSR replica failure up to a certain point" | |
end | |
end | |
describe "Etcd" do | |
context "local" do | |
it "can tolerate local Etcd failure until topology is too stale" do | |
good_conn = Connection.new("vtgate1") | |
fail_conn = Connection.new("vtgate2") | |
query = User.where(id: 1) | |
# Queries work for both vtgates | |
expect(query.using(good_conn).count).to eq(1) | |
expect(query.using(fail_conn).count).to eq(1) | |
tasks = (1..3).map { |i| pod_partition :create, "vtgate2", "local_etcd#{i}" } | |
tasks.map(&:join) | |
# Queries still work for both vtgates | |
expect(query.using(good_conn).count).to eq(1) | |
expect(query.using(fail_conn).count).to eq(1) | |
add_topo_tasks = [] | |
add_topo_tasks << Vitess.add_replica("shard1", "replica4") | |
add_topo_tasks << Vitess.add_replica("shard1", "replica5") | |
add_topo_tasks << Vitess.add_replica("shard1", "replica6") | |
add_topo_tasks.map(&:join) | |
kill_topo_tasks = [] | |
kill_topo_tasks << Vitess.kill_replica("shard1", "replica3") | |
kill_topo_tasks << Vitess.kill_replica("shard1", "replica2") | |
kill_topo_tasks << Vitess.kill_replica("shard1", "replica1") | |
kill_topo_tasks.map(&:join) | |
election = Vitess.elect_master("shard1", "replica4") | |
election.join | |
# Queries for `fail_conn` stop working because Topology is too stale now | |
expect(query.using(good_conn).count).to eq(1) | |
expect { query.using(fail_conn).count }.to raise_error(ActiveRecord::Timeout) | |
end | |
end | |
context "global" do | |
it "can tolerate global Etcd not having a quorum" do | |
query = User.where(id: 1) | |
expect(query.count).to eq(1) | |
tasks = (1..3).reduce([]) do |acc, i| | |
(1..3).each do |j| | |
next if i == j | |
acc << (pod_partition :create, "global_etcd#{i}", "global_etcd#{j}") | |
end | |
acc | |
end | |
tasks.map(&:join) | |
# Query should still work since we are not dependent on global Etcd for live writes | |
expect(query.count).to eq(1) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment