seansu4you87 · June 17, 2020 01:37
diff --git a/chaos.rb b/chaos.rb
 describe "Replica Reads Resilience" do
  it "can survive replica failure" do
    shard_1_query = User.where(id: 1)
    shard_2_query = User.where(id: 2)

    expect(shard_1_query.count).to eq(1)
    expect(shard_2_query.count).to eq(1)

    task = pod_failure :create, "vttablet-shard1-replica2"
    task.join

    expect(shard_1_query.count).to eq(1)
    expect(shard_2_query.count).to eq(1)

    task = pod_failure :heal, "vttablet-shard1-replica2"
    task.join

    expect(shard_1_query.count).to eq(1)
    expect(shard_2_query.count).to eq(1)
  end
 end

 describe "MySQL SSR" do
  context "single region" do
    it "can survive SSR replica failure up to a certain point" do
      # NOTE(yu): Topology
      #
      # - 1 single region
      # - 3 SSR (one of which is master)
      # - requires 1 SSR ack to ack a write

      # Works since both SSRs are in the network
      User.create!(id: 1)

      task = pod_partition :create, "vttablet-shard1-replica1", "vttablet-shard1-replica2"
      task.join

      # Works since vttablet-shard1-replica3 is still in the network
      User.create!(id: 3)

      task = pod_partition :create, "vttablet-shard1-replica1", "vttablet-shard1-replica3"
      task.join

      # Fails since no SSRs are in the network!
      expect { User.create!(id: 3) }.to raise_error(ActiveRecord::Timeout)
    end
  end

  context "multi region" do
    it "can survive SSR replica failure up to a certain point"
  end
 end

 describe "Etcd" do
  context "local" do
    it "can tolerate local Etcd failure until topology is too stale" do
      good_conn = Connection.new("vtgate1")
      fail_conn = Connection.new("vtgate2")

      query = User.where(id: 1)

      # Queries work for both vtgates
      expect(query.using(good_conn).count).to eq(1)
      expect(query.using(fail_conn).count).to eq(1)

      tasks = (1..3).map { |i| pod_partition :create, "vtgate2", "local_etcd#{i}" }
      tasks.map(&:join)

      # Queries still work for both vtgates
      expect(query.using(good_conn).count).to eq(1)
      expect(query.using(fail_conn).count).to eq(1)

      add_topo_tasks = []
      add_topo_tasks << Vitess.add_replica("shard1", "replica4")
      add_topo_tasks << Vitess.add_replica("shard1", "replica5")
      add_topo_tasks << Vitess.add_replica("shard1", "replica6")
      add_topo_tasks.map(&:join)

      kill_topo_tasks = []
      kill_topo_tasks << Vitess.kill_replica("shard1", "replica3")
      kill_topo_tasks << Vitess.kill_replica("shard1", "replica2")
      kill_topo_tasks << Vitess.kill_replica("shard1", "replica1")
      kill_topo_tasks.map(&:join)

      election = Vitess.elect_master("shard1", "replica4")
      election.join

      # Queries for `fail_conn` stop working because Topology is too stale now
      expect(query.using(good_conn).count).to eq(1)
      expect { query.using(fail_conn).count }.to raise_error(ActiveRecord::Timeout)
    end
  end

  context "global" do
    it "can tolerate global Etcd not having a quorum" do
      query = User.where(id: 1)

      expect(query.count).to eq(1)

      tasks = (1..3).reduce([]) do |acc, i|
        (1..3).each do |j|
          next if i == j
          acc << (pod_partition :create, "global_etcd#{i}", "global_etcd#{j}")
        end
        acc
      end
      tasks.map(&:join)

      # Query should still work since we are not dependent on global Etcd for live writes
      expect(query.count).to eq(1)
    end
  end
 end
	describe "Replica Reads Resilience" do
	it "can survive replica failure" do
	shard_1_query = User.where(id: 1)
	shard_2_query = User.where(id: 2)

	expect(shard_1_query.count).to eq(1)
	expect(shard_2_query.count).to eq(1)

	task = pod_failure :create, "vttablet-shard1-replica2"
	task.join

	expect(shard_1_query.count).to eq(1)
	expect(shard_2_query.count).to eq(1)

	task = pod_failure :heal, "vttablet-shard1-replica2"
	task.join

	expect(shard_1_query.count).to eq(1)
	expect(shard_2_query.count).to eq(1)
	end
	end

	describe "MySQL SSR" do
	context "single region" do
	it "can survive SSR replica failure up to a certain point" do
	# NOTE(yu): Topology
	#
	# - 1 single region
	# - 3 SSR (one of which is master)
	# - requires 1 SSR ack to ack a write

	# Works since both SSRs are in the network
	User.create!(id: 1)

	task = pod_partition :create, "vttablet-shard1-replica1", "vttablet-shard1-replica2"
	task.join

	# Works since vttablet-shard1-replica3 is still in the network
	User.create!(id: 3)

	task = pod_partition :create, "vttablet-shard1-replica1", "vttablet-shard1-replica3"
	task.join

	# Fails since no SSRs are in the network!
	expect { User.create!(id: 3) }.to raise_error(ActiveRecord::Timeout)
	end
	end

	context "multi region" do
	it "can survive SSR replica failure up to a certain point"
	end
	end

	describe "Etcd" do
	context "local" do
	it "can tolerate local Etcd failure until topology is too stale" do
	good_conn = Connection.new("vtgate1")
	fail_conn = Connection.new("vtgate2")

	query = User.where(id: 1)

	# Queries work for both vtgates
	expect(query.using(good_conn).count).to eq(1)
	expect(query.using(fail_conn).count).to eq(1)

	tasks = (1..3).map { \|i\| pod_partition :create, "vtgate2", "local_etcd#{i}" }
	tasks.map(&:join)

	# Queries still work for both vtgates
	expect(query.using(good_conn).count).to eq(1)
	expect(query.using(fail_conn).count).to eq(1)

	add_topo_tasks = []
	add_topo_tasks << Vitess.add_replica("shard1", "replica4")
	add_topo_tasks << Vitess.add_replica("shard1", "replica5")
	add_topo_tasks << Vitess.add_replica("shard1", "replica6")
	add_topo_tasks.map(&:join)

	kill_topo_tasks = []
	kill_topo_tasks << Vitess.kill_replica("shard1", "replica3")
	kill_topo_tasks << Vitess.kill_replica("shard1", "replica2")
	kill_topo_tasks << Vitess.kill_replica("shard1", "replica1")
	kill_topo_tasks.map(&:join)

	election = Vitess.elect_master("shard1", "replica4")
	election.join

	# Queries for `fail_conn` stop working because Topology is too stale now
	expect(query.using(good_conn).count).to eq(1)
	expect { query.using(fail_conn).count }.to raise_error(ActiveRecord::Timeout)
	end
	end

	context "global" do
	it "can tolerate global Etcd not having a quorum" do
	query = User.where(id: 1)

	expect(query.count).to eq(1)

	tasks = (1..3).reduce([]) do \|acc, i\|
	(1..3).each do \|j\|
	next if i == j
	acc << (pod_partition :create, "global_etcd#{i}", "global_etcd#{j}")
	end
	acc
	end
	tasks.map(&:join)

	# Query should still work since we are not dependent on global Etcd for live writes
	expect(query.count).to eq(1)
	end
	end
	end