Skip to content

Instantly share code, notes, and snippets.

Revisions

  1. @talawahtech talawahtech revised this gist May 21, 2021. No changes.
  2. @talawahtech talawahtech revised this gist May 21, 2021. No changes.
  3. @talawahtech talawahtech revised this gist May 21, 2021. No changes.
  4. @talawahtech talawahtech created this gist May 20, 2021.
    261 changes: 261 additions & 0 deletions extreme-benchmark-environment.yaml
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,261 @@
    AWSTemplateFormatVersion: '2010-09-09'

    Description: Extreme Performance Tuning Benchmark Environment


    Parameters:
    AmiId:
    Type: AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>
    Default: '/aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2'

    InstanceKeyPair:
    Type: AWS::EC2::KeyPair::KeyName

    InstanceSecurityGroup:
    Type: AWS::EC2::SecurityGroup::Id

    InstanceSubnet:
    Type: AWS::EC2::Subnet::Id

    InstanceVolumeSize:
    Type: Number
    Default: 8


    Resources:
    Client:
    Type: AWS::EC2::Instance
    Properties:
    InstanceType: 'c5n.4xlarge'
    Tags:
    - Key: 'Name'
    Value: 'extreme-client'
    - Key: 'Role' # Used by cloud-init script to conditionally apply changes to only the client or server
    Value: 'client'
    LaunchTemplate:
    LaunchTemplateId: !Ref 'LaunchTemplate'
    Version: !GetAtt 'LaunchTemplate.LatestVersionNumber'

    Server:
    Type: AWS::EC2::Instance
    Properties:
    InstanceType: 'c5n.xlarge'
    Tags:
    - Key: 'Name'
    Value: 'extreme-server'
    - Key: 'Role' # Used by cloud-init script to conditionally apply changes to only the client or server
    Value: 'server'
    LaunchTemplate:
    LaunchTemplateId: !Ref 'LaunchTemplate'
    Version: !GetAtt 'LaunchTemplate.LatestVersionNumber'

    ClusterPlacementGroup:
    Type: AWS::EC2::PlacementGroup
    Properties:
    Strategy: cluster

    # Allows 'aws ec2 describe-tags' to be called from the cloud-init script so it can differentiate client from server
    Ec2Role:
    Type: AWS::IAM::Role
    Properties:
    Path: /
    Policies:
    - PolicyName: 'AllowInstanceLogs'
    PolicyDocument:
    Version: '2012-10-17'
    Statement:
    - Effect: Allow
    Action: [ 'ec2:DescribeTags' ]
    Resource: '*'
    AssumeRolePolicyDocument:
    Statement:
    - Effect: Allow
    Principal:
    Service: ['ec2.amazonaws.com']
    Action: ['sts:AssumeRole']

    Ec2InstanceProfile:
    Type: AWS::IAM::InstanceProfile
    Properties:
    Path: /
    Roles: [!Ref 'Ec2Role']

    LaunchTemplate:
    Type: AWS::EC2::LaunchTemplate
    Properties:
    LaunchTemplateName: !Ref 'AWS::StackName'
    LaunchTemplateData:
    ImageId: !Ref 'AmiId'
    KeyName: !Ref 'InstanceKeyPair'
    IamInstanceProfile:
    Arn: !GetAtt 'Ec2InstanceProfile.Arn'
    Placement:
    GroupName: !Ref 'ClusterPlacementGroup'
    NetworkInterfaces:
    - DeviceIndex: 0
    Ipv6AddressCount: 0 # Ensure that we don't get assigned any IPv6 addresses, even if it is the default for the subnet
    SubnetId: !Ref 'InstanceSubnet'
    Groups:
    - !Ref 'InstanceSecurityGroup'
    BlockDeviceMappings:
    - DeviceName: '/dev/xvda'
    Ebs:
    VolumeSize: !Ref 'InstanceVolumeSize'
    VolumeType: 'gp3'

    UserData:
    Fn::Base64: !Sub |
    Content-Type: multipart/mixed; boundary="==BOUNDARY=="
    MIME-Version: 1.0

    --==BOUNDARY==
    Content-Type: text/cloud-config; charset="us-ascii"
    Content-Disposition: attachment; filename="cloud-config.txt"

    # Automatically reboot after cloud-init completes to apply kernel param changes
    power_state:
    mode: reboot
    message: Rebooting to apply new kernel params
    timeout: 10
    condition: True

    bootcmd:
    # These commands run on every boot, not just the first boot

    #### Disable iptables
    - modprobe -rv ip_tables

    ##### ENA driver configuration. Disable generic receive offloading
    - ethtool -K eth0 gro off

    ##### ENA driver configuration. Enable adaptive IRQ coalescing (server only)
    - export INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
    - echo INSTANCE_ID = ${!INSTANCE_ID}
    - export INSTANCE_ROLE=$(aws ec2 describe-tags --region ${AWS::Region} --filters "Name=resource-id,Values=${!INSTANCE_ID}" "Name=key,Values=Role" --output text | cut -f5)
    - echo INSTANCE_ROLE = ${!INSTANCE_ROLE}

    - if [ "${!INSTANCE_ROLE}" == "server" ]; then ethtool -C eth0 adaptive-rx on; fi
    - if [ "${!INSTANCE_ROLE}" == "server" ]; then ethtool -C eth0 tx-usecs 256; fi

    ##### Disable irqbalance and fix IRQs to cpus. Assumes # of irqs/queues = # of cpus!!!
    ## Note ${!} is the CF escape sequence for the bash equivalent and ${!!} is needed to get a literal ${!}
    ## sleep to give irqbalance time to shutdown before manually setting the values
    - systemctl stop irqbalance.service
    - echo sleeping
    - sleep 5
    - export IRQS=($(grep eth0 /proc/interrupts | awk '{print $1}' | tr -d :))
    - for i in ${!!IRQS[@]}; do echo $i > /proc/irq/${!IRQS[i]}/smp_affinity_list; done;
    - echo irq affinity
    - for i in ${!!IRQS[@]}; do cat /proc/irq/${!IRQS[i]}/smp_affinity_list; done;

    ##### Setup Transmit Packet Steering (XPS) to map queue x to cpu x for outgoing packets. Assumes # of queues = # of cpus!!!
    ## A hex bitmap is used in this case, not the cpu id so we raise 2 to the power of i and convert it to hex
    ## Note ${!} is the CF escape sequence for the bash equivalent and ${!!} is needed to get a literal ${!}
    - export TXQUEUES=($(ls -1qdv /sys/class/net/eth0/queues/tx-*))
    - for i in ${!!TXQUEUES[@]}; do printf '%x' $((2**i)) > ${!TXQUEUES[i]}/xps_cpus; done;
    - echo 'xps_cpus'
    - for i in ${!!TXQUEUES[@]}; do cat ${!TXQUEUES[i]}/xps_cpus; done;

    ## Stop dhclient and set address lifetime to "forever"
    - dhclient -x -pf /var/run/dhclient-eth0.pid
    - dhclient -x -pf /var/run/dhclient6-eth0.pid
    - ip addr change $( ip -4 addr show dev eth0 | grep 'inet' | awk '{ print $2 " brd " $4 " scope global"}') dev eth0 valid_lft forever preferred_lft forever

    packages:
    - git
    - gcc
    - make
    - htop
    - iperf3
    - dstat
    - pcp-system-tools
    - perf
    - iproute-tc

    --==BOUNDARY==
    Content-Type: text/x-shellscript; charset="us-ascii"
    Content-Disposition: attachment; filename="user-data-script.txt"
    #!/bin/bash

    # Configure sysctls
    cat > /etc/sysctl.d/90-extreme.conf <<- EOF
    vm.swappiness=0
    vm.dirty_ratio=80

    net.core.somaxconn=2048
    net.ipv4.tcp_max_syn_backlog=10000

    net.core.busy_poll=1
    net.core.default_qdisc=noqueue
    net.ipv4.tcp_congestion_control=reno
    EOF

    # Reload sysctl to pick up new configs
    sysctl -p

    # Disable ssm agent. It doesn't really affect throughput, but any network activity can affect p99 and stdev for latency
    systemctl stop amazon-ssm-agent
    systemctl disable amazon-ssm-agent

    # Install docker and stress-ng from amazon-linux-extras
    amazon-linux-extras enable -y docker testing
    yum install -y docker stress-ng

    # Add the ec2-user and to the docker group so you can execute Docker commands without using sudo
    usermod -a -G docker ec2-user

    # Configure and start docker with iptables support disabled
    mkdir -p /etc/systemd/system/docker.service.d/
    cat > /etc/systemd/system/docker.service.d/startup_options.conf <<- EOF
    [Service]
    ExecStart=
    ExecStart=/usr/bin/dockerd -H fd:// --bridge=none --iptables=false --ip-forward=false --live-restore
    EOF

    systemctl daemon-reload
    systemctl enable docker
    systemctl start docker

    # Build (t)wrk
    # Note that the luajit-devel package comes from the amazon-linux-extras repo for BCC
    amazon-linux-extras enable BCC
    yum clean metadata
    yum install -y openssl11-devel luajit-devel-2.1.0
    cd /home/ec2-user/
    git clone https://github.com/talawahtech/wrk --single-branch --branch twrk twrk
    cd twrk
    make WITH_LUAJIT=/usr WITH_OPENSSL=/usr CFLAGS="-I /usr/include/luajit-2.1"
    mv twrk /usr/local/bin/
    chown -R ec2-user:ec2-user /home/ec2-user/twrk/

    # Build and run the libreactor (round 20) docker container on the server
    cd /home/ec2-user/
    git clone https://github.com/TechEmpower/FrameworkBenchmarks --branch R20 --single-branch
    chown -R ec2-user:ec2-user /home/ec2-user/FrameworkBenchmarks/

    cd FrameworkBenchmarks/frameworks/C/libreactor/
    docker build . -f libreactor.dockerfile --network host -t libreactor
    docker build . -f libreactor-server.dockerfile --network host -t libreactor-server

    # Install Flamegraph tools
    cd /home/ec2-user/
    git clone https://github.com/brendangregg/FlameGraph
    chown -R ec2-user:ec2-user /home/ec2-user/FlameGraph/

    # Download custom palette.map
    wget -q https://gist.githubusercontent.com/talawahtech/b043e2dbf12af746de06b9b86c1a8b80/raw/ -O palette.map
    chown ec2-user:ec2-user /home/ec2-user/palette.map

    # Download network monitor script
    wget -q https://gist.githubusercontent.com/talawahtech/de78601f1201d9586ac19fff420024b8/raw/ -O netmonitor.sh
    chmod a+x netmonitor.sh
    mv netmonitor.sh /usr/local/bin/

    #### Set kernel params to disable speculative execution mitigations. Requires a reboot to take effect, which is handled above
    sed -i 's/^GRUB_CMDLINE_LINUX_DEFAULT="/&nospectre_v1 nospectre_v2 pti=off mds=off tsx_async_abort=off /' /etc/default/grub
    grub2-mkconfig -o /boot/grub2/grub.cfg

    #### Disable syscall auditing (but otherwise leave auditd functioning).
    echo "-a never,task" > /etc/audit/rules.d/disable-syscall-auditing.rules
    /sbin/augenrules --load
    --==BOUNDARY==--