File: //proc/thread-self/root/usr/local/qcloud/nv/harp_setup.sh
#!/bin/bash
########################################################################
#
# This script is used to set up execution environment for HARP,
#
# sudo bash harp_setup.sh <version>
#
########################################################################
if [ "$EUID" -ne 0 ]
then echo "ERROR: Please run this script as root!"
exit
fi
OS=`awk -F= '/^NAME=/{print $2}' /etc/os-release`
# TODO: there might be more robust way to get gpu or nic counts
GPU_NUM=`lspci -d 10de: | wc -l`
NIC_NUM=`lspci | grep -i ethernet | wc -l`
LOG_PATH="/var/log/taco_installation.log"
python_ln_rm=
log_exit()
{
echo "[$(date)] $1" | tee -a $LOG_PATH
[ -z $python_ln_rm ] || rm -rf /usr/bin/python
exit 1
}
log()
{
echo && echo "[$(date)] $1" | tee -a $LOG_PATH
}
TFABRIC_DIR="/usr/local/tfabric"
TFABRIC_TOOLS_DIR="$TFABRIC_DIR/tools"
VERSION=${1:-v1.4}
TOOLS_TAR="nccl-ztcp-tools_$VERSION.tar"
TOOLS_URL="https://mirrors.cloud.tencent.com/install/GPU/taco/$TOOLS_TAR"
mkdir -p $TFABRIC_DIR
# Always re-configure if any existing setup
if [ -d $TFABRIC_TOOLS_DIR ]; then
ifconfig -a | grep "eth[1-8]:" >> /dev/null 2>&1
if [ $? -ne 0 ]; then
pushd $TFABRIC_TOOLS_DIR >> /dev/null 2>&1
log "Bind assistant nic(s) to kernel at first"
bash bind-kernel.sh
popd >> /dev/null 2>&1
fi
log "Remove existing harp configurations"
rm -rf $TFABRIC_TOOLS_DIR
fi
# Download tools package and decompress it
log "Start downloading tools package ..."
rm -f $TOOLS_TAR
wait_time=0
while [ $wait_time -lt 20 ]; do
wget -q $TOOLS_URL
if [ $? == 0 ]; then
break;
fi
log "retry"
wait_time=$(($wait_time+1))
sleep 1
done
if [ -f $TOOLS_TAR ]; then
tar xf $TOOLS_TAR -C $TFABRIC_DIR
rm -f $TOOLS_TAR
else
log_exit "Failed to download $TOOLS_URL"
fi
# Insmod uio module
log "Start loading the uio module for $OS ..."
lsmod | grep igb_uio >> /dev/null 2>&1
if [ $? -ne 0 ]; then
modprobe uio || log_exit "Failed to load uio module"
if [[ $OS == *"Ubuntu"* ]]; then
apt-get install -y linux-headers-$(uname -r) >> $LOG_PATH 2>&1|| log_exit "Failed to install kernel headers"
fi
pushd $TFABRIC_TOOLS_DIR/igb-uio >> /dev/null 2>&1
make >> $LOG_PATH 2>&1 || log_exit "Failed to make igb_uio"
insmod igb_uio.ko >> $LOG_PATH 2>&1 || log_exit "Failed to insmod igb_uio"
popd >> /dev/null 2>&1
fi
# Create configuration for HARP
log "Start generating harp config files ..."
pushd $TFABRIC_TOOLS_DIR >> /dev/null 2>&1
which python >> /dev/null 2>&1
if [ $? -ne 0 ]; then
which python3 >> /dev/null 2>&1
if [ $? -ne 0 ]; then
log_exit "Error: No python or python3 installed\n"
else
ln -sf /usr/bin/python3 /usr/bin/python
python_ln_rm=1
fi
fi
python config-create.py >> $LOG_PATH 2>&1
[ -f $TFABRIC_TOOLS_DIR/config/ztcp.conf ] || log_exit "Failed to create harp config file"
bash bind-igd-uio.sh || log_exit "Failed to execute bind-igd-uio.sh"
[ -z $python_ln_rm ] || rm -rf /usr/bin/python
popd >> /dev/null 2>&1
log "Set up HARP successfully"