HEX
Server: Apache/2.4.52 (Ubuntu)
System: Linux WebLive 5.15.0-79-generic #86-Ubuntu SMP Mon Jul 10 16:07:21 UTC 2023 x86_64
User: ubuntu (1000)
PHP: 7.4.33
Disabled: pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,pcntl_unshare,
Upload Files
File: //proc/thread-self/root/usr/local/qcloud/nv/harp_setup.sh
#!/bin/bash

########################################################################
#
# 	This script is used to set up execution environment for HARP,
#
#		sudo bash harp_setup.sh <version>
#
########################################################################

if [ "$EUID" -ne 0 ]
	then echo "ERROR: Please run this script as root!"
	exit
fi

OS=`awk -F= '/^NAME=/{print $2}' /etc/os-release`

# TODO: there might be more robust way to get gpu or nic counts
GPU_NUM=`lspci -d 10de: | wc -l`
NIC_NUM=`lspci | grep -i ethernet | wc -l`
LOG_PATH="/var/log/taco_installation.log"
python_ln_rm=

log_exit()
{
	echo "[$(date)] $1" | tee -a $LOG_PATH
	[ -z $python_ln_rm ] || rm -rf /usr/bin/python
	exit 1
}

log()
{
	echo && echo "[$(date)] $1" | tee -a $LOG_PATH
}

TFABRIC_DIR="/usr/local/tfabric"
TFABRIC_TOOLS_DIR="$TFABRIC_DIR/tools"

VERSION=${1:-v1.4}
TOOLS_TAR="nccl-ztcp-tools_$VERSION.tar"
TOOLS_URL="https://mirrors.cloud.tencent.com/install/GPU/taco/$TOOLS_TAR"

mkdir -p $TFABRIC_DIR

# Always re-configure if any existing setup
if [ -d $TFABRIC_TOOLS_DIR ]; then
	ifconfig  -a | grep "eth[1-8]:" >> /dev/null 2>&1
	if [ $? -ne 0 ]; then
		pushd $TFABRIC_TOOLS_DIR >> /dev/null 2>&1
		log "Bind assistant nic(s) to kernel at first"
		bash bind-kernel.sh
		popd >> /dev/null 2>&1
	fi
	log "Remove existing harp configurations"
	rm -rf $TFABRIC_TOOLS_DIR
fi

# Download tools package and decompress it
log "Start downloading tools package ..."
rm -f $TOOLS_TAR 

wait_time=0
while [ $wait_time -lt 20 ]; do
    wget -q $TOOLS_URL
    if [ $? == 0 ]; then
        break;
    fi
    log "retry"
    wait_time=$(($wait_time+1))
    sleep 1
done
if [ -f $TOOLS_TAR ]; then
	tar xf $TOOLS_TAR -C $TFABRIC_DIR
	rm -f $TOOLS_TAR
else
	log_exit "Failed to download $TOOLS_URL"
fi

# Insmod uio module
log "Start loading the uio module for $OS ..."
lsmod | grep igb_uio >> /dev/null 2>&1
if [ $? -ne 0 ]; then
	modprobe uio || log_exit "Failed to load uio module"
	if [[ $OS == *"Ubuntu"* ]]; then
		apt-get install -y linux-headers-$(uname -r) >> $LOG_PATH 2>&1|| log_exit "Failed to install kernel headers"
	fi
	pushd $TFABRIC_TOOLS_DIR/igb-uio >> /dev/null  2>&1
	make >> $LOG_PATH 2>&1 || log_exit "Failed to make igb_uio"
	insmod igb_uio.ko >> $LOG_PATH 2>&1 || log_exit "Failed to insmod igb_uio"
	popd >> /dev/null 2>&1
fi

# Create configuration for HARP
log "Start generating harp config files ..."
pushd $TFABRIC_TOOLS_DIR >> /dev/null 2>&1
which python >> /dev/null 2>&1
if [ $? -ne 0 ]; then
	which python3 >> /dev/null 2>&1
	if [ $? -ne 0 ]; then
		log_exit "Error: No python or python3 installed\n"
	else
		ln -sf /usr/bin/python3 /usr/bin/python
		python_ln_rm=1
	fi
fi
python config-create.py >> $LOG_PATH 2>&1
[ -f $TFABRIC_TOOLS_DIR/config/ztcp.conf ] || log_exit "Failed to create harp config file"

bash bind-igd-uio.sh || log_exit "Failed to execute bind-igd-uio.sh"

[ -z $python_ln_rm ] || rm -rf /usr/bin/python
popd >> /dev/null 2>&1

log "Set up HARP successfully"