Skip to content

Commit

Permalink
Adding an installer script that installs Nvidia drivers in Container …
Browse files Browse the repository at this point in the history
…Optimized OS

Packaged the script as a docker container stored in gcr.io/google-containers
A daemonset deployment is included to make it easy to consume the installer
A cluster e2e has been added to test the installation daemonset along with verifying installation
by using a sample CUDA application.
Node e2e for GPUs updated to avoid running on nodes without GPU devices.

Signed-off-by: Vishnu kannan <[email protected]>
  • Loading branch information
vishh committed May 21, 2017
1 parent 95ce463 commit 1e77594
Show file tree
Hide file tree
Showing 19 changed files with 665 additions and 14 deletions.
5 changes: 4 additions & 1 deletion cluster/gce/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ filegroup(

filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
srcs = [
":package-srcs",
"//cluster/gce/gci/nvidia-gpus:all-srcs",
],
tags = ["automanaged"],
)

Expand Down
2 changes: 2 additions & 0 deletions cluster/gce/config-default.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ fi
# variable. Also please update corresponding image for node e2e at:
# https://github.com/kubernetes/kubernetes/blob/master/test/e2e_node/jenkins/image-config.yaml
CVM_VERSION=${CVM_VERSION:-container-vm-v20170214}
# NOTE: Update the kernel commit SHA in cluster/addons/nvidia-gpus/cos-installer-daemonset.yaml
# while updating the COS version here.
GCI_VERSION=${KUBE_GCI_VERSION:-gci-stable-56-9000-84-2}
MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-}
MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers}
Expand Down
1 change: 1 addition & 0 deletions cluster/gce/gci/configure-helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1605,4 +1605,5 @@ else
fi
reset-motd
prepare-mounter-rootfs
modprobe configs
echo "Done for the configuration for kubernetes"
24 changes: 24 additions & 0 deletions cluster/gce/gci/nvidia-gpus/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package(default_visibility = ["//visibility:public"])

load("@io_bazel//tools/build_defs/pkg:pkg.bzl", "pkg_tar")
load("@io_kubernetes_build//defs:build.bzl", "release_filegroup")

filegroup(
name = "sources",
srcs = glob([
"**/*",
]),
)

filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)

filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
)
28 changes: 28 additions & 0 deletions cluster/gce/gci/nvidia-gpus/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright 2017 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

FROM ubuntu:16.04

# Disable prompts from apt
ENV DEBIAN_FRONTEND noninteractive

RUN apt-get -qq update
RUN apt-get install -qq pciutils gcc g++ git make dpkg-dev bc module-init-tools curl

RUN mkdir /lakitu-kernel
RUN git clone https://chromium.googlesource.com/chromiumos/third_party/kernel /lakitu-kernel

ADD installer.sh /usr/bin/nvidia-installer.sh
RUN chmod a+x /usr/bin/nvidia-installer.sh
CMD ["/usr/bin/nvidia-installer.sh"]
27 changes: 27 additions & 0 deletions cluster/gce/gci/nvidia-gpus/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright 2017 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

TAG?=v0.1
REGISTRY?=gcr.io/google_containers
IMAGE=cos-nvidia-driver-install

all: container

container:
docker build --pull -t ${REGISTRY}/${IMAGE}:${TAG} .

push:
gcloud docker -- push ${REGISTRY}/${IMAGE}:${TAG}

.PHONY: all container push
57 changes: 57 additions & 0 deletions cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: cos-nvidia-installer
namespace: kube-system
spec:
template:
metadata:
labels:
name: cos-nvidia-installer
# Update the version tag here and `LAKITU_KERNEL_SHA1` while using against new COS releases.
cos-version: cos-beta-59-9460-20-0
spec:
hostNetwork: true
hostPID: true
volumes:
- name: dev
hostPath:
path: /dev
- name: nvidia-overlay
hostPath:
path: /home/kubernetes/bin/nvidia
- name: os-release
hostPath:
path: /etc/os-release
- name: sysrq
hostPath:
path: /proc/sysrq-trigger
containers:
- image: gcr.io/google_containers/cos-nvidia-driver-install@sha256:ad83ede6e0c6d768bf7cf69a7dec972aa5e8f88778142ca46afd3286ad58cfc8
command: ["/bin/sh", "-c"]
args: ["usr/bin/nvidia-installer.sh && sleep infinity"]
env:
- name: BASE_DIR
value: "/rootfs/nvidia"
name: nvidia-driver-installer
resources:
requests:
cpu: 0.15
securityContext:
privileged: true
env:
# The kernel SHA1 here should correspond to the GCI_VERSION specified by default under cluster/gce/config-default.sh
- name: LAKITU_KERNEL_SHA1
value: 26481563cb3788ad254c2bf2126b843c161c7e48
- name: BASE_DIR
value: "/rootfs/nvidia"
volumeMounts:
- name: nvidia-overlay
mountPath: /rootfs/nvidia
- name: dev
mountPath: /dev
- name: os-release
mountPath: /rootfs/etc/os-release
- name: sysrq
mountPath: /sysrq

207 changes: 207 additions & 0 deletions cluster/gce/gci/nvidia-gpus/installer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
#!/bin/bash

# Copyright 2017 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script is for dynamically installing nvidia kernel drivers in Container Optimized OS

set -o errexit
set -o pipefail
set -x

# The script must be run as a root.
# Prerequisites:
#
# LAKITU_KERNEL_SHA1 - The env variable is expected to be set to HEAD of the kernel version used on the host.
# BASE_DIR - Directory that is mapped to a stateful partition on host. Defaults to `/rootfs/nvidia`.
#
# The script will output the following artifacts:
# ${BASE_DIR}/lib* --> Nvidia CUDA libraries
# ${BASE_DIR}/bin/* --> Nvidia debug utilities
# ${BASE_DIR}/.cache/* --> Nvidia driver artifacts cached for idempotency.
#

BASE_DIR=${BASE_DIR:-"/rootfs/nvidia"}
CACHE_DIR="${BASE_DIR}/.cache"
USR_WORK_DIR="${CACHE_DIR}/usr-work"
USR_WRITABLE_DIR="${CACHE_DIR}/usr-writable"
LIB_WORK_DIR="${CACHE_DIR}/lib-work"
LIB_WRITABLE_DIR="${CACHE_DIR}/lib-writable"

LIB_OUTPUT_DIR="${BASE_DIR}/lib"
BIN_OUTPUT_DIR="${BASE_DIR}/bin"

KERNEL_SRC_DIR="/lakitu-kernel"
NVIDIA_DRIVER_DIR="/nvidia"
NVIDIA_DRIVER_VERSION="375.26"

# Source: https://developer.nvidia.com/cuda-downloads
NVIDIA_CUDA_URL="https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run"
NVIDIA_CUDA_MD5SUM="33e1bd980e91af4e55f3ef835c103f9b"
NVIDIA_CUDA_PKG_NAME="cuda_8.0.61_375.26_linux.run"
NVIDIA_DRIVER_PKG_NAME="NVIDIA-Linux-x86_64-375.26.run"

check_nvidia_device() {
lspci
if ! lspci | grep -i -q NVIDIA; then
echo "No NVIDIA devices attached to this instance."
exit 0
fi
echo "Found NVIDIA device on this instance."
}

prepare_kernel_source() {
local kernel_git_repo="https://chromium.googlesource.com/chromiumos/third_party/kernel"
local kernel_version="$(uname -r)"
local kernel_version_stripped="$(echo ${kernel_version} | sed 's/\+//')"

# Checkout the correct tag.
echo "Downloading kernel source at tag ${kernel_version_stripped} ..."
pushd "${KERNEL_SRC_DIR}"
# TODO: Consume KERNEL SHA1 from COS image directly.
# git checkout "tags/v${kernel_version_stripped}"
git checkout ${LAKITU_KERNEL_SHA1}

# Prepare kernel configu and source for modules.
echo "Preparing kernel sources ..."
zcat "/proc/config.gz" > ".config"
make olddefconfig
make modules_prepare
# Done.
popd
}

download_install_nvidia() {
local pkg_name="${NVIDIA_CUDA_PKG_NAME}"
local url="${NVIDIA_CUDA_URL}"
local log_file_name="${NVIDIA_DRIVER_DIR}/nvidia-installer.log"

mkdir -p "${NVIDIA_DRIVER_DIR}"
pushd "${NVIDIA_DRIVER_DIR}"

echo "Downloading Nvidia CUDA package from ${url} ..."
curl -L -s "${url}" -o "${pkg_name}"
echo "${NVIDIA_CUDA_MD5SUM} ${pkg_name}" | md5sum --check

echo "Extracting Nvidia CUDA package ..."
sh ${pkg_name} --extract="$(pwd)"

echo "Running the Nvidia driver installer ..."
if ! sh "${NVIDIA_DRIVER_PKG_NAME}" --kernel-source-path="${KERNEL_SRC_DIR}" --silent --accept-license --keep --log-file-name="${log_file_name}"; then
echo "Nvidia installer failed, log below:"
echo "==================================="
tail -50 "${log_file_name}"
echo "==================================="
exit 1
fi
# Create unified memory device file.
nvidia-modprobe -c0 -u
popd
}

unlock_loadpin_and_reboot_if_needed() {
kernel_cmdline="$(cat /proc/cmdline)"
if echo "${kernel_cmdline}" | grep -q -v "lsm.module_locking=0"; then
local -r esp_partition="/dev/sda12"
local -r mount_path="/tmp/esp"
local -r grub_cfg="efi/boot/grub.cfg"

mkdir -p "${mount_path}"
mount "${esp_partition}" "${mount_path}"

pushd "${mount_path}"
cp "${grub_cfg}" "${grub_cfg}.orig"
sed 's/cros_efi/cros_efi lsm.module_locking=0/g' -i "efi/boot/grub.cfg"
cat "${grub_cfg}"
popd
sync
umount "${mount_path}"
# Restart the node for loadpin to be disabled.
echo b > /sysrq
fi
}

create_uvm_device() {
# Create unified memory device file.
nvidia-modprobe -c0 -u
}

verify_base_image() {
mount --bind /rootfs/etc/os-release /etc/os-release
local id="$(grep "^ID=" /etc/os-release)"
if [[ "${id#*=}" != "cos" ]]; then
echo "This installer is designed to run on Container-Optimized OS only"
exit 1
fi
}

setup_overlay_mounts() {
mkdir -p ${USR_WRITABLE_DIR} ${USR_WORK_DIR} ${LIB_WRITABLE_DIR} ${LIB_WORK_DIR}
mount -t overlay -o lowerdir=/usr,upperdir=${USR_WRITABLE_DIR},workdir=${USR_WORK_DIR} none /usr
mount -t overlay -o lowerdir=/lib,upperdir=${LIB_WRITABLE_DIR},workdir=${LIB_WORK_DIR} none /lib
}

exit_if_install_not_needed() {
if nvidia-smi; then
echo "nvidia drivers already installed. Skipping installation"
post_installation_sequence
exit 0
fi
}

restart_kubelet() {
echo "Sending SIGTERM to kubelet"
pkill -SIGTERM kubelet
}

# Copy user space libraries and debug utilities to a special output directory on the host.
# Make these artifacts world readable and executable.
copy_files_to_host() {
mkdir -p ${LIB_OUTPUT_DIR} ${BIN_OUTPUT_DIR}
cp -r ${USR_WRITABLE_DIR}/lib/x86_64-linux-gnu/* ${LIB_OUTPUT_DIR}/
cp -r ${USR_WRITABLE_DIR}/bin/* ${BIN_OUTPUT_DIR}/
chmod -R a+rx ${LIB_OUTPUT_DIR}
chmod -R a+rx ${BIN_OUTPUT_DIR}
}

post_installation_sequence() {
create_uvm_device
# Copy nvidia user space libraries and debug tools to the host for use from other containers.
copy_files_to_host
# Restart the kubelet for it to pick up the GPU devices.
restart_kubelet
}

main() {
# Do not run the installer unless the base image is Container Optimized OS (COS)
verify_base_image
# Do not run the installer unless a Nvidia device is found on the PCI bus
check_nvidia_device
# Setup overlay mounts to capture nvidia driver artificats in a more permanent storage on the host.
setup_overlay_mounts
# Disable a critical security feature in COS that will allow for dynamically loading Nvidia drivers
unlock_loadpin_and_reboot_if_needed
# Exit if installation is not required (for idempotency)
exit_if_install_not_needed
# Checkout kernel sources appropriate for the base image.
prepare_kernel_source
# Download, compile and install nvidia drivers.
download_install_nvidia
# Verify that the Nvidia drivers have been successfully installed.
nvidia-smi
# Perform post installation steps - copying artifacts, restarting kubelet, etc.
post_installation_sequence
}

main "$@"
3 changes: 2 additions & 1 deletion hack/generate-bindata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ go-bindata -nometadata -o "${BINDATA_OUTPUT}.tmp" -pkg generated \
"examples/..." \
"test/e2e/testing-manifests/..." \
"test/images/..." \
"test/fixtures/..."
"test/fixtures/..." \
"cluster/gce/gci/nvidia-gpus/..."

gofmt -s -w "${BINDATA_OUTPUT}.tmp"

Expand Down
1 change: 1 addition & 0 deletions test/e2e/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ go_library(
"networking.go",
"networking_perf.go",
"nodeoutofdisk.go",
"nvidia-gpus.go",
"pod_gc.go",
"podpreset.go",
"pods.go",
Expand Down
Loading

0 comments on commit 1e77594

Please sign in to comment.