forked from kubernetes/kubernetes
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding an installer script that installs Nvidia drivers in Container …
…Optimized OS Packaged the script as a docker container stored in gcr.io/google-containers A daemonset deployment is included to make it easy to consume the installer A cluster e2e has been added to test the installation daemonset along with verifying installation by using a sample CUDA application. Node e2e for GPUs updated to avoid running on nodes without GPU devices. Signed-off-by: Vishnu kannan <[email protected]>
- Loading branch information
Showing
19 changed files
with
665 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1605,4 +1605,5 @@ else | |
fi | ||
reset-motd | ||
prepare-mounter-rootfs | ||
modprobe configs | ||
echo "Done for the configuration for kubernetes" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
package(default_visibility = ["//visibility:public"]) | ||
|
||
load("@io_bazel//tools/build_defs/pkg:pkg.bzl", "pkg_tar") | ||
load("@io_kubernetes_build//defs:build.bzl", "release_filegroup") | ||
|
||
filegroup( | ||
name = "sources", | ||
srcs = glob([ | ||
"**/*", | ||
]), | ||
) | ||
|
||
filegroup( | ||
name = "package-srcs", | ||
srcs = glob(["**"]), | ||
tags = ["automanaged"], | ||
visibility = ["//visibility:private"], | ||
) | ||
|
||
filegroup( | ||
name = "all-srcs", | ||
srcs = [":package-srcs"], | ||
tags = ["automanaged"], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# Copyright 2017 The Kubernetes Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
FROM ubuntu:16.04 | ||
|
||
# Disable prompts from apt | ||
ENV DEBIAN_FRONTEND noninteractive | ||
|
||
RUN apt-get -qq update | ||
RUN apt-get install -qq pciutils gcc g++ git make dpkg-dev bc module-init-tools curl | ||
|
||
RUN mkdir /lakitu-kernel | ||
RUN git clone https://chromium.googlesource.com/chromiumos/third_party/kernel /lakitu-kernel | ||
|
||
ADD installer.sh /usr/bin/nvidia-installer.sh | ||
RUN chmod a+x /usr/bin/nvidia-installer.sh | ||
CMD ["/usr/bin/nvidia-installer.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Copyright 2017 The Kubernetes Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
TAG?=v0.1 | ||
REGISTRY?=gcr.io/google_containers | ||
IMAGE=cos-nvidia-driver-install | ||
|
||
all: container | ||
|
||
container: | ||
docker build --pull -t ${REGISTRY}/${IMAGE}:${TAG} . | ||
|
||
push: | ||
gcloud docker -- push ${REGISTRY}/${IMAGE}:${TAG} | ||
|
||
.PHONY: all container push |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
apiVersion: extensions/v1beta1 | ||
kind: DaemonSet | ||
metadata: | ||
name: cos-nvidia-installer | ||
namespace: kube-system | ||
spec: | ||
template: | ||
metadata: | ||
labels: | ||
name: cos-nvidia-installer | ||
# Update the version tag here and `LAKITU_KERNEL_SHA1` while using against new COS releases. | ||
cos-version: cos-beta-59-9460-20-0 | ||
spec: | ||
hostNetwork: true | ||
hostPID: true | ||
volumes: | ||
- name: dev | ||
hostPath: | ||
path: /dev | ||
- name: nvidia-overlay | ||
hostPath: | ||
path: /home/kubernetes/bin/nvidia | ||
- name: os-release | ||
hostPath: | ||
path: /etc/os-release | ||
- name: sysrq | ||
hostPath: | ||
path: /proc/sysrq-trigger | ||
containers: | ||
- image: gcr.io/google_containers/cos-nvidia-driver-install@sha256:ad83ede6e0c6d768bf7cf69a7dec972aa5e8f88778142ca46afd3286ad58cfc8 | ||
command: ["/bin/sh", "-c"] | ||
args: ["usr/bin/nvidia-installer.sh && sleep infinity"] | ||
env: | ||
- name: BASE_DIR | ||
value: "/rootfs/nvidia" | ||
name: nvidia-driver-installer | ||
resources: | ||
requests: | ||
cpu: 0.15 | ||
securityContext: | ||
privileged: true | ||
env: | ||
# The kernel SHA1 here should correspond to the GCI_VERSION specified by default under cluster/gce/config-default.sh | ||
- name: LAKITU_KERNEL_SHA1 | ||
value: 26481563cb3788ad254c2bf2126b843c161c7e48 | ||
- name: BASE_DIR | ||
value: "/rootfs/nvidia" | ||
volumeMounts: | ||
- name: nvidia-overlay | ||
mountPath: /rootfs/nvidia | ||
- name: dev | ||
mountPath: /dev | ||
- name: os-release | ||
mountPath: /rootfs/etc/os-release | ||
- name: sysrq | ||
mountPath: /sysrq | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2017 The Kubernetes Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# This script is for dynamically installing nvidia kernel drivers in Container Optimized OS | ||
|
||
set -o errexit | ||
set -o pipefail | ||
set -x | ||
|
||
# The script must be run as a root. | ||
# Prerequisites: | ||
# | ||
# LAKITU_KERNEL_SHA1 - The env variable is expected to be set to HEAD of the kernel version used on the host. | ||
# BASE_DIR - Directory that is mapped to a stateful partition on host. Defaults to `/rootfs/nvidia`. | ||
# | ||
# The script will output the following artifacts: | ||
# ${BASE_DIR}/lib* --> Nvidia CUDA libraries | ||
# ${BASE_DIR}/bin/* --> Nvidia debug utilities | ||
# ${BASE_DIR}/.cache/* --> Nvidia driver artifacts cached for idempotency. | ||
# | ||
|
||
BASE_DIR=${BASE_DIR:-"/rootfs/nvidia"} | ||
CACHE_DIR="${BASE_DIR}/.cache" | ||
USR_WORK_DIR="${CACHE_DIR}/usr-work" | ||
USR_WRITABLE_DIR="${CACHE_DIR}/usr-writable" | ||
LIB_WORK_DIR="${CACHE_DIR}/lib-work" | ||
LIB_WRITABLE_DIR="${CACHE_DIR}/lib-writable" | ||
|
||
LIB_OUTPUT_DIR="${BASE_DIR}/lib" | ||
BIN_OUTPUT_DIR="${BASE_DIR}/bin" | ||
|
||
KERNEL_SRC_DIR="/lakitu-kernel" | ||
NVIDIA_DRIVER_DIR="/nvidia" | ||
NVIDIA_DRIVER_VERSION="375.26" | ||
|
||
# Source: https://developer.nvidia.com/cuda-downloads | ||
NVIDIA_CUDA_URL="https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run" | ||
NVIDIA_CUDA_MD5SUM="33e1bd980e91af4e55f3ef835c103f9b" | ||
NVIDIA_CUDA_PKG_NAME="cuda_8.0.61_375.26_linux.run" | ||
NVIDIA_DRIVER_PKG_NAME="NVIDIA-Linux-x86_64-375.26.run" | ||
|
||
check_nvidia_device() { | ||
lspci | ||
if ! lspci | grep -i -q NVIDIA; then | ||
echo "No NVIDIA devices attached to this instance." | ||
exit 0 | ||
fi | ||
echo "Found NVIDIA device on this instance." | ||
} | ||
|
||
prepare_kernel_source() { | ||
local kernel_git_repo="https://chromium.googlesource.com/chromiumos/third_party/kernel" | ||
local kernel_version="$(uname -r)" | ||
local kernel_version_stripped="$(echo ${kernel_version} | sed 's/\+//')" | ||
|
||
# Checkout the correct tag. | ||
echo "Downloading kernel source at tag ${kernel_version_stripped} ..." | ||
pushd "${KERNEL_SRC_DIR}" | ||
# TODO: Consume KERNEL SHA1 from COS image directly. | ||
# git checkout "tags/v${kernel_version_stripped}" | ||
git checkout ${LAKITU_KERNEL_SHA1} | ||
|
||
# Prepare kernel configu and source for modules. | ||
echo "Preparing kernel sources ..." | ||
zcat "/proc/config.gz" > ".config" | ||
make olddefconfig | ||
make modules_prepare | ||
# Done. | ||
popd | ||
} | ||
|
||
download_install_nvidia() { | ||
local pkg_name="${NVIDIA_CUDA_PKG_NAME}" | ||
local url="${NVIDIA_CUDA_URL}" | ||
local log_file_name="${NVIDIA_DRIVER_DIR}/nvidia-installer.log" | ||
|
||
mkdir -p "${NVIDIA_DRIVER_DIR}" | ||
pushd "${NVIDIA_DRIVER_DIR}" | ||
|
||
echo "Downloading Nvidia CUDA package from ${url} ..." | ||
curl -L -s "${url}" -o "${pkg_name}" | ||
echo "${NVIDIA_CUDA_MD5SUM} ${pkg_name}" | md5sum --check | ||
|
||
echo "Extracting Nvidia CUDA package ..." | ||
sh ${pkg_name} --extract="$(pwd)" | ||
|
||
echo "Running the Nvidia driver installer ..." | ||
if ! sh "${NVIDIA_DRIVER_PKG_NAME}" --kernel-source-path="${KERNEL_SRC_DIR}" --silent --accept-license --keep --log-file-name="${log_file_name}"; then | ||
echo "Nvidia installer failed, log below:" | ||
echo "===================================" | ||
tail -50 "${log_file_name}" | ||
echo "===================================" | ||
exit 1 | ||
fi | ||
# Create unified memory device file. | ||
nvidia-modprobe -c0 -u | ||
popd | ||
} | ||
|
||
unlock_loadpin_and_reboot_if_needed() { | ||
kernel_cmdline="$(cat /proc/cmdline)" | ||
if echo "${kernel_cmdline}" | grep -q -v "lsm.module_locking=0"; then | ||
local -r esp_partition="/dev/sda12" | ||
local -r mount_path="/tmp/esp" | ||
local -r grub_cfg="efi/boot/grub.cfg" | ||
|
||
mkdir -p "${mount_path}" | ||
mount "${esp_partition}" "${mount_path}" | ||
|
||
pushd "${mount_path}" | ||
cp "${grub_cfg}" "${grub_cfg}.orig" | ||
sed 's/cros_efi/cros_efi lsm.module_locking=0/g' -i "efi/boot/grub.cfg" | ||
cat "${grub_cfg}" | ||
popd | ||
sync | ||
umount "${mount_path}" | ||
# Restart the node for loadpin to be disabled. | ||
echo b > /sysrq | ||
fi | ||
} | ||
|
||
create_uvm_device() { | ||
# Create unified memory device file. | ||
nvidia-modprobe -c0 -u | ||
} | ||
|
||
verify_base_image() { | ||
mount --bind /rootfs/etc/os-release /etc/os-release | ||
local id="$(grep "^ID=" /etc/os-release)" | ||
if [[ "${id#*=}" != "cos" ]]; then | ||
echo "This installer is designed to run on Container-Optimized OS only" | ||
exit 1 | ||
fi | ||
} | ||
|
||
setup_overlay_mounts() { | ||
mkdir -p ${USR_WRITABLE_DIR} ${USR_WORK_DIR} ${LIB_WRITABLE_DIR} ${LIB_WORK_DIR} | ||
mount -t overlay -o lowerdir=/usr,upperdir=${USR_WRITABLE_DIR},workdir=${USR_WORK_DIR} none /usr | ||
mount -t overlay -o lowerdir=/lib,upperdir=${LIB_WRITABLE_DIR},workdir=${LIB_WORK_DIR} none /lib | ||
} | ||
|
||
exit_if_install_not_needed() { | ||
if nvidia-smi; then | ||
echo "nvidia drivers already installed. Skipping installation" | ||
post_installation_sequence | ||
exit 0 | ||
fi | ||
} | ||
|
||
restart_kubelet() { | ||
echo "Sending SIGTERM to kubelet" | ||
pkill -SIGTERM kubelet | ||
} | ||
|
||
# Copy user space libraries and debug utilities to a special output directory on the host. | ||
# Make these artifacts world readable and executable. | ||
copy_files_to_host() { | ||
mkdir -p ${LIB_OUTPUT_DIR} ${BIN_OUTPUT_DIR} | ||
cp -r ${USR_WRITABLE_DIR}/lib/x86_64-linux-gnu/* ${LIB_OUTPUT_DIR}/ | ||
cp -r ${USR_WRITABLE_DIR}/bin/* ${BIN_OUTPUT_DIR}/ | ||
chmod -R a+rx ${LIB_OUTPUT_DIR} | ||
chmod -R a+rx ${BIN_OUTPUT_DIR} | ||
} | ||
|
||
post_installation_sequence() { | ||
create_uvm_device | ||
# Copy nvidia user space libraries and debug tools to the host for use from other containers. | ||
copy_files_to_host | ||
# Restart the kubelet for it to pick up the GPU devices. | ||
restart_kubelet | ||
} | ||
|
||
main() { | ||
# Do not run the installer unless the base image is Container Optimized OS (COS) | ||
verify_base_image | ||
# Do not run the installer unless a Nvidia device is found on the PCI bus | ||
check_nvidia_device | ||
# Setup overlay mounts to capture nvidia driver artificats in a more permanent storage on the host. | ||
setup_overlay_mounts | ||
# Disable a critical security feature in COS that will allow for dynamically loading Nvidia drivers | ||
unlock_loadpin_and_reboot_if_needed | ||
# Exit if installation is not required (for idempotency) | ||
exit_if_install_not_needed | ||
# Checkout kernel sources appropriate for the base image. | ||
prepare_kernel_source | ||
# Download, compile and install nvidia drivers. | ||
download_install_nvidia | ||
# Verify that the Nvidia drivers have been successfully installed. | ||
nvidia-smi | ||
# Perform post installation steps - copying artifacts, restarting kubelet, etc. | ||
post_installation_sequence | ||
} | ||
|
||
main "$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.