Skip to content

Commit

Permalink
Issue dell#174: Prometheus slurm exporter and test framework
Browse files Browse the repository at this point in the history
Signed-off-by: K <[email protected]>
  • Loading branch information
lwilson authored and DeepikaKrishnaiah committed Jan 8, 2021
2 parents e301ba4 + de81618 commit a607340
Show file tree
Hide file tree
Showing 18 changed files with 474 additions and 58 deletions.
32 changes: 18 additions & 14 deletions omnia.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
# limitations under the License.
---

- name: Validate the cluster
hosts: localhost
connection: local
gather_facts: no
roles:
- cluster_validation
#- name: Validate the cluster
# hosts: localhost
# connection: local
# gather_facts: no
# roles:
# - cluster_validation

- name: Gather facts from all the nodes
hosts: all
Expand Down Expand Up @@ -76,14 +76,18 @@
gather_facts: false
roles:
- k8s_nfs_server_setup
tags: kubernetes
tags:
- kubernetes
- nfs

- name: Apply NFS client setup on compute nodes
hosts: compute
gather_facts: false
roles:
- k8s_nfs_client_setup
tags: kubernetes
tags:
- kubernetes
- nfs

- name: Start K8s on manager server
hosts: manager
Expand Down Expand Up @@ -134,9 +138,9 @@
- slurm_start_services
tags: slurm

- name: Install slurm exporter
hosts: manager
gather_facts: false
roles:
- slurm_exporter
tags: slurm
#- name: Install slurm exporter
# hosts: manager
# gather_facts: false
# roles:
# - slurm_exporter
# tags: slurm
9 changes: 8 additions & 1 deletion roles/common/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@
state: present
tags: install

- name: Add docker community edition repository
get_url:
url: "{{ docker_repo_url }}"
dest: "{{ docker_repo_dest }}"
tags: install

- name: Disable swap
command: /sbin/swapoff -a
changed_when: true
Expand Down Expand Up @@ -70,4 +76,5 @@

- name: Install Nvidia drivers and software components
include_tasks: nvidia.yml
when: ansible_local.inventory.nvidia_gpu > 0
when: ansible_local.inventory.nvidia_gpu > 0
tags: install
54 changes: 37 additions & 17 deletions roles/common/tasks/nvidia.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,44 @@
# limitations under the License.
---

- name: Add nvidia-docker2 Repo
get_url:
url: "{{ nvidia_docker_repo_url }}"
dest: "{{ nvidia_docker_repo_dest }}"
tags: install, testing

- name: Add libnvidia container Repo
get_url:
url: "{{ nvidia_container_repo_url }}"
dest: "{{ nvidia_container_repo_dest }}"
tags: install, testing
yum_repository:
name: libnvidia-container
description: libnvidia-container
baseurl: https://nvidia.github.io/libnvidia-container/stable/centos7/$basearch
repo_gpgcheck: no
gpgcheck: no
gpgkey: https://nvidia.github.io/libnvidia-container/gpgkey
sslverify: yes
sslcacert: /etc/pki/tls/certs/ca-bundle.crt
enabled: yes
tags: install

- name: Add nvidia-container-runtime Repo
yum_repository:
name: nvidia-container-runtime
description: nvidia-container-runtime
baseurl: https://nvidia.github.io/nvidia-container-runtime/stable/centos7/$basearch
repo_gpgcheck: no
gpgcheck: no
gpgkey: https://nvidia.github.io/nvidia-container-runtime/gpgkey
sslverify: yes
sslcacert: /etc/pki/tls/certs/ca-bundle.crt
enabled: yes
tags: install

- name: Add nvidia-docker Repo
yum_repository:
name: nvidia-docker
description: nvidia-docker
baseurl: https://nvidia.github.io/nvidia-docker/centos7/$basearch
repo_gpgcheck: no
gpgcheck: no
gpgkey: https://nvidia.github.io/nvidia-docker/gpgkey
enabled: yes
sslverify: yes
sslcacert: /etc/pki/tls/certs/ca-bundle.crt
tags: install

- name: Install nvidia driver and nvidia-docker2
package:
Expand Down Expand Up @@ -52,10 +79,3 @@
enabled: yes
daemon_reload: yes
tags: install

- name: Restart and enable kubernetes - kubelet
service:
name: kubelet
state: restarted
enabled: yes
tags: install
7 changes: 6 additions & 1 deletion roles/common/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ common_packages:
- nvidia-detect
- chrony
- pciutils
- docker-ce

custom_fact_dir: /etc/ansible/facts.d

Expand All @@ -36,6 +37,10 @@ elrepo_gpg_key_url: https://www.elrepo.org/RPM-GPG-KEY-elrepo.org

elrepo_rpm_url: https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm

docker_repo_url: https://download.docker.com/linux/centos/docker-ce.repo

docker_repo_dest: /etc/yum.repos.d/docker-ce.repo

chrony_path: "/etc/chrony.conf"
ntp_path: "/etc/ntp.conf"
ntp_mode: "0644"
Expand Down Expand Up @@ -63,4 +68,4 @@ nvidia_packages:
- nvidia-docker2

daemon_file_dest: /etc/docker/
daemon_file_mode: 0644
daemon_file_mode: 0644
30 changes: 11 additions & 19 deletions roles/k8s_common/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,16 @@
---

- name: Add kubernetes repo
copy:
src: kubernetes.repo
dest: "{{ k8s_repo_dest }}"
owner: root
group: root
mode: "{{ k8s_repo_file_mode }}"
tags: install

- name: Add docker community edition repository
get_url:
url: "{{ docker_repo_url }}"
dest: "{{ docker_repo_dest }}"
yum_repository:
name: kubernetes
description: kubernetes
baseurl: https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64
enabled: yes
gpgcheck: no
repo_gpgcheck: no
gpgkey:
- https://packages.cloud.google.com/yum/doc/yum-key.gpg
- https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
tags: install

- name: Update sysctl to handle incorrectly routed traffic when iptables is bypassed
Expand All @@ -42,12 +40,6 @@
changed_when: true
tags: install

- name: Install docker
package:
name: docker-ce
state: present
tags: install

- name: Install k8s packages
package:
name: "{{ k8s_packages }}"
Expand All @@ -74,4 +66,4 @@
service:
name: kubelet
state: restarted
enabled: yes
enabled: yes
6 changes: 1 addition & 5 deletions roles/k8s_common/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,8 @@ k8s_packages:

k8s_repo_dest: /etc/yum.repos.d/

docker_repo_url: https://download.docker.com/linux/centos/docker-ce.repo

docker_repo_dest: /etc/yum.repos.d/docker-ce.repo

k8s_conf_dest: /etc/sysctl.d/

k8s_repo_file_mode: 0644

k8s_conf_file_mode: 0644
k8s_conf_file_mode: 0644
10 changes: 10 additions & 0 deletions roles/slurm_exporter/files/prometheus-slurm-exporter.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[Unit]
Description = Start prometheus slurm exporter

[Service]
ExecStart = /usr/bin/prometheus-slurm-exporter
Restart = always
RestartSec = 15

[Install]
WantedBy = multi-user.target
30 changes: 30 additions & 0 deletions roles/slurm_exporter/files/slurm_exporter_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: v1
kind: Service
metadata:
name: prometheus-slurmexporter-metrics-2
namespace: default
annotations:
prometheus.io/scrape: 'true'
labels:
app: prometheus
app.kubernetes.io/managed-by: Helm
chart: prometheus-11.12.1
component: server
spec:
ports:
- name: metrics
port: 8080
protocol: TCP
targetPort: 8080
selector:
app: prometheus
component: server
additionalScrapeConfigs:
name: prometheus-config
key: prometheus-config.yaml
job_name: 'prometheus-slurm-exporter'
scrape_interval: 15s
static_configs:
- targets:
- http:"{{ inventory_hostname }}":8080/metrics
serviceMonitorSelector: {}
18 changes: 18 additions & 0 deletions roles/slurm_exporter/tasks/configure_prometheus_pod.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Apply slurm exporter configuration to prometheus
command: kubectl apply -f "{{ role_path }}/files/{{ slurm_config_file }}" --validate=false
changed_when: False
40 changes: 40 additions & 0 deletions roles/slurm_exporter/tasks/install_prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright 2020 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Download and untar prometheus stable version
unarchive:
src: "{{ prometheus_git_repo }}"
dest: "{{ installation_dir }}"
remote_src: yes

- name: Copy prometheus executable to /usr/local/bin
copy:
src: "{{ prometheus_exec_path }}"
dest: "{{ system_local_path }}"
remote_src: yes
mode: "{{ file_permission }}"

- name: Configure prometheus for slurm exporter
blockinfile:
path: "{{ prometheus_config_file }}"
insertafter: EOF
mode: "{{ file_permission }}"
block: |
# SLURM resource manager:
- job_name: 'my_slurm_exporter'
scrape_interval: 30s
scrape_timeout: 30s
static_configs:
- targets: ['localhost:8080']
Loading

0 comments on commit a607340

Please sign in to comment.