Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Login Node Implementation for slurm #445

Merged
merged 2 commits into from
Aug 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions omnia.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
tags: kubernetes

- name: Apply common Slurm installation and config
hosts: manager, compute
hosts: manager, compute, login_node
gather_facts: false
roles:
- slurm_common
Expand All @@ -125,13 +125,21 @@
- slurm_manager
tags: slurm

- name: Start Slurm workers
hosts: compute
- name: Configure Slurm workers
hosts: compute, login_node
serial: 1
gather_facts: false
roles:
- slurm_workers
tags: slurm

- name: Start Slurm workers
hosts: compute, login_node
gather_facts: false
roles:
- slurm_workers_service
tags: slurm

- name: Start Slurm services
hosts: manager
gather_facts: false
Expand Down
2 changes: 1 addition & 1 deletion roles/slurm_common/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -192,4 +192,4 @@
state: restarted
enabled: yes
tags: install
ignore_errors: yes
failed_when: false
2 changes: 1 addition & 1 deletion roles/slurm_exporter/tasks/install_slurm_exporter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
args:
chdir: "{{ slurm_exporter_inst_dir }}"
changed_when: False
ignore_errors: yes
failed_when: false

- name: Copy executable to /usr/bin
copy:
Expand Down
20 changes: 20 additions & 0 deletions roles/slurm_exporter/tasks/start_services.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@
# limitations under the License.
---

- name: Install firewalld
package:
name: firewalld
state: present
tags: firewalld

- name: Start and enable firewalld
service:
name: firewalld
state: started
enabled: yes
tags: firewalld

- name: Firewall port addition for slurm exporter
firewalld:
zone: public
Expand All @@ -29,6 +42,13 @@
changed_when: true
tags: firewalld

- name: Stop and disable firewalld
service:
name: firewalld
state: stopped
enabled: no
tags: firewalld

- name: Create systemd unit file
copy:
src: "{{ role_path }}/files/prometheus-slurm-exporter.service"
Expand Down
6 changes: 3 additions & 3 deletions roles/slurm_manager/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -156,13 +156,13 @@
lineinfile:
path: "{{ slurmdbd_path }}"
regexp: "DbdAddr="
line: "DbdAddr={{ DbdAddr }}"
line: "DbdAddr={{ dbd_addr }}"

- name: Add db host
lineinfile:
path: "{{ slurmdbd_path }}"
regexp: "DbdHost="
line: "DbdHost={{ DbdHost }}"
line: "DbdHost={{ dbd_host }}"

- name: Add storage password
lineinfile:
Expand Down Expand Up @@ -192,4 +192,4 @@
fetch:
src: "{{ slurm_confpth }}"
dest: "{{ buffer_path }}"
flat: true
flat: true
4 changes: 2 additions & 2 deletions roles/slurm_manager/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ slurmdbd_path: "/etc/slurm/slurmdbd.conf"
slurmdbd_mode: "0600"
slurm_confpth: "/etc/slurm/slurm.conf"
slurm_user: "slurm"
DbdAddr: "localhost"
DbdHost: "localhost"
dbd_addr: "localhost"
dbd_host: "localhost"
logfile: "/var/log/slurm/slurmdbd.log"
pidfile: "/var/run/slurmdbd.pid"
buffer_path: "/tmp/slurm.conf"
Expand Down
66 changes: 58 additions & 8 deletions roles/slurm_workers/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,29 @@
state: present
tags: firewalld

- name: Start and enable firewalld
service:
name: firewalld
state: started
enabled: yes
tags: firewalld

- name: Firewall rule for slurm - tcp/udp ports
firewalld:
zone: public
port: "{{ item }}"
permanent: true
state: enabled
with_items:
- "{{ tcp_port2 }}"
- "{{ udp_port2 }}"
tags: firewalld

- name: Reload firewalld
command: firewall-cmd --reload
changed_when: true
tags: firewalld

- name: Stop and disable firewalld
service:
name: firewalld
Expand Down Expand Up @@ -90,16 +113,43 @@
mode: "{{ slurm_mode }}"
with_items:
- "{{ groups['compute'] }}"
when: '"compute" in group_names'

- name: Add login node core & socket info in slurm config file
lineinfile:
dest: "{{ slurm_confpth }}"
line: "NodeName={{ hostvars[item].node_name }} Sockets={{ hostvars[item].sockets }} CoresPerSocket={{ hostvars[item].cores }}"
state: present
create: yes
mode: "{{ slurm_mode }}"
with_items:
- "{{ groups['login_node'] }}"
when:
- hostvars["127.0.0.1"]["login_node_required"]
- '"login_node" in group_names'

- name: Update hostnames of compute node when ALL in partition nodes
replace:
path: "{{ slurm_confpth }}"
regexp: 'PartitionName=normal Nodes=ALL'
replace: 'PartitionName=normal Nodes={{ machine_name.stdout }}'
when:
- hostvars["127.0.0.1"]["login_node_required"]
- '"compute" in group_names'
register: output

- name: Update hostnames of compute node in partition nodes
replace:
path: "{{ slurm_confpth }}"
regexp: ' Default=YES MaxTime=INFINITE State=UP'
replace: ',{{ machine_name.stdout }} Default=YES MaxTime=INFINITE State=UP'
when:
- hostvars["127.0.0.1"]["login_node_required"]
- '"compute" in group_names'
- output.msg | length == 0

- name: Save slurm conf in buffer
fetch:
src: "{{ slurm_confpth }}"
dest: "{{ buffer_path }}"
flat: true

- name: Start slurmd on compute nodes
systemd:
name: slurmd.service
state: started
enabled: yes
tags: install
flat: true
36 changes: 36 additions & 0 deletions roles/slurm_workers_service/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---

- name: Include common variables
include_vars: ../../slurm_manager/vars/main.yml

- name: Copy slurm conf from buffer
copy:
src: "{{ buffer_path }}"
dest: "{{ slurm_confpth }}"
mode: "{{ slurm_mode }}"

- name: Save slurm conf in buffer
fetch:
src: "{{ slurm_confpth }}"
dest: "{{ buffer_path }}"
flat: true

- name: Start slurmd on compute nodes
systemd:
name: slurmd.service
state: started
enabled: yes
tags: install