diff --git a/omnia.yml b/omnia.yml index bb695ebda..06a669d90 100644 --- a/omnia.yml +++ b/omnia.yml @@ -112,7 +112,7 @@ tags: kubernetes - name: Apply common Slurm installation and config - hosts: manager, compute + hosts: manager, compute, login_node gather_facts: false roles: - slurm_common @@ -125,13 +125,21 @@ - slurm_manager tags: slurm -- name: Start Slurm workers - hosts: compute +- name: Configure Slurm workers + hosts: compute, login_node + serial: 1 gather_facts: false roles: - slurm_workers tags: slurm +- name: Start Slurm workers + hosts: compute, login_node + gather_facts: false + roles: + - slurm_workers_service + tags: slurm + - name: Start Slurm services hosts: manager gather_facts: false diff --git a/roles/slurm_common/tasks/main.yml b/roles/slurm_common/tasks/main.yml index a065c963c..71d7abdb6 100644 --- a/roles/slurm_common/tasks/main.yml +++ b/roles/slurm_common/tasks/main.yml @@ -192,4 +192,4 @@ state: restarted enabled: yes tags: install - ignore_errors: yes + failed_when: false \ No newline at end of file diff --git a/roles/slurm_exporter/tasks/install_slurm_exporter.yml b/roles/slurm_exporter/tasks/install_slurm_exporter.yml index 2c0bf7177..85420acda 100644 --- a/roles/slurm_exporter/tasks/install_slurm_exporter.yml +++ b/roles/slurm_exporter/tasks/install_slurm_exporter.yml @@ -55,7 +55,7 @@ args: chdir: "{{ slurm_exporter_inst_dir }}" changed_when: False - ignore_errors: yes + failed_when: false - name: Copy executable to /usr/bin copy: diff --git a/roles/slurm_exporter/tasks/start_services.yml b/roles/slurm_exporter/tasks/start_services.yml index b2d7a6a6b..6c94a3a8d 100644 --- a/roles/slurm_exporter/tasks/start_services.yml +++ b/roles/slurm_exporter/tasks/start_services.yml @@ -13,6 +13,19 @@ # limitations under the License. --- +- name: Install firewalld + package: + name: firewalld + state: present + tags: firewalld + +- name: Start and enable firewalld + service: + name: firewalld + state: started + enabled: yes + tags: firewalld + - name: Firewall port addition for slurm exporter firewalld: zone: public @@ -29,6 +42,13 @@ changed_when: true tags: firewalld +- name: Stop and disable firewalld + service: + name: firewalld + state: stopped + enabled: no + tags: firewalld + - name: Create systemd unit file copy: src: "{{ role_path }}/files/prometheus-slurm-exporter.service" diff --git a/roles/slurm_manager/tasks/main.yml b/roles/slurm_manager/tasks/main.yml index 20dfdd0c6..598e3d057 100644 --- a/roles/slurm_manager/tasks/main.yml +++ b/roles/slurm_manager/tasks/main.yml @@ -156,13 +156,13 @@ lineinfile: path: "{{ slurmdbd_path }}" regexp: "DbdAddr=" - line: "DbdAddr={{ DbdAddr }}" + line: "DbdAddr={{ dbd_addr }}" - name: Add db host lineinfile: path: "{{ slurmdbd_path }}" regexp: "DbdHost=" - line: "DbdHost={{ DbdHost }}" + line: "DbdHost={{ dbd_host }}" - name: Add storage password lineinfile: @@ -192,4 +192,4 @@ fetch: src: "{{ slurm_confpth }}" dest: "{{ buffer_path }}" - flat: true + flat: true \ No newline at end of file diff --git a/roles/slurm_manager/vars/main.yml b/roles/slurm_manager/vars/main.yml index 0387e298b..fcb6462df 100644 --- a/roles/slurm_manager/vars/main.yml +++ b/roles/slurm_manager/vars/main.yml @@ -64,8 +64,8 @@ slurmdbd_path: "/etc/slurm/slurmdbd.conf" slurmdbd_mode: "0600" slurm_confpth: "/etc/slurm/slurm.conf" slurm_user: "slurm" -DbdAddr: "localhost" -DbdHost: "localhost" +dbd_addr: "localhost" +dbd_host: "localhost" logfile: "/var/log/slurm/slurmdbd.log" pidfile: "/var/run/slurmdbd.pid" buffer_path: "/tmp/slurm.conf" diff --git a/roles/slurm_workers/tasks/main.yml b/roles/slurm_workers/tasks/main.yml index c93a04f48..ac01dae44 100644 --- a/roles/slurm_workers/tasks/main.yml +++ b/roles/slurm_workers/tasks/main.yml @@ -40,6 +40,29 @@ state: present tags: firewalld +- name: Start and enable firewalld + service: + name: firewalld + state: started + enabled: yes + tags: firewalld + +- name: Firewall rule for slurm - tcp/udp ports + firewalld: + zone: public + port: "{{ item }}" + permanent: true + state: enabled + with_items: + - "{{ tcp_port2 }}" + - "{{ udp_port2 }}" + tags: firewalld + +- name: Reload firewalld + command: firewall-cmd --reload + changed_when: true + tags: firewalld + - name: Stop and disable firewalld service: name: firewalld @@ -90,16 +113,43 @@ mode: "{{ slurm_mode }}" with_items: - "{{ groups['compute'] }}" + when: '"compute" in group_names' + +- name: Add login node core & socket info in slurm config file + lineinfile: + dest: "{{ slurm_confpth }}" + line: "NodeName={{ hostvars[item].node_name }} Sockets={{ hostvars[item].sockets }} CoresPerSocket={{ hostvars[item].cores }}" + state: present + create: yes + mode: "{{ slurm_mode }}" + with_items: + - "{{ groups['login_node'] }}" + when: + - hostvars["127.0.0.1"]["login_node_required"] + - '"login_node" in group_names' + +- name: Update hostnames of compute node when ALL in partition nodes + replace: + path: "{{ slurm_confpth }}" + regexp: 'PartitionName=normal Nodes=ALL' + replace: 'PartitionName=normal Nodes={{ machine_name.stdout }}' + when: + - hostvars["127.0.0.1"]["login_node_required"] + - '"compute" in group_names' + register: output + +- name: Update hostnames of compute node in partition nodes + replace: + path: "{{ slurm_confpth }}" + regexp: ' Default=YES MaxTime=INFINITE State=UP' + replace: ',{{ machine_name.stdout }} Default=YES MaxTime=INFINITE State=UP' + when: + - hostvars["127.0.0.1"]["login_node_required"] + - '"compute" in group_names' + - output.msg | length == 0 - name: Save slurm conf in buffer fetch: src: "{{ slurm_confpth }}" dest: "{{ buffer_path }}" - flat: true - -- name: Start slurmd on compute nodes - systemd: - name: slurmd.service - state: started - enabled: yes - tags: install + flat: true \ No newline at end of file diff --git a/roles/slurm_workers_service/tasks/main.yml b/roles/slurm_workers_service/tasks/main.yml new file mode 100644 index 000000000..43c347bfe --- /dev/null +++ b/roles/slurm_workers_service/tasks/main.yml @@ -0,0 +1,36 @@ +# Copyright 2021 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Include common variables + include_vars: ../../slurm_manager/vars/main.yml + +- name: Copy slurm conf from buffer + copy: + src: "{{ buffer_path }}" + dest: "{{ slurm_confpth }}" + mode: "{{ slurm_mode }}" + +- name: Save slurm conf in buffer + fetch: + src: "{{ slurm_confpth }}" + dest: "{{ buffer_path }}" + flat: true + +- name: Start slurmd on compute nodes + systemd: + name: slurmd.service + state: started + enabled: yes + tags: install \ No newline at end of file