From 813756495d5ef33fa3cc95d69b6d88418ebe7bb1 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Thu, 5 Jul 2018 16:22:58 +0200 Subject: Squashed 'roles/cuda/' content from commit f82a4fe git-subtree-dir: roles/cuda git-subtree-split: f82a4fedb62a410b1f05454ee5ba5f2e5ff0a16c --- .travis.yml | 47 ++++++++++++++ LICENSE | 22 +++++++ README.md | 64 +++++++++++++++++++ defaults/main.yml | 23 +++++++ files/nvidia_packaging_key.asc | 29 +++++++++ handlers/main.yml | 43 +++++++++++++ meta/main.yml | 140 +++++++++++++++++++++++++++++++++++++++++ tasks/configure_apt.yml | 15 +++++ tasks/configure_yum.yml | 28 +++++++++ tasks/cuda_init.yml | 51 +++++++++++++++ tasks/main.yml | 46 ++++++++++++++ templates/cuda.sh.j2 | 7 +++ templates/cuda_init.service.j2 | 13 ++++ templates/cuda_init.sh.j2 | 9 +++ tests/install.yml | 23 +++++++ tests/inventory | 5 ++ tests/test.yml | 12 ++++ vars/centos-6.yml | 4 ++ vars/centos-7.yml | 4 ++ vars/main.yml | 5 ++ vars/redhat-6.yml | 4 ++ vars/redhat-7.yml | 4 ++ vars/ubuntu-14.04.yml | 4 ++ vars/ubuntu-14.10.yml | 4 ++ vars/ubuntu-15.yml | 4 ++ vars/ubuntu-16.04.yml | 4 ++ vars/ubuntu-17.04.yml | 4 ++ 27 files changed, 618 insertions(+) create mode 100644 .travis.yml create mode 100644 LICENSE create mode 100644 README.md create mode 100644 defaults/main.yml create mode 100644 files/nvidia_packaging_key.asc create mode 100644 handlers/main.yml create mode 100644 meta/main.yml create mode 100644 tasks/configure_apt.yml create mode 100644 tasks/configure_yum.yml create mode 100644 tasks/cuda_init.yml create mode 100644 tasks/main.yml create mode 100755 templates/cuda.sh.j2 create mode 100644 templates/cuda_init.service.j2 create mode 100644 templates/cuda_init.sh.j2 create mode 100644 tests/install.yml create mode 100644 tests/inventory create mode 100644 tests/test.yml create mode 100644 vars/centos-6.yml create mode 100644 vars/centos-7.yml create mode 100644 vars/main.yml create mode 100644 vars/redhat-6.yml create mode 100644 vars/redhat-7.yml create mode 100644 vars/ubuntu-14.04.yml create mode 100644 vars/ubuntu-14.10.yml create mode 100644 vars/ubuntu-15.yml create mode 100644 vars/ubuntu-16.04.yml create mode 100644 vars/ubuntu-17.04.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..0cdcab4 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,47 @@ +--- +language: python +sudo: required +dist: trusty + +matrix: + fast_finish: true + +cache: + directories: [ '$HOME/lxc/' ] + pip: true + +before_cache: + - sudo mkdir $HOME/lxc && sudo tar cf $HOME/lxc/cache.tar /var/cache/lxc/ && sudo chown $USER. $HOME/lxc/cache.tar + +env: + - LXC_DISTRO=ubuntu LXC_RELEASE=xenial + - LXC_DISTRO=ubuntu LXC_RELEASE=trusty + - LXC_DISTRO=centos LXC_RELEASE=7 + - LXC_DISTRO=centos LXC_RELEASE=6 + +install: + - sudo tar xf $HOME/lxc/cache.tar -C / || true + - sudo apt-get install -y expect-dev + - pip install ansible + - ansible --version + - printf '[defaults]\nroles_path=../\ncallback_whitelist=profile_tasks' >ansible.cfg + - ansible-galaxy install lae.travis-lxc + - ansible-playbook -vvv tests/install.yml -i tests/inventory + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + + # Perform a test run with the playbook + - travis_wait ansible-playbook tests/test.yml -i tests/inventory + + # Perform a another test run with the playbook to check for idempotency + - 'unbuffer ansible-playbook tests/test.yml -i tests/inventory >/tmp/idempotency.log 2>&1' + - 'grep -A1 "PLAY RECAP" /tmp/idempotency.log | grep -qP "changed=0.*failed=0" && + (echo "Idempotence: PASS"; exit 0) || (echo "Idempotence: FAIL"; cat /tmp/idempotency.log; + exit 1)' + +notifications: + email: false + flowdock: + secure: "lxqZTTUCUYy19JzwXnH/kRcndYwNasUYpa0AK0vcRqTRSoULRzwE2jcJUk2YBz0qTBcTgAkinj1VQbImdrl68NTPDTmOZM4+hPZ8RQNTGR7VJwy4Ynjl/RtvxmwvoW/kSZJI3twDvPpAl2yEKfiPwSE4kYNFs84w43WieNhX3qO0LN9EdFykV6M0xeZgGc71v6oGof3n9HhBMfMUYU6YZJKvirHJNwAxHsWiFSq+dfDA5hqDyilwuQM0toNgjsFi6F7b40vOVJPGJKdSNekrFgp2Gm/Fzd83sO/Sp1ord9v73UflshejKuK2/iMRddPW5JDl05FNrJ7x6xcCfK9sxOj5KDELotagHCDdCTqX4USelBpI7DeO9yV8NAIxyL2KAFi5b3uwvF5reitGhRfdeqA0B9eK+k6vdRQ/xKryYc48hVX46wraL2ibZv2gbic7vpYdxiWFUirKB9NBoQu7JHkTT/LT3LjbC9/uL9c4qRyakAnQwhgi4/sEk7f9euvtZA6MJfZpCzfiYVt3rGe6H9HqcCflnxW5F1ZjLBSkHk02rNn6hcfFxGHSS5x1362F9JCwVkAWgJ43JvQLRxobW4htbx56+niX0zS+vs2kK5K3NxUCJzInUb4UV5/9lwcCBZQJL0fD4u2Gy8/TC7MrhhOVxkSSPmjjoonaRPY497c=" diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8b0c2e6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2015 CSC - IT Center for Science + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/README.md b/README.md new file mode 100644 index 0000000..d28e484 --- /dev/null +++ b/README.md @@ -0,0 +1,64 @@ +[![Build Status](https://travis-ci.org/CSCfi/ansible-role-cuda.svg)](https://travis-ci.org/CSCfi/ansible-role-cuda) +[![Galaxy Role](https://img.shields.io/badge/ansible--galaxy-cuda-blue.svg)](https://galaxy.ansible.com/CSCfi/cuda/) + +ansible-role-cuda +========= + +Installs CUDA + +Tested with Tesla P100, K80, Tesla M40, CentOS7, Ubuntu 16.04, Cuda 7.5 and 8.0 + +Optionally also installs cuda_init which initializes the GPUs during boot. + +Requirements +------------ + +Outbound access to http://developer.download.nvidia.com/compute/cuda/repos/ + +Role Variables +-------------- + + gpu: False + cuda_packages: + - cuda + cuda_restart_node_on_install: True + cuda_init: True + cuda_bash_profile: True + +- gpu: True is needed. Without it this role does nothing. +- cuda_packages: List that can be updated to include more packages that are installed after nvidia cuda repo is installed, or to a specific cuda package (e.g. `cuda-7-5`) +- cuda_init: Installs a bash script that is executed via systemd +- cuda_gpu_name0: "/dev/nvidia0" # set this to the device ansible looks for. If it does not exist then if cuda_init is True then it will run the cuda_init.sh script +- cuda_restart_node_on_install: restarts the system when packages are installed or updated + + +Example Playbook +---------------- + +`playbook.yml`: + + - hosts: deep_learning + roles: + - CSC-IT-Center-for-Science.cuda + +`inventory`: + + [deep_learning] + host1.example gpu=True + +Example Errors +-------------- + +This error means you are not using a supported OS (like Ubuntu 17.04 which does not have a cuda URL) +
+   "msg": "No file was found when using with_first_found. Use the 'skip: true' option to allow this task to be skipped if no files are found"
+
+ +License +------- + +MIT + +Author Information +------------------ + diff --git a/defaults/main.yml b/defaults/main.yml new file mode 100644 index 0000000..57efd6b --- /dev/null +++ b/defaults/main.yml @@ -0,0 +1,23 @@ +--- +# defaults file for ansible-role-cuda +# By default gpu is False, set it to True on the gpu nodes in the hosts file + +gpu: False +cuda_repo_url: "http://developer.download.nvidia.com/compute/cuda/repos/" +cuda_rpm_key_path: /etc/rpm/nvidia_packaging_key.asc +cuda_packages: + - cuda +cuda_restart_node_on_install: True +cuda_init: True +cuda_init_restart_service: True +cuda_init_compute_mode: 3 +cuda_init_persistence_mode: 1 +cuda_gpu_name0: "/dev/nvidia0" +cuda_bash_profile: True +cuda_bash_path: "/usr/local/cuda/bin" +cuda_bash_ld_lib_path: "/usr/local/cuda/lib64" +cuda_bash_cuda_root: "/usr/local/cuda" +cuda_bash_cuda_inc_dir: "/usr/local/cuda/bin" +cuda_bash_cpath: "/usr/local/cuda/include" + +# vim:ft=ansible: diff --git a/files/nvidia_packaging_key.asc b/files/nvidia_packaging_key.asc new file mode 100644 index 0000000..0c44d5a --- /dev/null +++ b/files/nvidia_packaging_key.asc @@ -0,0 +1,29 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v1 + +mQINBFdtt4UBEAC8FDSWMR07GJZ265giLn7kLF+EsJCWESUq6Cd13QN0JQ/tLibi +QlW4ZjeOnEH9VPlqh/mKqNMG4SwRt8S+GHpePMQrr0aOkiRGfCclnAWIZURSAP+t +PLelCt43fkw1BBTopd/0oOzO8kHu8j8WU4A8GHxqghfFWPv54FQs2iaZ2eWR7a6d +79IJrbDKaVCCiQrkhCM8m648pNKHhuoJ9cQXFV+uvwkpfmKWGQ4ultxlOyjLHJLF +vuML2RuAO9IxbdZjzeYNN+T+wjFIBVcPnwEO+WrYgvGkT4r9aqVqTeg3EPb7QclV +sKBVJdxk4jZl0y22HAWqScVi6SJ15uK9pXxywDZkbpuRBWx4ThWiGe/FiUa2igi9 +/SIvqN2TBY0g18sRTrylVr1wE1UGa/y7nDx6PoGCP1frBt8YUYt3pkM8Xvb2CRxx +CyWwmuFEQHC6jCEWf7FnoBHBYQwTVGNrU0vkuIeDrm+ZAcv8wx+ie1hlFhqCCJnf +jqeQ0/zA9RPmCPOkLyTdSsNZtlxxk7bzCdTdFFKzBjGTR7Gz3SMSp23d11eIyRiF +HQsp2v0SvnPJ6OcgB95Hmo544vi3RuoVfovtDOdfSBCRxP+GhhxkKSrTleQjD0/r +CGkdG2Kox3m9YllAsvZchLXlS7bZV9mGRF61mVMjF3HJRUQfBBm89VPQ+QARAQAB +tCBjdWRhdG9vbHMgPGN1ZGF0b29sc0BudmlkaWEuY29tPokCNwQTAQgAIQUCV223 +hQIbAwULCQgHAwUVCgkICwUWAgMBAAIeAQIXgAAKCRD2D0s9f6KvgNArEAChnfcW +rYItgt7xXXubT6E+KpJyJ0RPrXf51S2mhciFbjDl+3EXRMRjOutVmgWYPWUUZaKR +8Iez3Lz4BRmwYOWBLtdnOLbKoSsQUX95rnPFjfly/DFLfjKxz4NRBmh4r4/rCYWm +2hmnXmOAi8kV7fqx3g5XMpJ//N6+T8ctEol2iZ82GrXjadcRWE4rAe7UyuEzJ74y +6ZKIzk5ijdgEKtcaBhzEWvoV5Pr9nkn7ByGsdehKR/gNnjPMYXrklSHGfphJIsS2 +S32lMk/kuRjihBcWcYBXIPEQ7CV+PNW2TlkZj/YqTg637sZHwkhcjcNzxeqKvRYG +8V7Ju5hTDxL1UQBmgDS3cRx1lw7tYRG5bS67tbC2dc/CpPkG5agiZ/WyoHQDnn4r +1fRuOFx694QR6+0rAP6171xEEoNAPaH7gdJdhWKiYiJD0T2EEbW7wBUi/EupeKRv +kR12R1jUa1mlpxNtWQxJ7qp98T9+DmkxI1XDmWx0/g4ryuicwLDSqoPgNcRNdSQb +b8YfTqrkqaDdYzwLr/n0YKW3cYIvIeisV0WxRjb6OP7oAlAtaAhImlIc//51qNO7 +/WAud6qMtnhFoZayR/BzLKqnCioN5GYr9BAKskpPHe9cDKVS3fg+Qvc1sNJID+jf +k52PqyW24Qsr0A9+5zQyE4tH9dfv120gj9avmg== +=0nKc +-----END PGP PUBLIC KEY BLOCK----- diff --git a/handlers/main.yml b/handlers/main.yml new file mode 100644 index 0000000..adba3b7 --- /dev/null +++ b/handlers/main.yml @@ -0,0 +1,43 @@ +--- +# handlers file for ansible-role-cuda +- name: reload systemd unit files + shell: systemctl daemon-reload + +- name: Initialize the GPUs + command: /bin/bash /usr/local/bin/cuda_init.sh + when: + - cuda_init + - cuda_init_restart_service + +- name: Restart cuda_init service + service: + name: cuda_init + state: restarted + when: + - cuda_init + - cuda_init_restart_service + - ansible_service_mgr == "systemd" + +- name: ZZ CUDA Restart server + command: sleep 2 && /sbin/shutdown -r now "Node software upgrade reboot" + async: 1 + poll: 0 + ignore_errors: true + when: + - cuda_packages_installation.changed + - cuda_restart_node_on_install + +# define the variable running_as_ansible_pull in the ansible-pull playbook, like local.yml +- name: ZZ CUDA Wait for server to restart + wait_for: + host: "{{ ansible_ssh_host | default(inventory_hostname) }}" + state: started + delay: 30 + timeout: 300 + connection: local + become: false + when: + - cuda_restart_node_on_install + - (running_as_ansible_pull is not defined or running_as_ansible_pull == False) + +# vim:ft=ansible: diff --git a/meta/main.yml b/meta/main.yml new file mode 100644 index 0000000..f103de5 --- /dev/null +++ b/meta/main.yml @@ -0,0 +1,140 @@ +--- +galaxy_info: + author: Johan Guldmyr + description: Installs CUDA + company: CSC - IT Center for Science + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + # Some suggested licenses: + # - BSD (default) + # - MIT + # - GPLv2 + # - GPLv3 + # - Apache + # - CC-BY + license: MIT + min_ansible_version: 2.4 + # + # Below are all platforms currently available. Just uncomment + # the ones that apply to your role. If you don't see your + # platform on this list, let us know and we'll get it added! + # + platforms: + - name: EL + versions: + # - all + # - 5 + - 6 + - 7 + #- name: GenericUNIX + # versions: + # - all + # - any + #- name: Fedora + # versions: + # - all + # - 16 + # - 17 + # - 18 + # - 19 + # - 20 + # - 21 + # - 22 + #- name: Windows + # versions: + # - all + # - 2012R2 + #- name: SmartOS + # versions: + # - all + # - any + #- name: opensuse + # versions: + # - all + # - 12.1 + # - 12.2 + # - 12.3 + # - 13.1 + # - 13.2 + #- name: Amazon + # versions: + # - all + # - 2013.03 + # - 2013.09 + #- name: GenericBSD + # versions: + # - all + # - any + #- name: FreeBSD + # versions: + # - all + # - 8.0 + # - 8.1 + # - 8.2 + # - 8.3 + # - 8.4 + # - 9.0 + # - 9.1 + # - 9.1 + # - 9.2 + - name: Ubuntu + versions: + # - all + # - lucid + # - maverick + # - natty + # - oneiric + # - precise + # - quantal + # - raring + # - saucy + - trusty + # - utopic + # - vivid + - xenial + #- name: SLES + # versions: + # - all + # - 10SP3 + # - 10SP4 + # - 11 + # - 11SP1 + # - 11SP2 + # - 11SP3 + #- name: GenericLinux + # versions: + # - all + # - any + #- name: Debian + # versions: + # - all + # - etch + # - jessie + # - lenny + # - squeeze + # - wheezy + # + # Below are all categories currently available. Just as with + # the platforms above, uncomment those that apply to your role. + # + categories: + #- cloud + #- cloud:ec2 + #- cloud:gce + #- cloud:rax + #- clustering + #- database + #- database:nosql + #- database:sql + #- development + #- monitoring + #- networking + #- packaging + - system + #- web +dependencies: [] + # List your role dependencies here, one per line. + # Be sure to remove the '[]' above if you add dependencies + # to this list. + diff --git a/tasks/configure_apt.yml b/tasks/configure_apt.yml new file mode 100644 index 0000000..53a38a5 --- /dev/null +++ b/tasks/configure_apt.yml @@ -0,0 +1,15 @@ +--- +# tasks file for ansible-role-cuda +- name: Trust packaging key for Nvidia repositories (apt) + apt_key: + data: "{{ lookup('file', 'files/nvidia_packaging_key.asc') }}" + id: "{{ cuda_packaging_key_id }}" + state: present + +- name: Configure Nvidia repository (apt) + apt_repository: + repo: "deb {{ cuda_repo_url }}/{{ cuda_repo_subfolder }}/x86_64 /" + filename: nvidia + state: present + +# vim:ft=ansible: diff --git a/tasks/configure_yum.yml b/tasks/configure_yum.yml new file mode 100644 index 0000000..e888468 --- /dev/null +++ b/tasks/configure_yum.yml @@ -0,0 +1,28 @@ +--- +# tasks file for ansible-role-cuda +- name: Upload packaging key for Nvidia repositories + copy: + src: nvidia_packaging_key.asc + dest: "{{ cuda_rpm_key_path }}" + mode: 0644 + +- name: Trust packaging key for Nvidia repositories (rpm) + rpm_key: + key: "{{ cuda_rpm_key_path }}" + state: present + +- name: Remove trust for old Nvidia packaging key + rpm_key: + key: 5C37D3BE + state: absent + +- name: Configure Nvidia repository (yum) + yum_repository: + name: nvidia + description: Official Nvidia repository + baseurl: "{{ cuda_repo_url }}/{{ cuda_repo_subfolder }}/x86_64/" + gpgkey: "file://{{ cuda_rpm_key_path }}" + gpgcheck: yes + enabled: yes + +# vim:ft=ansible: diff --git a/tasks/cuda_init.yml b/tasks/cuda_init.yml new file mode 100644 index 0000000..9a10284 --- /dev/null +++ b/tasks/cuda_init.yml @@ -0,0 +1,51 @@ +--- +- name: template in cuda_init.sh used during boot + template: + src: cuda_init.sh.j2 + dest: /usr/local/bin/cuda_init.sh + mode: 0755 + notify: + - Initialize the GPUs + +- name: lineinfile/make sure cuda_init.sh script is absent from rc.local + lineinfile: + dest: /etc/rc.local + insertafter: "^touch /var/lock/subsys/local" + regexp: "^/bin/bash /usr/local/bin/cuda_init.sh$" + line: "/bin/bash /usr/local/bin/cuda_init.sh" + state: absent + +- name: template in cuda_init.service systemd script + template: + src: cuda_init.service.j2 + dest: /etc/systemd/system/cuda_init.service + mode: 0644 + notify: + - reload systemd unit files + - Restart cuda_init service + when: ansible_service_mgr == "systemd" + +- name: enable the cuda_init systemd service + service: + name: cuda_init + enabled: yes + when: ansible_service_mgr == "systemd" + +- name: check if cuda_gpu_name0 ( /dev/nvidia0 ) exists + stat: + path: "{{ cuda_gpu_name0 }}" + register: reg_cuda_gpu_name0 + check_mode: no + +- debug: + var: reg_cuda_gpu_name0 + verbosity: 1 + +- name: Initialize the GPUs - run cuda_init.sh if there is no /dev/nvidia0 + command: /bin/bash /usr/local/bin/cuda_init.sh + when: + - reg_cuda_gpu_name0.stat.exists is defined + - reg_cuda_gpu_name0.stat.exists == False + - cuda_init_restart_service + +# vim:ft=ansible: diff --git a/tasks/main.yml b/tasks/main.yml new file mode 100644 index 0000000..6d846f8 --- /dev/null +++ b/tasks/main.yml @@ -0,0 +1,46 @@ +--- +# tasks file for ansible-role-cuda +- name: "Gather OS specific variables" + include_vars: "{{ item }}" + with_first_found: + - "{{ ansible_distribution|lower }}-{{ ansible_distribution_version }}.yml" + - "{{ ansible_distribution|lower }}-{{ ansible_distribution_major_version }}.yml" + - "{{ ansible_distribution|lower }}.yml" + - "{{ ansible_os_family|lower }}.yml" + +- block: + - include_tasks: configure_yum.yml + when: ansible_pkg_mgr == 'yum' + + - include_tasks: configure_apt.yml + when: ansible_pkg_mgr == 'apt' + + - name: Install CUDA and related packages (1.5-2GB download, also restarts if cuda_restart_node_on_install is set to True) + package: + name: "{{ item }}" + state: present + with_items: "{{ cuda_packages }}" + register: cuda_packages_installation + notify: + - ZZ CUDA Restart server + - ZZ CUDA Wait for server to restart + + - name: Template CUDA paths to user environments + template: + src: cuda.sh.j2 + dest: /etc/profile.d/cuda.sh + mode: 0755 + when: cuda_bash_profile + + - include_tasks: cuda_init.yml + when: cuda_init == True + + # This is here because if we in the same playbook try to start slurmd without + # having run the cuda_init.sh script then slurmd doesn't start and the play fails. + # todo: reload nvidia modules/etc instead of restart + - name: flush the handlers - so that the node is rebooted after CUDA is installed and that the GPUs are initialized before we start slurm + meta: flush_handlers + + when: gpu == True + +# vim:ft=ansible: diff --git a/templates/cuda.sh.j2 b/templates/cuda.sh.j2 new file mode 100755 index 0000000..78393a1 --- /dev/null +++ b/templates/cuda.sh.j2 @@ -0,0 +1,7 @@ +#!/bin/bash +# {{ ansible_managed }} +export PATH={{ cuda_bash_path }}:$PATH +export LD_LIBRARY_PATH={{ cuda_bash_ld_lib_path }}:$LD_LIBRARY_PATH +export CUDA_ROOT={{ cuda_bash_cuda_root }} +export CUDA_INC_DIR={{ cuda_bash_cuda_inc_dir }}:$CUDA_INC_DIR +export CPATH={{ cuda_bash_cpath }}:$CPATH diff --git a/templates/cuda_init.service.j2 b/templates/cuda_init.service.j2 new file mode 100644 index 0000000..c61cbad --- /dev/null +++ b/templates/cuda_init.service.j2 @@ -0,0 +1,13 @@ +[Unit] +Description=Initialize nvidia/cuda devices +Before=slurm.service +DefaultDependencies=no +Conflicts=shutdown.target + +[Service] +ExecStart=/usr/local/bin/cuda_init.sh +Type=oneshot +RemainAfterExit=yes + +[Install] +WantedBy=multi-user.target diff --git a/templates/cuda_init.sh.j2 b/templates/cuda_init.sh.j2 new file mode 100644 index 0000000..3c85ea2 --- /dev/null +++ b/templates/cuda_init.sh.j2 @@ -0,0 +1,9 @@ +#!/bin/bash +# During boot we initialize the GPUs (creates /dev/nvidia*) + +if [ ! -f "/usr/bin/nvidia-smi" ]; then + logger -s -t nvidia-smi "Script $0 could not find /usr/bin/nvidia-smi" +else + /usr/bin/nvidia-smi --compute-mode={{ cuda_init_compute_mode }} + /usr/bin/nvidia-smi --persistence-mode={{ cuda_init_persistence_mode }} +fi diff --git a/tests/install.yml b/tests/install.yml new file mode 100644 index 0000000..03b9360 --- /dev/null +++ b/tests/install.yml @@ -0,0 +1,23 @@ +--- +- hosts: localhost + connection: local + roles: + - { name: lae.travis-lxc } + vars: + host_quantity: 1 + +# Run the following within the containers in the inventory +- hosts: all + tasks: + # Solution for avahi-daemon issue from https://github.com/lxc/lxc/issues/25 + - block: + - name: Install avahi-daemon early on Ubuntu 16 containers + package: + name: avahi-daemon + ignore_errors: True + - name: Remove nproc from avahi-daemon.conf + lineinfile: + dest: /etc/avahi/avahi-daemon.conf + regexp: "^rlimit-nproc=" + state: absent + when: "ansible_distribution_release == 'xenial'" diff --git a/tests/inventory b/tests/inventory new file mode 100644 index 0000000..e45e40f --- /dev/null +++ b/tests/inventory @@ -0,0 +1,5 @@ +[cuda] +test01.lxc + +[cuda:vars] +ansible_ssh_user=root diff --git a/tests/test.yml b/tests/test.yml new file mode 100644 index 0000000..5125bda --- /dev/null +++ b/tests/test.yml @@ -0,0 +1,12 @@ +--- +- hosts: all + become: True + roles: + - ansible-role-cuda + vars: + - gpu: True + - cuda_restart_node_on_install: False + - cuda_init: True + - cuda_init_restart_service: False + +# vim:ft=ansible: diff --git a/vars/centos-6.yml b/vars/centos-6.yml new file mode 100644 index 0000000..c4322ae --- /dev/null +++ b/vars/centos-6.yml @@ -0,0 +1,4 @@ +--- +cuda_repo_subfolder: rhel6 + +# vim:ft=ansible: \ No newline at end of file diff --git a/vars/centos-7.yml b/vars/centos-7.yml new file mode 100644 index 0000000..b331a96 --- /dev/null +++ b/vars/centos-7.yml @@ -0,0 +1,4 @@ +--- +cuda_repo_subfolder: rhel7 + +# vim:ft=ansible: \ No newline at end of file diff --git a/vars/main.yml b/vars/main.yml new file mode 100644 index 0000000..ded4141 --- /dev/null +++ b/vars/main.yml @@ -0,0 +1,5 @@ +--- +# vars file for ansible-role-cuda +cuda_packaging_key_id: 7FA2AF80 + +# vim:ft=ansible: \ No newline at end of file diff --git a/vars/redhat-6.yml b/vars/redhat-6.yml new file mode 100644 index 0000000..c4322ae --- /dev/null +++ b/vars/redhat-6.yml @@ -0,0 +1,4 @@ +--- +cuda_repo_subfolder: rhel6 + +# vim:ft=ansible: \ No newline at end of file diff --git a/vars/redhat-7.yml b/vars/redhat-7.yml new file mode 100644 index 0000000..b331a96 --- /dev/null +++ b/vars/redhat-7.yml @@ -0,0 +1,4 @@ +--- +cuda_repo_subfolder: rhel7 + +# vim:ft=ansible: \ No newline at end of file diff --git a/vars/ubuntu-14.04.yml b/vars/ubuntu-14.04.yml new file mode 100644 index 0000000..61d04fd --- /dev/null +++ b/vars/ubuntu-14.04.yml @@ -0,0 +1,4 @@ +--- +cuda_repo_subfolder: ubuntu1404 + +# vim:ft=ansible: \ No newline at end of file diff --git a/vars/ubuntu-14.10.yml b/vars/ubuntu-14.10.yml new file mode 100644 index 0000000..8c8e53e --- /dev/null +++ b/vars/ubuntu-14.10.yml @@ -0,0 +1,4 @@ +--- +cuda_repo_subfolder: ubuntu1410 + +# vim:ft=ansible: \ No newline at end of file diff --git a/vars/ubuntu-15.yml b/vars/ubuntu-15.yml new file mode 100644 index 0000000..d1f56ad --- /dev/null +++ b/vars/ubuntu-15.yml @@ -0,0 +1,4 @@ +--- +cuda_repo_subfolder: ubuntu1504 + +# vim:ft=ansible: \ No newline at end of file diff --git a/vars/ubuntu-16.04.yml b/vars/ubuntu-16.04.yml new file mode 100644 index 0000000..f948a1a --- /dev/null +++ b/vars/ubuntu-16.04.yml @@ -0,0 +1,4 @@ +--- +cuda_repo_subfolder: ubuntu1604 + +# vim:ft=ansible: \ No newline at end of file diff --git a/vars/ubuntu-17.04.yml b/vars/ubuntu-17.04.yml new file mode 100644 index 0000000..356467f --- /dev/null +++ b/vars/ubuntu-17.04.yml @@ -0,0 +1,4 @@ +--- +cuda_repo_subfolder: ubuntu1704 + +# vim:ft=ansible: -- cgit v1.2.1