From f3c41dd13a0a86382b80d564e9de0d6b06fb1dbf Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 11 Mar 2018 19:56:38 +0100 Subject: Various fixes before moving to hardware installation --- .gitmodules | 3 + anslib/ansible-role-ntp | 1 + anslib/link_vars.sh | 1 + anslib/openshift-ansible | 2 +- .../etcd-ds-rh1538446-openshift-undefined.patch | 11 ++ ...glusterfs-ds-use_cluster_local_for_heketi.patch | 24 +++ ...networkmanager-ds-use-openshift-interface.patch | 47 +++++ anslib/patches/openshift-ds-update371.patch | 45 +++++ anslib/patches/registry-ds-glusterfs-fixes.patch | 61 ++++++ .../registry-ds-glusterfs-storageclass.patch | 64 +++++++ docs/ands_ansible.txt | 2 +- docs/backup.txt | 26 +++ docs/consistency.txt | 36 ++++ docs/managment.txt | 166 ++++++++++++++++ docs/network.txt | 58 ++++++ docs/pods.txt | 13 ++ docs/regions.txt | 16 ++ docs/samples/templates/00-katrin-restricted.yml.j2 | 44 +++++ docs/samples/vars/run_oc.yml | 6 + docs/samples/vars/variants.yml | 33 ++++ docs/troubleshooting.txt | 210 +++++++++++++++++++++ docs/upgrade.txt | 64 +++++++ group_vars/OSEv3.yml | 65 +++++-- group_vars/ands.yml | 7 - group_vars/staging.yml | 6 +- group_vars/testing.yml | 15 +- group_vars/virtual.yml | 6 +- inventories/staging.erb | 48 +++-- inventories/testing.erb | 60 ++++-- opts.sh | 10 +- playbooks/ands-gluster-ganesha.yml | 1 + playbooks/ands-gluster.yml | 6 +- playbooks/ands-prepare.yml | 13 +- playbooks/openshift-add-masters.yml | 2 +- playbooks/openshift-add-nodes.yml | 2 +- playbooks/openshift-deploy-cluster.yml | 1 - playbooks/openshift-install-service-catalog.yml | 13 ++ playbooks/openshift-redeploy-certificates.yml | 14 +- playbooks/openshift-setup-project.yml | 2 +- playbooks/openshift-setup-projects.yml | 2 +- playbooks/openshift-setup-security.yml | 2 +- playbooks/openshift-setup-users.yml | 2 +- playbooks/openshift-upgrade.yml | 11 ++ roles/ands_backup/defaults/main.yml | 9 + roles/ands_backup/tasks/main.yml | 29 +++ roles/ands_backup/templates/backup.cron.j2 | 4 + roles/ands_backup/templates/backup.sh.j2 | 72 +++++++ roles/ands_common/README | 11 ++ roles/ands_common/default/main.yml | 1 + roles/ands_common/tasks/main.yml | 47 +++++ roles/ands_facts/defaults/main.yml | 8 +- roles/ands_facts/tasks/main.yml | 8 +- roles/ands_facts/tasks/network.yml | 16 +- roles/ands_facts/tasks/storage.yml | 8 +- roles/ands_kaas/templates/50-kaas-pods.yml.j2 | 16 +- roles/ands_network/tasks/common.yml | 21 +-- roles/ands_network/tasks/ganesha.yml | 12 ++ roles/ands_network/tasks/install_post.yml | 2 +- roles/ands_network/tasks/maintain.yml | 6 +- roles/common/README | 11 -- roles/common/default/main.yml | 1 - roles/common/tasks/main.yml | 46 ----- roles/docker/defaults/main.yml | 3 + roles/docker/tasks/main.yml | 16 +- roles/glusterfs/tasks/cfg/vols3.yml | 1 + roles/glusterfs/tasks/data/vols2.yml | 1 + roles/glusterfs/tasks/data/vols3.yml | 1 + roles/glusterfs/tasks/db/vols3.yml | 1 + roles/glusterfs/tasks/la/vols3.yml | 1 + roles/ntp | 1 + scripts/gluster.sh | 18 +- setup.sh | 2 +- setup/configs/labels.yml | 9 + setup/configs/volumes.yml | 11 +- setup/projects/adei/templates/60-adei.yml.j2 | 6 +- setup/projects/adei/vars/globals.yml | 4 + setup/projects/adei/vars/pods.yml | 3 +- setup/projects/kaas/vars/pods.yml | 3 +- 78 files changed, 1436 insertions(+), 194 deletions(-) create mode 160000 anslib/ansible-role-ntp create mode 100644 anslib/patches/etcd-ds-rh1538446-openshift-undefined.patch create mode 100644 anslib/patches/glusterfs-ds-use_cluster_local_for_heketi.patch create mode 100644 anslib/patches/networkmanager-ds-use-openshift-interface.patch create mode 100644 anslib/patches/openshift-ds-update371.patch create mode 100644 anslib/patches/registry-ds-glusterfs-fixes.patch create mode 100644 anslib/patches/registry-ds-glusterfs-storageclass.patch create mode 100644 docs/backup.txt create mode 100644 docs/consistency.txt create mode 100644 docs/managment.txt create mode 100644 docs/network.txt create mode 100644 docs/pods.txt create mode 100644 docs/regions.txt create mode 100644 docs/samples/templates/00-katrin-restricted.yml.j2 create mode 100644 docs/samples/vars/run_oc.yml create mode 100644 docs/samples/vars/variants.yml create mode 100644 docs/troubleshooting.txt create mode 100644 docs/upgrade.txt delete mode 120000 playbooks/openshift-deploy-cluster.yml create mode 100644 playbooks/openshift-install-service-catalog.yml mode change 120000 => 100644 playbooks/openshift-redeploy-certificates.yml create mode 100644 roles/ands_backup/defaults/main.yml create mode 100644 roles/ands_backup/tasks/main.yml create mode 100644 roles/ands_backup/templates/backup.cron.j2 create mode 100755 roles/ands_backup/templates/backup.sh.j2 create mode 100644 roles/ands_common/README create mode 100644 roles/ands_common/default/main.yml create mode 100644 roles/ands_common/tasks/main.yml create mode 100644 roles/ands_network/tasks/ganesha.yml delete mode 100644 roles/common/README delete mode 100644 roles/common/default/main.yml delete mode 100644 roles/common/tasks/main.yml create mode 120000 roles/ntp diff --git a/.gitmodules b/.gitmodules index 1401d9b..1185e39 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "anslib/ansible-ghetto-json"] path = anslib/ansible-ghetto-json url = https://github.com/FauxFaux/ansible-ghetto-json.git +[submodule "anslib/ansible-role-ntp"] + path = anslib/ansible-role-ntp + url = https://github.com/geerlingguy/ansible-role-ntp.git diff --git a/anslib/ansible-role-ntp b/anslib/ansible-role-ntp new file mode 160000 index 0000000..47b40d4 --- /dev/null +++ b/anslib/ansible-role-ntp @@ -0,0 +1 @@ +Subproject commit 47b40d48fce51c79630feeac84659824a746d4aa diff --git a/anslib/link_vars.sh b/anslib/link_vars.sh index 01a9fe9..651c09c 100755 --- a/anslib/link_vars.sh +++ b/anslib/link_vars.sh @@ -24,3 +24,4 @@ function mklink_func { export -f mklink_func find openshift-ansible/playbooks -mindepth 0 -maxdepth 2 -type d -print0 | xargs -0 -L 1 -I {} bash -c 'mklink_func "$@"' _ {} +find openshift-ansible/playbooks/common/openshift-cluster/upgrades -mindepth 0 -maxdepth 1 -type d -print0 | xargs -0 -L 1 -I {} bash -c 'mklink_func "$@"' _ {} diff --git a/anslib/openshift-ansible b/anslib/openshift-ansible index d1fcbd7..22d3a96 160000 --- a/anslib/openshift-ansible +++ b/anslib/openshift-ansible @@ -1 +1 @@ -Subproject commit d1fcbd7a9a8511b895f9a163f7fa2a7bc0d72f2b +Subproject commit 22d3a96deaf74b7aa9aa021a73ef39e2b4da3378 diff --git a/anslib/patches/etcd-ds-rh1538446-openshift-undefined.patch b/anslib/patches/etcd-ds-rh1538446-openshift-undefined.patch new file mode 100644 index 0000000..2301072 --- /dev/null +++ b/anslib/patches/etcd-ds-rh1538446-openshift-undefined.patch @@ -0,0 +1,11 @@ +diff --git a/roles/openshift_etcd_facts/tasks/main.yml b/roles/openshift_etcd_facts/tasks/main.yml +index 86546f4..bda0606 100644 +--- a/roles/openshift_etcd_facts/tasks/main.yml ++++ b/roles/openshift_etcd_facts/tasks/main.yml +@@ -1,2 +1,6 @@ + --- ++- openshift_facts: ++ role: etcd ++ local_facts: {} ++ + - import_tasks: set_etcd_ca_host.yml diff --git a/anslib/patches/glusterfs-ds-use_cluster_local_for_heketi.patch b/anslib/patches/glusterfs-ds-use_cluster_local_for_heketi.patch new file mode 100644 index 0000000..75a8a43 --- /dev/null +++ b/anslib/patches/glusterfs-ds-use_cluster_local_for_heketi.patch @@ -0,0 +1,24 @@ +diff --git a/roles/openshift_storage_glusterfs/tasks/glusterfs_common.yml b/roles/openshift_storage_glusterfs/tasks/glusterfs_common.yml +index 4928e86..b8f3cab 100644 +--- a/roles/openshift_storage_glusterfs/tasks/glusterfs_common.yml ++++ b/roles/openshift_storage_glusterfs/tasks/glusterfs_common.yml +@@ -293,7 +293,8 @@ + + - name: Determine StorageClass heketi URL + set_fact: +- glusterfs_heketi_route: "{{ heketi_route.results.results[0]['spec']['host'] }}" ++ glusterfs_heketi_route: "heketi-{{ glusterfs_name }}.{{ glusterfs_namespace }}.svc.cluster.local:8080" ++# glusterfs_heketi_route: "{{ heketi_route.results.results[0]['spec']['host'] }}" + when: + - glusterfs_heketi_is_native + +@@ -344,7 +345,8 @@ + + - name: Determine StorageClass heketi URL + set_fact: +- glusterfs_heketi_route: "{{ heketi_route.results.results[0]['spec']['host'] }}" ++ glusterfs_heketi_route: "heketi-{{ glusterfs_name }}.{{ glusterfs_namespace }}.svc.cluster.local:8080" ++# glusterfs_heketi_route: "{{ heketi_route.results.results[0]['spec']['host'] }}" + when: + - glusterfs_heketi_is_native + - glusterfs_heketi_route is not defined diff --git a/anslib/patches/networkmanager-ds-use-openshift-interface.patch b/anslib/patches/networkmanager-ds-use-openshift-interface.patch new file mode 100644 index 0000000..687be8a --- /dev/null +++ b/anslib/patches/networkmanager-ds-use-openshift-interface.patch @@ -0,0 +1,47 @@ +diff --git a/roles/openshift_node/files/bootstrap.yml b/roles/openshift_node/files/bootstrap.yml +index ea28064..df95ba3 100644 +--- a/roles/openshift_node/files/bootstrap.yml ++++ b/roles/openshift_node/files/bootstrap.yml +@@ -8,7 +8,7 @@ + lines: + - regex: ^listen-address + state: present +- line: "listen-address={{ ansible_default_ipv4.address }}" ++ line: "listen-address={{ openshift_dns_ip }}" + node_dns: + file: /etc/dnsmasq.d/node-dnsmasq.conf + lines: +diff --git a/roles/openshift_node/files/networkmanager/99-origin-dns.sh b/roles/openshift_node/files/networkmanager/99-origin-dns.sh +index acf3e2f..16129a2 100755 +--- a/roles/openshift_node/files/networkmanager/99-origin-dns.sh ++++ b/roles/openshift_node/files/networkmanager/99-origin-dns.sh +@@ -43,10 +43,25 @@ if [[ $2 =~ ^(up|dhcp4-change|dhcp6-change)$ ]]; then + ###################################################################### + # couldn't find an existing method to determine if the interface owns the + # default route +- def_route=$(/sbin/ip route list match 0.0.0.0/0 | awk '{print $3 }') +- def_route_int=$(/sbin/ip route get to ${def_route} | awk '{print $3}') +- def_route_ip=$(/sbin/ip route get to ${def_route} | awk '{print $5}') +- if [[ ${DEVICE_IFACE} == ${def_route_int} ]]; then ++ #SDS ++ #def_route=$(/sbin/ip route list match 0.0.0.0/0 | awk '{print $3 }') ++ #def_route_int=$(/sbin/ip route get to ${def_route} | awk '{print $3}') ++ #def_route_ip=$(/sbin/ip route get to ${def_route} | awk '{print $5}') ++ #EDS ++ def_route_ip=$(cat /etc/hosts | grep openshift_dns_ip | awk '{ print $1 }') ++ [ -n "$def_route_ip" ] && def_route_int=$(ip -o addr show | grep ${def_route_ip} | awk '{ print $2 }') ++ if [ -z "$def_route_ip" -o -z "$def_route_int" ]; then ++ def_route=$(/sbin/ip route list match 0.0.0.0/0 | awk '{print $3 }') ++ def_route_int=$(/sbin/ip route get to ${def_route} | awk '{print $3}' | head -n 1) ++ def_route_ip=$(/sbin/ip -f inet addr show dev ${def_route_int} scope global up | grep -Po 'inet \K[\d.]+' | head -n 1) ++ fi ++ ++ def_routes=$(/sbin/ip route list match 0.0.0.0/0 | awk '{print $3 }') ++ def_routes_int=$(for r in ${def_routes}; do /sbin/ip route get to ${r} | awk '{print $3}'; done) ++ interfaces="${def_route_int} ${def_routes_int}" ++ ++ if [[ "${interfaces}" =~ (^|[[:space:]])${DEVICE_IFACE}($|[[:space:]]) ]]; then ++# if [[ ${DEVICE_IFACE} == ${def_route_int} ]]; then + if [ ! -f /etc/dnsmasq.d/origin-dns.conf ]; then + cat << EOF > /etc/dnsmasq.d/origin-dns.conf + no-resolv diff --git a/anslib/patches/openshift-ds-update371.patch b/anslib/patches/openshift-ds-update371.patch new file mode 100644 index 0000000..a6beff3 --- /dev/null +++ b/anslib/patches/openshift-ds-update371.patch @@ -0,0 +1,45 @@ +diff --git a/playbooks/common/openshift-cluster/upgrades/v3_7/upgrade.yml b/playbooks/common/openshift-cluster/upgrades/v3_7/upgrade.yml +index cc2ec27..6c4ccf8 100644 +--- a/playbooks/common/openshift-cluster/upgrades/v3_7/upgrade.yml ++++ b/playbooks/common/openshift-cluster/upgrades/v3_7/upgrade.yml +@@ -12,7 +12,7 @@ + - pre_upgrade + tasks: + - set_fact: +- openshift_upgrade_target: '3.7' ++ openshift_upgrade_target: '3.7.1' + openshift_upgrade_min: '3.6' + + - import_playbook: ../pre/config.yml +diff --git a/roles/openshift_repos/templates/CentOS-OpenShift-Origin371.repo.j2 b/roles/openshift_repos/templates/CentOS-OpenShift-Origin371.repo.j2 +new file mode 100644 +index 0000000..10b49c0 +--- /dev/null ++++ b/roles/openshift_repos/templates/CentOS-OpenShift-Origin371.repo.j2 +@@ -0,0 +1,26 @@ ++[centos-openshift-origin371] ++name=CentOS OpenShift Origin ++baseurl={{ ands_repo_url }}/openshift371/ ++enabled=1 ++gpgcheck=0 ++ ++[centos-openshift-origin37-testing] ++name=CentOS OpenShift Origin Testing ++baseurl=http://buildlogs.centos.org/centos/7/paas/x86_64/openshift-origin37/ ++enabled={{ 1 if openshift_repos_enable_testing else 0 }} ++gpgcheck=0 ++gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-PaaS ++ ++[centos-openshift-origin37-debuginfo] ++name=CentOS OpenShift Origin DebugInfo ++baseurl=http://debuginfo.centos.org/centos/7/paas/x86_64/ ++enabled=0 ++gpgcheck=1 ++gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-PaaS ++ ++[centos-openshift-origin37-source] ++name=CentOS OpenShift Origin Source ++baseurl=http://vault.centos.org/centos/7/paas/Source/openshift-origin37/ ++enabled=0 ++gpgcheck=1 ++gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-PaaS diff --git a/anslib/patches/registry-ds-glusterfs-fixes.patch b/anslib/patches/registry-ds-glusterfs-fixes.patch new file mode 100644 index 0000000..65f30e5 --- /dev/null +++ b/anslib/patches/registry-ds-glusterfs-fixes.patch @@ -0,0 +1,61 @@ +diff --git a/roles/openshift_hosted/tasks/registry.yml b/roles/openshift_hosted/tasks/registry.yml +index bc4d81e..4720095 100644 +diff --git a/roles/openshift_hosted/tasks/registry_storage.yml b/roles/openshift_hosted/tasks/registry_storage.yml +index aa66a78..e1b8c4e 100644 +diff --git a/roles/openshift_hosted/tasks/storage/glusterfs.yml b/roles/openshift_hosted/tasks/storage/glusterfs.yml +index 7223a5a..3465b6c 100644 +--- a/roles/openshift_hosted/tasks/storage/glusterfs.yml ++++ b/roles/openshift_hosted/tasks/storage/glusterfs.yml +@@ -35,7 +35,7 @@ + mount: + state: mounted + fstype: glusterfs +- src: "{% if 'glusterfs_registry' in groups and groups['glusterfs_registry'] | length > 0 %}{% set node = groups.glusterfs_registry[0] %}{% elif 'glusterfs' in groups and groups['glusterfs'] | length > 0 %}{% set node = groups.glusterfs[0] %}{% endif %}{% if openshift_hosted_registry_storage_glusterfs_ips is defined and openshift_hosted_registry_storage_glusterfs_ips|length > 0 %}{{ openshift_hosted_registry_storage_glusterfs_ips[0] }}{% elif 'glusterfs_hostname' in hostvars[node] %}{{ hostvars[node].glusterfs_hostname }}{% elif 'openshift' in hostvars[node] %}{{ hostvars[node].openshift.node.nodename }}{% else %}{{ node }}{% endif %}:/{{ openshift.hosted.registry.storage.glusterfs.path }}" ++ src: "{% if 'glusterfs_registry' in groups and groups['glusterfs_registry'] | length > 0 %}{% set node = groups.glusterfs_registry[0] %}{% elif 'glusterfs' in groups and groups['glusterfs'] | length > 0 %}{% set node = groups.glusterfs[0] %}{% endif %}{% if openshift_hosted_registry_storage_glusterfs_ips is defined and openshift_hosted_registry_storage_glusterfs_ips|length > 0 %}{{ openshift_hosted_registry_storage_glusterfs_ips[0] }}{% elif 'glusterfs_hostname' in hostvars[node] %}{{ hostvars[node].glusterfs_hostname }}{% elif 'openshift' in hostvars[node] %}{{ hostvars[node].openshift.node.nodename }}{% else %}{{ node }}{% endif %}:/{{ openshift_hosted_registry_storage_glusterfs_path }}" + name: "{{ mktemp.stdout }}" + + - name: Set registry volume permissions +@@ -49,10 +49,11 @@ + - block: + - name: Activate registry maintenance mode + oc_env: ++ kind: dc + namespace: "{{ openshift_hosted_registry_namespace }}" + name: "{{ openshift_hosted_registry_name }}" + env_vars: +- - REGISTRY_STORAGE_MAINTENANCE_READONLY_ENABLED: 'true' ++ REGISTRY_STORAGE_MAINTENANCE_READONLY_ENABLED: 'true' + + - name: Get first registry pod name + set_fact: +@@ -72,11 +73,12 @@ + + - name: Deactivate registry maintenance mode + oc_env: ++ kind: dc + namespace: "{{ openshift_hosted_registry_namespace }}" + name: "{{ openshift_hosted_registry_name }}" + state: absent + env_vars: +- - REGISTRY_STORAGE_MAINTENANCE_READONLY_ENABLED: 'true' ++ REGISTRY_STORAGE_MAINTENANCE_READONLY_ENABLED: 'true' + when: openshift_hosted_registry_storage_glusterfs_swap + + - name: Unmount registry volume and clean up mount point/fstab +diff --git a/roles/openshift_persistent_volumes/tasks/main.yml b/roles/openshift_persistent_volumes/tasks/main.yml +index b1d9c8c..1c32a67 100644 +diff --git a/roles/openshift_persistent_volumes/templates/persistent-volume-claim.yml.j2 b/roles/openshift_persistent_volumes/templates/persistent-volume-claim.yml.j2 +index ca8b747..ce15533 100644 +--- a/roles/openshift_persistent_volumes/templates/persistent-volume-claim.yml.j2 ++++ b/roles/openshift_persistent_volumes/templates/persistent-volume-claim.yml.j2 +@@ -12,7 +12,7 @@ items: + resources: + requests: + storage: "{{ claim.capacity }}" +-{% if claim.storageclass is not None %} ++{% if claim.storageclass is defined and claim.storageclass is not none %} + storageClassName: "{{ claim.storageclass }}" + {% endif %} + {% endfor %} +diff --git a/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml b/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml +index e91e130..f3562b6 100644 diff --git a/anslib/patches/registry-ds-glusterfs-storageclass.patch b/anslib/patches/registry-ds-glusterfs-storageclass.patch new file mode 100644 index 0000000..a189091 --- /dev/null +++ b/anslib/patches/registry-ds-glusterfs-storageclass.patch @@ -0,0 +1,64 @@ +diff --git a/roles/openshift_hosted/tasks/registry.yml b/roles/openshift_hosted/tasks/registry.yml +index bc4d81e..4720095 100644 +--- a/roles/openshift_hosted/tasks/registry.yml ++++ b/roles/openshift_hosted/tasks/registry.yml +@@ -112,6 +112,7 @@ + when: + - openshift_hosted_registry_storage_glusterfs_ips|length > 0 + - openshift_hosted_registry_storage_kind | default(none) in ['glusterfs'] ++ - openshift_hosted_registry_storage_class is not defined + + - name: Create OpenShift registry + oc_adm_registry: +diff --git a/roles/openshift_hosted/tasks/registry_storage.yml b/roles/openshift_hosted/tasks/registry_storage.yml +index aa66a78..e1b8c4e 100644 +--- a/roles/openshift_hosted/tasks/registry_storage.yml ++++ b/roles/openshift_hosted/tasks/registry_storage.yml +@@ -2,3 +2,4 @@ + - include_tasks: storage/glusterfs.yml + when: + - openshift_hosted_registry_storage_kind | default(none) == 'glusterfs' or openshift_hosted_registry_storage_glusterfs_swap ++ - openshift_hosted_registry_storage_class is not defined +diff --git a/roles/openshift_hosted/tasks/storage/glusterfs.yml b/roles/openshift_hosted/tasks/storage/glusterfs.yml +index 7223a5a..3465b6c 100644 +diff --git a/roles/openshift_persistent_volumes/tasks/main.yml b/roles/openshift_persistent_volumes/tasks/main.yml +index b1d9c8c..1c32a67 100644 +--- a/roles/openshift_persistent_volumes/tasks/main.yml ++++ b/roles/openshift_persistent_volumes/tasks/main.yml +@@ -23,7 +23,21 @@ + - name: "{{ openshift_hosted_registry_storage_volume_name }}-glusterfs-claim" + capacity: "{{ openshift_hosted_registry_storage_volume_size }}" + access_modes: "{{ openshift_hosted_registry_storage_access_modes }}" +- when: openshift_hosted_registry_storage_glusterfs_swap | default(False) ++ when: ++ - openshift_hosted_registry_storage_glusterfs_swap | default(False) ++ - openshift_hosted_registry_storage_class is not defined ++ ++ ++- set_fact: ++ glusterfs_pv: [] ++ glusterfs_pvc: ++ - name: "{{ openshift_hosted_registry_storage_volume_name }}-claim" ++ storageclass: "{{ openshift_hosted_registry_storage_class }}" ++ capacity: "{{ openshift_hosted_registry_storage_volume_size }}" ++ access_modes: "{{ openshift_hosted_registry_storage_access_modes }}" ++ when: ++ - openshift_hosted_registry_storage_class is defined ++ + + - name: create standard pv and pvc lists + # generate_pv_pvcs_list is a custom action module defined in +diff --git a/roles/openshift_persistent_volumes/templates/persistent-volume-claim.yml.j2 b/roles/openshift_persistent_volumes/templates/persistent-volume-claim.yml.j2 +index ca8b747..ce15533 100644 +diff --git a/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml b/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml +index e91e130..f3562b6 100644 +--- a/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml ++++ b/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml +@@ -12,4 +12,6 @@ + + - name: Create GlusterFS registry volume + command: "{{ glusterfs_heketi_client }} volume create --size={{ openshift_hosted_registry_storage_volume_size | replace('Gi','') }} --name={{ openshift_hosted_registry_storage_glusterfs_path }}" +- when: "openshift_hosted_registry_storage_glusterfs_path not in registry_volume.stdout" ++ when: ++ - "openshift_hosted_registry_storage_glusterfs_path not in registry_volume.stdout" ++ - "openshift_hosted_registry_storage_class is not defined" diff --git a/docs/ands_ansible.txt b/docs/ands_ansible.txt index 80a7cf0..70800e1 100644 --- a/docs/ands_ansible.txt +++ b/docs/ands_ansible.txt @@ -89,7 +89,7 @@ Ansible parameters (global) glusterfs_version group_vars glusterfs_transport group_vars - - OPenShift specific + - OpenShift specific ands_openshift_labels setup/configs Labels to assign to the nodes ands_openshift_projects setup/configs List of projects to configure (with GlusterFS endpoints, etc.) ands_openshift_users setup/configs Optional list of user names with contacts diff --git a/docs/backup.txt b/docs/backup.txt new file mode 100644 index 0000000..1b25592 --- /dev/null +++ b/docs/backup.txt @@ -0,0 +1,26 @@ +Critical directories and services +--------------------------------- + - etcd database [ once ] + * There is etcd2 and etcd3 APIs. OpenShift 3.5+ uses etcd3, but documentation + still describes etcd2-style backup. etcd3 is backward compatible with etcd2, + and we can run etcd2 backup as well. Now the question if we need to backup + both ways (OpenShift 3.5 is definitively has etcd3 data) or just etcd3 + considering it is a bug in documentation. + * etcd3 + etcdctl3 --endpoints="192.168.213.1:2379" snapshot save snapshot.db + * etcd2 + etcdctl backup --data-dir /var/lib/etcd/ --backup-dir . + cp "$ETCD_DATA_DIR"/member/snap/db member/snap/db + + - heketi topology [ once ] + heketi-cli -s http://heketi-storage.glusterfs.svc.cluster.local:8080 --user admin --secret "$(oc get secret heketi-storage-admin-secret -n glusterfs -o jsonpath='{.data.key}' | base64 -d)" topology info --json + + - Gluster volume information [ storage nodes ] + * /var/lib/glusterd/glusterd.info + * /var/lib/glusterd/peers + * /var/lib/glusterd/glustershd - not mentioned in docs + + - etc [ all nodes ] + * /etc/origin/ - Only *.key *.crt from /etc/origin/master in docs + * /etc/etcd - Not mentioned + * /etc/docker - Only certs.d diff --git a/docs/consistency.txt b/docs/consistency.txt new file mode 100644 index 0000000..127d9a7 --- /dev/null +++ b/docs/consistency.txt @@ -0,0 +1,36 @@ +General overview +================= + - etcd services (worth checking both ports) + etcdctl3 --endpoints="192.168.213.1:2379" member list - doesn't check health only reports members + oc get cs - only etcd (other services will fail on Openshift) + - All nodes and pods are fine and running and all pvc are bound + oc get nodes + oc get pods --all-namespaces -o wide + oc get pvc --all-namespaces -o wide + - API health check + curl -k https://apiserver.kube-service-catalog.svc/healthz + +Storage +======= + - Heketi status + heketi-cli -s http://heketi-storage.glusterfs.svc.cluster.local:8080 --user admin --secret "$(oc get secret heketi-storage-admin-secret -n glusterfs -o jsonpath='{.data.key}' | base64 -d)" topology info + - Status of Gluster Volume (and its bricks which with heketi fails often) + gluster volume info + ./gluster.sh info all_heketi + - Check available storage space on system partition and LVM volumes (docker, heketi, ands) + Run 'df -h' and 'lvdisplay' on each node + +Networking +========== + - Check that both internal and external addresses are resolvable from all hosts. + * I.e. we should be able to resolve 'google.com' + * And we should be able to resolve 'heketi-storage.glusterfs.svc.cluster.local' + + - Check that keepalived service is up and the corresponding ip's are really assigned to one + of the nodes (vagrant provisioner would remove keepalived tracked ips, but keepalived will + continue running without noticing it) + + - Ensure, we don't have override of cluster_name to first master (which we do during the + provisioning of OpenShift plays) + + \ No newline at end of file diff --git a/docs/managment.txt b/docs/managment.txt new file mode 100644 index 0000000..1eca8a8 --- /dev/null +++ b/docs/managment.txt @@ -0,0 +1,166 @@ +DOs and DONTs +============= + Here we discuss things we should do and we should not do! + + - Scaling up cluster is normally problem-less. Both nodes & masters can be added + fast and without much troubles afterwards. + + - Upgrade procedure may cause the problems. The main trouble that many pods are + configured to use the 'latest' tag. And the latest versions has latest problems (some + of the tags can be fixed to actual version, but finding that is broken and why takes + a lot of effort)... + * Currently, there is problems if 'kube-service-catalog' is updated (see discussion + in docs/upgrade.txt). While it seems nothing really changes, the connection between + apiserver and etcd breaks down (at least for health checks). The intallation reamins + pretty much usable, but not in healthy state. This particular update is blocked by + setting. + openshift_enable_service_catalog: false + Then, it is left in 'Error' state, but can be easily recovered by deteleting and + allowing system to re-create a new pod. + * However, as cause is unclear, it is possible that something else with break as time + passes and new images are released. It is ADVISED to check upgrade in staging first. + * During upgrade also other system pods may stuck in Error state (as explained + in troubleshooting) and block the flow of upgrade. Just delete them and allow + system to re-create to continue. + * After upgrade, it is necessary to verify that all pods are operational and + restart ones in 'Error' states. + + - Re-running install will break on heketi. And it will DESTROY heketi topology! + DON"T DO IT! Instead a separate components can be re-installed. + * For instance to reinstall 'openshift-ansible-service-broker' use + openshift-install-service-catalog.yml + * There is a way to prevent plays from touching heketi, we need to define + openshift_storage_glusterfs_is_missing: False + openshift_storage_glusterfs_heketi_is_missing: False + But I am not sure if it is only major issue. + + - Few administrative tools could cause troubles. Don't run + * oc adm diagnostics + + +Failures / Immidiate +======== + - We need to remove the failed node from etcd cluster + etcdctl3 --endpoints="192.168.213.1:2379" member list + etcdctl3 --endpoints="192.168.213.1:2379" member remove + + - Further, the following is required on all remaining nodes if the node is forever gone + * Delete node + oc delete node + * Remove it also from /etc/etcd.conf on all nodes ETCD_INITIAL_CLUSTER + * Remove failed nodes from 'etcdClinetInfo' section in /etc/origin/master/master-config.yaml + systemctl restart origin-master-api.service + +Scaling / Recovery +======= + - One important point. + * If we lost data on the storage node, it should be re-added with different name (otherwise + the GlusterFS recovery would be significantly more complicated) + * If Gluster bricks are preserved, we may keep the name. I have not tried, but according to + documentation, it should be possible to reconnect it back and synchronize. Still it may be + easier to use a new name again to simplify procedure. + * Simple OpenShift nodes may be re-added with the same name, no problem. + + - Next we need to perform all prepartion steps (the --limit should not be applied as we normally + need to update CentOS on all nodes to synchronize software versions; list all nodes in /etc/hosts + files; etc). + ./setup.sh -i staging prepare + + - The OpenShift scale is provided as several ansible plays (scale-masters, scale-nodes, scale-etcd). + * Running 'masters' will also install configured 'nodes' and 'etcd' daemons + * I guess running 'nodes' will also handle 'etcd' daemons, but I have not checked. + +Problems +-------- + - There should be no problems if a simple node crashed, but things may go wrong if one of the + masters is crashed. And things definitively will go wrong if complete cluster will be cut from the power. + * Some pods will be stuck polling images. This happens if node running docker-registry have crashed + and the persistent storage was not used to back the registry. It can be fixed by re-schedulling build + and roling out the latest version from dc. + oc -n adei start-build adei + oc -n adei rollout latest mysql + OpenShift will trigger rollout automatically in some time, but it will take a while. The builds + should be done manually it seems. + * In case of long outtage some CronJobs will stop execute. The reason is some protection against + excive loads and missing defaults. Fix is easy, just setup how much time the OpenShift scheduller + allows to CronJob to start before considering it failed: + oc -n adei patch cronjob/adei-autogen-update --patch '{ "spec": {"startingDeadlineSeconds": 10 }}' + + - if we forgot to remove old host from etcd cluster, the OpenShift node will be configured, but etcd + will not be installed. We need, then, to remove the node as explained above and run scale of etcd + cluster. + * In multiple ocasions, the etcd daemon has failed after reboot and needed to be resarted manually. + If half of the daemons is broken, the 'oc' will block. + + + +Storage / Recovery +======= + - Furthermore, it is necessary to add glusterfs nodes on a new storage nodes. It is not performed + automatically by scale plays. The 'glusterfs' play should be executed with additional options + specifying that we are just re-configuring nodes. We can check if all pods are serviced + oc -n glusterfs get pods -o wide + Both OpenShift and etcd clusters should be in proper state before running this play. Fixing and re-running + should be not an issue. + + - More details: + https://docs.openshift.com/container-platform/3.7/day_two_guide/host_level_tasks.html + + +Heketi +------ + - With heketi things are straighforward, we need to mark node broken. Then heketi will automatically move the + bricks to other servers (as he thinks fit). + * Accessing heketi + heketi-cli -s http://heketi-storage-glusterfs.openshift.suren.me --user admin --secret "$(oc get secret heketi-storage-admin-secret -n glusterfs -o jsonpath='{.data.key}' | base64 -d)" + * Gettiing required ids + heketi-cli topology info + * Removing node + heketi-cli node info + heketi-cli node disable + heketi-cli node remove + * Thats it. A few self-healing daemons are running which should bring the volumes in order automatically. + * The node will still persist in heketi topology as failed, but will not be used ('node delete' potentially could destroy it, but it is failin) + + - One problem with heketi, it may start volumes before bricks get ready. Consequently, it may run volumes with several bricks offline. It should be + checked and fixed by restarting the volumes. + +KaaS Volumes +------------ + There is two modes. + - If we migrated to a new server, we need to migrate bricks (force is required because + the source break is dead and data can't be copied) + gluster volume replace-brick commit force + * There is healing daemons running and nothing else has to be done. + * There play and scripts available to move all bricks automatically + + - If we kept the name and the data is still there, it should be also relatively easy + to perform migration (not checked). We also should have backups of all this data. + * Ensure Gluster is not running on the failed node + oadm manage-node ipeshift2 --schedulable=false + oadm manage-node ipeshift2 --evacuate + * Verify the gluster pod is not active. It may be running, but not ready. + Could be double checked with 'ps'. + oadm manage-node ipeshift2 --list-pods + * Get the original Peer UUID of the failed node (by running on healthy node) + gluster peer status + * And create '/var/lib/glusterd/glusterd.info' similar to the one on the + healthy nodes, but with the found UUID. + * Copy peers from the healthy nodes to /var/lib/glusterd/peers. We need to + copy from 2 nodes as node does not hold peer information on itself. + * Create mount points and re-schedule gluster pod. See more details + https://access.redhat.com/documentation/en-us/red_hat_gluster_storage/3/html/administration_guide/sect-replacing_hosts + * Start healing + gluster volume heal VOLNAME full + + - However, if data is lost, it is quite complecated to recover using the same server name. + We should rename the server and use first approach instead. + + + +Scaling +======= +We have currently serveral assumptions which will probably not hold true for larger clusters + - Gluster + To simplify matters we just reference servers in the storage group manually + Arbiter may work for several groups and we should define several brick path in this case diff --git a/docs/network.txt b/docs/network.txt new file mode 100644 index 0000000..a164d36 --- /dev/null +++ b/docs/network.txt @@ -0,0 +1,58 @@ +Configuration +============= +openshift_ip Infiniband IPs for fast communication (it also used for ADEI/MySQL bridge + and so should reside on fast network. +openshift_hostname The 'cluster' host name. Should match real host name for certificat validation. + So, it should be set if default ip does not resolve to host name +openshift_public_ip We may either skip this or set to our 192.168.26.xxx network. Usage is unclear +openshift_public_hostname I guess it is also for certificates, but while communicating with external systems +openshift_master_cluster_hostname Internal cluster load-balancer or just pointer to master host +openshift_public_master_cluster_hostname The main cluster gateway + + +Complex Network +=============== +Some things in OpenShift ansible scripts are still implemented with assumption we have +a simple network configuration with a single interface communicating to the world. There +are several options to change this: + openshift_set_node_ip - This variable configures nodeIP in the node configuration. This + variable is needed in cases where it is desired for node traffic to go over an interface + other than the default network interface. + openshift_ip - This variable overrides the cluster internal IP address for the system. + Use this when using an interface that is not configured with the default route. + openshift_hostname - This variable overrides the internal cluster host name for the system. + Use this when the system’s default IP address does not resolve to the system host name. +Furthermore, if we use infiniband which is not accessible to outside world we need to set + openshift_public_ip - Use this for cloud installations, or for hosts on networks using + a network address translation + openshift_public_hostname - Use this for cloud installations, or for hosts on networks + using a network address translation (NAT). + + This is, however, is not used trough all system components. Some provisioning code and +installed scripts are still detect kind of 'main system ip' to look for the +services. This ip is intendified either as 'ansible_default_ip' or by the code trying +to look for the ip which is used to send packet over default route. Ansible in the end does +the some thing. This plays bad for several reasons. + - We have keepalived ips moving between systems. The scripts are actually catching + this moving ips instead of the fixed ip bound to the system. + - There could be several default routes. While it is not a problem, scripts does not expect + that and may fail. + +For instance, the script '99-origin-dns.sh' in /etc/NetworkManager/dispatcher.d. + * def_route=$(/sbin/ip route list match 0.0.0.0/0 | awk '{print $3 }') + 1) Does not expect multiple default routes and will find just a random one. Then, + * if [[ ${DEVICE_IFACE} == ${def_route_int} ]]; then + check may fail and the resolv.conf will be not updated because currently up'ed + interface is not on default route, but it actually is. Furthermore, + * def_route_ip=$(/sbin/ip route get to ${def_route} | awk '{print $5}') + 2) ignorant of keepalived and will bound to keepalived. + + But I am not sure the problems are limited to this script. There could be other places with + the same logic. Some details are here: + https://docs.openshift.com/container-platform/3.7/admin_guide/manage_nodes.html#manage-node-change-node-traffic-interface + +Hostnames +========= + The linux host name (uname -a) should match the hostnames assigned to openshift nodes. Otherwise, the certificate verification + will fail. It seems minor issue as system continue functioning, but better to avoid. The check can be performed with etcd: + etcdctl3 --key=/etc/etcd/peer.key --cacert=/etc/etcd/ca.crt --endpoints="192.168.213.1:2379,192.168.213.3:2379,192.168.213.4:2379" diff --git a/docs/pods.txt b/docs/pods.txt new file mode 100644 index 0000000..b84f42f --- /dev/null +++ b/docs/pods.txt @@ -0,0 +1,13 @@ +Updating Daemon Set +=================== + - Not trivial. We need to + a) Re-recreate ds + * Manualy change 'imagePullPolicty' to 'Always' if it is set to 'IfNotExisting' + b) Destory all nodes and allow ds to recreate them + + - Sample: Updateing gluster + oc -n glusterfs delete ds/glusterfs-storage + oc -n glusterfs process glusterfs IMAGE_NAME=chsa/gluster-centos IMAGE_VERSION=312 > gluster.json + *** Edit + oc -n glusterfs create -f gluster.json + oc -n glusterfs delete pods -l 'glusterfs=storage-pod' diff --git a/docs/regions.txt b/docs/regions.txt new file mode 100644 index 0000000..88b8f5e --- /dev/null +++ b/docs/regions.txt @@ -0,0 +1,16 @@ +region=infra Infrastructure nodes which are used by OpenShift to run router and registry services. This is + more or less ipekatrin* nodes down in the basement. +region=prod Production servers (ipecompute*, etc.) located anythere, but I expect only basement. +region=dev Temporary nodes + +zone=default Basement +zone=404 Second server room on 4th floor +zone=student Student room +zone=external Other external places + + + +production: 1 Specifies all production servers (no extra load, no occasional reboots) + This includes 'infra' and 'prod' regions. +server: 1 Like production, but with occasional reboots and some extra testing load possible +permanent: 1 Non-production systems, but which are permanently connected to OpenShift diff --git a/docs/samples/templates/00-katrin-restricted.yml.j2 b/docs/samples/templates/00-katrin-restricted.yml.j2 new file mode 100644 index 0000000..6221f30 --- /dev/null +++ b/docs/samples/templates/00-katrin-restricted.yml.j2 @@ -0,0 +1,44 @@ +# Overriding SCC rules to allow arbitrary gluster mounts in restricted containers +--- +allowHostDirVolumePlugin: false +allowHostIPC: false +allowHostNetwork: false +allowHostPID: false +allowHostPorts: false +allowPrivilegedContainer: false +allowedCapabilities: null +apiVersion: v1 +defaultAddCapabilities: null +fsGroup: + type: MustRunAs +groups: +- system:authenticated +kind: SecurityContextConstraints +metadata: + annotations: + kubernetes.io/description: restricted denies access to all host features and requires + pods to be run with a UID, and SELinux context that are allocated to the namespace. This + is the most restrictive SCC. + creationTimestamp: null + name: katrin-restricted +priority: null +readOnlyRootFilesystem: false +requiredDropCapabilities: +- KILL +- MKNOD +- SYS_CHROOT +- SETUID +- SETGID +runAsUser: + type: MustRunAsRange +seLinuxContext: + type: MustRunAs +supplementalGroups: + type: RunAsAny +volumes: +- glusterfs +- configMap +- downwardAPI +- emptyDir +- persistentVolumeClaim +- secret diff --git a/docs/samples/vars/run_oc.yml b/docs/samples/vars/run_oc.yml new file mode 100644 index 0000000..a464549 --- /dev/null +++ b/docs/samples/vars/run_oc.yml @@ -0,0 +1,6 @@ +oc: + - template: "[0-3]*" + - template: "[4-6]*" + - resource: "route/apache" + oc: "expose svc/kaas --name apache --hostname=apache.{{ openshift_master_default_subdomain }}" + - template: "*" diff --git a/docs/samples/vars/variants.yml b/docs/samples/vars/variants.yml new file mode 100644 index 0000000..c7a27b4 --- /dev/null +++ b/docs/samples/vars/variants.yml @@ -0,0 +1,33 @@ +# First port is exposed + +pods: + kaas: + variant: "{{ ands_prefer_docker | default(false) | ternary('docker', 'centos') }}" + centos: + service: { host: "{{ katrin_node }}", ports: [ 80/8080, 443/8043 ] } + sched: { replicas: 1, selector: { master: 1 } } + selector: { master: 1 } + images: + - image: "centos/httpd-24-centos7" + mappings: + - { name: "etc", path: "apache2-kaas-centos", mount: "/etc/httpd" } + - { name: "www", path: "kaas", mount: "/opt/rh/httpd24/root/var/www/html" } + - { name: "log", path: "apache2-kaas", mount: "/var/log/httpd24" } + probes: + - { port: 8080, path: '/index.html' } + docker: + service: { host: "{{ katrin_node }}", ports: [ 80/8080, 443/8043 ] } + sched: { replicas: 1, selector: { master: 1 } } + selector: { master: 1 } + images: + - image: "httpd:2.2" + mappings: + - { name: "etc", path: "apache2-kaas-docker", mount: "/usr/local/apache2/conf" } + - { name: "www", path: "kaas", mount: "/usr/local/apache2/htdocs" } + - { name: "log", path: "apache2-kaas", mount: "/usr/local/apache2/logs" } + probes: + - { port: 8080, path: '/index.html' } + + + + \ No newline at end of file diff --git a/docs/troubleshooting.txt b/docs/troubleshooting.txt new file mode 100644 index 0000000..b4ac8e7 --- /dev/null +++ b/docs/troubleshooting.txt @@ -0,0 +1,210 @@ +The services has to be running +------------------------------ + Etcd: + - etcd + + Node: + - origin-node + + Master nodes: + - origin-master-api + - origin-master-controllers + - origin-master is not running + + Required Services: + - lvm2-lvmetad.socket + - lvm2-lvmetad.service + - docker + - NetworkManager + - firewalld + - dnsmasq + - openvswitch + + Extra Services: + - ssh + - ntp + - openvpn + - ganesha (on master nodes, optional) + +Pods has to be running +---------------------- + Kubernetes System + - kube-service-catalog/apiserver + - kube-service-catalog/controller-manager + + OpenShift Main Services + - default/docker-registry + - default/registry-console + - default/router (3 replicas) + - openshift-template-service-broker/api-server (daemonset, on all nodes) + + OpenShift Secondary Services + - openshift-ansible-service-broker/asb + - openshift-ansible-service-broker/asb-etcd + + GlusterFS + - glusterfs-storage (daemonset, on all storage nodes) + - glusterblock-storage-provisioner-dc + - heketi-storage + + Metrics (openshift-infra): + - hawkular-cassandra + - hawkular-metrics + - heapster + + +Debugging +========= + - Ensure system consistency as explained in 'consistency.txt' (incomplete) + - Check current pod logs and possibly logs for last failed instance + oc logs --tail=100 [-p] - dc/name or ds/name as well + - Verify initialization steps (check if all volumes are mounted) + oc describe + - It worth looking the pod environment + oc env po --list + - It worth connecting running container with 'rsh' session and see running processes, + internal logs, etc. The 'debug' session will start a new instance of the pod. + - If try looking if corresponding pv/pvc are bound. Check logs for pv. + * Even if 'pvc' is bound. The 'pv' may have problems with its backend. + * Check logs here: /var/lib/origin/plugins/kubernetes.io/glusterfs/ + - Another frequent problems is failing 'postStart' hook. Or 'livenessProbe'. As it + immediately crashes it is not possible to connect. Remedies are: + * Set larger initial delay to check the probe. + * Try to remove hook and execute it using 'rsh'/'debug' + - Determine node running the pod and check the host logs in '/var/log/messages' + * Particularly logs of 'origin-master-controllers' are of interest + - Check which docker images are actually downloaded on the node + docker images + +network +======= + - There is a NetworkManager script which should adjust /etc/resolv.conf to use local dnsmasq server. + This is based on '/etc/NetworkManager/dispatcher.d/99-origin-dns.sh' which does not play well + if OpenShift is running on non-default network interface. I provided a patched version, but it + worth verifying + * that nameserver is pointing to the host itself (but not localhost, this is important + to allow running pods to use it) + * that correct upstream nameservers are listed in '/etc/dnsmasq.d/origin-upstream-dns.conf' + * In some cases, it was necessary to restart dnsmasq (but it could be also for different reasons) + If script misbehaves, it is possible to call it manually like that + DEVICE_IFACE="eth1" ./99-origin-dns.sh eth1 up + + +etcd (and general operability) +==== + - Few of this sevices may seem running accroding to 'systemctl', but actually misbehave. Then, it + may be needed to restart them manually. I have noticed it with + * lvm2-lvmetad.socket (pvscan will complain on problems) + * node-origin + * etcd but BEWARE of too entusiastic restarting: + - However, restarting etcd many times is BAD as it may trigger a severe problem with + 'kube-service-catalog/apiserver'. The bug description is here + https://github.com/kubernetes/kubernetes/issues/47131 + - Due to problem mentioned above, all 'oc' queries are very slow. There is not proper + solution suggested. But killing the 'kube-service-catalog/apiserver' helps for a while. + The pod is restarted and response times are back in order. + * Another way to see this problem is quering 'healthz' service which would tell that + there is too many clients and, please, retry later. + curl -k https://apiserver.kube-service-catalog.svc/healthz + + - On node crash, the etcd database may get corrupted. + * There is no easy fix. Backup/restore is not working. + * Easiest option is to remove the failed etcd from the cluster. + etcdctl3 --endpoints="192.168.213.1:2379" member list + etcdctl3 --endpoints="192.168.213.1:2379" member remove + * Add it to [new_etcd] section in inventory and run openshift-etcd to scale-up etcd cluster. + + - There is a helth check provided by the cluster + curl -k https://apiserver.kube-service-catalog.svc/healthz + it may complain about etcd problems. It seems triggered by OpenShift upgrade. The real cause and + remedy is unclear, but the installation is mostly working. Discussion is in docs/upgrade.txt + + - There is also a different etcd which is integral part of the ansible service broker: + 'openshift-ansible-service-broker/asb-etcd'. If investigated with 'oc logs' it complains + on: + 2018-03-07 20:54:48.791735 I | embed: rejected connection from "127.0.0.1:43066" (error "tls: failed to verify client's certificate: x509: certificate signed by unknown authority", ServerName "") + WARNING: 2018/03/07 20:54:48 Failed to dial 0.0.0.0:2379: connection error: desc = "transport: authentication handshake failed: remote error: tls: bad certificate"; please retry. + Nevertheless, it seems working without much trouble. The error message seems caused by + certificate verification code which introduced in etcd 3.2. There are multiple bug repports on + the issue. + +pods (failed pods, rogue namespaces, etc...) +==== + - After crashes / upgrades some pods may end up in 'Error' state. This is quite often happen to + * kube-service-catalog/controller-manager + * openshift-template-service-broker/api-server + Normally, they should be deleted. Then, OpenShift will auto-restart pods and they likely will run without problems. + for name in $(oc get pods -n openshift-template-service-broker | grep Error | awk '{ print $1 }' ); do oc -n openshift-template-service-broker delete po $name; done + for name in $(oc get pods -n kube-service-catalog | grep Error | awk '{ print $1 }' ); do oc -n kube-service-catalog delete po $name; done + + - Other pods will fail with 'ImagePullBackOff' after cluster crash. The problem is that ImageStreams populated by 'builds' will + not be recreated automatically. By default OpenShift docker registry is stored on ephemeral disks and is lost on crash. The build should be + re-executed manually. + oc -n adei start-build adei + + - Furthermore, after long outtages the CronJobs will stop functioning. The reason can be found by analyzing '/var/log/messages' or specially + systemctl status origin-master-controllers + it will contain something like: + 'Cannot determine if / needs to be started: Too many missed start time (> 100). Set or decrease .spec.startingDeadlineSeconds or check clock skew.' + * The reason is that after 100 missed (or failed) launch periods it will stop trying to avoid excive load. The remedy is set 'startingDeadlineSeconds' + which tells the system that if cronJob has failed to start in the allocated interval we stop trying until the next start period. Then, 100 is only + counted the specified period. I.e. we should set period bellow the 'launch period / 100'. + https://github.com/kubernetes/kubernetes/issues/45825 + * The running CronJobs can be easily patched with + oc -n adei patch cronjob/adei-autogen-update --patch '{ "spec": {"startingDeadlineSeconds": 120 }}' + + - Sometimes there is rogue namespaces in 'deleting' state. This is also hundreds of reasons, but mainly + * Crash of both masters during population / destruction of OpenShift resources + * Running of 'oc adm diagnostics' + It is unclear how to remove them manually, but it seems if we run + * OpenShift upgrade, the namespaces are gone (but there could be a bunch of new problems). + * ... i don't know if install, etc. May cause the trouble... + + - There is also rogue pods (mainly due to some problems with unmounting lost storage), etc. If 'oc delete' does not + work for a long time. It worth + * Determining the host running failed pod with 'oc get pods -o wide' + * Going to the pod and killing processes and stopping the container using docker command + * Looking in the '/var/lib/origin/openshift.local.volumes/pods' for the remnants of the container + - This can be done with 'find . -name heketi*' or something like... + - There could be problematic mounts which can be freed with lazy umount + - The folders for removed pods may (and should) be removed. + + - Looking into the '/var/log/messages', it is sometimes possible to spot various erros like + * Orphaned pod "212074ca-1d15-11e8-9de3-525400225b53" found, but volume paths are still present on disk. + The volumes can be removed in '/var/lib/origin/openshift.local.volumes/pods' on the corresponding node + * PodSandbox "aa28e9c7605cae088838bb4c9b92172083680880cd4c085d93cbc33b5b9e8910" from runtime service failed: ... + - We can find and remove the corresponding container (the short id is just first letters of the long id) + docker ps -a | grep aa28e9c76 + docker rm + - We further can just destroy all containers which are not running (it will actually try to remove all, + but just error message will be printed for running ones) + docker ps -aq --no-trunc | xargs docker rm + + +Storage +======= + - Running a lot of pods may exhaust available storage. It worth checking if + * There is enough Docker storage for containers (lvm) + * There is enough Heketi storage for dynamic volumes (lvm) + * The root file system on nodes still has space for logs, etc. + Particularly there is a big problem for ansible-ran virtual machines. The system disk is stored + under '/root/VirtualBox VMs' and is not cleaned/destroyed unlike second hard drive on 'vagrant + destroy'. So, it should be cleaned manually. + + - Problems with pvc's can be evaluated by running + oc -n openshift-ansible-service-broker describe pvc etcd + Furthermore it worth looking in the folder with volume logs. For each 'pv' it stores subdirectories + with pods executed on this host which are mount this pod and holds the log for this pods. + /var/lib/origin/plugins/kubernetes.io/glusterfs/ + + - Heketi is problematic. + * Worth checking if topology is fine and running. + heketi-cli -s http://heketi-storage-glusterfs.openshift.suren.me --user admin --secret "$(oc get secret heketi-storage-admin-secret -n glusterfs -o jsonpath='{.data.key}' | base64 -d)" + - Furthermore, the heketi gluster volumes may be started, but with multiple bricks offline. This can + be checked with + gluster volume status detail + * If not all bricks online, likely it is just enought to restart the volume + gluster volume stop + gluster volume start + * This may break services depending on provisioned 'pv' like 'openshift-ansible-service-broker/asb-etcd' + diff --git a/docs/upgrade.txt b/docs/upgrade.txt new file mode 100644 index 0000000..b4f22d6 --- /dev/null +++ b/docs/upgrade.txt @@ -0,0 +1,64 @@ +Upgrade +------- + - The 'upgrade' may break things causing long cluster outtages or even may require a complete re-install. + Currently, I found problem with 'kube-service-catalog', but I am not sure problems are limited to it. + Furthermore, we currently using 'latest' tag of several docker images (heketi is example of a critical + service on the 'latest' tag). Update may break things down. + +kube-service-catalog +-------------------- + - Update of 'kube-service-catalog' breaks OpenShift health check + curl -k https://apiserver.kube-service-catalog.svc/healthz + It complains on 'etcd'. The speific etcd check + curl -k https://apiserver.kube-service-catalog.svc/healthz/etcd + reports that all servers are unreachable. + + - In fact etcd is working and the cluster is mostly functional. Occasionaly, it may suffer from the bug + described here: + https://github.com/kubernetes/kubernetes/issues/47131 + The 'oc' queries are extremely slow and healthz service reports that there is too many connections. + Killing the 'kube-service-catalog/apiserver' helps for a while, but problem returns occasionlly. + + - The information bellow is attempt to understand the reason. In fact, it is the list specifying that + is NOT the reason. The only found solution is to prevent update of 'kube-service-catalog' by setting + openshift_enable_service_catalog: false + + - The problem only occurs if 'openshift_service_catalog' role is executed. It results in some + miscommunication between 'apiserver' and/or 'control-manager' with etcd. Still the cluster is + operational, so the connection is not completely lost, but is not working as expected in some + circustmances. + + - There is no significant changes. The exactly same docker images are installed. The only change in + '/etc' is updated certificates used by 'apiserver' and 'control-manager'. + * The certificates are located in '/etc/origin/service-catalog/' on the first master server. + 'oc adm ca' is used for generation. However, certificates in this folder are not used directly. They + are barely a temporary files used to generate 'secrets/service-catalog-ssl' which is used in + 'apiserver' and 'control-manager'. The provisioning code is in: + openshift-ansible/roles/openshift_service_catalog/tasks/generate_certs.yml + it can't be disabled completely as registered 'apiserver_ca' variable is used in install.yml, but + actual generation can be skipped and old files re-used to generate secret. + * I have tried to modify role to keep old certificates. The healhz check was still broken afterwards. + So, this is update is not a problem (or at least not a sole problem). + + - The 'etcd' cluster seems OK. On all nodes, the etcd can be verified using + etcdctl3 member list + * The last command is actually bash alias which executes + ETCDCTL_API=3 /usr/bin/etcdctl --cert /etc/etcd/peer.crt --key /etc/etcd/peer.key --cacert /etc/etcd/ca.crt --endpoints https://`hostname`:2379 member list + Actually, etcd is serving two ports 2379 (clients) and 2380 (peers). One idea was that may be the + second port got problems. I was trying to change 2379 to 2380 in command above and it was failing. + However, it does not work either if the cluster in healhy state. + * One idea was that certificates are re-generated for wrong ip/names and, hence, certificate validation + fails. Or that the originally generated CA is registered with etcd. This is certainly not the (only) issue + as problem persist even if we keep certificates intact. However, I also verified that newly generated + certificates are completely similar to old ones and containe the correct hostnames inside. + * Last idea was that actually 'asb-etcd' is broken. It complains + 2018-03-07 20:54:48.791735 I | embed: rejected connection from "127.0.0.1:43066" (error "tls: failed to verify client's certificate: x509: certificate signed by unknown authority", ServerName "") + However, the same error is present in log directly after install while the cluster is completely + healthy. + + - The networking seems also not an issue. The configurations during install and upgrade are exactly the same. + All names are defined in /etc/hosts. Furthermore, the names in /etc/hosts are resolved (and back-resolved) + by provided dnsmasq server. I.e. ipeshift1 resolves to 192.168.13.1 using nslookup and 192.168.13.1 resolves + back to ipeshift1. So, the configuration is indistinguishable from proper one with properly configured DNS. + + \ No newline at end of file diff --git a/group_vars/OSEv3.yml b/group_vars/OSEv3.yml index d896677..20bfece 100644 --- a/group_vars/OSEv3.yml +++ b/group_vars/OSEv3.yml @@ -1,14 +1,8 @@ ### Deployment Type openshift_deployment_type: origin openshift_master_cluster_method: "native" -#openshift_release: "v1.5" openshift_release: "v3.7.1" -#openshift_release: "v3.7" -#openshift_image_tag: "v1.5.0-rc.0" #openshift_image_tag: "v3.7.1" -#openshift_pkg_version=-3.7.0 -#openshift_hosted_metrics_deployer_version: "v1.5.0-rc.0" -#openshift_hosted_metrics_deployer_version: "v3.7.1" #containerized: true containerized: false @@ -18,9 +12,32 @@ os_firewall_use_firewalld: true #enable_excluders: false #enable_docker_excluder: false +### Versions +#system packages +#etcd_version="3.1.0" +#docker_version="1.12.1" + +#for some package only latest is available +#openshift_pkg_version=-3.7.0 +#openshift_cockpit_deployer_version=latest +#openshift_metrics_image_prefix=docker.io/openshift/origin- +#openshift_metrics_image_version=v3.7 +#openshift_logging_image_prefix=docker.io/openshift/origin- +#openshift_logging_image_version=v3.7.0 +#openshift_service_catalog_image_prefix=docker.io/openshift/origin- +#openshift_service_catalog_image_version=v3.7.1 +#template_service_broker_version='v3.7' +#ansible_service_broker_image_prefix: ansibleplaybookbundle/ +#ansible_service_broker_registry_url: "registry.access.redhat.com" +ansible_service_broker_etcd_image_tag: v3.2 + +#test +#openshift_enable_service_catalog: false + + ### Network & DNS configuration -openshift_master_cluster_hostname: "{{ ands_openshift_cluster_fqdn }}" +openshift_master_cluster_hostname: "{{ ands_use_inner_lb | ternary(ands_inner_lb_fqdn, ands_openshift_lb) }}" openshift_master_cluster_public_hostname: "{{ ands_openshift_lb }}" openshift_master_default_subdomain: "{{ ands_openshift_subdomain | default(ands_openshift_lb) }}" openshift_master_ingress_ip_network_cidr: "{{ ands_openshift_ingress_network }}" @@ -30,8 +47,8 @@ openshift_master_ingress_ip_network_cidr: "{{ ands_openshift_ingress_network }}" # we may need to put conditionals here (except _ip). Currently values set to '' if undifined (OpenShift uses None which is equivalent in ansible) openshift_ip: "{{ ands_openshift_ip }}" openshift_public_ip: "{{ ands_openshift_public_ip }}" -openshift_hostname: "{{ ands_openshift_fqdn }}" -openshift_public_hostname: "{{ ands_openshift_public_fqdn }}" +openshift_hostname: "{{ ands_openshift_set_hostname | ternary(ands_openshift_fqdn, ands_none) }}" +openshift_public_hostname: "{{ ands_openshift_set_public_hostname | ternary(ands_openshift_public_fqdn, ands_none) }}" #Check configuration to fight dynamic IPs @@ -68,10 +85,35 @@ openshift_docker_log_options: [ max-size=2m, max-file=3 ] openshift_docker_options: --log-driver json-file #openshift_docker_options: --log-opt max-size=2m --log-opt max-file=3 +### Registry +openshift_hosted_registry_storage_kind: glusterfs +openshift_hosted_registry_storage_class: glusterfs-storage +openshift_hosted_registry_storage_volume_size: "{{ ands_registry_volume_size }}" + +# By default dynamic provisioning is not used. The 'openshift_persistent_volumes' role creates pvc/pv pair if the following +# variables set. The volumes are called 'registry-claim' and 'registry-volume'. The 'openshift_storage_glusterfs' creates +# the corresponding volume using heketi (this can't be disabled, so we patched to skip if openshift_hosted_registry_storage_class set). +# Finally, 'openshift_hosted' role creates the corresponding endpoints (this only happens if ..._ips are set). +# Alternative is triggered if 'openshift_hosted_registry_storage_glusterfs_swap' is set. The 'openshift_persistent_volumes' creates +# registry-glusterfs-claim/registry-volume pair. 'openshift_hosted' role, then, tries first to copy data from the current volume, but +# this path is pretty much broken. +# I have introduced 'openshift_hosted_registry_storage_class' and blocked if it set creatin of above-said components which are not +# possible to disable with variable bellow. Furthermore, I added a simple 'pvc' based on dynamic provisioning to 'openshift_persistent_volumes'. +openshift_hosted_registry_storage_create_pv: false +openshift_hosted_registry_storage_create_pvc: false + +# This is an alternative to go standard way. All above should be commented, then. +# volume size should be given as plain number (without G) if we go without 'sc'. +#openshift_hosted_registry_storage_glusterfs_path: openshift_registry +#openshift_hosted_registry_storage_glusterfs_ips: "{{ openshift_storage_nodes }}" + ### Dynamic Storage openshift_storage_glusterfs_image: chsa/gluster-centos openshift_storage_glusterfs_version: "{{ glusterfs_version }}" - +#Either 5 or 6 corresponds to latest +#openshift_storage_glusterfs_heketi_version: 6 +#Only latest +#openshift_storage_glusterfs_block_version: latest #openshift_storage_glusterfs_version: '3.12.5' # Latest 3.10.1 #openshift_storage_glusterfs_is_native: True @@ -113,6 +155,3 @@ openshift_install_examples: true # Required for IPFailover openshift_clock_enabled: true - -#This is required by OpenShift upgrade (may be something else) -g_ssh_user: "{{ ansible_ssh_user }}" diff --git a/group_vars/ands.yml b/group_vars/ands.yml index d81f11e..faacc40 100644 --- a/group_vars/ands.yml +++ b/group_vars/ands.yml @@ -1,10 +1,3 @@ -ands_configure_heketi: false - -# This should be here, the variables from the role are not propogated to hostvars -#ands_master_id: "{{ ('masters' in group_names) | ternary(groups.masters.index(('masters' in group_names) | ternary(inventory_hostname, groups.masters[0])), -1) }}" -ands_storage_hostname: "{{ ands_storage_network | default(false) | ternary(ands_storage_network | default('') | ipaddr(ands_host_id) | ipaddr('address'), ansible_fqdn) }}" - - ands_repo_url: http://ufo.kit.edu/ands/repos ands_repositories: - name: ands-updates diff --git a/group_vars/staging.yml b/group_vars/staging.yml index 34bf7c7..00ec146 100644 --- a/group_vars/staging.yml +++ b/group_vars/staging.yml @@ -11,13 +11,9 @@ ands_openshift_public_network: 192.168.226.0/24 ands_openshift_ingress_network: 192.168.216.0/24 ands_inner_domain: "" -#ands_inner_lb: true -#ands_openshift_set_hostname: false - -ands_inner_lb: false +ands_use_inner_lb: true ands_openshift_set_hostname: true - #ands_ipfailover_interface: eth1 ands_ipfailover_vips: [141.52.64.28/23] diff --git a/group_vars/testing.yml b/group_vars/testing.yml index 72b2dba..f7e04cf 100644 --- a/group_vars/testing.yml +++ b/group_vars/testing.yml @@ -1,17 +1,20 @@ ands_storage_network: 192.168.12.0/24 ands_cluster_domain: ipe.kit.edu -ands_openshift_lb: katrin.suren.me -#ands_openshift_subdomain: katrin.suren.me -ands_openshift_subdomain: apps.suren.me -#ands_openshift_network: 192.168.26.0/24 +ands_hostname_template: ipekatrin +ands_openshift_lb: kaas.kit.edu +ands_openshift_subdomain: kaas.kit.edu ands_openshift_network: 192.168.13.0/24 ands_openshift_public_network: 192.168.26.0/24 ands_openshift_ingress_network: 192.168.16.0/24 -ands_hostname_template: ipekatrin +#ands_inner_domain: "" +ands_openshift_set_hostname: false +# if we provision inner_lb (default), we can turn it on and just re-run ands_network role (or maintain play) +ands_use_inner_lb: false + -ands_ipfailover_interface: eth1 +#ands_ipfailover_interface: eth1 ands_ipfailover_vips: [141.52.64.15/23, 141.52.64.17/23] katrin_openvpn_subnet_bits: 24 diff --git a/group_vars/virtual.yml b/group_vars/virtual.yml index f76bafc..7a61a55 100644 --- a/group_vars/virtual.yml +++ b/group_vars/virtual.yml @@ -1,10 +1,10 @@ glusterfs_transport: tcp ands_data_device: "/dev/sdb" -ands_data_volume_size: "20G" -ands_heketi_volume_size: "20G" +ands_data_volume_size: "15G" +ands_heketi_volume_size: "25G" +ands_registry_volume_size: "5G" docker_storage_device: "/dev/sdb" docker_storage_vg: "ands" -ands_host_id: "{{ ansible_hostname | regex_replace('^[\\w\\d]*\\w(\\d+)(\\.|$)', '\\1') }}" diff --git a/inventories/staging.erb b/inventories/staging.erb index dc3bcb2..aa9e935 100644 --- a/inventories/staging.erb +++ b/inventories/staging.erb @@ -1,46 +1,52 @@ [masters] 192.168.226.[1:2] +[etcd] +192.168.226.[1:3] + [simple_storage_nodes] 192.168.226.[3:3] -[external_storage_servers] -#192.168.226.[4:4] - [simple_nodes] +[external_storage_servers] + [staging:children] nodes +new_nodes +etcd +new_etcd external_storage_servers vagrant [virtual:children] nodes +new_nodes +etcd +new_etcd external_storage_servers - [OSEv3:children] masters +new_masters nodes +new_nodes etcd +new_etcd -[glusterfs:children] -masters -simple_storage_nodes - -[etcd:children] +[nodes:children] masters simple_storage_nodes +simple_nodes [storage_nodes:children] masters +new_masters simple_storage_nodes +new_simple_storage_nodes -[nodes:children] -masters -simple_storage_nodes -simple_nodes - +[glusterfs:children] +storage_nodes #[lb] #master1.example.com @@ -49,9 +55,11 @@ simple_nodes #[glusterfs_registry] #192.168.10.14 glusterfs_ip=192.168.10.14 glusterfs_devices='[ "/dev/xvdc", "/dev/xvdd" ]' - [ands_servers:children] nodes +new_nodes +etcd +new_etcd external_storage_servers [ands_storage_servers:children] @@ -63,3 +71,13 @@ ands_servers [vagrant] ipepdvcompute3.ipe.kit.edu vagrant_project=staging + +[new_masters] +[new_etcd] +[new_simple_storage_nodes] +[new_simple_nodes] + +[new_nodes:children] +new_masters +new_simple_storage_nodes +new_simple_nodes diff --git a/inventories/testing.erb b/inventories/testing.erb index b8b5f48..f9d27ae 100644 --- a/inventories/testing.erb +++ b/inventories/testing.erb @@ -1,50 +1,66 @@ [masters] -ipekatrin[1:2].katrin.kit.edu +ipekatrin[1:2].ipe.kit.edu -[simple_storage_nodes] -ipekatrin[3:3].katrin.kit.edu -#ipetest.katrin.kit.edu ands_host_id=5 +[etcd] +ipekatrin[1:3].ipe.kit.edu -[external_storage_servers] -#ipekatrin[4:4].katrin.kit.edu +[simple_storage_nodes] +ipekatrin[3:3].ipe.kit.edu [simple_nodes] -#ipekatrin[3:3].katrin.kit.edu -#strnage_name.katrin.kit.edu ands_host_id=1 +#ipecompute1.katrin.kit.edu ands_host_id=4 + +[external_storage_servers] [testing:children] nodes +new_nodes +etcd +new_etcd external_storage_servers vagrant [virtual:children] nodes +new_nodes +etcd +new_etcd external_storage_servers [OSEv3:children] masters +new_masters nodes +new_nodes etcd +new_etcd -[glusterfs:children] -masters -simple_storage_nodes - -[etcd:children] +[nodes:children] masters simple_storage_nodes +simple_nodes [storage_nodes:children] masters +new_masters simple_storage_nodes +new_simple_storage_nodes -[nodes:children] -masters -simple_storage_nodes -simple_nodes +[glusterfs:children] +storage_nodes + +#[lb] +#master1.example.com +#[nfs] +#master1.example.com +#[glusterfs_registry] +#192.168.10.14 glusterfs_ip=192.168.10.14 glusterfs_devices='[ "/dev/xvdc", "/dev/xvdd" ]' [ands_servers:children] nodes +new_nodes +etcd +new_etcd external_storage_servers [ands_storage_servers:children] @@ -56,3 +72,13 @@ ands_servers [vagrant] ipepdvcompute3.ipe.kit.edu vagrant_project=testing + +[new_masters] +[new_etcd] +[new_simple_storage_nodes] +[new_simple_nodes] + +[new_nodes:children] +new_masters +new_simple_storage_nodes +new_simple_nodes diff --git a/opts.sh b/opts.sh index ac1962a..9cfaf86 100644 --- a/opts.sh +++ b/opts.sh @@ -80,9 +80,17 @@ apply() { hosts: $group remote_user: root roles: - - ands_facts + - { role: ands_facts } + - { role: ands_network, action: install_pre } +- import_playbook: ../anslib/openshift-ansible/playbooks/prerequisites.yml - import_playbook: ../$action + +- name: Common setup procedures + hosts: $group + remote_user: root + roles: + - { role: ands_network, action: install_post } END playbook="playbooks/tmp_play.yml" clean="playbooks/tmp_play.*" diff --git a/playbooks/ands-gluster-ganesha.yml b/playbooks/ands-gluster-ganesha.yml index 586dd07..a347c4f 100644 --- a/playbooks/ands-gluster-ganesha.yml +++ b/playbooks/ands-gluster-ganesha.yml @@ -7,6 +7,7 @@ - name: Configure GlusterFS cluster hosts: masters, new_masters roles: + - { role: ands_network, action: ganesha } - { role: glusterfs, action: ganesha } - { role: ganesha } vars: diff --git a/playbooks/ands-gluster.yml b/playbooks/ands-gluster.yml index 8aa30fc..6e71b55 100644 --- a/playbooks/ands-gluster.yml +++ b/playbooks/ands-gluster.yml @@ -3,7 +3,6 @@ roles: - role: ands_facts - - name: Configure GlusterFS cluster hosts: ands_servers roles: @@ -13,3 +12,8 @@ glusterfs_servers: "{{ ands_storage_servers }}" glusterfs_bricks_path: "{{ ands_data_path }}/glusterfs" glusterfs_domains: "{{ ands_storage_domains }}" + +- name: Configure Backup + hosts: ands_servers + roles: + - role: ands_backup diff --git a/playbooks/ands-prepare.yml b/playbooks/ands-prepare.yml index d198ec0..239d292 100644 --- a/playbooks/ands-prepare.yml +++ b/playbooks/ands-prepare.yml @@ -11,10 +11,15 @@ - name: Common setup procedures hosts: ands roles: - - role: common - - role: firewall + - { role: ands_common } + - { role: firewall } - { role: ands_network, action: common } +- name: Setup NTP + hosts: ands:!virtual + roles: + - role: ntp + - name: Keepalived service hosts: masters roles: @@ -22,7 +27,7 @@ #OpenVPN started before Origin-node causes problems #- name: OpenVPN service -# hosts: nodes, new_nodes +# hosts: nodes:new_nodes # roles: # - role: openvpn # vars: @@ -36,7 +41,7 @@ - role: ands_storage - name: Docker setup - hosts: nodes, new_nodes + hosts: nodes:new_nodes roles: - role: docker vars: diff --git a/playbooks/openshift-add-masters.yml b/playbooks/openshift-add-masters.yml index 99672d0..6878137 100644 --- a/playbooks/openshift-add-masters.yml +++ b/playbooks/openshift-add-masters.yml @@ -2,7 +2,7 @@ hosts: nodes:new_nodes roles: - { role: ands_facts } - - { role: common, os_update: true } + - { role: ands_common, os_update: true } - { role: ands_network, action: install_pre } # etcd will provisioned as well if node is listed in new_etcd diff --git a/playbooks/openshift-add-nodes.yml b/playbooks/openshift-add-nodes.yml index c788e12..3d3efc4 100644 --- a/playbooks/openshift-add-nodes.yml +++ b/playbooks/openshift-add-nodes.yml @@ -2,7 +2,7 @@ hosts: nodes:new_nodes roles: - { role: ands_facts } - - { role: common, os_update: true } + - { role: ands_common, os_update: true } - { role: ands_network, action: install_pre } # I am not sure if etcd will be automatic here. If not, we may need to run etcd scaleup afterwards diff --git a/playbooks/openshift-deploy-cluster.yml b/playbooks/openshift-deploy-cluster.yml deleted file mode 120000 index 2a18fca..0000000 --- a/playbooks/openshift-deploy-cluster.yml +++ /dev/null @@ -1 +0,0 @@ -../anslib/openshift-ansible/playbooks/deploy_cluster.yml \ No newline at end of file diff --git a/playbooks/openshift-install-service-catalog.yml b/playbooks/openshift-install-service-catalog.yml new file mode 100644 index 0000000..b6c0a10 --- /dev/null +++ b/playbooks/openshift-install-service-catalog.yml @@ -0,0 +1,13 @@ +- name: Configure cluster hosts names + hosts: nodes + roles: + - { role: ands_facts } + - { role: ands_network, action: install_pre } + +- import_playbook: ../anslib/openshift-ansible/playbooks/prerequisites.yml +- import_playbook: ../anslib/openshift-ansible/playbooks/openshift-service-catalog/config.yml + +- name: Configure cluster hosts names + hosts: nodes + roles: + - { role: ands_network, action: install_post } diff --git a/playbooks/openshift-redeploy-certificates.yml b/playbooks/openshift-redeploy-certificates.yml deleted file mode 120000 index f812372..0000000 --- a/playbooks/openshift-redeploy-certificates.yml +++ /dev/null @@ -1 +0,0 @@ -../anslib/openshift-ansible/playbooks/redeploy-certificates.yml \ No newline at end of file diff --git a/playbooks/openshift-redeploy-certificates.yml b/playbooks/openshift-redeploy-certificates.yml new file mode 100644 index 0000000..682468f --- /dev/null +++ b/playbooks/openshift-redeploy-certificates.yml @@ -0,0 +1,13 @@ +- name: Configure cluster hosts names + hosts: nodes + roles: + - { role: ands_facts } + - { role: ands_network, action: install_pre } + +- import_playbook: ../anslib/openshift-ansible/playbooks/prerequisites.yml +- import_playbook: ../anslib/openshift-ansible/playbooks/redeploy-certificates.yml + +- name: Configure cluster hosts names + hosts: nodes + roles: + - { role: ands_network, action: install_post } diff --git a/playbooks/openshift-setup-project.yml b/playbooks/openshift-setup-project.yml index 6150cdf..a4666e3 100644 --- a/playbooks/openshift-setup-project.yml +++ b/playbooks/openshift-setup-project.yml @@ -1,5 +1,5 @@ - name: Analyze Ands configuration - hosts: masters + hosts: nodes roles: - { role: ands_facts } diff --git a/playbooks/openshift-setup-projects.yml b/playbooks/openshift-setup-projects.yml index 689ecb4..164f91c 100644 --- a/playbooks/openshift-setup-projects.yml +++ b/playbooks/openshift-setup-projects.yml @@ -1,5 +1,5 @@ - name: Analyze Ands configuration - hosts: masters + hosts: nodes roles: - { role: ands_facts } diff --git a/playbooks/openshift-setup-security.yml b/playbooks/openshift-setup-security.yml index f576ba5..ba96354 100644 --- a/playbooks/openshift-setup-security.yml +++ b/playbooks/openshift-setup-security.yml @@ -1,5 +1,5 @@ - name: Analyze Ands configuration - hosts: masters + hosts: nodes roles: - { role: ands_facts } diff --git a/playbooks/openshift-setup-users.yml b/playbooks/openshift-setup-users.yml index f54a806..998dd59 100644 --- a/playbooks/openshift-setup-users.yml +++ b/playbooks/openshift-setup-users.yml @@ -1,5 +1,5 @@ - name: Analyze Ands configuration - hosts: masters + hosts: nodes roles: - { role: ands_facts } diff --git a/playbooks/openshift-upgrade.yml b/playbooks/openshift-upgrade.yml index f2680ab..dd60639 100644 --- a/playbooks/openshift-upgrade.yml +++ b/playbooks/openshift-upgrade.yml @@ -3,5 +3,16 @@ roles: - { role: ands_facts } # - { role: ands_openshift, subrole: hostnames } + - { role: ands_network, action: install_pre } +- import_playbook: ../anslib/openshift-ansible/playbooks/prerequisites.yml + +# Updating service catalog breaks etcd health checks (see docs/upgrade.txt) - import_playbook: ../anslib/openshift-ansible/playbooks/byo/openshift-cluster/upgrades/v3_7/upgrade.yml + vars: + openshift_enable_service_catalog: false + +- name: Configure cluster hosts names + hosts: nodes + roles: + - { role: ands_network, action: install_post } diff --git a/roles/ands_backup/defaults/main.yml b/roles/ands_backup/defaults/main.yml new file mode 100644 index 0000000..33d1ff1 --- /dev/null +++ b/roles/ands_backup/defaults/main.yml @@ -0,0 +1,9 @@ +ands_script_path: "/opt/scripts" + +ands_backup_frequency: "17 */4 * * *" +ands_backup_volume: "{{ ands_paths.provision }}" +ands_backup_path: "{{ ands_backup_volume }}/backup" +ands_backup_clean_minutes: "720" +ands_borg_path: "{{ ands_backup_volume }}/borg" +ands_borg_args: "-C zlib,6 -x" +ands_borg_prune: "--keep-daily=7 --keep-weekly=4 --keep-monthly=6 --keep-within 1w" diff --git a/roles/ands_backup/tasks/main.yml b/roles/ands_backup/tasks/main.yml new file mode 100644 index 0000000..16a8ec3 --- /dev/null +++ b/roles/ands_backup/tasks/main.yml @@ -0,0 +1,29 @@ +- name: Install required packages + package: name={{item}} state=present + with_items: + - borgbackup + - heketi-client + +- name: Create scripts directory + file: path="{{ ands_script_path }}" state=directory + +- name: Populate backup script + template: src=backup.sh.j2 dest="{{ ands_script_path }}/ands_backup.sh" owner=root group=root mode=0755 + +- name: Populate cron job + template: src=backup.cron.j2 dest="/etc/cron.d/9ands_backup" owner=root group=root mode=0644 + + +- name: Check if backup volume is mounted + command: mountpoint -q "{{ ands_backup_volume }}" + + +- block: + - name: Check if borg is already initialized + stat: path="{{ ands_borg_path }}/config" + register: borg_stat_res + + - name: Initialize borg repository + shell: "borg init {{ ands_borg_path }} --encryption=none" + when: not borg_stat_res.stat.exists + run_once: true diff --git a/roles/ands_backup/templates/backup.cron.j2 b/roles/ands_backup/templates/backup.cron.j2 new file mode 100644 index 0000000..5c017b8 --- /dev/null +++ b/roles/ands_backup/templates/backup.cron.j2 @@ -0,0 +1,4 @@ +SHELL=/bin/bash +PATH=/sbin:/bin:/usr/sbin:/usr/bin +MAILTO=root +{{ ands_backup_frequency }} root /bin/bash {{ ands_script_path }}/ands_backup.sh diff --git a/roles/ands_backup/templates/backup.sh.j2 b/roles/ands_backup/templates/backup.sh.j2 new file mode 100755 index 0000000..74fff85 --- /dev/null +++ b/roles/ands_backup/templates/backup.sh.j2 @@ -0,0 +1,72 @@ +#! /bin/bash + +date=$(date -u "+%Y%m%d_%H%M%S") +hostname=$(hostname) + +volume_path="{{ ands_backup_volume }}" +host_path="{{ ands_backup_path }}/${hostname}" +backup_path="${host_path}/${date}" +borg_path="{{ ands_borg_path }}" + +borg_args="{{ ands_borg_args }}" +borg_prune_args="{{ ands_borg_prune }}" + +etcdctl3 () { + ETCDCTL_API=3 /usr/bin/etcdctl --cert /etc/etcd/peer.crt --key /etc/etcd/peer.key --cacert /etc/etcd/ca.crt --endpoints "https://${hostname}:2379" ${@} +} + + +check=$(df | awk '{ print $6 }' | grep -P "^${volume_path}$") +[ $? -ne 0 -o -z "$check" ] && { echo "The volume $volume_path is not mounted. Skipping..." ; exit 1 ; } + +[ -d "$backup_path" ] && { echo "Something wrong, path $backup_path already exists..." ; exit 1 ; } + +# Check the provision volume is mounted +mkdir -p "$backup_path" || { echo "Can't create ${backup_path}" ; exit 1 ; } + +{% if 'masters' in group_names %} +# etcd +mkdir -p "$backup_path/etcd" || { echo "Can't create ${backup_path}/etcd" ; exit 1 ; } +etcdctl3 --endpoints="192.168.213.1:2379" snapshot save "$backup_path/etcd/snapshot.db" > /dev/null + +# heketi +mkdir -p "$backup_path/heketi" || { echo "Can't create ${backup_path}/heketi" ; exit 1 ; } +heketi-cli -s http://heketi-storage.glusterfs.svc.cluster.local:8080 --user admin --secret "$(oc get secret heketi-storage-admin-secret -n glusterfs -o jsonpath='{.data.key}' | base64 -d)" topology info --json > "$backup_path/heketi/topology.json" +{% endif %} + + +{% if 'ands_storage_servers' in group_names %} +# Gluster +#mkdir -p "$backup_path/gluster" || { echo "Can't create ${backup_path}/gluster" ; exit 1 ; } +#( +# cd /var/lib/ +# tar cjf $backup_path/gluster/var_lib_glusterd.tar.bz2 glusterd +#) +{% endif %} + +# etc +#mkdir -p "$backup_path/etc" || { echo "Can't create ${backup_path}/etc" ; exit 1 ; } +#( +# cd / +# tar cjf $backup_path/etc/etc.tar.bz2 etc --exclude=selinux --exclude=udev --exclude=bash_completion.d --exclude=etc/pki --exclude=etc/services --exclude=postfix --exclude=mc +#) + +if [ -d "$borg_path" ]; then + borg_glusterd="/var/lib/glusterd" + borg_etc="/etc -e */etc/selinux -e */etc/udev -e */etc/bash_completion.d -e */etc/pki -e */etc/services -e */etc/postfix -e */etc/mc" + +{% if 'masters' in group_names %} + borg_list="* ${borg_glusterd} ${borg_etc}" +{% elif 'ands_storage_servers' in group_names %} + borg_list="${borg_glusterd} ${borg_etc}" +{% else %} + borg_list="${borg_etc}" +{% endif %} + + ( + cd ${backup_path} + borg create ${borg_args} "$borg_path::${hostname}-${date}" $borg_list + borg prune ${borg_prune_args} --prefix "${hostname}-" "$borg_path" + ) + find "$host_path" -maxdepth 1 -type d -mmin +{{ands_backup_clean_minutes}} -print0 | xargs -0 rm -rf +fi diff --git a/roles/ands_common/README b/roles/ands_common/README new file mode 100644 index 0000000..c8bd679 --- /dev/null +++ b/roles/ands_common/README @@ -0,0 +1,11 @@ +Dependencies: + - Executed on all nodes + - No dependencies & no facts + +Parameters: + extra_packages: list of extra packages to install + +Actions: + - Enables standard repositories + - Install a set of common packages on all nodes (mc, etc.) + \ No newline at end of file diff --git a/roles/ands_common/default/main.yml b/roles/ands_common/default/main.yml new file mode 100644 index 0000000..d355d15 --- /dev/null +++ b/roles/ands_common/default/main.yml @@ -0,0 +1 @@ +os_update: "{{ ands_update | default(false) }}" \ No newline at end of file diff --git a/roles/ands_common/tasks/main.yml b/roles/ands_common/tasks/main.yml new file mode 100644 index 0000000..e9196ad --- /dev/null +++ b/roles/ands_common/tasks/main.yml @@ -0,0 +1,47 @@ +- name: Ensure all required repositories are configured + package: name={{item}} state=present + with_items: + - epel-release + - centos-release-openshift-origin + +- name: Add our repository with updates and overrides + yum_repository: name="{{ item.name }}" description= "{{ item.description | default('Ands repository') }}" baseurl="{{ item.url }}" enabled="yes" gpgcheck="no" cost="{{ item.cost | default(1) }}" + with_items: "{{ ands_repositories | default([]) }}" + +- name: Ensure GlusterFS repositories are present + yum: name="centos-release-gluster{{ glusterfs_version }}" state=present + +# Seems we need iptables-services at least temporary... +- name: Ensure all required packages are installed + package: name={{item}} state=present + register: result + with_items: + - mc + - bzr + - git + - yamllint + - pyOpenSSL + - python-passlib + - python2-ruamel-yaml + - python2-jmespath + - python-ipaddress + - iptables-services + - PyYAML + - python-rhsm-certificates + - glusterfs-fuse + - telnet + - yum-plugin-versionlock + +# We always update on first install and if requested +- name: Update CentOS + yum: name=* state=latest update_cache=yes + when: (result | changed) or (os_update | default(false)) + +#- name: Add NodeJS required by a few used Ansible extensions +# package: name={{item}} state=present +# with_items: +# - nodejs + +- name: Ensure all extra packages are installed + package: name={{item}} state=present + with_items: "{{ extra_packages | default([]) }}" diff --git a/roles/ands_facts/defaults/main.yml b/roles/ands_facts/defaults/main.yml index fc3fcfd..c74984e 100644 --- a/roles/ands_facts/defaults/main.yml +++ b/roles/ands_facts/defaults/main.yml @@ -3,6 +3,11 @@ ands_none: "{{ None }}" ands_configure_heketi: false ands_data_device_default_threshold: 10 +ands_host_id: "{{ ansible_hostname | regex_replace('^[\\w\\d]*\\w(\\d+)(\\.|$)', '\\1') }}" +# We need to add it to set_fact if enabled +#ands_master_id: "{{ ('masters' in group_names) | ternary(groups.masters.index(('masters' in group_names) | ternary(inventory_hostname, groups.masters[0])), -1) }}" + +ands_storage_hostname: "{{ ands_storage_network | default(false) | ternary(ands_storage_network | default('') | ipaddr(ands_host_id) | ipaddr('address'), ansible_fqdn) }}" ands_storage_servers: "{{ groups.ands_storage_servers | map('extract', hostvars, 'ands_storage_hostname') | list }}" #openshift_storage_nodes: "{{ groups.storage_nodes | map('extract', hostvars, 'ands_storage_hostname') | list }}" @@ -23,7 +28,8 @@ ands_default_ip: "{{ ansible_default_ipv4.address }}" ands_openshift_default_ip: "{{ ands_resolve_public_ip | default(false) | ternary(ands_default_ip, ands_none) }}" ands_openshift_default_hostname: "{{ (ands_hostname_template is defined) | ternary(ands_hostname_template ~ ands_host_id, ansible_hostname) }}" -ands_inner_lb: false +ands_inner_lb: true +ands_use_inner_lb: false ands_inner_lb_id: 254 ands_inner_lb_hostname: 'ands-lb' diff --git a/roles/ands_facts/tasks/main.yml b/roles/ands_facts/tasks/main.yml index 6b28683..bd23e13 100644 --- a/roles/ands_facts/tasks/main.yml +++ b/roles/ands_facts/tasks/main.yml @@ -1,14 +1,14 @@ --- +# Here we set 'openshift_hostname', 'openshift_ip' and other variables +- name: "Configuring network facts" + include_tasks: "network.yml" # The variables accessed trough 'hostvars' should be set as facts # Here we set 'ands_storage_servers' and other variables - name: "Configuring storage facts" include_tasks: "storage.yml" -# Here we set 'openshift_hostname', 'openshift_ip' and other variables -- name: "Configuring network facts" - include_tasks: "network.yml" - - name: "Confirm that ands facts are configured" set_fact: + ands_none: "{{ ands_none }}" ands_facts_configured: true diff --git a/roles/ands_facts/tasks/network.yml b/roles/ands_facts/tasks/network.yml index 1d0248f..808d7b6 100644 --- a/roles/ands_facts/tasks/network.yml +++ b/roles/ands_facts/tasks/network.yml @@ -1,24 +1,34 @@ +- name: Set some facts + set_fact: + ands_host_id: "{{ ands_host_id }}" + - name: Set network facts set_fact: ands_cluster_domain: "{{ ands_cluster_domain }}" ands_cluster_dot_domain: ".{{ ands_cluster_domain }}" ands_inner_domain: "{{ ands_inner_domain }}" ands_inner_dot_domain: "{{ (ands_inner_domain == ands_none) | ternary('', '.' ~ ands_inner_domain) }}" + ands_inner_lb: "{{ ands_inner_lb }}" + ands_use_inner_lb: "{{ ands_use_inner_lb }}" ands_inner_lb_ip: "{{ ands_openshift_network | ipaddr(ands_inner_lb_id) | ipaddr('address') }}" ands_inner_lb_hostname: "{{ ands_inner_lb_hostname }}" ands_openshift_ip: "{{ ands_openshift_network | ipaddr(ands_host_id) | ipaddr('address') }}" - ands_openshift_hostname: "{{ ands_openshift_hostname | default(ands_openshift_set_hostname | ternary(ands_openshift_default_hostname, ands_none)) }}" + ands_openshift_hostname: "{{ ands_openshift_hostname | default(ands_openshift_default_hostname) }}" ands_openshift_public_ip: "{{ (ands_openshift_public_network is defined) | ternary( ands_openshift_public_network | ipaddr(ands_host_id) | ipaddr('address'), ands_openshift_default_ip) }}" - ands_openshift_public_hostname: "{{ ands_openshift_public_hostname | default(ands_openshift_set_public_hostname | ternary(ands_openshift_default_hostname, ands_none)) }}" + ands_openshift_public_hostname: "{{ ands_openshift_public_hostname | default(ands_openshift_default_hostname) }}" ands_storage_ip: "{{ ands_storage_network | default(ands_openshift_network) | ipaddr(ands_host_id) | ipaddr('address') }}" ands_hostname_storage: "ands_storage{{ ands_host_id }}" ands_hostname_openshift: "ands_openshift{{ ands_host_id }}" + ands_openshift_set_hostname: "{{ ands_openshift_set_hostname }}" + ands_openshift_set_public_hostname: "{{ ands_openshift_set_public_hostname }}" + ands_storage_hostname: "{{ ands_storage_hostname }}" - name: Set more network facts set_fact: ands_openshift_public_fqdn: "{{ (ands_openshift_public_hostname == ands_none) | ternary(ands_none, ands_openshift_public_hostname ~ ands_cluster_dot_domain ) }}" ands_openshift_fqdn: "{{ (ands_openshift_hostname == ands_none) | ternary(ands_none, ands_openshift_hostname ~ ands_inner_dot_domain ) }}" - ands_openshift_cluster_fqdn: "{{ ands_inner_lb | ternary(ands_inner_lb_hostname ~ ands_inner_dot_domain, ands_openshift_lb) }}" + ands_inner_lb_fqdn: "{{ ands_inner_lb_hostname ~ ands_inner_dot_domain }}" + ands_storage_servers: "{{ ands_storage_servers }}" - name: "Detect inner network interface" include_tasks: "find_interface_by_ip.yml" diff --git a/roles/ands_facts/tasks/storage.yml b/roles/ands_facts/tasks/storage.yml index cf995a0..888ad70 100644 --- a/roles/ands_facts/tasks/storage.yml +++ b/roles/ands_facts/tasks/storage.yml @@ -1,5 +1,9 @@ - include_vars: dir="vars" +- name: Set facts + set_fact: + ands_configure_heketi: "{{ ands_configure_heketi }}" + - name: Detect Heketi set_fact: ands_storage_domains="{{ ands_storage_domains | union([ands_heketi_domain]) }}" when: @@ -7,10 +11,6 @@ - ands_heketi_domain is defined - ansible_lvm.lvs[ands_heketi_lv] is defined -- name: Set some facts - set_fact: - ands_storage_servers: "{{ ands_storage_servers }}" - - name: Set some facts set_fact: ands_data_vg: "{{ ands_data_vg }}" diff --git a/roles/ands_kaas/templates/50-kaas-pods.yml.j2 b/roles/ands_kaas/templates/50-kaas-pods.yml.j2 index 216dc01..ad1fc58 100644 --- a/roles/ands_kaas/templates/50-kaas-pods.yml.j2 +++ b/roles/ands_kaas/templates/50-kaas-pods.yml.j2 @@ -5,7 +5,7 @@ kind: Template metadata: name: {{ kaas_project }}-pods annotations: - descriptions: {{ kaas_project_config.description | default(kaas_project ~ "auto-generated pod template") }} + descriptions: {{ kaas_project_config.description | default(kaas_project ~ " auto-generated pod template") }} objects: {% for name, pod in kaas_project_pods.iteritems() %} {% set pubkey = "kaas_" ~ name ~ "_pubkey" %} @@ -14,6 +14,9 @@ objects: {% if pod.variant is defined %} {% set pod = pod[pod.variant] %} {% endif %} + {% set sched = pod.sched | default({}) %} + {% set node_selector = (sched.selector is defined) | ternary(sched.selector, ands_default_node_selector | combine(sched.restrict | default({}))) %} + {% if pod.service is defined %} - apiVersion: v1 kind: Service @@ -68,10 +71,10 @@ objects: metadata: name: {{ pod.name | default(name) }} spec: - replicas: {{ ( pod.sched | default({})).replicas | default(1) }} + replicas: {{ ( sched | default({})).replicas | default(1) }} revisionHistoryLimit: 2 strategy: - type: {{ (pod.sched | default({})).strategy | default('Rolling') }} + type: {{ (sched | default({})).strategy | default('Rolling') }} triggers: - type: ConfigChange selector: @@ -82,11 +85,8 @@ objects: labels: name: {{ pod.name | default(name) }} spec: - {% if pod.selector is defined %} - nodeSelector: - {% for skey, sval in pod.selector.iteritems() %} - {{ skey }}: "{{ sval }}" - {% endfor %} + {% if node_selector | length > 0 %} + nodeSelector: {{ node_selector | to_json }} {% endif %} {% set mappings = (pod.images | json_query('[*].mappings') | length) %} {% if mappings > 0 %} diff --git a/roles/ands_network/tasks/common.yml b/roles/ands_network/tasks/common.yml index 384029f..f2fda00 100644 --- a/roles/ands_network/tasks/common.yml +++ b/roles/ands_network/tasks/common.yml @@ -22,27 +22,18 @@ - nodes - new_nodes -- name: Configure all storage ips in /etc/hosts - lineinfile: dest="/etc/hosts" line="{{ ip }} {{ hostname }}" regexp="{{ hostname }}" state="present" - when: - - hostvars[item]['ands_storage_network'] | default(ands_none) != ands_none - - hostvars[item]['ands_facts_configured'] is defined - vars: - ip: "{{ hostvars[item]['ands_storage_ip'] }}" - hostname: "{{ hostvars[item]['ands_hostname_storage'] }}" - with_inventory_hostnames: - - storage_nodes - - new_storage_nodes - - - name: Provision /etc/hosts to ensure that all masters servers are accessing Master API on loopback device lineinfile: dest="/etc/hosts" line="127.0.0.1 {{ openshift_master_cluster_hostname }}" regexp=".*{{ openshift_master_cluster_hostname }}$" state="present" when: ('masters' in group_names or 'new_masters' in group_names) register: result -- name: Provision /etc/hosts to ensure that all masters servers are accessing Master API on loopback device +- name: Provision /etc/hosts with load-balance IP on non master servers lineinfile: dest="/etc/hosts" line="{{ ands_inner_lb_ip }} {{ openshift_master_cluster_hostname }}" regexp=".*{{ openshift_master_cluster_hostname }}$" state="present" - when: (result | skipped) and (ands_inner_lb | default(false)) + when: (result | skipped) and (ands_use_inner_lb | default(false)) + +- name: Provision inner load-balancer hostname in /etc/hosts + lineinfile: dest="/etc/hosts" line="{{ ands_inner_lb_ip }} {{ ands_inner_lb_hostname }} {{ ands_inner_lb_fqdn }}" regexp=".*{{ ands_inner_lb_fqdn }}$" state="present" + when: openshift_master_cluster_hostname != ands_inner_lb_fqdn - name: Register openshift_dns_ip in /etc/hosts lineinfile: dest="/etc/hosts" line="{{ openshift_dns_ip }} openshift_dns_ip" regexp="openshift_dns_ip$" state="present" diff --git a/roles/ands_network/tasks/ganesha.yml b/roles/ands_network/tasks/ganesha.yml new file mode 100644 index 0000000..0f77ca8 --- /dev/null +++ b/roles/ands_network/tasks/ganesha.yml @@ -0,0 +1,12 @@ +- name: Configure all storage ips in /etc/hosts + lineinfile: dest="/etc/hosts" line="{{ ip }} {{ hostname }}" regexp="{{ hostname }}" state="present" + when: + - hostvars[item]['ands_storage_network'] | default(ands_none) != ands_none + - hostvars[item]['ands_facts_configured'] is defined + vars: + ip: "{{ hostvars[item]['ands_storage_ip'] }}" + hostname: "{{ hostvars[item]['ands_hostname_storage'] }}" + with_inventory_hostnames: + - storage_nodes + - new_storage_nodes + diff --git a/roles/ands_network/tasks/install_post.yml b/roles/ands_network/tasks/install_post.yml index 0bfef34..3f1e57c 100644 --- a/roles/ands_network/tasks/install_post.yml +++ b/roles/ands_network/tasks/install_post.yml @@ -6,4 +6,4 @@ lineinfile: dest="/etc/hosts" line="{{ ands_inner_lb_ip | default('') }} {{ openshift_master_cluster_hostname }}" regexp=".*{{ openshift_master_cluster_hostname }}$" state="{{ state }}" when: ('masters' not in group_names and 'new_masters' not in group_names) vars: - state: "{{ ands_inner_lb | default(false) | ternary('present', 'absent') }}" + state: "{{ ands_use_inner_lb | default(false) | ternary('present', 'absent') }}" diff --git a/roles/ands_network/tasks/maintain.yml b/roles/ands_network/tasks/maintain.yml index a7af597..6fba5f2 100644 --- a/roles/ands_network/tasks/maintain.yml +++ b/roles/ands_network/tasks/maintain.yml @@ -6,4 +6,8 @@ lineinfile: dest="/etc/hosts" line="{{ ands_inner_lb_ip | default('') }} {{ openshift_master_cluster_hostname }}" regexp=".*{{ openshift_master_cluster_hostname }}$" state="{{ state }}" when: ('masters' not in group_names and 'new_masters' not in group_names) vars: - state: "{{ ands_inner_lb | default(false) | ternary('present', 'absent') }}" + state: "{{ ands_use_inner_lb | default(false) | ternary('present', 'absent') }}" + +- name: Provision inner load-balancer hostname in /etc/hosts + lineinfile: dest="/etc/hosts" line="{{ ands_inner_lb_ip }} {{ ands_inner_lb_hostname }} {{ ands_inner_lb_fqdn }}" regexp=".*{{ ands_inner_lb_fqdn }}$" state="present" + when: openshift_master_cluster_hostname != ands_inner_lb_fqdn diff --git a/roles/common/README b/roles/common/README deleted file mode 100644 index c8bd679..0000000 --- a/roles/common/README +++ /dev/null @@ -1,11 +0,0 @@ -Dependencies: - - Executed on all nodes - - No dependencies & no facts - -Parameters: - extra_packages: list of extra packages to install - -Actions: - - Enables standard repositories - - Install a set of common packages on all nodes (mc, etc.) - \ No newline at end of file diff --git a/roles/common/default/main.yml b/roles/common/default/main.yml deleted file mode 100644 index d355d15..0000000 --- a/roles/common/default/main.yml +++ /dev/null @@ -1 +0,0 @@ -os_update: "{{ ands_update | default(false) }}" \ No newline at end of file diff --git a/roles/common/tasks/main.yml b/roles/common/tasks/main.yml deleted file mode 100644 index fdd7246..0000000 --- a/roles/common/tasks/main.yml +++ /dev/null @@ -1,46 +0,0 @@ -- name: Ensure all required repositories are configured - package: name={{item}} state=present - with_items: - - epel-release - - centos-release-openshift-origin - -- name: Add our repository with updates and overrides - yum_repository: name="{{ item.name }}" description= "{{ item.description | default('Ands repository') }}" baseurl="{{ item.url }}" enabled="yes" gpgcheck="no" cost="{{ item.cost | default(1) }}" - with_items: "{{ ands_repositories | default([]) }}" - -- name: Ensure GlusterFS repositories are present - yum: name="centos-release-gluster{{ glusterfs_version }}" state=present - -# Seems we need iptables-services at least temporary... -- name: Ensure all required packages are installed - package: name={{item}} state=present - register: result - with_items: - - mc - - bzr - - git - - yamllint - - pyOpenSSL - - python-passlib - - python2-ruamel-yaml - - python2-jmespath - - python-ipaddress - - iptables-services - - PyYAML - - python-rhsm-certificates - - glusterfs-fuse - - telnet - -# We always update on first install and if requested -- name: Update CentOS - yum: name=* state=latest update_cache=yes - when: (result | changed) or (os_update | default(false)) - -#- name: Add NodeJS required by a few used Ansible extensions -# package: name={{item}} state=present -# with_items: -# - nodejs - -- name: Ensure all extra packages are installed - package: name={{item}} state=present - with_items: "{{ extra_packages | default([]) }}" diff --git a/roles/docker/defaults/main.yml b/roles/docker/defaults/main.yml index f7b96f5..30b1ff8 100644 --- a/roles/docker/defaults/main.yml +++ b/roles/docker/defaults/main.yml @@ -4,3 +4,6 @@ docker_lv: "docker-pool" docker_min_size: 100 docker_max_log_size: "2m" docker_max_log_files: "3" + +# There are some problems with groups on 1.13 +docker_version: "-1.12*" diff --git a/roles/docker/tasks/main.yml b/roles/docker/tasks/main.yml index 0d040a9..c03d897 100644 --- a/roles/docker/tasks/main.yml +++ b/roles/docker/tasks/main.yml @@ -1,6 +1,20 @@ --- +#- name: Remove docker +# yum: name="{{ item }}" state="absent" +# with_items: [ docker, docker-client, docker-common ] + +- name: Remove versionlock from yum + command: yum versionlock delete docker docker-common docker-client + register: result + failed_when: false + changed_when: result | failed + - name: Ensure docker is installed - yum: name="docker" state="present" + yum: name="docker{{ docker_version | default('') }}" state="{{ docker_version is defined | ternary('latest', 'present') }}" + +- name: Add versionlock to yum + command: yum versionlock add docker docker-common docker-client + when: docker_version is defined - name: start docker service: name="docker" state="started" diff --git a/roles/glusterfs/tasks/cfg/vols3.yml b/roles/glusterfs/tasks/cfg/vols3.yml index d094797..d8ed728 100644 --- a/roles/glusterfs/tasks/cfg/vols3.yml +++ b/roles/glusterfs/tasks/cfg/vols3.yml @@ -3,6 +3,7 @@ gluster_volume: state: present name: "{{ name }}" + host: "{{ ands_storage_hostname }}" cluster: "{{ domain_servers | join(',') }}" replicas: "{{ domain_servers | length }}" bricks: "{{ glusterfs_bricks_path }}/brick-{{ name }}" diff --git a/roles/glusterfs/tasks/data/vols2.yml b/roles/glusterfs/tasks/data/vols2.yml index d094797..d8ed728 100644 --- a/roles/glusterfs/tasks/data/vols2.yml +++ b/roles/glusterfs/tasks/data/vols2.yml @@ -3,6 +3,7 @@ gluster_volume: state: present name: "{{ name }}" + host: "{{ ands_storage_hostname }}" cluster: "{{ domain_servers | join(',') }}" replicas: "{{ domain_servers | length }}" bricks: "{{ glusterfs_bricks_path }}/brick-{{ name }}" diff --git a/roles/glusterfs/tasks/data/vols3.yml b/roles/glusterfs/tasks/data/vols3.yml index 866480c..14c3763 100644 --- a/roles/glusterfs/tasks/data/vols3.yml +++ b/roles/glusterfs/tasks/data/vols3.yml @@ -3,6 +3,7 @@ gluster_volume: state: present name: "{{ name }}" + host: "{{ ands_storage_hostname }}" cluster: "{{ domain_servers | join(',') }}" replicas: 3 arbiters: 1 diff --git a/roles/glusterfs/tasks/db/vols3.yml b/roles/glusterfs/tasks/db/vols3.yml index b1beacb..cbd238d 100644 --- a/roles/glusterfs/tasks/db/vols3.yml +++ b/roles/glusterfs/tasks/db/vols3.yml @@ -3,6 +3,7 @@ gluster_volume: state: present name: "{{ name }}" + host: "{{ ands_storage_hostname }}" cluster: "{{ domain_servers | join(',') }}" disperses: "3" redundancies: "1" diff --git a/roles/glusterfs/tasks/la/vols3.yml b/roles/glusterfs/tasks/la/vols3.yml index 9565bb3..ada8f95 100644 --- a/roles/glusterfs/tasks/la/vols3.yml +++ b/roles/glusterfs/tasks/la/vols3.yml @@ -3,6 +3,7 @@ gluster_volume: state: present name: "{{ name }}" + host: "{{ ands_storage_hostname }}" cluster: "{{ domain_servers | join(',') }}" bricks: "{{ glusterfs_bricks_path }}/brick-{{ name }}" transport: "{{ glusterfs_transport }}" diff --git a/roles/ntp b/roles/ntp new file mode 120000 index 0000000..626609b --- /dev/null +++ b/roles/ntp @@ -0,0 +1 @@ +../anslib/ansible-role-ntp/ \ No newline at end of file diff --git a/scripts/gluster.sh b/scripts/gluster.sh index 02a0a3f..9efea45 100755 --- a/scripts/gluster.sh +++ b/scripts/gluster.sh @@ -69,17 +69,21 @@ function migrate { # heal $1 -if [ -n "$1" -a "$1" != "all" ]; then - eval "$action" "$@" -else - [ "$1" == "all" ] && shift +if [ -z "$1" -a "$1" =~ ^all ]; then + all=0 + [ "$1" == "all_heketi" ] && all=1 + [ "$1" =~ ^all ] && shift vols=$(gluster volume info | grep -P '^Volume Name' | awk '{ print $NF }' | tr '\r\n' ' ') for vol in $vols; do - [[ "$vol" =~ [0-9] ]] && continue - [[ "$vol" =~ ^vol_ ]] && continue - [[ "$vol" =~ ^heketi ]] && continue + if [ $all -eq 0 ]; then + [[ "$vol" =~ [0-9] ]] && continue + [[ "$vol" =~ ^vol_ ]] && continue + [[ "$vol" =~ ^heketi ]] && continue + fi eval "$action" "$vol" "$@" done +else + eval "$action" "$@" fi diff --git a/setup.sh b/setup.sh index 1c38536..4ccf94d 100755 --- a/setup.sh +++ b/setup.sh @@ -46,7 +46,7 @@ case "$action" in apply playbooks/openshift-setup-projects.yml "$@" || exit 1 ;; project) - project=$2 + project=$1 shift [ -n "$project" ] || { usage 'project name should be specified...' ; exit 1; } apply playbooks/openshift-setup-project.yml --extra-vars "ands_configure_project=$project" "$@" || exit 1 diff --git a/setup/configs/labels.yml b/setup/configs/labels.yml index e8ee868..3f8cbe4 100644 --- a/setup/configs/labels.yml +++ b/setup/configs/labels.yml @@ -2,6 +2,9 @@ ands_openshift_labels: region: "infra" zone: "default" + production: 1 + server: 1 + permanent: 1 hostid: "{{ ands_host_id }}" hostname: "{{ ansible_hostname }}" fqdn: "{{ ansible_hostname }}.{{ ansible_domain }}" @@ -11,3 +14,9 @@ ands_openshift_labels: pod_node: 1 compute_node: 0 gpu_node: 0 + + + +ands_default_node_selector: + zone: default + production: "1" diff --git a/setup/configs/volumes.yml b/setup/configs/volumes.yml index f97d485..14aadfa 100644 --- a/setup/configs/volumes.yml +++ b/setup/configs/volumes.yml @@ -18,16 +18,21 @@ ands_nfs_clients: ands_storage_domains: - servers: "ands_storage_servers" - clients: [ "masters", "new_masters" ] + clients: [ "nodes", "new_nodes" ] volumes: provision: { type: "cfg", mount: "{{ ands_paths.provision }}" } + - servers: "ands_storage_servers" + clients: [ "masters", "new_masters" ] + volumes: +# provision: { type: "cfg", mount: "{{ ands_paths.provision }}" } openshift: { type: "cfg", mount: "{{ ands_paths.openshift }}", nfs_clients: "{{ ands_nfs_clients }}" } databases: { type: "db", mount: "{{ ands_paths.databases }}" } temporary: { type: "tmp", mount: "{{ ands_paths.temporary }}", nfs_clients: "{{ ands_nfs_clients }}" } datastore: { type: "data", mount: "{{ ands_paths.datastore }}", nfs_clients: "{{ ands_nfs_clients }}" } katrin_data: { type: "data", mount: "{{ ands_paths.katrin_data }}", nfs_clients: "{{ ands_nfs_clients }}" } -# - servers: "storage_nodes" -# clients: [ "nodes" ] + +# - servers: "ands_storage_servers" +# clients: [ "nodes", "new_nodes" ] # openshift: { type: "cfg", mount: "{{ ands_paths.openshift }}" } # temporary: { type: "tmp", mount: "{{ ands_paths.temporary }}" } # volumes: diff --git a/setup/projects/adei/templates/60-adei.yml.j2 b/setup/projects/adei/templates/60-adei.yml.j2 index ca3c17a..22f4bb0 100644 --- a/setup/projects/adei/templates/60-adei.yml.j2 +++ b/setup/projects/adei/templates/60-adei.yml.j2 @@ -75,12 +75,13 @@ objects: spec: schedule: "{{ cfg.cron }}" concurrencyPolicy: "Forbid" + startingDeadlineSeconds: "{{ cfg.start_tolerance | default(30) }}" successfulJobsHistoryLimit: "{{ adei_pod_history_limit }}" failedJobsHistoryLimit: "{{ adei_pod_history_limit }}" jobTemplate: spec: completions: "1" - activeDeadlineSeconds: "3600" + activeDeadlineSeconds: "{{ cfg.max_run_time | default(600) }}" # restartPolicy: "Never" template: metadata: @@ -125,6 +126,9 @@ objects: adei-setup: "${setup}" spec: restartPolicy: {{ restart_policy }} +{% if (ands_default_node_selector is defined) and (ands_default_node_selector | length > 0) %} + nodeSelector: {{ ands_default_node_selector | to_json }} +{% endif %} volumes: {{ cfg.vols | to_json }} {% if (cfg.groups is defined) or (cfg.run_as is defined) %} securityContext: diff --git a/setup/projects/adei/vars/globals.yml b/setup/projects/adei/vars/globals.yml index f8d7816..01fb495 100644 --- a/setup/projects/adei/vars/globals.yml +++ b/setup/projects/adei/vars/globals.yml @@ -233,3 +233,7 @@ adei_frontends: mounts: "{{ adei_prod_mounts | union(adei_pod_mounts) }}" groups: [ "adei" ] enabled: true + +# Extra options: +# start_tolerance: 30 +# max_run_time: 600 diff --git a/setup/projects/adei/vars/pods.yml b/setup/projects/adei/vars/pods.yml index 182db9c..3923c23 100644 --- a/setup/projects/adei/vars/pods.yml +++ b/setup/projects/adei/vars/pods.yml @@ -1,8 +1,7 @@ pods: mysql: service: { ports: [ 3306 ] } - sched: { replicas: 1, strategy: "Recreate", selector: { master: 1 } } - selector: { master: 1 } + sched: { replicas: 1, strategy: "Recreate", restrict: { fat_storage: "1" } } groups: [ "adei_db" ] images: - image: "centos/mysql-57-centos7" diff --git a/setup/projects/kaas/vars/pods.yml b/setup/projects/kaas/vars/pods.yml index 41831ab..8cfa65a 100644 --- a/setup/projects/kaas/vars/pods.yml +++ b/setup/projects/kaas/vars/pods.yml @@ -1,8 +1,7 @@ pods: kaas-router: service: { host: "{{ katrin_node }}", ports: [ 80/8080, 443/8043 ] } - sched: { replicas: 1, selector: { master: 1 } } - selector: { master: 1 } + sched: { replicas: 1, restrict: { master: "1" } } images: - image: "httpd:2.2" mappings: -- cgit v1.2.1