summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSuren A. Chilingaryan <csa@suren.me>2018-03-11 19:56:38 +0100
committerSuren A. Chilingaryan <csa@suren.me>2018-03-11 19:56:38 +0100
commitf3c41dd13a0a86382b80d564e9de0d6b06fb1dbf (patch)
tree3522ce77203da92bb2b6f7cfa2b0999bf6cc132c
parent6bc3a3ac71e11fb6459df715536fec373c123a97 (diff)
downloadands-f3c41dd13a0a86382b80d564e9de0d6b06fb1dbf.tar.gz
ands-f3c41dd13a0a86382b80d564e9de0d6b06fb1dbf.tar.bz2
ands-f3c41dd13a0a86382b80d564e9de0d6b06fb1dbf.tar.xz
ands-f3c41dd13a0a86382b80d564e9de0d6b06fb1dbf.zip
Various fixes before moving to hardware installation
-rw-r--r--.gitmodules3
m---------anslib/ansible-role-ntp0
-rwxr-xr-xanslib/link_vars.sh1
m---------anslib/openshift-ansible0
-rw-r--r--anslib/patches/etcd-ds-rh1538446-openshift-undefined.patch11
-rw-r--r--anslib/patches/glusterfs-ds-use_cluster_local_for_heketi.patch24
-rw-r--r--anslib/patches/networkmanager-ds-use-openshift-interface.patch47
-rw-r--r--anslib/patches/openshift-ds-update371.patch45
-rw-r--r--anslib/patches/registry-ds-glusterfs-fixes.patch61
-rw-r--r--anslib/patches/registry-ds-glusterfs-storageclass.patch64
-rw-r--r--docs/ands_ansible.txt2
-rw-r--r--docs/backup.txt26
-rw-r--r--docs/consistency.txt36
-rw-r--r--docs/managment.txt166
-rw-r--r--docs/network.txt58
-rw-r--r--docs/pods.txt13
-rw-r--r--docs/regions.txt16
-rw-r--r--docs/samples/templates/00-katrin-restricted.yml.j244
-rw-r--r--docs/samples/vars/run_oc.yml6
-rw-r--r--docs/samples/vars/variants.yml33
-rw-r--r--docs/troubleshooting.txt210
-rw-r--r--docs/upgrade.txt64
-rw-r--r--group_vars/OSEv3.yml65
-rw-r--r--group_vars/ands.yml7
-rw-r--r--group_vars/staging.yml6
-rw-r--r--group_vars/testing.yml15
-rw-r--r--group_vars/virtual.yml6
-rw-r--r--inventories/staging.erb48
-rw-r--r--inventories/testing.erb60
-rw-r--r--opts.sh10
-rw-r--r--playbooks/ands-gluster-ganesha.yml1
-rw-r--r--playbooks/ands-gluster.yml6
-rw-r--r--playbooks/ands-prepare.yml13
-rw-r--r--playbooks/openshift-add-masters.yml2
-rw-r--r--playbooks/openshift-add-nodes.yml2
l---------playbooks/openshift-deploy-cluster.yml1
-rw-r--r--playbooks/openshift-install-service-catalog.yml13
-rw-r--r--[l---------]playbooks/openshift-redeploy-certificates.yml14
-rw-r--r--playbooks/openshift-setup-project.yml2
-rw-r--r--playbooks/openshift-setup-projects.yml2
-rw-r--r--playbooks/openshift-setup-security.yml2
-rw-r--r--playbooks/openshift-setup-users.yml2
-rw-r--r--playbooks/openshift-upgrade.yml11
-rw-r--r--roles/ands_backup/defaults/main.yml9
-rw-r--r--roles/ands_backup/tasks/main.yml29
-rw-r--r--roles/ands_backup/templates/backup.cron.j24
-rwxr-xr-xroles/ands_backup/templates/backup.sh.j272
-rw-r--r--roles/ands_common/README (renamed from roles/common/README)0
-rw-r--r--roles/ands_common/default/main.yml (renamed from roles/common/default/main.yml)0
-rw-r--r--roles/ands_common/tasks/main.yml (renamed from roles/common/tasks/main.yml)1
-rw-r--r--roles/ands_facts/defaults/main.yml8
-rw-r--r--roles/ands_facts/tasks/main.yml8
-rw-r--r--roles/ands_facts/tasks/network.yml16
-rw-r--r--roles/ands_facts/tasks/storage.yml8
-rw-r--r--roles/ands_kaas/templates/50-kaas-pods.yml.j216
-rw-r--r--roles/ands_network/tasks/common.yml21
-rw-r--r--roles/ands_network/tasks/ganesha.yml12
-rw-r--r--roles/ands_network/tasks/install_post.yml2
-rw-r--r--roles/ands_network/tasks/maintain.yml6
-rw-r--r--roles/docker/defaults/main.yml3
-rw-r--r--roles/docker/tasks/main.yml16
-rw-r--r--roles/glusterfs/tasks/cfg/vols3.yml1
-rw-r--r--roles/glusterfs/tasks/data/vols2.yml1
-rw-r--r--roles/glusterfs/tasks/data/vols3.yml1
-rw-r--r--roles/glusterfs/tasks/db/vols3.yml1
-rw-r--r--roles/glusterfs/tasks/la/vols3.yml1
l---------roles/ntp1
-rwxr-xr-xscripts/gluster.sh18
-rwxr-xr-xsetup.sh2
-rw-r--r--setup/configs/labels.yml9
-rw-r--r--setup/configs/volumes.yml11
-rw-r--r--setup/projects/adei/templates/60-adei.yml.j26
-rw-r--r--setup/projects/adei/vars/globals.yml4
-rw-r--r--setup/projects/adei/vars/pods.yml3
-rw-r--r--setup/projects/kaas/vars/pods.yml3
75 files changed, 1376 insertions, 135 deletions
diff --git a/.gitmodules b/.gitmodules
index 1401d9b..1185e39 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
[submodule "anslib/ansible-ghetto-json"]
path = anslib/ansible-ghetto-json
url = https://github.com/FauxFaux/ansible-ghetto-json.git
+[submodule "anslib/ansible-role-ntp"]
+ path = anslib/ansible-role-ntp
+ url = https://github.com/geerlingguy/ansible-role-ntp.git
diff --git a/anslib/ansible-role-ntp b/anslib/ansible-role-ntp
new file mode 160000
+Subproject 47b40d48fce51c79630feeac84659824a746d4a
diff --git a/anslib/link_vars.sh b/anslib/link_vars.sh
index 01a9fe9..651c09c 100755
--- a/anslib/link_vars.sh
+++ b/anslib/link_vars.sh
@@ -24,3 +24,4 @@ function mklink_func {
export -f mklink_func
find openshift-ansible/playbooks -mindepth 0 -maxdepth 2 -type d -print0 | xargs -0 -L 1 -I {} bash -c 'mklink_func "$@"' _ {}
+find openshift-ansible/playbooks/common/openshift-cluster/upgrades -mindepth 0 -maxdepth 1 -type d -print0 | xargs -0 -L 1 -I {} bash -c 'mklink_func "$@"' _ {}
diff --git a/anslib/openshift-ansible b/anslib/openshift-ansible
-Subproject d1fcbd7a9a8511b895f9a163f7fa2a7bc0d72f2
+Subproject 22d3a96deaf74b7aa9aa021a73ef39e2b4da337
diff --git a/anslib/patches/etcd-ds-rh1538446-openshift-undefined.patch b/anslib/patches/etcd-ds-rh1538446-openshift-undefined.patch
new file mode 100644
index 0000000..2301072
--- /dev/null
+++ b/anslib/patches/etcd-ds-rh1538446-openshift-undefined.patch
@@ -0,0 +1,11 @@
+diff --git a/roles/openshift_etcd_facts/tasks/main.yml b/roles/openshift_etcd_facts/tasks/main.yml
+index 86546f4..bda0606 100644
+--- a/roles/openshift_etcd_facts/tasks/main.yml
++++ b/roles/openshift_etcd_facts/tasks/main.yml
+@@ -1,2 +1,6 @@
+ ---
++- openshift_facts:
++ role: etcd
++ local_facts: {}
++
+ - import_tasks: set_etcd_ca_host.yml
diff --git a/anslib/patches/glusterfs-ds-use_cluster_local_for_heketi.patch b/anslib/patches/glusterfs-ds-use_cluster_local_for_heketi.patch
new file mode 100644
index 0000000..75a8a43
--- /dev/null
+++ b/anslib/patches/glusterfs-ds-use_cluster_local_for_heketi.patch
@@ -0,0 +1,24 @@
+diff --git a/roles/openshift_storage_glusterfs/tasks/glusterfs_common.yml b/roles/openshift_storage_glusterfs/tasks/glusterfs_common.yml
+index 4928e86..b8f3cab 100644
+--- a/roles/openshift_storage_glusterfs/tasks/glusterfs_common.yml
++++ b/roles/openshift_storage_glusterfs/tasks/glusterfs_common.yml
+@@ -293,7 +293,8 @@
+
+ - name: Determine StorageClass heketi URL
+ set_fact:
+- glusterfs_heketi_route: "{{ heketi_route.results.results[0]['spec']['host'] }}"
++ glusterfs_heketi_route: "heketi-{{ glusterfs_name }}.{{ glusterfs_namespace }}.svc.cluster.local:8080"
++# glusterfs_heketi_route: "{{ heketi_route.results.results[0]['spec']['host'] }}"
+ when:
+ - glusterfs_heketi_is_native
+
+@@ -344,7 +345,8 @@
+
+ - name: Determine StorageClass heketi URL
+ set_fact:
+- glusterfs_heketi_route: "{{ heketi_route.results.results[0]['spec']['host'] }}"
++ glusterfs_heketi_route: "heketi-{{ glusterfs_name }}.{{ glusterfs_namespace }}.svc.cluster.local:8080"
++# glusterfs_heketi_route: "{{ heketi_route.results.results[0]['spec']['host'] }}"
+ when:
+ - glusterfs_heketi_is_native
+ - glusterfs_heketi_route is not defined
diff --git a/anslib/patches/networkmanager-ds-use-openshift-interface.patch b/anslib/patches/networkmanager-ds-use-openshift-interface.patch
new file mode 100644
index 0000000..687be8a
--- /dev/null
+++ b/anslib/patches/networkmanager-ds-use-openshift-interface.patch
@@ -0,0 +1,47 @@
+diff --git a/roles/openshift_node/files/bootstrap.yml b/roles/openshift_node/files/bootstrap.yml
+index ea28064..df95ba3 100644
+--- a/roles/openshift_node/files/bootstrap.yml
++++ b/roles/openshift_node/files/bootstrap.yml
+@@ -8,7 +8,7 @@
+ lines:
+ - regex: ^listen-address
+ state: present
+- line: "listen-address={{ ansible_default_ipv4.address }}"
++ line: "listen-address={{ openshift_dns_ip }}"
+ node_dns:
+ file: /etc/dnsmasq.d/node-dnsmasq.conf
+ lines:
+diff --git a/roles/openshift_node/files/networkmanager/99-origin-dns.sh b/roles/openshift_node/files/networkmanager/99-origin-dns.sh
+index acf3e2f..16129a2 100755
+--- a/roles/openshift_node/files/networkmanager/99-origin-dns.sh
++++ b/roles/openshift_node/files/networkmanager/99-origin-dns.sh
+@@ -43,10 +43,25 @@ if [[ $2 =~ ^(up|dhcp4-change|dhcp6-change)$ ]]; then
+ ######################################################################
+ # couldn't find an existing method to determine if the interface owns the
+ # default route
+- def_route=$(/sbin/ip route list match 0.0.0.0/0 | awk '{print $3 }')
+- def_route_int=$(/sbin/ip route get to ${def_route} | awk '{print $3}')
+- def_route_ip=$(/sbin/ip route get to ${def_route} | awk '{print $5}')
+- if [[ ${DEVICE_IFACE} == ${def_route_int} ]]; then
++ #SDS
++ #def_route=$(/sbin/ip route list match 0.0.0.0/0 | awk '{print $3 }')
++ #def_route_int=$(/sbin/ip route get to ${def_route} | awk '{print $3}')
++ #def_route_ip=$(/sbin/ip route get to ${def_route} | awk '{print $5}')
++ #EDS
++ def_route_ip=$(cat /etc/hosts | grep openshift_dns_ip | awk '{ print $1 }')
++ [ -n "$def_route_ip" ] && def_route_int=$(ip -o addr show | grep ${def_route_ip} | awk '{ print $2 }')
++ if [ -z "$def_route_ip" -o -z "$def_route_int" ]; then
++ def_route=$(/sbin/ip route list match 0.0.0.0/0 | awk '{print $3 }')
++ def_route_int=$(/sbin/ip route get to ${def_route} | awk '{print $3}' | head -n 1)
++ def_route_ip=$(/sbin/ip -f inet addr show dev ${def_route_int} scope global up | grep -Po 'inet \K[\d.]+' | head -n 1)
++ fi
++
++ def_routes=$(/sbin/ip route list match 0.0.0.0/0 | awk '{print $3 }')
++ def_routes_int=$(for r in ${def_routes}; do /sbin/ip route get to ${r} | awk '{print $3}'; done)
++ interfaces="${def_route_int} ${def_routes_int}"
++
++ if [[ "${interfaces}" =~ (^|[[:space:]])${DEVICE_IFACE}($|[[:space:]]) ]]; then
++# if [[ ${DEVICE_IFACE} == ${def_route_int} ]]; then
+ if [ ! -f /etc/dnsmasq.d/origin-dns.conf ]; then
+ cat << EOF > /etc/dnsmasq.d/origin-dns.conf
+ no-resolv
diff --git a/anslib/patches/openshift-ds-update371.patch b/anslib/patches/openshift-ds-update371.patch
new file mode 100644
index 0000000..a6beff3
--- /dev/null
+++ b/anslib/patches/openshift-ds-update371.patch
@@ -0,0 +1,45 @@
+diff --git a/playbooks/common/openshift-cluster/upgrades/v3_7/upgrade.yml b/playbooks/common/openshift-cluster/upgrades/v3_7/upgrade.yml
+index cc2ec27..6c4ccf8 100644
+--- a/playbooks/common/openshift-cluster/upgrades/v3_7/upgrade.yml
++++ b/playbooks/common/openshift-cluster/upgrades/v3_7/upgrade.yml
+@@ -12,7 +12,7 @@
+ - pre_upgrade
+ tasks:
+ - set_fact:
+- openshift_upgrade_target: '3.7'
++ openshift_upgrade_target: '3.7.1'
+ openshift_upgrade_min: '3.6'
+
+ - import_playbook: ../pre/config.yml
+diff --git a/roles/openshift_repos/templates/CentOS-OpenShift-Origin371.repo.j2 b/roles/openshift_repos/templates/CentOS-OpenShift-Origin371.repo.j2
+new file mode 100644
+index 0000000..10b49c0
+--- /dev/null
++++ b/roles/openshift_repos/templates/CentOS-OpenShift-Origin371.repo.j2
+@@ -0,0 +1,26 @@
++[centos-openshift-origin371]
++name=CentOS OpenShift Origin
++baseurl={{ ands_repo_url }}/openshift371/
++enabled=1
++gpgcheck=0
++
++[centos-openshift-origin37-testing]
++name=CentOS OpenShift Origin Testing
++baseurl=http://buildlogs.centos.org/centos/7/paas/x86_64/openshift-origin37/
++enabled={{ 1 if openshift_repos_enable_testing else 0 }}
++gpgcheck=0
++gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-PaaS
++
++[centos-openshift-origin37-debuginfo]
++name=CentOS OpenShift Origin DebugInfo
++baseurl=http://debuginfo.centos.org/centos/7/paas/x86_64/
++enabled=0
++gpgcheck=1
++gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-PaaS
++
++[centos-openshift-origin37-source]
++name=CentOS OpenShift Origin Source
++baseurl=http://vault.centos.org/centos/7/paas/Source/openshift-origin37/
++enabled=0
++gpgcheck=1
++gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-SIG-PaaS
diff --git a/anslib/patches/registry-ds-glusterfs-fixes.patch b/anslib/patches/registry-ds-glusterfs-fixes.patch
new file mode 100644
index 0000000..65f30e5
--- /dev/null
+++ b/anslib/patches/registry-ds-glusterfs-fixes.patch
@@ -0,0 +1,61 @@
+diff --git a/roles/openshift_hosted/tasks/registry.yml b/roles/openshift_hosted/tasks/registry.yml
+index bc4d81e..4720095 100644
+diff --git a/roles/openshift_hosted/tasks/registry_storage.yml b/roles/openshift_hosted/tasks/registry_storage.yml
+index aa66a78..e1b8c4e 100644
+diff --git a/roles/openshift_hosted/tasks/storage/glusterfs.yml b/roles/openshift_hosted/tasks/storage/glusterfs.yml
+index 7223a5a..3465b6c 100644
+--- a/roles/openshift_hosted/tasks/storage/glusterfs.yml
++++ b/roles/openshift_hosted/tasks/storage/glusterfs.yml
+@@ -35,7 +35,7 @@
+ mount:
+ state: mounted
+ fstype: glusterfs
+- src: "{% if 'glusterfs_registry' in groups and groups['glusterfs_registry'] | length > 0 %}{% set node = groups.glusterfs_registry[0] %}{% elif 'glusterfs' in groups and groups['glusterfs'] | length > 0 %}{% set node = groups.glusterfs[0] %}{% endif %}{% if openshift_hosted_registry_storage_glusterfs_ips is defined and openshift_hosted_registry_storage_glusterfs_ips|length > 0 %}{{ openshift_hosted_registry_storage_glusterfs_ips[0] }}{% elif 'glusterfs_hostname' in hostvars[node] %}{{ hostvars[node].glusterfs_hostname }}{% elif 'openshift' in hostvars[node] %}{{ hostvars[node].openshift.node.nodename }}{% else %}{{ node }}{% endif %}:/{{ openshift.hosted.registry.storage.glusterfs.path }}"
++ src: "{% if 'glusterfs_registry' in groups and groups['glusterfs_registry'] | length > 0 %}{% set node = groups.glusterfs_registry[0] %}{% elif 'glusterfs' in groups and groups['glusterfs'] | length > 0 %}{% set node = groups.glusterfs[0] %}{% endif %}{% if openshift_hosted_registry_storage_glusterfs_ips is defined and openshift_hosted_registry_storage_glusterfs_ips|length > 0 %}{{ openshift_hosted_registry_storage_glusterfs_ips[0] }}{% elif 'glusterfs_hostname' in hostvars[node] %}{{ hostvars[node].glusterfs_hostname }}{% elif 'openshift' in hostvars[node] %}{{ hostvars[node].openshift.node.nodename }}{% else %}{{ node }}{% endif %}:/{{ openshift_hosted_registry_storage_glusterfs_path }}"
+ name: "{{ mktemp.stdout }}"
+
+ - name: Set registry volume permissions
+@@ -49,10 +49,11 @@
+ - block:
+ - name: Activate registry maintenance mode
+ oc_env:
++ kind: dc
+ namespace: "{{ openshift_hosted_registry_namespace }}"
+ name: "{{ openshift_hosted_registry_name }}"
+ env_vars:
+- - REGISTRY_STORAGE_MAINTENANCE_READONLY_ENABLED: 'true'
++ REGISTRY_STORAGE_MAINTENANCE_READONLY_ENABLED: 'true'
+
+ - name: Get first registry pod name
+ set_fact:
+@@ -72,11 +73,12 @@
+
+ - name: Deactivate registry maintenance mode
+ oc_env:
++ kind: dc
+ namespace: "{{ openshift_hosted_registry_namespace }}"
+ name: "{{ openshift_hosted_registry_name }}"
+ state: absent
+ env_vars:
+- - REGISTRY_STORAGE_MAINTENANCE_READONLY_ENABLED: 'true'
++ REGISTRY_STORAGE_MAINTENANCE_READONLY_ENABLED: 'true'
+ when: openshift_hosted_registry_storage_glusterfs_swap
+
+ - name: Unmount registry volume and clean up mount point/fstab
+diff --git a/roles/openshift_persistent_volumes/tasks/main.yml b/roles/openshift_persistent_volumes/tasks/main.yml
+index b1d9c8c..1c32a67 100644
+diff --git a/roles/openshift_persistent_volumes/templates/persistent-volume-claim.yml.j2 b/roles/openshift_persistent_volumes/templates/persistent-volume-claim.yml.j2
+index ca8b747..ce15533 100644
+--- a/roles/openshift_persistent_volumes/templates/persistent-volume-claim.yml.j2
++++ b/roles/openshift_persistent_volumes/templates/persistent-volume-claim.yml.j2
+@@ -12,7 +12,7 @@ items:
+ resources:
+ requests:
+ storage: "{{ claim.capacity }}"
+-{% if claim.storageclass is not None %}
++{% if claim.storageclass is defined and claim.storageclass is not none %}
+ storageClassName: "{{ claim.storageclass }}"
+ {% endif %}
+ {% endfor %}
+diff --git a/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml b/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml
+index e91e130..f3562b6 100644
diff --git a/anslib/patches/registry-ds-glusterfs-storageclass.patch b/anslib/patches/registry-ds-glusterfs-storageclass.patch
new file mode 100644
index 0000000..a189091
--- /dev/null
+++ b/anslib/patches/registry-ds-glusterfs-storageclass.patch
@@ -0,0 +1,64 @@
+diff --git a/roles/openshift_hosted/tasks/registry.yml b/roles/openshift_hosted/tasks/registry.yml
+index bc4d81e..4720095 100644
+--- a/roles/openshift_hosted/tasks/registry.yml
++++ b/roles/openshift_hosted/tasks/registry.yml
+@@ -112,6 +112,7 @@
+ when:
+ - openshift_hosted_registry_storage_glusterfs_ips|length > 0
+ - openshift_hosted_registry_storage_kind | default(none) in ['glusterfs']
++ - openshift_hosted_registry_storage_class is not defined
+
+ - name: Create OpenShift registry
+ oc_adm_registry:
+diff --git a/roles/openshift_hosted/tasks/registry_storage.yml b/roles/openshift_hosted/tasks/registry_storage.yml
+index aa66a78..e1b8c4e 100644
+--- a/roles/openshift_hosted/tasks/registry_storage.yml
++++ b/roles/openshift_hosted/tasks/registry_storage.yml
+@@ -2,3 +2,4 @@
+ - include_tasks: storage/glusterfs.yml
+ when:
+ - openshift_hosted_registry_storage_kind | default(none) == 'glusterfs' or openshift_hosted_registry_storage_glusterfs_swap
++ - openshift_hosted_registry_storage_class is not defined
+diff --git a/roles/openshift_hosted/tasks/storage/glusterfs.yml b/roles/openshift_hosted/tasks/storage/glusterfs.yml
+index 7223a5a..3465b6c 100644
+diff --git a/roles/openshift_persistent_volumes/tasks/main.yml b/roles/openshift_persistent_volumes/tasks/main.yml
+index b1d9c8c..1c32a67 100644
+--- a/roles/openshift_persistent_volumes/tasks/main.yml
++++ b/roles/openshift_persistent_volumes/tasks/main.yml
+@@ -23,7 +23,21 @@
+ - name: "{{ openshift_hosted_registry_storage_volume_name }}-glusterfs-claim"
+ capacity: "{{ openshift_hosted_registry_storage_volume_size }}"
+ access_modes: "{{ openshift_hosted_registry_storage_access_modes }}"
+- when: openshift_hosted_registry_storage_glusterfs_swap | default(False)
++ when:
++ - openshift_hosted_registry_storage_glusterfs_swap | default(False)
++ - openshift_hosted_registry_storage_class is not defined
++
++
++- set_fact:
++ glusterfs_pv: []
++ glusterfs_pvc:
++ - name: "{{ openshift_hosted_registry_storage_volume_name }}-claim"
++ storageclass: "{{ openshift_hosted_registry_storage_class }}"
++ capacity: "{{ openshift_hosted_registry_storage_volume_size }}"
++ access_modes: "{{ openshift_hosted_registry_storage_access_modes }}"
++ when:
++ - openshift_hosted_registry_storage_class is defined
++
+
+ - name: create standard pv and pvc lists
+ # generate_pv_pvcs_list is a custom action module defined in
+diff --git a/roles/openshift_persistent_volumes/templates/persistent-volume-claim.yml.j2 b/roles/openshift_persistent_volumes/templates/persistent-volume-claim.yml.j2
+index ca8b747..ce15533 100644
+diff --git a/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml b/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml
+index e91e130..f3562b6 100644
+--- a/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml
++++ b/roles/openshift_storage_glusterfs/tasks/glusterfs_registry.yml
+@@ -12,4 +12,6 @@
+
+ - name: Create GlusterFS registry volume
+ command: "{{ glusterfs_heketi_client }} volume create --size={{ openshift_hosted_registry_storage_volume_size | replace('Gi','') }} --name={{ openshift_hosted_registry_storage_glusterfs_path }}"
+- when: "openshift_hosted_registry_storage_glusterfs_path not in registry_volume.stdout"
++ when:
++ - "openshift_hosted_registry_storage_glusterfs_path not in registry_volume.stdout"
++ - "openshift_hosted_registry_storage_class is not defined"
diff --git a/docs/ands_ansible.txt b/docs/ands_ansible.txt
index 80a7cf0..70800e1 100644
--- a/docs/ands_ansible.txt
+++ b/docs/ands_ansible.txt
@@ -89,7 +89,7 @@ Ansible parameters (global)
glusterfs_version group_vars
glusterfs_transport group_vars
- - OPenShift specific
+ - OpenShift specific
ands_openshift_labels setup/configs Labels to assign to the nodes
ands_openshift_projects setup/configs List of projects to configure (with GlusterFS endpoints, etc.)
ands_openshift_users setup/configs Optional list of user names with contacts
diff --git a/docs/backup.txt b/docs/backup.txt
new file mode 100644
index 0000000..1b25592
--- /dev/null
+++ b/docs/backup.txt
@@ -0,0 +1,26 @@
+Critical directories and services
+---------------------------------
+ - etcd database [ once ]
+ * There is etcd2 and etcd3 APIs. OpenShift 3.5+ uses etcd3, but documentation
+ still describes etcd2-style backup. etcd3 is backward compatible with etcd2,
+ and we can run etcd2 backup as well. Now the question if we need to backup
+ both ways (OpenShift 3.5 is definitively has etcd3 data) or just etcd3
+ considering it is a bug in documentation.
+ * etcd3
+ etcdctl3 --endpoints="192.168.213.1:2379" snapshot save snapshot.db
+ * etcd2
+ etcdctl backup --data-dir /var/lib/etcd/ --backup-dir .
+ cp "$ETCD_DATA_DIR"/member/snap/db member/snap/db
+
+ - heketi topology [ once ]
+ heketi-cli -s http://heketi-storage.glusterfs.svc.cluster.local:8080 --user admin --secret "$(oc get secret heketi-storage-admin-secret -n glusterfs -o jsonpath='{.data.key}' | base64 -d)" topology info --json
+
+ - Gluster volume information [ storage nodes ]
+ * /var/lib/glusterd/glusterd.info
+ * /var/lib/glusterd/peers
+ * /var/lib/glusterd/glustershd - not mentioned in docs
+
+ - etc [ all nodes ]
+ * /etc/origin/ - Only *.key *.crt from /etc/origin/master in docs
+ * /etc/etcd - Not mentioned
+ * /etc/docker - Only certs.d
diff --git a/docs/consistency.txt b/docs/consistency.txt
new file mode 100644
index 0000000..127d9a7
--- /dev/null
+++ b/docs/consistency.txt
@@ -0,0 +1,36 @@
+General overview
+=================
+ - etcd services (worth checking both ports)
+ etcdctl3 --endpoints="192.168.213.1:2379" member list - doesn't check health only reports members
+ oc get cs - only etcd (other services will fail on Openshift)
+ - All nodes and pods are fine and running and all pvc are bound
+ oc get nodes
+ oc get pods --all-namespaces -o wide
+ oc get pvc --all-namespaces -o wide
+ - API health check
+ curl -k https://apiserver.kube-service-catalog.svc/healthz
+
+Storage
+=======
+ - Heketi status
+ heketi-cli -s http://heketi-storage.glusterfs.svc.cluster.local:8080 --user admin --secret "$(oc get secret heketi-storage-admin-secret -n glusterfs -o jsonpath='{.data.key}' | base64 -d)" topology info
+ - Status of Gluster Volume (and its bricks which with heketi fails often)
+ gluster volume info
+ ./gluster.sh info all_heketi
+ - Check available storage space on system partition and LVM volumes (docker, heketi, ands)
+ Run 'df -h' and 'lvdisplay' on each node
+
+Networking
+==========
+ - Check that both internal and external addresses are resolvable from all hosts.
+ * I.e. we should be able to resolve 'google.com'
+ * And we should be able to resolve 'heketi-storage.glusterfs.svc.cluster.local'
+
+ - Check that keepalived service is up and the corresponding ip's are really assigned to one
+ of the nodes (vagrant provisioner would remove keepalived tracked ips, but keepalived will
+ continue running without noticing it)
+
+ - Ensure, we don't have override of cluster_name to first master (which we do during the
+ provisioning of OpenShift plays)
+
+ \ No newline at end of file
diff --git a/docs/managment.txt b/docs/managment.txt
new file mode 100644
index 0000000..1eca8a8
--- /dev/null
+++ b/docs/managment.txt
@@ -0,0 +1,166 @@
+DOs and DONTs
+=============
+ Here we discuss things we should do and we should not do!
+
+ - Scaling up cluster is normally problem-less. Both nodes & masters can be added
+ fast and without much troubles afterwards.
+
+ - Upgrade procedure may cause the problems. The main trouble that many pods are
+ configured to use the 'latest' tag. And the latest versions has latest problems (some
+ of the tags can be fixed to actual version, but finding that is broken and why takes
+ a lot of effort)...
+ * Currently, there is problems if 'kube-service-catalog' is updated (see discussion
+ in docs/upgrade.txt). While it seems nothing really changes, the connection between
+ apiserver and etcd breaks down (at least for health checks). The intallation reamins
+ pretty much usable, but not in healthy state. This particular update is blocked by
+ setting.
+ openshift_enable_service_catalog: false
+ Then, it is left in 'Error' state, but can be easily recovered by deteleting and
+ allowing system to re-create a new pod.
+ * However, as cause is unclear, it is possible that something else with break as time
+ passes and new images are released. It is ADVISED to check upgrade in staging first.
+ * During upgrade also other system pods may stuck in Error state (as explained
+ in troubleshooting) and block the flow of upgrade. Just delete them and allow
+ system to re-create to continue.
+ * After upgrade, it is necessary to verify that all pods are operational and
+ restart ones in 'Error' states.
+
+ - Re-running install will break on heketi. And it will DESTROY heketi topology!
+ DON"T DO IT! Instead a separate components can be re-installed.
+ * For instance to reinstall 'openshift-ansible-service-broker' use
+ openshift-install-service-catalog.yml
+ * There is a way to prevent plays from touching heketi, we need to define
+ openshift_storage_glusterfs_is_missing: False
+ openshift_storage_glusterfs_heketi_is_missing: False
+ But I am not sure if it is only major issue.
+
+ - Few administrative tools could cause troubles. Don't run
+ * oc adm diagnostics
+
+
+Failures / Immidiate
+========
+ - We need to remove the failed node from etcd cluster
+ etcdctl3 --endpoints="192.168.213.1:2379" member list
+ etcdctl3 --endpoints="192.168.213.1:2379" member remove <hexid>
+
+ - Further, the following is required on all remaining nodes if the node is forever gone
+ * Delete node
+ oc delete node
+ * Remove it also from /etc/etcd.conf on all nodes ETCD_INITIAL_CLUSTER
+ * Remove failed nodes from 'etcdClinetInfo' section in /etc/origin/master/master-config.yaml
+ systemctl restart origin-master-api.service
+
+Scaling / Recovery
+=======
+ - One important point.
+ * If we lost data on the storage node, it should be re-added with different name (otherwise
+ the GlusterFS recovery would be significantly more complicated)
+ * If Gluster bricks are preserved, we may keep the name. I have not tried, but according to
+ documentation, it should be possible to reconnect it back and synchronize. Still it may be
+ easier to use a new name again to simplify procedure.
+ * Simple OpenShift nodes may be re-added with the same name, no problem.
+
+ - Next we need to perform all prepartion steps (the --limit should not be applied as we normally
+ need to update CentOS on all nodes to synchronize software versions; list all nodes in /etc/hosts
+ files; etc).
+ ./setup.sh -i staging prepare
+
+ - The OpenShift scale is provided as several ansible plays (scale-masters, scale-nodes, scale-etcd).
+ * Running 'masters' will also install configured 'nodes' and 'etcd' daemons
+ * I guess running 'nodes' will also handle 'etcd' daemons, but I have not checked.
+
+Problems
+--------
+ - There should be no problems if a simple node crashed, but things may go wrong if one of the
+ masters is crashed. And things definitively will go wrong if complete cluster will be cut from the power.
+ * Some pods will be stuck polling images. This happens if node running docker-registry have crashed
+ and the persistent storage was not used to back the registry. It can be fixed by re-schedulling build
+ and roling out the latest version from dc.
+ oc -n adei start-build adei
+ oc -n adei rollout latest mysql
+ OpenShift will trigger rollout automatically in some time, but it will take a while. The builds
+ should be done manually it seems.
+ * In case of long outtage some CronJobs will stop execute. The reason is some protection against
+ excive loads and missing defaults. Fix is easy, just setup how much time the OpenShift scheduller
+ allows to CronJob to start before considering it failed:
+ oc -n adei patch cronjob/adei-autogen-update --patch '{ "spec": {"startingDeadlineSeconds": 10 }}'
+
+ - if we forgot to remove old host from etcd cluster, the OpenShift node will be configured, but etcd
+ will not be installed. We need, then, to remove the node as explained above and run scale of etcd
+ cluster.
+ * In multiple ocasions, the etcd daemon has failed after reboot and needed to be resarted manually.
+ If half of the daemons is broken, the 'oc' will block.
+
+
+
+Storage / Recovery
+=======
+ - Furthermore, it is necessary to add glusterfs nodes on a new storage nodes. It is not performed
+ automatically by scale plays. The 'glusterfs' play should be executed with additional options
+ specifying that we are just re-configuring nodes. We can check if all pods are serviced
+ oc -n glusterfs get pods -o wide
+ Both OpenShift and etcd clusters should be in proper state before running this play. Fixing and re-running
+ should be not an issue.
+
+ - More details:
+ https://docs.openshift.com/container-platform/3.7/day_two_guide/host_level_tasks.html
+
+
+Heketi
+------
+ - With heketi things are straighforward, we need to mark node broken. Then heketi will automatically move the
+ bricks to other servers (as he thinks fit).
+ * Accessing heketi
+ heketi-cli -s http://heketi-storage-glusterfs.openshift.suren.me --user admin --secret "$(oc get secret heketi-storage-admin-secret -n glusterfs -o jsonpath='{.data.key}' | base64 -d)"
+ * Gettiing required ids
+ heketi-cli topology info
+ * Removing node
+ heketi-cli node info <failed_node_id>
+ heketi-cli node disable <failed_node_id>
+ heketi-cli node remove <failed_node_id>
+ * Thats it. A few self-healing daemons are running which should bring the volumes in order automatically.
+ * The node will still persist in heketi topology as failed, but will not be used ('node delete' potentially could destroy it, but it is failin)
+
+ - One problem with heketi, it may start volumes before bricks get ready. Consequently, it may run volumes with several bricks offline. It should be
+ checked and fixed by restarting the volumes.
+
+KaaS Volumes
+------------
+ There is two modes.
+ - If we migrated to a new server, we need to migrate bricks (force is required because
+ the source break is dead and data can't be copied)
+ gluster volume replace-brick <volume> <src_brick> <dst_brick> commit force
+ * There is healing daemons running and nothing else has to be done.
+ * There play and scripts available to move all bricks automatically
+
+ - If we kept the name and the data is still there, it should be also relatively easy
+ to perform migration (not checked). We also should have backups of all this data.
+ * Ensure Gluster is not running on the failed node
+ oadm manage-node ipeshift2 --schedulable=false
+ oadm manage-node ipeshift2 --evacuate
+ * Verify the gluster pod is not active. It may be running, but not ready.
+ Could be double checked with 'ps'.
+ oadm manage-node ipeshift2 --list-pods
+ * Get the original Peer UUID of the failed node (by running on healthy node)
+ gluster peer status
+ * And create '/var/lib/glusterd/glusterd.info' similar to the one on the
+ healthy nodes, but with the found UUID.
+ * Copy peers from the healthy nodes to /var/lib/glusterd/peers. We need to
+ copy from 2 nodes as node does not hold peer information on itself.
+ * Create mount points and re-schedule gluster pod. See more details
+ https://access.redhat.com/documentation/en-us/red_hat_gluster_storage/3/html/administration_guide/sect-replacing_hosts
+ * Start healing
+ gluster volume heal VOLNAME full
+
+ - However, if data is lost, it is quite complecated to recover using the same server name.
+ We should rename the server and use first approach instead.
+
+
+
+Scaling
+=======
+We have currently serveral assumptions which will probably not hold true for larger clusters
+ - Gluster
+ To simplify matters we just reference servers in the storage group manually
+ Arbiter may work for several groups and we should define several brick path in this case
diff --git a/docs/network.txt b/docs/network.txt
new file mode 100644
index 0000000..a164d36
--- /dev/null
+++ b/docs/network.txt
@@ -0,0 +1,58 @@
+Configuration
+=============
+openshift_ip Infiniband IPs for fast communication (it also used for ADEI/MySQL bridge
+ and so should reside on fast network.
+openshift_hostname The 'cluster' host name. Should match real host name for certificat validation.
+ So, it should be set if default ip does not resolve to host name
+openshift_public_ip We may either skip this or set to our 192.168.26.xxx network. Usage is unclear
+openshift_public_hostname I guess it is also for certificates, but while communicating with external systems
+openshift_master_cluster_hostname Internal cluster load-balancer or just pointer to master host
+openshift_public_master_cluster_hostname The main cluster gateway
+
+
+Complex Network
+===============
+Some things in OpenShift ansible scripts are still implemented with assumption we have
+a simple network configuration with a single interface communicating to the world. There
+are several options to change this:
+ openshift_set_node_ip - This variable configures nodeIP in the node configuration. This
+ variable is needed in cases where it is desired for node traffic to go over an interface
+ other than the default network interface.
+ openshift_ip - This variable overrides the cluster internal IP address for the system.
+ Use this when using an interface that is not configured with the default route.
+ openshift_hostname - This variable overrides the internal cluster host name for the system.
+ Use this when the system’s default IP address does not resolve to the system host name.
+Furthermore, if we use infiniband which is not accessible to outside world we need to set
+ openshift_public_ip - Use this for cloud installations, or for hosts on networks using
+ a network address translation
+ openshift_public_hostname - Use this for cloud installations, or for hosts on networks
+ using a network address translation (NAT).
+
+ This is, however, is not used trough all system components. Some provisioning code and
+installed scripts are still detect kind of 'main system ip' to look for the
+services. This ip is intendified either as 'ansible_default_ip' or by the code trying
+to look for the ip which is used to send packet over default route. Ansible in the end does
+the some thing. This plays bad for several reasons.
+ - We have keepalived ips moving between systems. The scripts are actually catching
+ this moving ips instead of the fixed ip bound to the system.
+ - There could be several default routes. While it is not a problem, scripts does not expect
+ that and may fail.
+
+For instance, the script '99-origin-dns.sh' in /etc/NetworkManager/dispatcher.d.
+ * def_route=$(/sbin/ip route list match 0.0.0.0/0 | awk '{print $3 }')
+ 1) Does not expect multiple default routes and will find just a random one. Then,
+ * if [[ ${DEVICE_IFACE} == ${def_route_int} ]]; then
+ check may fail and the resolv.conf will be not updated because currently up'ed
+ interface is not on default route, but it actually is. Furthermore,
+ * def_route_ip=$(/sbin/ip route get to ${def_route} | awk '{print $5}')
+ 2) ignorant of keepalived and will bound to keepalived.
+
+ But I am not sure the problems are limited to this script. There could be other places with
+ the same logic. Some details are here:
+ https://docs.openshift.com/container-platform/3.7/admin_guide/manage_nodes.html#manage-node-change-node-traffic-interface
+
+Hostnames
+=========
+ The linux host name (uname -a) should match the hostnames assigned to openshift nodes. Otherwise, the certificate verification
+ will fail. It seems minor issue as system continue functioning, but better to avoid. The check can be performed with etcd:
+ etcdctl3 --key=/etc/etcd/peer.key --cacert=/etc/etcd/ca.crt --endpoints="192.168.213.1:2379,192.168.213.3:2379,192.168.213.4:2379"
diff --git a/docs/pods.txt b/docs/pods.txt
new file mode 100644
index 0000000..b84f42f
--- /dev/null
+++ b/docs/pods.txt
@@ -0,0 +1,13 @@
+Updating Daemon Set
+===================
+ - Not trivial. We need to
+ a) Re-recreate ds
+ * Manualy change 'imagePullPolicty' to 'Always' if it is set to 'IfNotExisting'
+ b) Destory all nodes and allow ds to recreate them
+
+ - Sample: Updateing gluster
+ oc -n glusterfs delete ds/glusterfs-storage
+ oc -n glusterfs process glusterfs IMAGE_NAME=chsa/gluster-centos IMAGE_VERSION=312 > gluster.json
+ *** Edit
+ oc -n glusterfs create -f gluster.json
+ oc -n glusterfs delete pods -l 'glusterfs=storage-pod'
diff --git a/docs/regions.txt b/docs/regions.txt
new file mode 100644
index 0000000..88b8f5e
--- /dev/null
+++ b/docs/regions.txt
@@ -0,0 +1,16 @@
+region=infra Infrastructure nodes which are used by OpenShift to run router and registry services. This is
+ more or less ipekatrin* nodes down in the basement.
+region=prod Production servers (ipecompute*, etc.) located anythere, but I expect only basement.
+region=dev Temporary nodes
+
+zone=default Basement
+zone=404 Second server room on 4th floor
+zone=student Student room
+zone=external Other external places
+
+
+
+production: 1 Specifies all production servers (no extra load, no occasional reboots)
+ This includes 'infra' and 'prod' regions.
+server: 1 Like production, but with occasional reboots and some extra testing load possible
+permanent: 1 Non-production systems, but which are permanently connected to OpenShift
diff --git a/docs/samples/templates/00-katrin-restricted.yml.j2 b/docs/samples/templates/00-katrin-restricted.yml.j2
new file mode 100644
index 0000000..6221f30
--- /dev/null
+++ b/docs/samples/templates/00-katrin-restricted.yml.j2
@@ -0,0 +1,44 @@
+# Overriding SCC rules to allow arbitrary gluster mounts in restricted containers
+---
+allowHostDirVolumePlugin: false
+allowHostIPC: false
+allowHostNetwork: false
+allowHostPID: false
+allowHostPorts: false
+allowPrivilegedContainer: false
+allowedCapabilities: null
+apiVersion: v1
+defaultAddCapabilities: null
+fsGroup:
+ type: MustRunAs
+groups:
+- system:authenticated
+kind: SecurityContextConstraints
+metadata:
+ annotations:
+ kubernetes.io/description: restricted denies access to all host features and requires
+ pods to be run with a UID, and SELinux context that are allocated to the namespace. This
+ is the most restrictive SCC.
+ creationTimestamp: null
+ name: katrin-restricted
+priority: null
+readOnlyRootFilesystem: false
+requiredDropCapabilities:
+- KILL
+- MKNOD
+- SYS_CHROOT
+- SETUID
+- SETGID
+runAsUser:
+ type: MustRunAsRange
+seLinuxContext:
+ type: MustRunAs
+supplementalGroups:
+ type: RunAsAny
+volumes:
+- glusterfs
+- configMap
+- downwardAPI
+- emptyDir
+- persistentVolumeClaim
+- secret
diff --git a/docs/samples/vars/run_oc.yml b/docs/samples/vars/run_oc.yml
new file mode 100644
index 0000000..a464549
--- /dev/null
+++ b/docs/samples/vars/run_oc.yml
@@ -0,0 +1,6 @@
+oc:
+ - template: "[0-3]*"
+ - template: "[4-6]*"
+ - resource: "route/apache"
+ oc: "expose svc/kaas --name apache --hostname=apache.{{ openshift_master_default_subdomain }}"
+ - template: "*"
diff --git a/docs/samples/vars/variants.yml b/docs/samples/vars/variants.yml
new file mode 100644
index 0000000..c7a27b4
--- /dev/null
+++ b/docs/samples/vars/variants.yml
@@ -0,0 +1,33 @@
+# First port is exposed
+
+pods:
+ kaas:
+ variant: "{{ ands_prefer_docker | default(false) | ternary('docker', 'centos') }}"
+ centos:
+ service: { host: "{{ katrin_node }}", ports: [ 80/8080, 443/8043 ] }
+ sched: { replicas: 1, selector: { master: 1 } }
+ selector: { master: 1 }
+ images:
+ - image: "centos/httpd-24-centos7"
+ mappings:
+ - { name: "etc", path: "apache2-kaas-centos", mount: "/etc/httpd" }
+ - { name: "www", path: "kaas", mount: "/opt/rh/httpd24/root/var/www/html" }
+ - { name: "log", path: "apache2-kaas", mount: "/var/log/httpd24" }
+ probes:
+ - { port: 8080, path: '/index.html' }
+ docker:
+ service: { host: "{{ katrin_node }}", ports: [ 80/8080, 443/8043 ] }
+ sched: { replicas: 1, selector: { master: 1 } }
+ selector: { master: 1 }
+ images:
+ - image: "httpd:2.2"
+ mappings:
+ - { name: "etc", path: "apache2-kaas-docker", mount: "/usr/local/apache2/conf" }
+ - { name: "www", path: "kaas", mount: "/usr/local/apache2/htdocs" }
+ - { name: "log", path: "apache2-kaas", mount: "/usr/local/apache2/logs" }
+ probes:
+ - { port: 8080, path: '/index.html' }
+
+
+
+ \ No newline at end of file
diff --git a/docs/troubleshooting.txt b/docs/troubleshooting.txt
new file mode 100644
index 0000000..b4ac8e7
--- /dev/null
+++ b/docs/troubleshooting.txt
@@ -0,0 +1,210 @@
+The services has to be running
+------------------------------
+ Etcd:
+ - etcd
+
+ Node:
+ - origin-node
+
+ Master nodes:
+ - origin-master-api
+ - origin-master-controllers
+ - origin-master is not running
+
+ Required Services:
+ - lvm2-lvmetad.socket
+ - lvm2-lvmetad.service
+ - docker
+ - NetworkManager
+ - firewalld
+ - dnsmasq
+ - openvswitch
+
+ Extra Services:
+ - ssh
+ - ntp
+ - openvpn
+ - ganesha (on master nodes, optional)
+
+Pods has to be running
+----------------------
+ Kubernetes System
+ - kube-service-catalog/apiserver
+ - kube-service-catalog/controller-manager
+
+ OpenShift Main Services
+ - default/docker-registry
+ - default/registry-console
+ - default/router (3 replicas)
+ - openshift-template-service-broker/api-server (daemonset, on all nodes)
+
+ OpenShift Secondary Services
+ - openshift-ansible-service-broker/asb
+ - openshift-ansible-service-broker/asb-etcd
+
+ GlusterFS
+ - glusterfs-storage (daemonset, on all storage nodes)
+ - glusterblock-storage-provisioner-dc
+ - heketi-storage
+
+ Metrics (openshift-infra):
+ - hawkular-cassandra
+ - hawkular-metrics
+ - heapster
+
+
+Debugging
+=========
+ - Ensure system consistency as explained in 'consistency.txt' (incomplete)
+ - Check current pod logs and possibly logs for last failed instance
+ oc logs <pod name> --tail=100 [-p] - dc/name or ds/name as well
+ - Verify initialization steps (check if all volumes are mounted)
+ oc describe <pod name>
+ - It worth looking the pod environment
+ oc env po <pod name> --list
+ - It worth connecting running container with 'rsh' session and see running processes,
+ internal logs, etc. The 'debug' session will start a new instance of the pod.
+ - If try looking if corresponding pv/pvc are bound. Check logs for pv.
+ * Even if 'pvc' is bound. The 'pv' may have problems with its backend.
+ * Check logs here: /var/lib/origin/plugins/kubernetes.io/glusterfs/
+ - Another frequent problems is failing 'postStart' hook. Or 'livenessProbe'. As it
+ immediately crashes it is not possible to connect. Remedies are:
+ * Set larger initial delay to check the probe.
+ * Try to remove hook and execute it using 'rsh'/'debug'
+ - Determine node running the pod and check the host logs in '/var/log/messages'
+ * Particularly logs of 'origin-master-controllers' are of interest
+ - Check which docker images are actually downloaded on the node
+ docker images
+
+network
+=======
+ - There is a NetworkManager script which should adjust /etc/resolv.conf to use local dnsmasq server.
+ This is based on '/etc/NetworkManager/dispatcher.d/99-origin-dns.sh' which does not play well
+ if OpenShift is running on non-default network interface. I provided a patched version, but it
+ worth verifying
+ * that nameserver is pointing to the host itself (but not localhost, this is important
+ to allow running pods to use it)
+ * that correct upstream nameservers are listed in '/etc/dnsmasq.d/origin-upstream-dns.conf'
+ * In some cases, it was necessary to restart dnsmasq (but it could be also for different reasons)
+ If script misbehaves, it is possible to call it manually like that
+ DEVICE_IFACE="eth1" ./99-origin-dns.sh eth1 up
+
+
+etcd (and general operability)
+====
+ - Few of this sevices may seem running accroding to 'systemctl', but actually misbehave. Then, it
+ may be needed to restart them manually. I have noticed it with
+ * lvm2-lvmetad.socket (pvscan will complain on problems)
+ * node-origin
+ * etcd but BEWARE of too entusiastic restarting:
+ - However, restarting etcd many times is BAD as it may trigger a severe problem with
+ 'kube-service-catalog/apiserver'. The bug description is here
+ https://github.com/kubernetes/kubernetes/issues/47131
+ - Due to problem mentioned above, all 'oc' queries are very slow. There is not proper
+ solution suggested. But killing the 'kube-service-catalog/apiserver' helps for a while.
+ The pod is restarted and response times are back in order.
+ * Another way to see this problem is quering 'healthz' service which would tell that
+ there is too many clients and, please, retry later.
+ curl -k https://apiserver.kube-service-catalog.svc/healthz
+
+ - On node crash, the etcd database may get corrupted.
+ * There is no easy fix. Backup/restore is not working.
+ * Easiest option is to remove the failed etcd from the cluster.
+ etcdctl3 --endpoints="192.168.213.1:2379" member list
+ etcdctl3 --endpoints="192.168.213.1:2379" member remove <hexid>
+ * Add it to [new_etcd] section in inventory and run openshift-etcd to scale-up etcd cluster.
+
+ - There is a helth check provided by the cluster
+ curl -k https://apiserver.kube-service-catalog.svc/healthz
+ it may complain about etcd problems. It seems triggered by OpenShift upgrade. The real cause and
+ remedy is unclear, but the installation is mostly working. Discussion is in docs/upgrade.txt
+
+ - There is also a different etcd which is integral part of the ansible service broker:
+ 'openshift-ansible-service-broker/asb-etcd'. If investigated with 'oc logs' it complains
+ on:
+ 2018-03-07 20:54:48.791735 I | embed: rejected connection from "127.0.0.1:43066" (error "tls: failed to verify client's certificate: x509: certificate signed by unknown authority", ServerName "")
+ WARNING: 2018/03/07 20:54:48 Failed to dial 0.0.0.0:2379: connection error: desc = "transport: authentication handshake failed: remote error: tls: bad certificate"; please retry.
+ Nevertheless, it seems working without much trouble. The error message seems caused by
+ certificate verification code which introduced in etcd 3.2. There are multiple bug repports on
+ the issue.
+
+pods (failed pods, rogue namespaces, etc...)
+====
+ - After crashes / upgrades some pods may end up in 'Error' state. This is quite often happen to
+ * kube-service-catalog/controller-manager
+ * openshift-template-service-broker/api-server
+ Normally, they should be deleted. Then, OpenShift will auto-restart pods and they likely will run without problems.
+ for name in $(oc get pods -n openshift-template-service-broker | grep Error | awk '{ print $1 }' ); do oc -n openshift-template-service-broker delete po $name; done
+ for name in $(oc get pods -n kube-service-catalog | grep Error | awk '{ print $1 }' ); do oc -n kube-service-catalog delete po $name; done
+
+ - Other pods will fail with 'ImagePullBackOff' after cluster crash. The problem is that ImageStreams populated by 'builds' will
+ not be recreated automatically. By default OpenShift docker registry is stored on ephemeral disks and is lost on crash. The build should be
+ re-executed manually.
+ oc -n adei start-build adei
+
+ - Furthermore, after long outtages the CronJobs will stop functioning. The reason can be found by analyzing '/var/log/messages' or specially
+ systemctl status origin-master-controllers
+ it will contain something like:
+ 'Cannot determine if <namespace>/<cronjob> needs to be started: Too many missed start time (> 100). Set or decrease .spec.startingDeadlineSeconds or check clock skew.'
+ * The reason is that after 100 missed (or failed) launch periods it will stop trying to avoid excive load. The remedy is set 'startingDeadlineSeconds'
+ which tells the system that if cronJob has failed to start in the allocated interval we stop trying until the next start period. Then, 100 is only
+ counted the specified period. I.e. we should set period bellow the 'launch period / 100'.
+ https://github.com/kubernetes/kubernetes/issues/45825
+ * The running CronJobs can be easily patched with
+ oc -n adei patch cronjob/adei-autogen-update --patch '{ "spec": {"startingDeadlineSeconds": 120 }}'
+
+ - Sometimes there is rogue namespaces in 'deleting' state. This is also hundreds of reasons, but mainly
+ * Crash of both masters during population / destruction of OpenShift resources
+ * Running of 'oc adm diagnostics'
+ It is unclear how to remove them manually, but it seems if we run
+ * OpenShift upgrade, the namespaces are gone (but there could be a bunch of new problems).
+ * ... i don't know if install, etc. May cause the trouble...
+
+ - There is also rogue pods (mainly due to some problems with unmounting lost storage), etc. If 'oc delete' does not
+ work for a long time. It worth
+ * Determining the host running failed pod with 'oc get pods -o wide'
+ * Going to the pod and killing processes and stopping the container using docker command
+ * Looking in the '/var/lib/origin/openshift.local.volumes/pods' for the remnants of the container
+ - This can be done with 'find . -name heketi*' or something like...
+ - There could be problematic mounts which can be freed with lazy umount
+ - The folders for removed pods may (and should) be removed.
+
+ - Looking into the '/var/log/messages', it is sometimes possible to spot various erros like
+ * Orphaned pod "212074ca-1d15-11e8-9de3-525400225b53" found, but volume paths are still present on disk.
+ The volumes can be removed in '/var/lib/origin/openshift.local.volumes/pods' on the corresponding node
+ * PodSandbox "aa28e9c7605cae088838bb4c9b92172083680880cd4c085d93cbc33b5b9e8910" from runtime service failed: ...
+ - We can find and remove the corresponding container (the short id is just first letters of the long id)
+ docker ps -a | grep aa28e9c76
+ docker rm <id>
+ - We further can just destroy all containers which are not running (it will actually try to remove all,
+ but just error message will be printed for running ones)
+ docker ps -aq --no-trunc | xargs docker rm
+
+
+Storage
+=======
+ - Running a lot of pods may exhaust available storage. It worth checking if
+ * There is enough Docker storage for containers (lvm)
+ * There is enough Heketi storage for dynamic volumes (lvm)
+ * The root file system on nodes still has space for logs, etc.
+ Particularly there is a big problem for ansible-ran virtual machines. The system disk is stored
+ under '/root/VirtualBox VMs' and is not cleaned/destroyed unlike second hard drive on 'vagrant
+ destroy'. So, it should be cleaned manually.
+
+ - Problems with pvc's can be evaluated by running
+ oc -n openshift-ansible-service-broker describe pvc etcd
+ Furthermore it worth looking in the folder with volume logs. For each 'pv' it stores subdirectories
+ with pods executed on this host which are mount this pod and holds the log for this pods.
+ /var/lib/origin/plugins/kubernetes.io/glusterfs/
+
+ - Heketi is problematic.
+ * Worth checking if topology is fine and running.
+ heketi-cli -s http://heketi-storage-glusterfs.openshift.suren.me --user admin --secret "$(oc get secret heketi-storage-admin-secret -n glusterfs -o jsonpath='{.data.key}' | base64 -d)"
+ - Furthermore, the heketi gluster volumes may be started, but with multiple bricks offline. This can
+ be checked with
+ gluster volume status <vol> detail
+ * If not all bricks online, likely it is just enought to restart the volume
+ gluster volume stop <vol>
+ gluster volume start <vol>
+ * This may break services depending on provisioned 'pv' like 'openshift-ansible-service-broker/asb-etcd'
+
diff --git a/docs/upgrade.txt b/docs/upgrade.txt
new file mode 100644
index 0000000..b4f22d6
--- /dev/null
+++ b/docs/upgrade.txt
@@ -0,0 +1,64 @@
+Upgrade
+-------
+ - The 'upgrade' may break things causing long cluster outtages or even may require a complete re-install.
+ Currently, I found problem with 'kube-service-catalog', but I am not sure problems are limited to it.
+ Furthermore, we currently using 'latest' tag of several docker images (heketi is example of a critical
+ service on the 'latest' tag). Update may break things down.
+
+kube-service-catalog
+--------------------
+ - Update of 'kube-service-catalog' breaks OpenShift health check
+ curl -k https://apiserver.kube-service-catalog.svc/healthz
+ It complains on 'etcd'. The speific etcd check
+ curl -k https://apiserver.kube-service-catalog.svc/healthz/etcd
+ reports that all servers are unreachable.
+
+ - In fact etcd is working and the cluster is mostly functional. Occasionaly, it may suffer from the bug
+ described here:
+ https://github.com/kubernetes/kubernetes/issues/47131
+ The 'oc' queries are extremely slow and healthz service reports that there is too many connections.
+ Killing the 'kube-service-catalog/apiserver' helps for a while, but problem returns occasionlly.
+
+ - The information bellow is attempt to understand the reason. In fact, it is the list specifying that
+ is NOT the reason. The only found solution is to prevent update of 'kube-service-catalog' by setting
+ openshift_enable_service_catalog: false
+
+ - The problem only occurs if 'openshift_service_catalog' role is executed. It results in some
+ miscommunication between 'apiserver' and/or 'control-manager' with etcd. Still the cluster is
+ operational, so the connection is not completely lost, but is not working as expected in some
+ circustmances.
+
+ - There is no significant changes. The exactly same docker images are installed. The only change in
+ '/etc' is updated certificates used by 'apiserver' and 'control-manager'.
+ * The certificates are located in '/etc/origin/service-catalog/' on the first master server.
+ 'oc adm ca' is used for generation. However, certificates in this folder are not used directly. They
+ are barely a temporary files used to generate 'secrets/service-catalog-ssl' which is used in
+ 'apiserver' and 'control-manager'. The provisioning code is in:
+ openshift-ansible/roles/openshift_service_catalog/tasks/generate_certs.yml
+ it can't be disabled completely as registered 'apiserver_ca' variable is used in install.yml, but
+ actual generation can be skipped and old files re-used to generate secret.
+ * I have tried to modify role to keep old certificates. The healhz check was still broken afterwards.
+ So, this is update is not a problem (or at least not a sole problem).
+
+ - The 'etcd' cluster seems OK. On all nodes, the etcd can be verified using
+ etcdctl3 member list
+ * The last command is actually bash alias which executes
+ ETCDCTL_API=3 /usr/bin/etcdctl --cert /etc/etcd/peer.crt --key /etc/etcd/peer.key --cacert /etc/etcd/ca.crt --endpoints https://`hostname`:2379 member list
+ Actually, etcd is serving two ports 2379 (clients) and 2380 (peers). One idea was that may be the
+ second port got problems. I was trying to change 2379 to 2380 in command above and it was failing.
+ However, it does not work either if the cluster in healhy state.
+ * One idea was that certificates are re-generated for wrong ip/names and, hence, certificate validation
+ fails. Or that the originally generated CA is registered with etcd. This is certainly not the (only) issue
+ as problem persist even if we keep certificates intact. However, I also verified that newly generated
+ certificates are completely similar to old ones and containe the correct hostnames inside.
+ * Last idea was that actually 'asb-etcd' is broken. It complains
+ 2018-03-07 20:54:48.791735 I | embed: rejected connection from "127.0.0.1:43066" (error "tls: failed to verify client's certificate: x509: certificate signed by unknown authority", ServerName "")
+ However, the same error is present in log directly after install while the cluster is completely
+ healthy.
+
+ - The networking seems also not an issue. The configurations during install and upgrade are exactly the same.
+ All names are defined in /etc/hosts. Furthermore, the names in /etc/hosts are resolved (and back-resolved)
+ by provided dnsmasq server. I.e. ipeshift1 resolves to 192.168.13.1 using nslookup and 192.168.13.1 resolves
+ back to ipeshift1. So, the configuration is indistinguishable from proper one with properly configured DNS.
+
+ \ No newline at end of file
diff --git a/group_vars/OSEv3.yml b/group_vars/OSEv3.yml
index d896677..20bfece 100644
--- a/group_vars/OSEv3.yml
+++ b/group_vars/OSEv3.yml
@@ -1,14 +1,8 @@
### Deployment Type
openshift_deployment_type: origin
openshift_master_cluster_method: "native"
-#openshift_release: "v1.5"
openshift_release: "v3.7.1"
-#openshift_release: "v3.7"
-#openshift_image_tag: "v1.5.0-rc.0"
#openshift_image_tag: "v3.7.1"
-#openshift_pkg_version=-3.7.0
-#openshift_hosted_metrics_deployer_version: "v1.5.0-rc.0"
-#openshift_hosted_metrics_deployer_version: "v3.7.1"
#containerized: true
containerized: false
@@ -18,9 +12,32 @@ os_firewall_use_firewalld: true
#enable_excluders: false
#enable_docker_excluder: false
+### Versions
+#system packages
+#etcd_version="3.1.0"
+#docker_version="1.12.1"
+
+#for some package only latest is available
+#openshift_pkg_version=-3.7.0
+#openshift_cockpit_deployer_version=latest
+#openshift_metrics_image_prefix=docker.io/openshift/origin-
+#openshift_metrics_image_version=v3.7
+#openshift_logging_image_prefix=docker.io/openshift/origin-
+#openshift_logging_image_version=v3.7.0
+#openshift_service_catalog_image_prefix=docker.io/openshift/origin-
+#openshift_service_catalog_image_version=v3.7.1
+#template_service_broker_version='v3.7'
+#ansible_service_broker_image_prefix: ansibleplaybookbundle/
+#ansible_service_broker_registry_url: "registry.access.redhat.com"
+ansible_service_broker_etcd_image_tag: v3.2
+
+#test
+#openshift_enable_service_catalog: false
+
+
### Network & DNS configuration
-openshift_master_cluster_hostname: "{{ ands_openshift_cluster_fqdn }}"
+openshift_master_cluster_hostname: "{{ ands_use_inner_lb | ternary(ands_inner_lb_fqdn, ands_openshift_lb) }}"
openshift_master_cluster_public_hostname: "{{ ands_openshift_lb }}"
openshift_master_default_subdomain: "{{ ands_openshift_subdomain | default(ands_openshift_lb) }}"
openshift_master_ingress_ip_network_cidr: "{{ ands_openshift_ingress_network }}"
@@ -30,8 +47,8 @@ openshift_master_ingress_ip_network_cidr: "{{ ands_openshift_ingress_network }}"
# we may need to put conditionals here (except _ip). Currently values set to '' if undifined (OpenShift uses None which is equivalent in ansible)
openshift_ip: "{{ ands_openshift_ip }}"
openshift_public_ip: "{{ ands_openshift_public_ip }}"
-openshift_hostname: "{{ ands_openshift_fqdn }}"
-openshift_public_hostname: "{{ ands_openshift_public_fqdn }}"
+openshift_hostname: "{{ ands_openshift_set_hostname | ternary(ands_openshift_fqdn, ands_none) }}"
+openshift_public_hostname: "{{ ands_openshift_set_public_hostname | ternary(ands_openshift_public_fqdn, ands_none) }}"
#Check configuration to fight dynamic IPs
@@ -68,10 +85,35 @@ openshift_docker_log_options: [ max-size=2m, max-file=3 ]
openshift_docker_options: --log-driver json-file
#openshift_docker_options: --log-opt max-size=2m --log-opt max-file=3
+### Registry
+openshift_hosted_registry_storage_kind: glusterfs
+openshift_hosted_registry_storage_class: glusterfs-storage
+openshift_hosted_registry_storage_volume_size: "{{ ands_registry_volume_size }}"
+
+# By default dynamic provisioning is not used. The 'openshift_persistent_volumes' role creates pvc/pv pair if the following
+# variables set. The volumes are called 'registry-claim' and 'registry-volume'. The 'openshift_storage_glusterfs' creates
+# the corresponding volume using heketi (this can't be disabled, so we patched to skip if openshift_hosted_registry_storage_class set).
+# Finally, 'openshift_hosted' role creates the corresponding endpoints (this only happens if ..._ips are set).
+# Alternative is triggered if 'openshift_hosted_registry_storage_glusterfs_swap' is set. The 'openshift_persistent_volumes' creates
+# registry-glusterfs-claim/registry-volume pair. 'openshift_hosted' role, then, tries first to copy data from the current volume, but
+# this path is pretty much broken.
+# I have introduced 'openshift_hosted_registry_storage_class' and blocked if it set creatin of above-said components which are not
+# possible to disable with variable bellow. Furthermore, I added a simple 'pvc' based on dynamic provisioning to 'openshift_persistent_volumes'.
+openshift_hosted_registry_storage_create_pv: false
+openshift_hosted_registry_storage_create_pvc: false
+
+# This is an alternative to go standard way. All above should be commented, then.
+# volume size should be given as plain number (without G) if we go without 'sc'.
+#openshift_hosted_registry_storage_glusterfs_path: openshift_registry
+#openshift_hosted_registry_storage_glusterfs_ips: "{{ openshift_storage_nodes }}"
+
### Dynamic Storage
openshift_storage_glusterfs_image: chsa/gluster-centos
openshift_storage_glusterfs_version: "{{ glusterfs_version }}"
-
+#Either 5 or 6 corresponds to latest
+#openshift_storage_glusterfs_heketi_version: 6
+#Only latest
+#openshift_storage_glusterfs_block_version: latest
#openshift_storage_glusterfs_version: '3.12.5' # Latest 3.10.1
#openshift_storage_glusterfs_is_native: True
@@ -113,6 +155,3 @@ openshift_install_examples: true
# Required for IPFailover
openshift_clock_enabled: true
-
-#This is required by OpenShift upgrade (may be something else)
-g_ssh_user: "{{ ansible_ssh_user }}"
diff --git a/group_vars/ands.yml b/group_vars/ands.yml
index d81f11e..faacc40 100644
--- a/group_vars/ands.yml
+++ b/group_vars/ands.yml
@@ -1,10 +1,3 @@
-ands_configure_heketi: false
-
-# This should be here, the variables from the role are not propogated to hostvars
-#ands_master_id: "{{ ('masters' in group_names) | ternary(groups.masters.index(('masters' in group_names) | ternary(inventory_hostname, groups.masters[0])), -1) }}"
-ands_storage_hostname: "{{ ands_storage_network | default(false) | ternary(ands_storage_network | default('') | ipaddr(ands_host_id) | ipaddr('address'), ansible_fqdn) }}"
-
-
ands_repo_url: http://ufo.kit.edu/ands/repos
ands_repositories:
- name: ands-updates
diff --git a/group_vars/staging.yml b/group_vars/staging.yml
index 34bf7c7..00ec146 100644
--- a/group_vars/staging.yml
+++ b/group_vars/staging.yml
@@ -11,13 +11,9 @@ ands_openshift_public_network: 192.168.226.0/24
ands_openshift_ingress_network: 192.168.216.0/24
ands_inner_domain: ""
-#ands_inner_lb: true
-#ands_openshift_set_hostname: false
-
-ands_inner_lb: false
+ands_use_inner_lb: true
ands_openshift_set_hostname: true
-
#ands_ipfailover_interface: eth1
ands_ipfailover_vips: [141.52.64.28/23]
diff --git a/group_vars/testing.yml b/group_vars/testing.yml
index 72b2dba..f7e04cf 100644
--- a/group_vars/testing.yml
+++ b/group_vars/testing.yml
@@ -1,17 +1,20 @@
ands_storage_network: 192.168.12.0/24
ands_cluster_domain: ipe.kit.edu
-ands_openshift_lb: katrin.suren.me
-#ands_openshift_subdomain: katrin.suren.me
-ands_openshift_subdomain: apps.suren.me
-#ands_openshift_network: 192.168.26.0/24
+ands_hostname_template: ipekatrin
+ands_openshift_lb: kaas.kit.edu
+ands_openshift_subdomain: kaas.kit.edu
ands_openshift_network: 192.168.13.0/24
ands_openshift_public_network: 192.168.26.0/24
ands_openshift_ingress_network: 192.168.16.0/24
-ands_hostname_template: ipekatrin
+#ands_inner_domain: ""
+ands_openshift_set_hostname: false
+# if we provision inner_lb (default), we can turn it on and just re-run ands_network role (or maintain play)
+ands_use_inner_lb: false
+
-ands_ipfailover_interface: eth1
+#ands_ipfailover_interface: eth1
ands_ipfailover_vips: [141.52.64.15/23, 141.52.64.17/23]
katrin_openvpn_subnet_bits: 24
diff --git a/group_vars/virtual.yml b/group_vars/virtual.yml
index f76bafc..7a61a55 100644
--- a/group_vars/virtual.yml
+++ b/group_vars/virtual.yml
@@ -1,10 +1,10 @@
glusterfs_transport: tcp
ands_data_device: "/dev/sdb"
-ands_data_volume_size: "20G"
-ands_heketi_volume_size: "20G"
+ands_data_volume_size: "15G"
+ands_heketi_volume_size: "25G"
+ands_registry_volume_size: "5G"
docker_storage_device: "/dev/sdb"
docker_storage_vg: "ands"
-ands_host_id: "{{ ansible_hostname | regex_replace('^[\\w\\d]*\\w(\\d+)(\\.|$)', '\\1') }}"
diff --git a/inventories/staging.erb b/inventories/staging.erb
index dc3bcb2..aa9e935 100644
--- a/inventories/staging.erb
+++ b/inventories/staging.erb
@@ -1,46 +1,52 @@
[masters]
192.168.226.[1:2]
+[etcd]
+192.168.226.[1:3]
+
[simple_storage_nodes]
192.168.226.[3:3]
-[external_storage_servers]
-#192.168.226.[4:4]
-
[simple_nodes]
+[external_storage_servers]
+
[staging:children]
nodes
+new_nodes
+etcd
+new_etcd
external_storage_servers
vagrant
[virtual:children]
nodes
+new_nodes
+etcd
+new_etcd
external_storage_servers
-
[OSEv3:children]
masters
+new_masters
nodes
+new_nodes
etcd
+new_etcd
-[glusterfs:children]
-masters
-simple_storage_nodes
-
-[etcd:children]
+[nodes:children]
masters
simple_storage_nodes
+simple_nodes
[storage_nodes:children]
masters
+new_masters
simple_storage_nodes
+new_simple_storage_nodes
-[nodes:children]
-masters
-simple_storage_nodes
-simple_nodes
-
+[glusterfs:children]
+storage_nodes
#[lb]
#master1.example.com
@@ -49,9 +55,11 @@ simple_nodes
#[glusterfs_registry]
#192.168.10.14 glusterfs_ip=192.168.10.14 glusterfs_devices='[ "/dev/xvdc", "/dev/xvdd" ]'
-
[ands_servers:children]
nodes
+new_nodes
+etcd
+new_etcd
external_storage_servers
[ands_storage_servers:children]
@@ -63,3 +71,13 @@ ands_servers
[vagrant]
ipepdvcompute3.ipe.kit.edu vagrant_project=staging
+
+[new_masters]
+[new_etcd]
+[new_simple_storage_nodes]
+[new_simple_nodes]
+
+[new_nodes:children]
+new_masters
+new_simple_storage_nodes
+new_simple_nodes
diff --git a/inventories/testing.erb b/inventories/testing.erb
index b8b5f48..f9d27ae 100644
--- a/inventories/testing.erb
+++ b/inventories/testing.erb
@@ -1,50 +1,66 @@
[masters]
-ipekatrin[1:2].katrin.kit.edu
+ipekatrin[1:2].ipe.kit.edu
-[simple_storage_nodes]
-ipekatrin[3:3].katrin.kit.edu
-#ipetest.katrin.kit.edu ands_host_id=5
+[etcd]
+ipekatrin[1:3].ipe.kit.edu
-[external_storage_servers]
-#ipekatrin[4:4].katrin.kit.edu
+[simple_storage_nodes]
+ipekatrin[3:3].ipe.kit.edu
[simple_nodes]
-#ipekatrin[3:3].katrin.kit.edu
-#strnage_name.katrin.kit.edu ands_host_id=1
+#ipecompute1.katrin.kit.edu ands_host_id=4
+
+[external_storage_servers]
[testing:children]
nodes
+new_nodes
+etcd
+new_etcd
external_storage_servers
vagrant
[virtual:children]
nodes
+new_nodes
+etcd
+new_etcd
external_storage_servers
[OSEv3:children]
masters
+new_masters
nodes
+new_nodes
etcd
+new_etcd
-[glusterfs:children]
-masters
-simple_storage_nodes
-
-[etcd:children]
+[nodes:children]
masters
simple_storage_nodes
+simple_nodes
[storage_nodes:children]
masters
+new_masters
simple_storage_nodes
+new_simple_storage_nodes
-[nodes:children]
-masters
-simple_storage_nodes
-simple_nodes
+[glusterfs:children]
+storage_nodes
+
+#[lb]
+#master1.example.com
+#[nfs]
+#master1.example.com
+#[glusterfs_registry]
+#192.168.10.14 glusterfs_ip=192.168.10.14 glusterfs_devices='[ "/dev/xvdc", "/dev/xvdd" ]'
[ands_servers:children]
nodes
+new_nodes
+etcd
+new_etcd
external_storage_servers
[ands_storage_servers:children]
@@ -56,3 +72,13 @@ ands_servers
[vagrant]
ipepdvcompute3.ipe.kit.edu vagrant_project=testing
+
+[new_masters]
+[new_etcd]
+[new_simple_storage_nodes]
+[new_simple_nodes]
+
+[new_nodes:children]
+new_masters
+new_simple_storage_nodes
+new_simple_nodes
diff --git a/opts.sh b/opts.sh
index ac1962a..9cfaf86 100644
--- a/opts.sh
+++ b/opts.sh
@@ -80,9 +80,17 @@ apply() {
hosts: $group
remote_user: root
roles:
- - ands_facts
+ - { role: ands_facts }
+ - { role: ands_network, action: install_pre }
+- import_playbook: ../anslib/openshift-ansible/playbooks/prerequisites.yml
- import_playbook: ../$action
+
+- name: Common setup procedures
+ hosts: $group
+ remote_user: root
+ roles:
+ - { role: ands_network, action: install_post }
END
playbook="playbooks/tmp_play.yml"
clean="playbooks/tmp_play.*"
diff --git a/playbooks/ands-gluster-ganesha.yml b/playbooks/ands-gluster-ganesha.yml
index 586dd07..a347c4f 100644
--- a/playbooks/ands-gluster-ganesha.yml
+++ b/playbooks/ands-gluster-ganesha.yml
@@ -7,6 +7,7 @@
- name: Configure GlusterFS cluster
hosts: masters, new_masters
roles:
+ - { role: ands_network, action: ganesha }
- { role: glusterfs, action: ganesha }
- { role: ganesha }
vars:
diff --git a/playbooks/ands-gluster.yml b/playbooks/ands-gluster.yml
index 8aa30fc..6e71b55 100644
--- a/playbooks/ands-gluster.yml
+++ b/playbooks/ands-gluster.yml
@@ -3,7 +3,6 @@
roles:
- role: ands_facts
-
- name: Configure GlusterFS cluster
hosts: ands_servers
roles:
@@ -13,3 +12,8 @@
glusterfs_servers: "{{ ands_storage_servers }}"
glusterfs_bricks_path: "{{ ands_data_path }}/glusterfs"
glusterfs_domains: "{{ ands_storage_domains }}"
+
+- name: Configure Backup
+ hosts: ands_servers
+ roles:
+ - role: ands_backup
diff --git a/playbooks/ands-prepare.yml b/playbooks/ands-prepare.yml
index d198ec0..239d292 100644
--- a/playbooks/ands-prepare.yml
+++ b/playbooks/ands-prepare.yml
@@ -11,10 +11,15 @@
- name: Common setup procedures
hosts: ands
roles:
- - role: common
- - role: firewall
+ - { role: ands_common }
+ - { role: firewall }
- { role: ands_network, action: common }
+- name: Setup NTP
+ hosts: ands:!virtual
+ roles:
+ - role: ntp
+
- name: Keepalived service
hosts: masters
roles:
@@ -22,7 +27,7 @@
#OpenVPN started before Origin-node causes problems
#- name: OpenVPN service
-# hosts: nodes, new_nodes
+# hosts: nodes:new_nodes
# roles:
# - role: openvpn
# vars:
@@ -36,7 +41,7 @@
- role: ands_storage
- name: Docker setup
- hosts: nodes, new_nodes
+ hosts: nodes:new_nodes
roles:
- role: docker
vars:
diff --git a/playbooks/openshift-add-masters.yml b/playbooks/openshift-add-masters.yml
index 99672d0..6878137 100644
--- a/playbooks/openshift-add-masters.yml
+++ b/playbooks/openshift-add-masters.yml
@@ -2,7 +2,7 @@
hosts: nodes:new_nodes
roles:
- { role: ands_facts }
- - { role: common, os_update: true }
+ - { role: ands_common, os_update: true }
- { role: ands_network, action: install_pre }
# etcd will provisioned as well if node is listed in new_etcd
diff --git a/playbooks/openshift-add-nodes.yml b/playbooks/openshift-add-nodes.yml
index c788e12..3d3efc4 100644
--- a/playbooks/openshift-add-nodes.yml
+++ b/playbooks/openshift-add-nodes.yml
@@ -2,7 +2,7 @@
hosts: nodes:new_nodes
roles:
- { role: ands_facts }
- - { role: common, os_update: true }
+ - { role: ands_common, os_update: true }
- { role: ands_network, action: install_pre }
# I am not sure if etcd will be automatic here. If not, we may need to run etcd scaleup afterwards
diff --git a/playbooks/openshift-deploy-cluster.yml b/playbooks/openshift-deploy-cluster.yml
deleted file mode 120000
index 2a18fca..0000000
--- a/playbooks/openshift-deploy-cluster.yml
+++ /dev/null
@@ -1 +0,0 @@
-../anslib/openshift-ansible/playbooks/deploy_cluster.yml \ No newline at end of file
diff --git a/playbooks/openshift-install-service-catalog.yml b/playbooks/openshift-install-service-catalog.yml
new file mode 100644
index 0000000..b6c0a10
--- /dev/null
+++ b/playbooks/openshift-install-service-catalog.yml
@@ -0,0 +1,13 @@
+- name: Configure cluster hosts names
+ hosts: nodes
+ roles:
+ - { role: ands_facts }
+ - { role: ands_network, action: install_pre }
+
+- import_playbook: ../anslib/openshift-ansible/playbooks/prerequisites.yml
+- import_playbook: ../anslib/openshift-ansible/playbooks/openshift-service-catalog/config.yml
+
+- name: Configure cluster hosts names
+ hosts: nodes
+ roles:
+ - { role: ands_network, action: install_post }
diff --git a/playbooks/openshift-redeploy-certificates.yml b/playbooks/openshift-redeploy-certificates.yml
index f812372..682468f 120000..100644
--- a/playbooks/openshift-redeploy-certificates.yml
+++ b/playbooks/openshift-redeploy-certificates.yml
@@ -1 +1,13 @@
-../anslib/openshift-ansible/playbooks/redeploy-certificates.yml \ No newline at end of file
+- name: Configure cluster hosts names
+ hosts: nodes
+ roles:
+ - { role: ands_facts }
+ - { role: ands_network, action: install_pre }
+
+- import_playbook: ../anslib/openshift-ansible/playbooks/prerequisites.yml
+- import_playbook: ../anslib/openshift-ansible/playbooks/redeploy-certificates.yml
+
+- name: Configure cluster hosts names
+ hosts: nodes
+ roles:
+ - { role: ands_network, action: install_post }
diff --git a/playbooks/openshift-setup-project.yml b/playbooks/openshift-setup-project.yml
index 6150cdf..a4666e3 100644
--- a/playbooks/openshift-setup-project.yml
+++ b/playbooks/openshift-setup-project.yml
@@ -1,5 +1,5 @@
- name: Analyze Ands configuration
- hosts: masters
+ hosts: nodes
roles:
- { role: ands_facts }
diff --git a/playbooks/openshift-setup-projects.yml b/playbooks/openshift-setup-projects.yml
index 689ecb4..164f91c 100644
--- a/playbooks/openshift-setup-projects.yml
+++ b/playbooks/openshift-setup-projects.yml
@@ -1,5 +1,5 @@
- name: Analyze Ands configuration
- hosts: masters
+ hosts: nodes
roles:
- { role: ands_facts }
diff --git a/playbooks/openshift-setup-security.yml b/playbooks/openshift-setup-security.yml
index f576ba5..ba96354 100644
--- a/playbooks/openshift-setup-security.yml
+++ b/playbooks/openshift-setup-security.yml
@@ -1,5 +1,5 @@
- name: Analyze Ands configuration
- hosts: masters
+ hosts: nodes
roles:
- { role: ands_facts }
diff --git a/playbooks/openshift-setup-users.yml b/playbooks/openshift-setup-users.yml
index f54a806..998dd59 100644
--- a/playbooks/openshift-setup-users.yml
+++ b/playbooks/openshift-setup-users.yml
@@ -1,5 +1,5 @@
- name: Analyze Ands configuration
- hosts: masters
+ hosts: nodes
roles:
- { role: ands_facts }
diff --git a/playbooks/openshift-upgrade.yml b/playbooks/openshift-upgrade.yml
index f2680ab..dd60639 100644
--- a/playbooks/openshift-upgrade.yml
+++ b/playbooks/openshift-upgrade.yml
@@ -3,5 +3,16 @@
roles:
- { role: ands_facts }
# - { role: ands_openshift, subrole: hostnames }
+ - { role: ands_network, action: install_pre }
+- import_playbook: ../anslib/openshift-ansible/playbooks/prerequisites.yml
+
+# Updating service catalog breaks etcd health checks (see docs/upgrade.txt)
- import_playbook: ../anslib/openshift-ansible/playbooks/byo/openshift-cluster/upgrades/v3_7/upgrade.yml
+ vars:
+ openshift_enable_service_catalog: false
+
+- name: Configure cluster hosts names
+ hosts: nodes
+ roles:
+ - { role: ands_network, action: install_post }
diff --git a/roles/ands_backup/defaults/main.yml b/roles/ands_backup/defaults/main.yml
new file mode 100644
index 0000000..33d1ff1
--- /dev/null
+++ b/roles/ands_backup/defaults/main.yml
@@ -0,0 +1,9 @@
+ands_script_path: "/opt/scripts"
+
+ands_backup_frequency: "17 */4 * * *"
+ands_backup_volume: "{{ ands_paths.provision }}"
+ands_backup_path: "{{ ands_backup_volume }}/backup"
+ands_backup_clean_minutes: "720"
+ands_borg_path: "{{ ands_backup_volume }}/borg"
+ands_borg_args: "-C zlib,6 -x"
+ands_borg_prune: "--keep-daily=7 --keep-weekly=4 --keep-monthly=6 --keep-within 1w"
diff --git a/roles/ands_backup/tasks/main.yml b/roles/ands_backup/tasks/main.yml
new file mode 100644
index 0000000..16a8ec3
--- /dev/null
+++ b/roles/ands_backup/tasks/main.yml
@@ -0,0 +1,29 @@
+- name: Install required packages
+ package: name={{item}} state=present
+ with_items:
+ - borgbackup
+ - heketi-client
+
+- name: Create scripts directory
+ file: path="{{ ands_script_path }}" state=directory
+
+- name: Populate backup script
+ template: src=backup.sh.j2 dest="{{ ands_script_path }}/ands_backup.sh" owner=root group=root mode=0755
+
+- name: Populate cron job
+ template: src=backup.cron.j2 dest="/etc/cron.d/9ands_backup" owner=root group=root mode=0644
+
+
+- name: Check if backup volume is mounted
+ command: mountpoint -q "{{ ands_backup_volume }}"
+
+
+- block:
+ - name: Check if borg is already initialized
+ stat: path="{{ ands_borg_path }}/config"
+ register: borg_stat_res
+
+ - name: Initialize borg repository
+ shell: "borg init {{ ands_borg_path }} --encryption=none"
+ when: not borg_stat_res.stat.exists
+ run_once: true
diff --git a/roles/ands_backup/templates/backup.cron.j2 b/roles/ands_backup/templates/backup.cron.j2
new file mode 100644
index 0000000..5c017b8
--- /dev/null
+++ b/roles/ands_backup/templates/backup.cron.j2
@@ -0,0 +1,4 @@
+SHELL=/bin/bash
+PATH=/sbin:/bin:/usr/sbin:/usr/bin
+MAILTO=root
+{{ ands_backup_frequency }} root /bin/bash {{ ands_script_path }}/ands_backup.sh
diff --git a/roles/ands_backup/templates/backup.sh.j2 b/roles/ands_backup/templates/backup.sh.j2
new file mode 100755
index 0000000..74fff85
--- /dev/null
+++ b/roles/ands_backup/templates/backup.sh.j2
@@ -0,0 +1,72 @@
+#! /bin/bash
+
+date=$(date -u "+%Y%m%d_%H%M%S")
+hostname=$(hostname)
+
+volume_path="{{ ands_backup_volume }}"
+host_path="{{ ands_backup_path }}/${hostname}"
+backup_path="${host_path}/${date}"
+borg_path="{{ ands_borg_path }}"
+
+borg_args="{{ ands_borg_args }}"
+borg_prune_args="{{ ands_borg_prune }}"
+
+etcdctl3 () {
+ ETCDCTL_API=3 /usr/bin/etcdctl --cert /etc/etcd/peer.crt --key /etc/etcd/peer.key --cacert /etc/etcd/ca.crt --endpoints "https://${hostname}:2379" ${@}
+}
+
+
+check=$(df | awk '{ print $6 }' | grep -P "^${volume_path}$")
+[ $? -ne 0 -o -z "$check" ] && { echo "The volume $volume_path is not mounted. Skipping..." ; exit 1 ; }
+
+[ -d "$backup_path" ] && { echo "Something wrong, path $backup_path already exists..." ; exit 1 ; }
+
+# Check the provision volume is mounted
+mkdir -p "$backup_path" || { echo "Can't create ${backup_path}" ; exit 1 ; }
+
+{% if 'masters' in group_names %}
+# etcd
+mkdir -p "$backup_path/etcd" || { echo "Can't create ${backup_path}/etcd" ; exit 1 ; }
+etcdctl3 --endpoints="192.168.213.1:2379" snapshot save "$backup_path/etcd/snapshot.db" > /dev/null
+
+# heketi
+mkdir -p "$backup_path/heketi" || { echo "Can't create ${backup_path}/heketi" ; exit 1 ; }
+heketi-cli -s http://heketi-storage.glusterfs.svc.cluster.local:8080 --user admin --secret "$(oc get secret heketi-storage-admin-secret -n glusterfs -o jsonpath='{.data.key}' | base64 -d)" topology info --json > "$backup_path/heketi/topology.json"
+{% endif %}
+
+
+{% if 'ands_storage_servers' in group_names %}
+# Gluster
+#mkdir -p "$backup_path/gluster" || { echo "Can't create ${backup_path}/gluster" ; exit 1 ; }
+#(
+# cd /var/lib/
+# tar cjf $backup_path/gluster/var_lib_glusterd.tar.bz2 glusterd
+#)
+{% endif %}
+
+# etc
+#mkdir -p "$backup_path/etc" || { echo "Can't create ${backup_path}/etc" ; exit 1 ; }
+#(
+# cd /
+# tar cjf $backup_path/etc/etc.tar.bz2 etc --exclude=selinux --exclude=udev --exclude=bash_completion.d --exclude=etc/pki --exclude=etc/services --exclude=postfix --exclude=mc
+#)
+
+if [ -d "$borg_path" ]; then
+ borg_glusterd="/var/lib/glusterd"
+ borg_etc="/etc -e */etc/selinux -e */etc/udev -e */etc/bash_completion.d -e */etc/pki -e */etc/services -e */etc/postfix -e */etc/mc"
+
+{% if 'masters' in group_names %}
+ borg_list="* ${borg_glusterd} ${borg_etc}"
+{% elif 'ands_storage_servers' in group_names %}
+ borg_list="${borg_glusterd} ${borg_etc}"
+{% else %}
+ borg_list="${borg_etc}"
+{% endif %}
+
+ (
+ cd ${backup_path}
+ borg create ${borg_args} "$borg_path::${hostname}-${date}" $borg_list
+ borg prune ${borg_prune_args} --prefix "${hostname}-" "$borg_path"
+ )
+ find "$host_path" -maxdepth 1 -type d -mmin +{{ands_backup_clean_minutes}} -print0 | xargs -0 rm -rf
+fi
diff --git a/roles/common/README b/roles/ands_common/README
index c8bd679..c8bd679 100644
--- a/roles/common/README
+++ b/roles/ands_common/README
diff --git a/roles/common/default/main.yml b/roles/ands_common/default/main.yml
index d355d15..d355d15 100644
--- a/roles/common/default/main.yml
+++ b/roles/ands_common/default/main.yml
diff --git a/roles/common/tasks/main.yml b/roles/ands_common/tasks/main.yml
index fdd7246..e9196ad 100644
--- a/roles/common/tasks/main.yml
+++ b/roles/ands_common/tasks/main.yml
@@ -30,6 +30,7 @@
- python-rhsm-certificates
- glusterfs-fuse
- telnet
+ - yum-plugin-versionlock
# We always update on first install and if requested
- name: Update CentOS
diff --git a/roles/ands_facts/defaults/main.yml b/roles/ands_facts/defaults/main.yml
index fc3fcfd..c74984e 100644
--- a/roles/ands_facts/defaults/main.yml
+++ b/roles/ands_facts/defaults/main.yml
@@ -3,6 +3,11 @@ ands_none: "{{ None }}"
ands_configure_heketi: false
ands_data_device_default_threshold: 10
+ands_host_id: "{{ ansible_hostname | regex_replace('^[\\w\\d]*\\w(\\d+)(\\.|$)', '\\1') }}"
+# We need to add it to set_fact if enabled
+#ands_master_id: "{{ ('masters' in group_names) | ternary(groups.masters.index(('masters' in group_names) | ternary(inventory_hostname, groups.masters[0])), -1) }}"
+
+ands_storage_hostname: "{{ ands_storage_network | default(false) | ternary(ands_storage_network | default('') | ipaddr(ands_host_id) | ipaddr('address'), ansible_fqdn) }}"
ands_storage_servers: "{{ groups.ands_storage_servers | map('extract', hostvars, 'ands_storage_hostname') | list }}"
#openshift_storage_nodes: "{{ groups.storage_nodes | map('extract', hostvars, 'ands_storage_hostname') | list }}"
@@ -23,7 +28,8 @@ ands_default_ip: "{{ ansible_default_ipv4.address }}"
ands_openshift_default_ip: "{{ ands_resolve_public_ip | default(false) | ternary(ands_default_ip, ands_none) }}"
ands_openshift_default_hostname: "{{ (ands_hostname_template is defined) | ternary(ands_hostname_template ~ ands_host_id, ansible_hostname) }}"
-ands_inner_lb: false
+ands_inner_lb: true
+ands_use_inner_lb: false
ands_inner_lb_id: 254
ands_inner_lb_hostname: 'ands-lb'
diff --git a/roles/ands_facts/tasks/main.yml b/roles/ands_facts/tasks/main.yml
index 6b28683..bd23e13 100644
--- a/roles/ands_facts/tasks/main.yml
+++ b/roles/ands_facts/tasks/main.yml
@@ -1,14 +1,14 @@
---
+# Here we set 'openshift_hostname', 'openshift_ip' and other variables
+- name: "Configuring network facts"
+ include_tasks: "network.yml"
# The variables accessed trough 'hostvars' should be set as facts
# Here we set 'ands_storage_servers' and other variables
- name: "Configuring storage facts"
include_tasks: "storage.yml"
-# Here we set 'openshift_hostname', 'openshift_ip' and other variables
-- name: "Configuring network facts"
- include_tasks: "network.yml"
-
- name: "Confirm that ands facts are configured"
set_fact:
+ ands_none: "{{ ands_none }}"
ands_facts_configured: true
diff --git a/roles/ands_facts/tasks/network.yml b/roles/ands_facts/tasks/network.yml
index 1d0248f..808d7b6 100644
--- a/roles/ands_facts/tasks/network.yml
+++ b/roles/ands_facts/tasks/network.yml
@@ -1,24 +1,34 @@
+- name: Set some facts
+ set_fact:
+ ands_host_id: "{{ ands_host_id }}"
+
- name: Set network facts
set_fact:
ands_cluster_domain: "{{ ands_cluster_domain }}"
ands_cluster_dot_domain: ".{{ ands_cluster_domain }}"
ands_inner_domain: "{{ ands_inner_domain }}"
ands_inner_dot_domain: "{{ (ands_inner_domain == ands_none) | ternary('', '.' ~ ands_inner_domain) }}"
+ ands_inner_lb: "{{ ands_inner_lb }}"
+ ands_use_inner_lb: "{{ ands_use_inner_lb }}"
ands_inner_lb_ip: "{{ ands_openshift_network | ipaddr(ands_inner_lb_id) | ipaddr('address') }}"
ands_inner_lb_hostname: "{{ ands_inner_lb_hostname }}"
ands_openshift_ip: "{{ ands_openshift_network | ipaddr(ands_host_id) | ipaddr('address') }}"
- ands_openshift_hostname: "{{ ands_openshift_hostname | default(ands_openshift_set_hostname | ternary(ands_openshift_default_hostname, ands_none)) }}"
+ ands_openshift_hostname: "{{ ands_openshift_hostname | default(ands_openshift_default_hostname) }}"
ands_openshift_public_ip: "{{ (ands_openshift_public_network is defined) | ternary( ands_openshift_public_network | ipaddr(ands_host_id) | ipaddr('address'), ands_openshift_default_ip) }}"
- ands_openshift_public_hostname: "{{ ands_openshift_public_hostname | default(ands_openshift_set_public_hostname | ternary(ands_openshift_default_hostname, ands_none)) }}"
+ ands_openshift_public_hostname: "{{ ands_openshift_public_hostname | default(ands_openshift_default_hostname) }}"
ands_storage_ip: "{{ ands_storage_network | default(ands_openshift_network) | ipaddr(ands_host_id) | ipaddr('address') }}"
ands_hostname_storage: "ands_storage{{ ands_host_id }}"
ands_hostname_openshift: "ands_openshift{{ ands_host_id }}"
+ ands_openshift_set_hostname: "{{ ands_openshift_set_hostname }}"
+ ands_openshift_set_public_hostname: "{{ ands_openshift_set_public_hostname }}"
+ ands_storage_hostname: "{{ ands_storage_hostname }}"
- name: Set more network facts
set_fact:
ands_openshift_public_fqdn: "{{ (ands_openshift_public_hostname == ands_none) | ternary(ands_none, ands_openshift_public_hostname ~ ands_cluster_dot_domain ) }}"
ands_openshift_fqdn: "{{ (ands_openshift_hostname == ands_none) | ternary(ands_none, ands_openshift_hostname ~ ands_inner_dot_domain ) }}"
- ands_openshift_cluster_fqdn: "{{ ands_inner_lb | ternary(ands_inner_lb_hostname ~ ands_inner_dot_domain, ands_openshift_lb) }}"
+ ands_inner_lb_fqdn: "{{ ands_inner_lb_hostname ~ ands_inner_dot_domain }}"
+ ands_storage_servers: "{{ ands_storage_servers }}"
- name: "Detect inner network interface"
include_tasks: "find_interface_by_ip.yml"
diff --git a/roles/ands_facts/tasks/storage.yml b/roles/ands_facts/tasks/storage.yml
index cf995a0..888ad70 100644
--- a/roles/ands_facts/tasks/storage.yml
+++ b/roles/ands_facts/tasks/storage.yml
@@ -1,5 +1,9 @@
- include_vars: dir="vars"
+- name: Set facts
+ set_fact:
+ ands_configure_heketi: "{{ ands_configure_heketi }}"
+
- name: Detect Heketi
set_fact: ands_storage_domains="{{ ands_storage_domains | union([ands_heketi_domain]) }}"
when:
@@ -9,10 +13,6 @@
- name: Set some facts
set_fact:
- ands_storage_servers: "{{ ands_storage_servers }}"
-
-- name: Set some facts
- set_fact:
ands_data_vg: "{{ ands_data_vg }}"
when: ands_data_vg != ""
diff --git a/roles/ands_kaas/templates/50-kaas-pods.yml.j2 b/roles/ands_kaas/templates/50-kaas-pods.yml.j2
index 216dc01..ad1fc58 100644
--- a/roles/ands_kaas/templates/50-kaas-pods.yml.j2
+++ b/roles/ands_kaas/templates/50-kaas-pods.yml.j2
@@ -5,7 +5,7 @@ kind: Template
metadata:
name: {{ kaas_project }}-pods
annotations:
- descriptions: {{ kaas_project_config.description | default(kaas_project ~ "auto-generated pod template") }}
+ descriptions: {{ kaas_project_config.description | default(kaas_project ~ " auto-generated pod template") }}
objects:
{% for name, pod in kaas_project_pods.iteritems() %}
{% set pubkey = "kaas_" ~ name ~ "_pubkey" %}
@@ -14,6 +14,9 @@ objects:
{% if pod.variant is defined %}
{% set pod = pod[pod.variant] %}
{% endif %}
+ {% set sched = pod.sched | default({}) %}
+ {% set node_selector = (sched.selector is defined) | ternary(sched.selector, ands_default_node_selector | combine(sched.restrict | default({}))) %}
+
{% if pod.service is defined %}
- apiVersion: v1
kind: Service
@@ -68,10 +71,10 @@ objects:
metadata:
name: {{ pod.name | default(name) }}
spec:
- replicas: {{ ( pod.sched | default({})).replicas | default(1) }}
+ replicas: {{ ( sched | default({})).replicas | default(1) }}
revisionHistoryLimit: 2
strategy:
- type: {{ (pod.sched | default({})).strategy | default('Rolling') }}
+ type: {{ (sched | default({})).strategy | default('Rolling') }}
triggers:
- type: ConfigChange
selector:
@@ -82,11 +85,8 @@ objects:
labels:
name: {{ pod.name | default(name) }}
spec:
- {% if pod.selector is defined %}
- nodeSelector:
- {% for skey, sval in pod.selector.iteritems() %}
- {{ skey }}: "{{ sval }}"
- {% endfor %}
+ {% if node_selector | length > 0 %}
+ nodeSelector: {{ node_selector | to_json }}
{% endif %}
{% set mappings = (pod.images | json_query('[*].mappings') | length) %}
{% if mappings > 0 %}
diff --git a/roles/ands_network/tasks/common.yml b/roles/ands_network/tasks/common.yml
index 384029f..f2fda00 100644
--- a/roles/ands_network/tasks/common.yml
+++ b/roles/ands_network/tasks/common.yml
@@ -22,27 +22,18 @@
- nodes
- new_nodes
-- name: Configure all storage ips in /etc/hosts
- lineinfile: dest="/etc/hosts" line="{{ ip }} {{ hostname }}" regexp="{{ hostname }}" state="present"
- when:
- - hostvars[item]['ands_storage_network'] | default(ands_none) != ands_none
- - hostvars[item]['ands_facts_configured'] is defined
- vars:
- ip: "{{ hostvars[item]['ands_storage_ip'] }}"
- hostname: "{{ hostvars[item]['ands_hostname_storage'] }}"
- with_inventory_hostnames:
- - storage_nodes
- - new_storage_nodes
-
-
- name: Provision /etc/hosts to ensure that all masters servers are accessing Master API on loopback device
lineinfile: dest="/etc/hosts" line="127.0.0.1 {{ openshift_master_cluster_hostname }}" regexp=".*{{ openshift_master_cluster_hostname }}$" state="present"
when: ('masters' in group_names or 'new_masters' in group_names)
register: result
-- name: Provision /etc/hosts to ensure that all masters servers are accessing Master API on loopback device
+- name: Provision /etc/hosts with load-balance IP on non master servers
lineinfile: dest="/etc/hosts" line="{{ ands_inner_lb_ip }} {{ openshift_master_cluster_hostname }}" regexp=".*{{ openshift_master_cluster_hostname }}$" state="present"
- when: (result | skipped) and (ands_inner_lb | default(false))
+ when: (result | skipped) and (ands_use_inner_lb | default(false))
+
+- name: Provision inner load-balancer hostname in /etc/hosts
+ lineinfile: dest="/etc/hosts" line="{{ ands_inner_lb_ip }} {{ ands_inner_lb_hostname }} {{ ands_inner_lb_fqdn }}" regexp=".*{{ ands_inner_lb_fqdn }}$" state="present"
+ when: openshift_master_cluster_hostname != ands_inner_lb_fqdn
- name: Register openshift_dns_ip in /etc/hosts
lineinfile: dest="/etc/hosts" line="{{ openshift_dns_ip }} openshift_dns_ip" regexp="openshift_dns_ip$" state="present"
diff --git a/roles/ands_network/tasks/ganesha.yml b/roles/ands_network/tasks/ganesha.yml
new file mode 100644
index 0000000..0f77ca8
--- /dev/null
+++ b/roles/ands_network/tasks/ganesha.yml
@@ -0,0 +1,12 @@
+- name: Configure all storage ips in /etc/hosts
+ lineinfile: dest="/etc/hosts" line="{{ ip }} {{ hostname }}" regexp="{{ hostname }}" state="present"
+ when:
+ - hostvars[item]['ands_storage_network'] | default(ands_none) != ands_none
+ - hostvars[item]['ands_facts_configured'] is defined
+ vars:
+ ip: "{{ hostvars[item]['ands_storage_ip'] }}"
+ hostname: "{{ hostvars[item]['ands_hostname_storage'] }}"
+ with_inventory_hostnames:
+ - storage_nodes
+ - new_storage_nodes
+
diff --git a/roles/ands_network/tasks/install_post.yml b/roles/ands_network/tasks/install_post.yml
index 0bfef34..3f1e57c 100644
--- a/roles/ands_network/tasks/install_post.yml
+++ b/roles/ands_network/tasks/install_post.yml
@@ -6,4 +6,4 @@
lineinfile: dest="/etc/hosts" line="{{ ands_inner_lb_ip | default('') }} {{ openshift_master_cluster_hostname }}" regexp=".*{{ openshift_master_cluster_hostname }}$" state="{{ state }}"
when: ('masters' not in group_names and 'new_masters' not in group_names)
vars:
- state: "{{ ands_inner_lb | default(false) | ternary('present', 'absent') }}"
+ state: "{{ ands_use_inner_lb | default(false) | ternary('present', 'absent') }}"
diff --git a/roles/ands_network/tasks/maintain.yml b/roles/ands_network/tasks/maintain.yml
index a7af597..6fba5f2 100644
--- a/roles/ands_network/tasks/maintain.yml
+++ b/roles/ands_network/tasks/maintain.yml
@@ -6,4 +6,8 @@
lineinfile: dest="/etc/hosts" line="{{ ands_inner_lb_ip | default('') }} {{ openshift_master_cluster_hostname }}" regexp=".*{{ openshift_master_cluster_hostname }}$" state="{{ state }}"
when: ('masters' not in group_names and 'new_masters' not in group_names)
vars:
- state: "{{ ands_inner_lb | default(false) | ternary('present', 'absent') }}"
+ state: "{{ ands_use_inner_lb | default(false) | ternary('present', 'absent') }}"
+
+- name: Provision inner load-balancer hostname in /etc/hosts
+ lineinfile: dest="/etc/hosts" line="{{ ands_inner_lb_ip }} {{ ands_inner_lb_hostname }} {{ ands_inner_lb_fqdn }}" regexp=".*{{ ands_inner_lb_fqdn }}$" state="present"
+ when: openshift_master_cluster_hostname != ands_inner_lb_fqdn
diff --git a/roles/docker/defaults/main.yml b/roles/docker/defaults/main.yml
index f7b96f5..30b1ff8 100644
--- a/roles/docker/defaults/main.yml
+++ b/roles/docker/defaults/main.yml
@@ -4,3 +4,6 @@ docker_lv: "docker-pool"
docker_min_size: 100
docker_max_log_size: "2m"
docker_max_log_files: "3"
+
+# There are some problems with groups on 1.13
+docker_version: "-1.12*"
diff --git a/roles/docker/tasks/main.yml b/roles/docker/tasks/main.yml
index 0d040a9..c03d897 100644
--- a/roles/docker/tasks/main.yml
+++ b/roles/docker/tasks/main.yml
@@ -1,6 +1,20 @@
---
+#- name: Remove docker
+# yum: name="{{ item }}" state="absent"
+# with_items: [ docker, docker-client, docker-common ]
+
+- name: Remove versionlock from yum
+ command: yum versionlock delete docker docker-common docker-client
+ register: result
+ failed_when: false
+ changed_when: result | failed
+
- name: Ensure docker is installed
- yum: name="docker" state="present"
+ yum: name="docker{{ docker_version | default('') }}" state="{{ docker_version is defined | ternary('latest', 'present') }}"
+
+- name: Add versionlock to yum
+ command: yum versionlock add docker docker-common docker-client
+ when: docker_version is defined
- name: start docker
service: name="docker" state="started"
diff --git a/roles/glusterfs/tasks/cfg/vols3.yml b/roles/glusterfs/tasks/cfg/vols3.yml
index d094797..d8ed728 100644
--- a/roles/glusterfs/tasks/cfg/vols3.yml
+++ b/roles/glusterfs/tasks/cfg/vols3.yml
@@ -3,6 +3,7 @@
gluster_volume:
state: present
name: "{{ name }}"
+ host: "{{ ands_storage_hostname }}"
cluster: "{{ domain_servers | join(',') }}"
replicas: "{{ domain_servers | length }}"
bricks: "{{ glusterfs_bricks_path }}/brick-{{ name }}"
diff --git a/roles/glusterfs/tasks/data/vols2.yml b/roles/glusterfs/tasks/data/vols2.yml
index d094797..d8ed728 100644
--- a/roles/glusterfs/tasks/data/vols2.yml
+++ b/roles/glusterfs/tasks/data/vols2.yml
@@ -3,6 +3,7 @@
gluster_volume:
state: present
name: "{{ name }}"
+ host: "{{ ands_storage_hostname }}"
cluster: "{{ domain_servers | join(',') }}"
replicas: "{{ domain_servers | length }}"
bricks: "{{ glusterfs_bricks_path }}/brick-{{ name }}"
diff --git a/roles/glusterfs/tasks/data/vols3.yml b/roles/glusterfs/tasks/data/vols3.yml
index 866480c..14c3763 100644
--- a/roles/glusterfs/tasks/data/vols3.yml
+++ b/roles/glusterfs/tasks/data/vols3.yml
@@ -3,6 +3,7 @@
gluster_volume:
state: present
name: "{{ name }}"
+ host: "{{ ands_storage_hostname }}"
cluster: "{{ domain_servers | join(',') }}"
replicas: 3
arbiters: 1
diff --git a/roles/glusterfs/tasks/db/vols3.yml b/roles/glusterfs/tasks/db/vols3.yml
index b1beacb..cbd238d 100644
--- a/roles/glusterfs/tasks/db/vols3.yml
+++ b/roles/glusterfs/tasks/db/vols3.yml
@@ -3,6 +3,7 @@
gluster_volume:
state: present
name: "{{ name }}"
+ host: "{{ ands_storage_hostname }}"
cluster: "{{ domain_servers | join(',') }}"
disperses: "3"
redundancies: "1"
diff --git a/roles/glusterfs/tasks/la/vols3.yml b/roles/glusterfs/tasks/la/vols3.yml
index 9565bb3..ada8f95 100644
--- a/roles/glusterfs/tasks/la/vols3.yml
+++ b/roles/glusterfs/tasks/la/vols3.yml
@@ -3,6 +3,7 @@
gluster_volume:
state: present
name: "{{ name }}"
+ host: "{{ ands_storage_hostname }}"
cluster: "{{ domain_servers | join(',') }}"
bricks: "{{ glusterfs_bricks_path }}/brick-{{ name }}"
transport: "{{ glusterfs_transport }}"
diff --git a/roles/ntp b/roles/ntp
new file mode 120000
index 0000000..626609b
--- /dev/null
+++ b/roles/ntp
@@ -0,0 +1 @@
+../anslib/ansible-role-ntp/ \ No newline at end of file
diff --git a/scripts/gluster.sh b/scripts/gluster.sh
index 02a0a3f..9efea45 100755
--- a/scripts/gluster.sh
+++ b/scripts/gluster.sh
@@ -69,17 +69,21 @@ function migrate {
# heal $1
-if [ -n "$1" -a "$1" != "all" ]; then
- eval "$action" "$@"
-else
- [ "$1" == "all" ] && shift
+if [ -z "$1" -a "$1" =~ ^all ]; then
+ all=0
+ [ "$1" == "all_heketi" ] && all=1
+ [ "$1" =~ ^all ] && shift
vols=$(gluster volume info | grep -P '^Volume Name' | awk '{ print $NF }' | tr '\r\n' ' ')
for vol in $vols; do
- [[ "$vol" =~ [0-9] ]] && continue
- [[ "$vol" =~ ^vol_ ]] && continue
- [[ "$vol" =~ ^heketi ]] && continue
+ if [ $all -eq 0 ]; then
+ [[ "$vol" =~ [0-9] ]] && continue
+ [[ "$vol" =~ ^vol_ ]] && continue
+ [[ "$vol" =~ ^heketi ]] && continue
+ fi
eval "$action" "$vol" "$@"
done
+else
+ eval "$action" "$@"
fi
diff --git a/setup.sh b/setup.sh
index 1c38536..4ccf94d 100755
--- a/setup.sh
+++ b/setup.sh
@@ -46,7 +46,7 @@ case "$action" in
apply playbooks/openshift-setup-projects.yml "$@" || exit 1
;;
project)
- project=$2
+ project=$1
shift
[ -n "$project" ] || { usage 'project name should be specified...' ; exit 1; }
apply playbooks/openshift-setup-project.yml --extra-vars "ands_configure_project=$project" "$@" || exit 1
diff --git a/setup/configs/labels.yml b/setup/configs/labels.yml
index e8ee868..3f8cbe4 100644
--- a/setup/configs/labels.yml
+++ b/setup/configs/labels.yml
@@ -2,6 +2,9 @@
ands_openshift_labels:
region: "infra"
zone: "default"
+ production: 1
+ server: 1
+ permanent: 1
hostid: "{{ ands_host_id }}"
hostname: "{{ ansible_hostname }}"
fqdn: "{{ ansible_hostname }}.{{ ansible_domain }}"
@@ -11,3 +14,9 @@ ands_openshift_labels:
pod_node: 1
compute_node: 0
gpu_node: 0
+
+
+
+ands_default_node_selector:
+ zone: default
+ production: "1"
diff --git a/setup/configs/volumes.yml b/setup/configs/volumes.yml
index f97d485..14aadfa 100644
--- a/setup/configs/volumes.yml
+++ b/setup/configs/volumes.yml
@@ -18,16 +18,21 @@ ands_nfs_clients:
ands_storage_domains:
- servers: "ands_storage_servers"
- clients: [ "masters", "new_masters" ]
+ clients: [ "nodes", "new_nodes" ]
volumes:
provision: { type: "cfg", mount: "{{ ands_paths.provision }}" }
+ - servers: "ands_storage_servers"
+ clients: [ "masters", "new_masters" ]
+ volumes:
+# provision: { type: "cfg", mount: "{{ ands_paths.provision }}" }
openshift: { type: "cfg", mount: "{{ ands_paths.openshift }}", nfs_clients: "{{ ands_nfs_clients }}" }
databases: { type: "db", mount: "{{ ands_paths.databases }}" }
temporary: { type: "tmp", mount: "{{ ands_paths.temporary }}", nfs_clients: "{{ ands_nfs_clients }}" }
datastore: { type: "data", mount: "{{ ands_paths.datastore }}", nfs_clients: "{{ ands_nfs_clients }}" }
katrin_data: { type: "data", mount: "{{ ands_paths.katrin_data }}", nfs_clients: "{{ ands_nfs_clients }}" }
-# - servers: "storage_nodes"
-# clients: [ "nodes" ]
+
+# - servers: "ands_storage_servers"
+# clients: [ "nodes", "new_nodes" ]
# openshift: { type: "cfg", mount: "{{ ands_paths.openshift }}" }
# temporary: { type: "tmp", mount: "{{ ands_paths.temporary }}" }
# volumes:
diff --git a/setup/projects/adei/templates/60-adei.yml.j2 b/setup/projects/adei/templates/60-adei.yml.j2
index ca3c17a..22f4bb0 100644
--- a/setup/projects/adei/templates/60-adei.yml.j2
+++ b/setup/projects/adei/templates/60-adei.yml.j2
@@ -75,12 +75,13 @@ objects:
spec:
schedule: "{{ cfg.cron }}"
concurrencyPolicy: "Forbid"
+ startingDeadlineSeconds: "{{ cfg.start_tolerance | default(30) }}"
successfulJobsHistoryLimit: "{{ adei_pod_history_limit }}"
failedJobsHistoryLimit: "{{ adei_pod_history_limit }}"
jobTemplate:
spec:
completions: "1"
- activeDeadlineSeconds: "3600"
+ activeDeadlineSeconds: "{{ cfg.max_run_time | default(600) }}"
# restartPolicy: "Never"
template:
metadata:
@@ -125,6 +126,9 @@ objects:
adei-setup: "${setup}"
spec:
restartPolicy: {{ restart_policy }}
+{% if (ands_default_node_selector is defined) and (ands_default_node_selector | length > 0) %}
+ nodeSelector: {{ ands_default_node_selector | to_json }}
+{% endif %}
volumes: {{ cfg.vols | to_json }}
{% if (cfg.groups is defined) or (cfg.run_as is defined) %}
securityContext:
diff --git a/setup/projects/adei/vars/globals.yml b/setup/projects/adei/vars/globals.yml
index f8d7816..01fb495 100644
--- a/setup/projects/adei/vars/globals.yml
+++ b/setup/projects/adei/vars/globals.yml
@@ -233,3 +233,7 @@ adei_frontends:
mounts: "{{ adei_prod_mounts | union(adei_pod_mounts) }}"
groups: [ "adei" ]
enabled: true
+
+# Extra options:
+# start_tolerance: 30
+# max_run_time: 600
diff --git a/setup/projects/adei/vars/pods.yml b/setup/projects/adei/vars/pods.yml
index 182db9c..3923c23 100644
--- a/setup/projects/adei/vars/pods.yml
+++ b/setup/projects/adei/vars/pods.yml
@@ -1,8 +1,7 @@
pods:
mysql:
service: { ports: [ 3306 ] }
- sched: { replicas: 1, strategy: "Recreate", selector: { master: 1 } }
- selector: { master: 1 }
+ sched: { replicas: 1, strategy: "Recreate", restrict: { fat_storage: "1" } }
groups: [ "adei_db" ]
images:
- image: "centos/mysql-57-centos7"
diff --git a/setup/projects/kaas/vars/pods.yml b/setup/projects/kaas/vars/pods.yml
index 41831ab..8cfa65a 100644
--- a/setup/projects/kaas/vars/pods.yml
+++ b/setup/projects/kaas/vars/pods.yml
@@ -1,8 +1,7 @@
pods:
kaas-router:
service: { host: "{{ katrin_node }}", ports: [ 80/8080, 443/8043 ] }
- sched: { replicas: 1, selector: { master: 1 } }
- selector: { master: 1 }
+ sched: { replicas: 1, restrict: { master: "1" } }
images:
- image: "httpd:2.2"
mappings: