From af658521b46751a20a5953bd9c7f3ef01c0a74d7 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 6 Jan 2019 01:56:17 +0100 Subject: Added more checks to kaas and adei --- service/check_adei.sh | 12 ++++++++++++ service/check_kaas.sh | 24 +++++++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/service/check_adei.sh b/service/check_adei.sh index f08ab5f..4edad93 100755 --- a/service/check_adei.sh +++ b/service/check_adei.sh @@ -72,6 +72,18 @@ size=$(query "$url/info.php?target=size&encoding=text") [[ "$size" =~ "Error:" ]] && size="" [ -n "$size" ] && msg="\${color gray}/ $((size / 1024 / 1024 / 1024)) GB" +# Check pending administrative scripts +if [ $healthy -ne 0 ]; then + scripts=$(query "$url/info.php?target=scripts") + waiting=$(echo $scripts | xmllint --format - | grep "Value" | sed -e "s/^\(.*mtime=\"\([^\"]*\)\".*\)$/\\2\\1/" | awk -v date="$(date +%s)" '{duration=date - $1} duration > 3600 { print duration }' | sort -rn) + num_waiting=$(echo $waiting | tr ' ' '\n' | wc -l) + long_waiting=$(echo $waiting | cut -d ' ' -f 1) + if [ $num_waiting -gt 0 ]; then + healthy=2 + echo "$num_waiting pending scripts, longest for $(format_time $long_waiting)" + fi +fi + # Verify offset (for selected database) if [ $healthy -ne 0 -a -n "$src" ]; then diff --git a/service/check_kaas.sh b/service/check_kaas.sh index d6e7300..e69e73d 100755 --- a/service/check_kaas.sh +++ b/service/check_kaas.sh @@ -4,7 +4,9 @@ cd "$(dirname "$0")" . opts.sh e_nodes=$2 - +e_pods=2 +e_restarts=10 +p_pods=10 online=$(../scripts/ping.pl "$host") healthy=$online @@ -34,4 +36,24 @@ if [ $healthy -ne 0 ]; then fi fi +# Find pods in wrong states +if [ $healthy -ne 0 ]; then + pods=$(oc get pods --all-namespaces | awk '$6~/s|m/ { next } { print }' | grep -P "Terminating|Pending|CrashLoopBackOff" | wc -l) + if [ $pods -ge $e_pods ]; then + healthy=2 + echo "Pods stalled in wrong states:" + oc get pods --all-namespaces -o wide | awk '$6~/s|m/ { next } { print }' | grep -P "(Terminating|Pending|CrashLoopBackOff)" | head -n $p_pods | sed -e 's/[[:space:]]\+/ /g' + fi +fi + +# Find not-ready running pods with large restart number +if [ $healthy -ne 0 ]; then + pods=$(oc get pods --all-namespaces | awk '$6~/s|m/ { next } $5<$e_restarts { next } $3~/^0/ { print $0 }' | grep Running | wc -l) + if [ $pods -ge $e_pods ]; then + healthy=2 + echo "Pods restarting:" + oc get pods --all-namespaces -o wide | awk '$6~/s|m/ { next } $5<$e_restarts { next } $3~/^0/ { print $0 }' | grep Running | head -n $p_pods | sed -e 's/[[:space:]]\+/ /g' + fi +fi + echo "$online $healthy $version $nodes" -- cgit v1.2.1