From ba144fab071258a97cf3c42a0defeb0aae41a353 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 6 Oct 2019 05:00:55 +0200 Subject: Document latest problems with docker images and resource reclaimation, add docker performance checks in the monitoring scripts, helpers to filter the logs --- .../templates/scripts/check_server_status.sh.j2 | 29 +++++++++++++++++++++- .../templates/scripts/check_uptime_status.sh.j2 | 2 +- 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'roles') diff --git a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 index 0bef13c..c2849f4 100755 --- a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 +++ b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 @@ -3,7 +3,7 @@ fs=`df -lm / | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4` datafs=`df -lm /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4` mem=`free -g | grep "Mem" | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 7` -cpu=`uptime | sed -e "s/[[:space:]]/\n/g" | tail -n 1` +cpu=`uptime | sed -e "s/[[:space:]]/\n/g" -e s/,/./g | tail -n 1` if [ $fs -le 8192 ]; then echo "Only $(($fs / 1024)) GB left in the root file system" @@ -53,3 +53,30 @@ ping -c 1 -W 2 8.8.8.8 &> /dev/null if [ $? -ne 0 ]; then echo "Networkign problems, can't ping Google's public DNS server" fi + +info=$(LC_ALL=C docker info) +if [ -n "$info" ]; then + images=$(echo "$info" | grep -i images | grep -Po "\d+") + [ -n "$images" ] && images=$(docker images -a | wc -l) + c=$(echo "$info" | grep -i containers | grep -Po "\d+") + c_running=$(echo "$info" | grep -i containers -A 5 | grep -i running | grep -Po "\d+") + c_paused=$(echo "$info" | grep -i containers -A 5 | grep -i paused | grep -Po "\d+") + c_stopped=$(echo "$info" | grep -i containers -A 5 | grep -i stopped | grep -Po "\d+") + + data_space=$(echo "$info" | grep -i "\bData Space Available" | grep -Po "[\d.]+\s+\w+") + data_size=$(echo "$data_space" | grep -Po "[\d.]+") + [ -n "$(echo $data_space | grep -P 'TB')" ] && data_size=$(echo "$data_size * 1024" | bc) + [ -z "$(echo $data_space | grep '[TG]B')" ] && data_size=0 + + metadata_space=$(echo "$info" | grep -i "\bMetadata Space Available" | grep -Po "[\d.]+\s+\w+") + metadata_size=$(echo "$metadata_space" | grep -Po "[\d.]+") + [ -n "$(echo $metadata_space | grep -P 'TB')" ] && metadata_size=$(echo "$metadata_size * 1024" | bc) + [ -z "$(echo $metadata_space | grep '[TG]B')" ] && metadata_size=0 + + [ $(echo "$data_size > 300" | bc) -eq 0 ] && echo "Docker Data Space is critically low ($data_space)" + [ $(echo "$metadata_size > 5" | bc) -eq 0 ] && echo "Docker Metadata Space is critically low ($metadata_space)" +else + images=$(docker images -a | wc -l) + echo "docker info has timed out" +fi +[ "$images" -gt 1000 ] && echo "Too many docker images ($images) will cause severe scheduling penalties" diff --git a/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2 index 7acac5f..df65c50 100755 --- a/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2 +++ b/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2 @@ -12,4 +12,4 @@ disks=$(/opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep "O data=`df -lh /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4` #echo -n "1 Up $up \${color gray}/ $disks disks, $data free, load: $load, pods: $pods" -echo -en "1 $up\${color gray}, ${disks}/${data}, $(printf %3u ${containers}) c - $(printf %4.1f ${load}), $(printf %3u ${mem}) GB, $(printf %4u ${iops}) IOPS, $(printf %3u ${net}) MB/s" +echo -en "1 $up\${color gray}, ${disks}/${data}, $(printf %3u ${containers}) c, $(printf %4.1f ${load})%, $(printf %3u ${mem}) GB, $(printf %4u ${iops}) IOPS, $(printf %3u ${net}) MB/s" -- cgit v1.2.1