summaryrefslogtreecommitdiffstats
path: root/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
blob: e49ec972e84105bb2230317c2a2d990ea79e5fbd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/bash

fs=`df -lm / | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4`
datafs=`df -lm /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4`
mem=`free -g | grep "Mem" | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 7`
cpu=`uptime | sed -e "s/[[:space:]]/\n/g" -e s/,/./g | tail -n 1`
max_cpu=$(cat /proc/cpuinfo | grep processor | tail -n 1 | cut -d ':' -f 2)
cpu_usage=$(echo "100 * $cpu / ( $max_cpu + 1)" | bc) #"

if [ $fs -le 8192 ]; then
    echo "Only $(($fs / 1024)) GB left in the root file system"
fi

if [ $datafs -le 1048576 ]; then
    echo "Only $(($datafs / 1024)) GB left in the data file system"
fi

if [ $mem -le 16 ]; then
    echo "The system is starving on memory, $mem GB left free"
fi

if [ `echo "$cpu_usage < 80" | bc` -eq 0 ]; then
    echo "The system is starving on cpu, $cpu ($cpu_usage%) is load average for the last 15 min"
fi

vol=$(/opt/MegaRAID/storcli/storcli64 /c0/v0 show | grep -P "^0/0" | grep "Optl" | wc -l)
if [ -z "$vol" -o "$vol" -eq 0 ]; then
    echo "Raid volume is not optimal:"
    /opt/MegaRAID/storcli/storcli64 /c0/v0 show | grep -P "^0/0"
fi

disks=$(/opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep -v "Onln" | wc -l)
if [ -z "$disks" -o "$disks" -ne 0 ]; then
    echo "Not all disks are online:"
    /opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep -v "Onln"
fi

ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+" | wc -l)
if [ "$ifaces" -gt 50 ]; then
    echo "Too many rogue interfaces ($ifaces) is registered on OpenVSwitch bridge. It could introduce large delays in pod scheduling..."
fi

#Check various known problems
vssize=$(du -sm /var/log/openvswitch/ovs-vswitchd.log | cut -f 1)
if [ "$vssize" -gt 128 ]; then
    echo "Current OpenVSwitch log is over $vssize MB. It could indicate some severe problems in pod networking..."
fi

host google.com &> /dev/null
if [ $? -ne 0 ]; then
    echo "DNS problems, can't resolve google.com"
fi

ping -c 1 -W 2 8.8.8.8 &> /dev/null
if [ $? -ne 0 ]; then
    echo "Networkign problems, can't ping Google's public DNS server"
fi

info=$(LC_ALL=C docker info)
if [ -n "$info" ]; then
    images=$(echo "$info" | grep -i images | grep -Po "\d+")
    [ -n "$images" ] && images=$(docker images -a | wc -l)
    c=$(echo "$info" | grep -i containers | grep -Po "\d+")
    c_running=$(echo "$info" | grep -i containers -A 5 | grep -i running | grep -Po "\d+")
    c_paused=$(echo "$info" | grep -i containers -A 5 | grep -i paused | grep -Po "\d+")
    c_stopped=$(echo "$info" | grep -i containers -A 5 | grep -i stopped | grep -Po "\d+")

    data_space=$(echo "$info" | grep -i "\bData Space Available" | grep -Po "[\d.]+\s+\w+")
    data_size=$(echo "$data_space" | grep -Po "[\d.]+")
    [ -n "$(echo $data_space | grep -P 'TB')" ] && data_size=$(echo "$data_size * 1024" | bc)
    [ -z "$(echo $data_space | grep '[TG]B')" ] && data_size=0

    metadata_space=$(echo "$info" | grep -i "\bMetadata Space Available" | grep -Po "[\d.]+\s+\w+")
    metadata_size=$(echo "$metadata_space" | grep -Po "[\d.]+")
    [ -n "$(echo $metadata_space | grep -P 'TB')" ] && metadata_size=$(echo "$metadata_size * 1024" | bc)
    [ -z "$(echo $metadata_space | grep '[TG]B')" ] && metadata_size=0

    [ $(echo "$data_size > 300" | bc) -eq 0 ] && echo "Docker Data Space is critically low ($data_space)"
    [ $(echo "$metadata_size > 5" | bc) -eq 0 ] && echo "Docker Metadata Space is critically low ($metadata_space)"
else
    images=$(docker images -a | wc -l)
    echo "docker info has timed out"
fi
[ "$images" -gt 1000 ] && echo "Too many docker images ($images) will cause severe scheduling penalties"