侧边栏壁纸
博主头像
逢尔Seyu 博主等级

星光不负赶路人,时光不负追梦人

  • 累计撰写 30 篇文章
  • 累计创建 20 个标签
  • 累计收到 2 条评论

目 录CONTENT

文章目录

linux+K8s巡检脚本

逢尔Seyu
2024-01-11 / 0 评论 / 0 点赞 / 45 阅读 / 0 字

系统巡检脚本

#!/bin/bash
# system-check-scripts
#主机信息每日巡检
 
 ## 备注:使用脚本前请先根据不同环境修改IP变量中的网卡名为管理网网卡!!!
 ## 在执行系统脚本时,会调用k8s集群巡检脚本!或者在执行带k8s容器环境时(eg:UC)也可以单独执行k8s-cluster_check_dandu.sh脚本!
 ## 在巡检UC以外的系统时,请注释掉k8s-cluster_check函数!

#IP=$(ifconfig eth0|grep -w 'inet'|awk -F '[ :]' '{print $13}')
IP=$(ifconfig eth0|grep -w 'inet' | awk {'print $2'})
#环境变量PATH没设好,在cron里执行时有很多命令会找不到
export PATH=/usr//sbin:/usr//bin:/sbin:/bin:/usr/sbin:/usr/bin:/root/bin
source /etc/profile
 
[ $(id -u) -gt 0 ] && echo "请用root用户执行此脚本!" && exit 1
centosVersion=$(awk '{print $(NF-1)}' /etc/redhat-release)

VERSION="V1.0.1"

#定义日志相关
PROGPATH=`echo $0 | sed -e 's,[\\/][^\\/][^\\/]*$,,'`
[ -f $PROGPATH ] && PROGPATH="."
LOGPATH="$PROGPATH/report"
[ -e $LOGPATH ] || mkdir $LOGPATH
RESULTFILE="$LOGPATH/System-Check-$IP-`date +%Y%m%d`.txt"

#迎宾仪式
function Welcome(){
cat <<EOF
################################
|     欢迎使用日常巡检脚本!      |
| ^^                        ^^ | 
| ()  脚本版本为:$VERSION   () |
|                              |
###############################
EOF
}

function OS_INFO(){
	# 系统名
	 OS_NAME=`uname -n`
	# 系统版本
	 OS_VERSION=`cat /etc/.kyinfo | grep dist_id | grep -oE "Kylin.*" 2>/dev/null || echo 获取信息失败`
	# 系统类型
	 OS_TYPE=`uname`
	# 主机序列号
	 OS_NUM=`dmidecode -t system | grep 'Serial Number' | awk '{print $3}'`
	# 系统内核版本
	 OS_KERNEL=`uname -r`
	# 系统机器码
	 OS_CODE=""
	# 系统语言环境
	 OS_LANG=`echo $LANG`
	# 系统时间
	 OS_DATE=`date +"%Y-%m-%d %H:%M:%S"`
	# 系统运行时间
	 OS_UPTIME=`uptime | awk -F',' '{sub(/.*up /,"",$1);print $1'} || echo 获取信息失败`
	# 系统时区情况
         OS_CLOCK=`clockdiff -o1 $IP`
        # 系统上次重启时间
	 OS_LAST_REBOOT=`last reboot | head -1 | awk '{print $5,$6,$7,$8,$10}'`
	# 系统上次关机时间
	 OS_LAST_SHUTDOWN=`last -x | grep shutdown | head -1 | awk '{print $5,$6,$7,$8,$10}'`
              
  
	echo "################################# [ 系统信息巡检区 ] ######################################"
	echo "主机名:$OS_NAME"
	echo "主机类型:$OS_TYPE"
	echo "主机序列号:${OS_NUM:-获取信息失败}"
	echo "系统版本:$OS_VERSION"
	echo "系统内核版本:$OS_KERNEL"
	echo "系统机器码:${OS_CODE:-获取信息失败}"
	echo "系统语言环境:${OS_LANG}"
	echo "系统时间;$OS_DATE"
        echo "系统时区情况:$OS_CLOCK"
	echo "系统已运行时间:$OS_UPTIME"
	echo "系统上次重启时间:${OS_LAST_REBOOT:-获取信息失败}"
	echo "系统上次关机时间:${OS_LAST_SHUTDOWN:-获取信息失败}"
}

function OS_HDWARE(){
	# CPU架构
	 CPU_ARCH=`uname -m`
	# CPU型号
	 CPU_TYPE=`cat /proc/cpuinfo | grep "model name" | uniq | awk -F':' '{sub(/ /,"",$2);print $2}'`
	# CPU个数
	 CPU_NUM=`cat /proc/cpuinfo | grep "physical id" | sort | uniq | wc -l`
	# CPU 核数
	 CPU_CORE=`cat /proc/cpuinfo | grep cores | uniq | awk -F':' '{sub(/ /,"",$2);print $2}'`
	# CPU 频率
	 CPU_HZ=`cat /proc/cpuinfo | grep "cpu MHz" | uniq | awk -F':' '{sub(/ /,"",$2);printf "%s MHz\n",$2}'`

	# 内存容量
	 ME_SIZE=$(echo "scale=2;`cat /proc/meminfo | grep 'MemTotal:' | awk '{print $2}'`/1048576"|bc)
	# 空闲内存
	 ME_FREE=$(echo "scale=2;`cat /proc/meminfo | grep 'MemFree:' | awk '{print $2}'`/1048576"|bc)
	# 可用内存
	 ME_FREEE=$(echo "scale=2;`cat /proc/meminfo | grep 'MemAvailable:' | awk '{print $2}'`/1048576" | bc)
	# 内存使用率
	 ME_USE=$(awk 'BEGIN{printf "%.1f%\n",('$ME_SIZE'-'$ME_FREEE')/'$ME_SIZE'*100}')
	# SWAP大小
	 ME_SWAP_SIZE=$(echo "scale=2;`cat /proc/meminfo | grep 'SwapTotal:' | awk '{print $2}'`/1048576"|bc)
	# SWAP可用
	 ME_SWAP_FREE=$(echo "scale=2;`cat /proc/meminfo | grep 'SwapFree:' | awk '{print $2}'`/1048576"|bc)
	# SWAP使用率
	 ME_SWAP_USE=$(awk 'BEGIN{printf "%.1f%\n",('$ME_SWAP_SIZE'-'$ME_SWAP_FREE')/'$ME_SWAP_SIZE'*100}')
	# Buffer大小
	 ME_BUF=$(cat /proc/meminfo | grep 'Buffers:' | awk '{printf "%s KB",$2}')
	# 内存Cache大小
	 ME_CACHE=$(cat /proc/meminfo | grep '^Cached:' | awk '{printf "%s KB",$2}')

	# 当前系统所有网卡
	 NET_DEVICE=(`cat /proc/net/dev | awk 'NR>2 && $1 !~/lo/ {sub(/:/,"");print $1}'`)

	echo "################################# [ 系统硬件巡检区 ] ######################################"
	echo "CPU型号:$CPU_TYPE"
	echo "CPU架构:$CPU_ARCH"
	echo "CPU个数:$CPU_NUM"
	echo "CPU核数: $CPU_CORE"
	echo "CPU频率:$CPU_HZ"
	echo "内存容量:${ME_SIZE} GB"
	echo "内存空闲:${ME_FREE} GB"
	echo "内存可用:${ME_FREEE} GB"
	echo "内存使用率:${ME_USE}"
	echo "SWAP容量:$ME_SWAP_SIZE GB"
	echo "SWAP可用容量:$ME_SWAP_FREE GB"
	echo "SWAP使用率:$ME_SWAP_USE"
	echo "内存Buffer大小:${ME_BUF}"
	echo "内存Cache大小:${ME_CACHE}"

	for i in ${NET_DEVICE[@]}
	do
		echo "网卡:$i  状态: $(ip link show eth0 | awk 'NR==1{print $9}') RX: $(ethtool -g eth0 | grep "RX:" | tail -1 | awk '{print $2}') TX: $(ethtool -g eth0 | grep "TX:" | tail -1 | awk '{print $2}')"
	done
}

function OS_NETWORK(){
	# 系统IP
	# IP=$(hostname -I)
	# 网关地址
	 GATEWAY=$(ip route | grep default &>/dev/null && ip route | grep default | awk '{print $3}' || echo '未设置默认网关')
	# DNS地址
	 DNS=(`cat /etc/resolv.conf | grep nameserver | uniq | awk '{print $2}'`)

	echo "################################# [ 系统网络巡检区 ] ######################################"
	echo "IP地址:$IP"
	echo "网关地址:$GATEWAY"
	echo "DNS地址:${DNS[@]}"
	echo "网关[$GATEWAY]连接情况: $(ping -t 1 -i 1 -c 5 -W 1 $GATEWAY &>/dev/null && echo '正常通信' || echo '无法通信')"
}

function OS_RESOURCE(){
	# 系统磁盘列表
	 DISK_LIST=(`lsblk | egrep "^[a-z].*" | grep -v "^sr" | awk '{print $1}'`)
	# 系统磁盘使用率情况
	 DISK_PER=(`df -h | awk 'NR>1 && $1 !~/sr/ {gsub(/%/,"",$5);print $5}'`)

	# CPU空闲率
	 CPU_FREE=$(top -d 1 -n 1 -b | awk 'NR==3{print $8}')
	# CPU使用率
	 CPU_USE=$(awk 'BEGIN{printf "%.1f%\n",100-'$CPU_FREE'}')
	# CPU_TOP_TEN
	 CPU_TOP_TEN=$(top -d 1 -n 1 -b | column -t | awk 'NR>=7 && NR<=15')

	# 当前进程数
	 CPU_PROCESSORS=$(top -d 1 -n 1 -b | awk 'NR==2{print $2}')
	# 当前正在运行进程数
	 CPU_RUN_PROCESSORS=$(top -d 1 -n 1 -b | awk 'NR==2{print $4}')
	# 当前正在休眠进程数
	 CPU_SL_PROCESSORS=$(top -d 1 -n 1 -b | awk 'NR==2{print $6}')
	# 当前停止运行进程数
	 CPU_STOP_PROCESSORS=$(top -d 1 -n 1 -b | awk 'NR==2{print 8}')
	# 当前僵尸进程数
	 CPU_ZOM_PROCESSORS=$(top -d 1 -n 1 -b | awk 'NR==2{print $10}')
	

	echo "################################# [ 系统资源巡检区 ] ######################################"
	echo "CPU使用率:$CPU_USE"
	echo "CPU使用率前十进程信息:"
	echo "$(ps -eo user,pid,pcpu,pmem,args --sort=-pcpu | head -n 10)"
	echo "内存使用率前十进程信息:"
	echo "$(ps -eo user,pid,pcpu,pmem,args --sort=-pmem | head -n 10)"
	echo "磁盘IO信息:$(iotop -bon 1 &>/dev/null || echo 'io top 未安装信息获取失败')"
	echo "$(iotop -bon 1 &>/dev/null && iotop -bon 1 | head -n 13)"
	echo "磁盘分区使用率是否正常:正常"
	for i in ${DISK_LIST[@]}
	do
		if [[ -z "$(lsblk --nodeps -no serial /dev/$i)" ]]; then
			echo "磁盘:$i	磁盘序列号:获取信息失败"	
		else
			echo "磁盘:$i	磁盘序列号:$(lsblk --nodeps -no serial /dev/$i)"
		fi
	done
	for i in ${DISK_PER[@]}
	do
		if [ $i -gt 80 ]; then
			echo "某分区磁盘使用率为:$i% > 80% 请及时扩容"
		fi
	done
	echo "\n系统磁盘分区inode使用情况:"
	echo "$(df -Thi)"
	echo "\n系统当前进程数:$CPU_PROCESSORS"	
	echo "系统当前进程运行数:$CPU_RUN_PROCESSORS"
	echo "系统当前休眠进程数:$CPU_SL_PROCESSORS"
	echo "系统当前停止进程数:$CPU_STOP_PROCESSORS"
	echo "系统当前僵尸进程数:$CPU_ZOM_PROCESSORS"

	echo "系统当前允许最大fd数量:$(cat /proc/sys/fs/file-nr | awk '{print $3}')"
	echo "系统当前已打开fd数量:$(cat /proc/sys/fs/file-nr | awk '{print $1}')"
	echo "系统单个进程运行打开fd数量:$(ulimit -n)"

	echo "系统当前socket连接数:$(netstat -anp &>/dev/null && netstat -anp | wc -l || echo 'net-tools 未安装,获取信息失败')"
	echo "系统 established socket数量: $(netstat -anp &>/dev/null && netstat -anp | grep "ESTABLISHED" | wc -l || echo 'net-tools 未安装,获取信息失败')"
	echo "系统 sync socket数量:$(netstat -anp &>/dev/null && netstat -anp | grep "SYN" | wc -l || echo 'net-tools 未安装,获取信息失败')"
	echo "系统当前已建立socket如下:"
	echo "$(netstat -anp &>/dev/null && netstat -anp | grep ESTABLISHED | awk '{printf "  本地:%-20s <=>    外部:%-22s\n",$4,$5}' || echo '')"
}

function OS_SECURITY(){
	# 系统所有能登录的用户
	 OS_USER=(`cat /etc/passwd | awk -F':' '$NF !~/nologin|sync|shutdown|halt/ {print $1}'`)
	# Selinux
	 OS_SELINUX=`getenforce`
	# 防火墙状态
	 OS_FIREWALLD=`service firewalld status &>/dev/null | grep "running" && echo on || echo off`

	echo "################################# [ 系统安全巡检区 ] ######################################"
	echo "防火墙状态: $OS_FIREWALLD"
	echo "Selinux状态:${OS_SELINUX}\n"
	echo "系统可登录用户数:$(cat /etc/passwd | awk -F':' '$NF !~/nologin|sync|shutdown|halt/ {print $1}' | wc -l)"
	echo "系统可登录用户:${OS_USER[@]}"
	for i in ${OS_USER[@]}
	do
		echo "用户 $i 最后1次登录信息: $(lastlog -u $i | awk 'NR==2')"
	done
	echo "系统当前登录用户:"
	echo "$(who | sed 's#[()]##g' | awk '{printf "   用户: %10s 终端: %7s 登录时间: %7s %7s 登录IP: %7s\n",$1,$2,$3,$4,$5}')"
}

function OS_SERVICE(){
	echo "################################# [ 系统服务巡检区 ] ######################################"
	echo "自行添加"
}

function k8s-cluster_check(){
         k8s_check_scripts_file="/root/monitor/check/k8s-cluster_check.sh"
         #判断k8s集群巡检脚本是否存在
         if [ ! -f ${k8s_check_scripts_file} ];then
             echo 'k8s集群巡检脚本不存在,巡检未执行!'
             exit 1
         else
             # 调用k8s集群巡检脚本执行
             bash /root/monitor/check/k8s-cluster_check.sh
         fi
}


if [ $(id -u -n) != "root" ]; then
        ERROR "请以ROOT用户运行这个脚本"
fi

function check(){
Welcome
OS_INFO
OS_HDWARE
OS_NETWORK
OS_RESOURCE
OS_SECURITY
#pod_check
#node_check
k8s-cluster_check
}

#执行检查并保存检查结果
check > $RESULTFILE

K8s巡检脚本

#!/bin/bash
#Author: xwutx
#Version: v1

#k8s集群日常巡检

#迎宾仪式
VERSION=v1
function Welcome(){
cat <<EOF
################################
| 欢迎使用k8s集群日常巡检脚本! |
| 脚本版本为:$VERSION         |
|                              |
###############################
EOF
}


#查看controller-manager、scheduler、etcd状态
function soft_check(){
    echo -e "----------Controller-manager、Scheduler、Etcd-0检测中----------"
    unhealthy_soft_count=$(kubectl get cs | awk 'NR == 1 {next}{if($2 != "Healthy") print $1}' | wc -l)                      #组件状态异常数
    if (( ${unhealthy_soft_count} >= 1 ));then
      echo -e "\033[31m$(kubectl get cs | awk 'NR == 1 {next}{if($2 != "Healthy") print $1}') Unhealthy\033[0m"
    else
      echo -e "\033[32mcontroller-manager、scheduler、etcd-0无异常\033[0m"
    fi
}

#查看kubelet状态
function kubeletCheck(){
    kubeletError=$(systemctl status kubelet.service | grep ^"$(date | awk '{print$2,$3}')" | grep -i error | wc -l)         #kubelet日志报错数
    echo -e "\n \n----------       Kubelet状态检测中       ----------"
    if (( ${kubeletError} >= 1 ));then
    echo -e "kubelet错误日志:" ; systemctl status kubelet.service | grep ^"$(date | awk '{print$2,$3}')" |awk '{for (i=10;i<=NF;i++)printf("%s ", $i);print ""}' | grep -i error | sort -n | uniq
    else
      echo -e "kubelet无日志报错"
    fi
}

#查看Pods状态
function podsCheck(){
    errorPod=$(kubectl get pods --all-namespaces | grep -v NAMESPACE| awk '{if($4 != "Running") print}' | wc -l )                #非runing状态pod数
    echo -e "\n \n----------      Pods运行状态检测中       ----------"
    if (( ${errorPod} >= 1 ));then
    echo -e "ErrorPod:"  && kubectl get pods --all-namespaces | grep -v NAMESPACE | awk '{if($4 != "Running") print}'
    else
      echo -e "Pods无异常"
    fi
}

#查看Node资源使用率
function nodeCheck(){
    echo -e "\n \n----------    Nodes资源使用状态检测中    ----------"
    memWarn=0
    for i in $(kubectl get nodes | awk 'NR == 1 {next}{print $1}');do
      memRq=$(kubectl describe node $i | grep memory | grep % | awk '{print $3}' | sed "s/[^0-9]//g")       #memory_request
      memLim=$(kubectl describe node $i | grep memory | grep % | awk '{print $5}' | sed "s/[^0-9]//g")      #memory_limit
      cpuUsed=$(kubectl top nodes $i | awk 'NR == 1 {next}{print $3}'| sed "s/[^0-9]//g")                   #cpu使用率
      memUsed=$(kubectl top nodes $i | awk 'NR == 1 {next}{print $5}'| sed "s/[^0-9]//g")                   #内存使用率
      if (( $cpuUsed > 60 || $memUsed > 80 ));then
        let memWarn+=1
        echo -e "$i\tCPU使用率:$cpuUsed%\t内存使用率:$memUsed%"
      fi
      if (( $memRq > 95 ));then
        let memWarn+=1
        echo -e "$i\tMem_Requests:$memRq%\tMem_Limits:$memLim%"
      fi
    done
    if (( $memWarn ==0 ));then
      echo -e "无节点CPU、内存使用异常"
    fi
}

#获取pods重启次数
function podRestart(){
    echo -e "\n \n----------      Pods自动重启检测中      ----------"
    kubectl get pods --all-namespaces |awk '{if($5 > 0) print}' | awk '{print $2,$5}' >/opt/podsnew.txt
    rebootNum=$(diff /opt/podsold.txt /opt/podsnew.txt | wc -l)
    if (( $rebootNum > 1 ));then
      echo -e "有以下pod重启:"
      diff /opt/podsold.txt /opt/podsnew.txt
    else
      echo -e "无自动重启pod"
    fi
    rm -f /opt/podsold.txt && mv /opt/podsnew.txt /opt/podsold.txt
}

main(){
    Welcome
    masterCheck
    kubeletCheck
    podsCheck
    nodeCheck
    podRestart
}
main

0

评论区