编辑代码

#!/bin/bash
#定义的举报变量
fork_usec_alarm=400
delayed_fsync_alarm=2
repl_offset_alarm=5

# 简易版打印分割线函数
printSeparator() {
	printtimes=50
	if [[ $1 ]]; then
		printtimes=$1
	fi

	mid=`expr $printtimes / 2`
	echo " "
	while [[ $printtimes -gt 0 ]]; do
		echo -n "="
		if [[ $2 ]] && [[ $printtimes == $mid ]]; then
			echo -n $2
		fi
		let "printtimes--"
	done
	echo " "
	echo " "
}


# 检查latest_fork_usec,fork操作时需要复制主进程的内存页表,复制过程中会阻塞主进程
data=$(redis-cli info stats | grep fork | awk -F":" '{print $2}')
echo "latest_fork_usec: $data"
if [ $data -gt $fork_usec_alarm ]; then
	# todo 发送警报邮件,代表内存过大,fork时间过长
	echo "fork_usec_alarm!!!please check the data memory used"
else
	echo "latest_fork_usec normal"
fi
printSeparator 50 "latest_fork_usec_check_end"



# 开启aof的everysec情况下,aof_delayed_fsync为距离上次fsync的时间,超过1秒代表存在阻塞了
# 把前方的appendfsync取代为空,awk去掉前面的换行
config=$(redis-cli config get appendfsync | sed 's/^.*fsync//g' | awk '{printf "%s",$1}')
echo "appendfsync: $config"
if [[ $config == 'everysec' ]]; then
	if [[ $(redis-cli info persistence | grep delayed_fsync | awk -F ":" '{print $2}') -gt $delayed_fsync_alarm ]]; then
		echo "delayed_fsync_alarm!!!please check the disk IO"
		# todo 发送警报邮件,代表AOF阻塞,可能IO问题
	else
		echo "aof_delayed_fsync normal"
	fi
fi
printSeparator 50 "aof_delayed_fsync_check_end"



# 主从配置中,每秒的心跳检测会报告从的复制偏移量,可以通过检测其差获知延迟
count=0
echo -n "" > /tmp/log1 # 不-n的话会输出换行到文件,下方-s 文件判断会不准确!
master_repl_offset=$(redis-cli info replication | grep master_repl_offset | sed 's/master_repl_offset://g' | awk '{printf "%s",$1}')
for line in $(redis-cli info replication | grep slave | sed '/connected_/d'); do
    line=$(echo $line | sed 's/,port=/ /g' | awk -F ":" '{print $2}')
    ip=$(echo $line | awk -F "," '{print $1}' | sed 's/ip=//g')
    state=$(echo $line | awk -F "," '{print $2}' | sed 's/state=//g')
    offset=$(echo $line | awk -F "," '{print $3}' | sed 's/offset=//g')
    
    minus_offset=`expr $master_repl_offset - $offset`
    if [[ $state != "online" ]] || [[ $minus_offset -gt $repl_offset_alarm ]]; then
    	# 我的sh是/bin/sh -> /bin/busybox,不支持数组
    	# msg[$count]="machine ${ip} state:${state} minus offset: ${minus_offset}"
    	echo "machine ${ip} state:${state} minus offset: ${minus_offset}"
    	echo "machine ${ip} state:${state} minus offset: ${minus_offset}" >> /tmp/log1
    fi
    let "count++"
done
if [[ -s /tmp/log1 ]]; then
	# todo 发送警报邮件,代表salve down掉或者产生延迟
	echo "repl_offset_alarm!!! please check slave machine "
else
	echo "replication normal"
fi
printSeparator 50 "repl_offset_alarm_check_end"