#!/bin/bash
# 功能:
# 在系统可用内存很小时选取一个squid进程重启
# 检查各进程健康状况
MAIL="[email protected]" # 设置自己的邮箱
MAIL_FROM="[email protected]"
CHECK_INTERVAL="3s" # 检查时间间隔
THRESHOLD_LOW_MEM="90" # MB,低内存阀值; 必须大于vm.min_free_kbytes
THRESHOLD_CHILD_MAX_MEM="200" # MB,child持有内存大于此值就重启
THRESHOLD_EMERG="100" # MB,LEVEL="emerg"时重启进程的阀值
LEVEL="normal" # 内存级别,可用内存较低时设置为emerg; 正常情况下应该是空变量
LOG="$0.log"
# 探测配置文件,70-79之间的端口为parent,80-87为child
CONFIG_LOCATION="/etc/squid/"
PARENT_CONF=$(cd $CONFIG_LOCATION; echo squid7[0-9].conf)
CHILD_CONF=$(cd $CONFIG_LOCATION; echo squid8[0-9].conf)
ALL_CONF="$PARENT_CONF $CHILD_CONF"
# disable killing by oom
echo "-17" > /proc/self/oom_adj
all_free_mem() {
free -m | awk '/^Mem:/ {print ($4 + $6 + $7)}' # 单位是MB
}
squid_instance_mem() {
INSTANCE=${CONF%.conf}
PID_FILE="/var/run/$INSTANCE.pid"
[[ -e $PID_FILE ]] && PID_FROM_FILE=`cat $PID_FILE` || PID_FROM_FILE=""
# 检查名称和进程号是否对应
if [ -e $PID_FILE ]; then
[[ x$PID_FROM_FILE != "x" ]] && grep -q squid /proc/$PID_FROM_FILE/comm || false
else
echo "$INSTANCE pid file not match to squid" >> $LOG
fi
# 占用的内存数量
RSS=`awk '/^VmRSS:/ {printf "%d\n", $2/1024}' /proc/$PID_FROM_FILE/status`
}
keep_processes_health() {
# 检查所有进程,如果挂了,修正一下
for CONF in $ALL_CONF; do
INSTANCE=${CONF%.conf}
PID_FILE="/var/run/$INSTANCE.pid"
STATE_FILE="/var/lib/init.d/started/$INSTANCE"
[[ -e $PID_FILE ]] && PID_FROM_FILE=`cat $PID_FILE` || PID_FROM_FILE=""
PID_RUNNING=`ps axo user,pid,cmd | awk '/^squid/ && /'$CONF'/ {print $2}'`
if [ x$PID_RUNNING = "x" ]; then
# 进程不存在: 启动进程
process_state="not_running"
echo "`date +%F\ %T` $INSTANCE state is $process_state, restarted" >> $LOG
[[ -e $PID_FILE ]] && /bin/rm -f $PID_FILE
[[ -e $STATE_FILE ]] && /bin/rm -f $STATE_FILE
/etc/init.d/squid_multi_instance start ${INSTANCE#squid} >/dev/null 2>&1
notify_admin &
elif [ x$PID_FROM_FILE = "x" ]; then
# pid文件有问题: 修正
process_state="bad_pid_file"
if echo $PID_RUNNING > $PID_FILE; then
process_state="good"
echo "fixed pid file of $INSTANCE at `date +%F\ %T`" >> $LOG
else
echo "can not write $PID_FILE" >> $LOG
process_state="pid_file_not_writeable"
notify_admin &
fi
elif [ x$PID_FROM_FILE != x$PID_RUNNING ]; then
# 什么情况下会这样呢...
process_state="pid_not_equal"
if echo $PID_RUNNING > $PID_FILE; then
echo "fixed $process_state of $INSTANCE at `date +%F\ %T`" >> $LOG
notify_admin &
else
echo "can not write $PID_FILE" >> $LOG
process_state="$process_state pid_file_not_writeable"
notify_admin &
fi
elif [ x$PID_FROM_FILE = x$PID_RUNNING ]; then
# 运行良好
process_state="good"
continue
else
process_state="unknow"
notify_admin &
fi
done
unset process_state
}
restart_process() {
INSTANCE=${CONF%.conf}
/etc/init.d/squid_multi_instance restart ${INSTANCE#squid} >/dev/null 2>&1
notify_admin &
}
pick_and_restart_parent() {
# 重启占用内存最大的进程
for CONF in $PARENT_CONF; do
squid_instance_mem
PARENT_MEM="$PARENT_MEM\n$RSS $CONF"
done
PARENT_MEM_MAX=`echo -e $PARENT_MEM | sort -n | tail -1`
if [ -n $PARENT_MEM_MAX ]; then
CONF=`echo $PARENT_MEM_MAX | awk '{print $2}'`
restart_process
echo "restarted parent ${CONF%.conf} at `date +%F\ %T`" >> $LOG
else
echo "unknow error in pick_and_restart_parent" >> $LOG
fi
}
pick_and_restart_child() {
# 正常情况下重启内存大于300M的进程; 紧急情况下重启大于100M的进程
for CONF in $CHILD_CONF; do
squid_instance_mem
if [ $RSS -gt $THRESHOLD_CHILD_MAX_MEM ]; then
process_state="over_THRESHOLD_CHILD_MAX_MEM"
echo "$process_state restarted child ${CONF%.conf} at `date +%F\ %T`" >> $LOG
restart_process
else
true
fi
done
if [ x$LEVEL = "xemerg" ]; then
for CONF in $CHILD_CONF; do
squid_instance_mem
[[ $RSS -gt $THRESHOLD_EMERG ]] && (process_state="over_THRESHOLD_EMERG"; restart_process)
done
else
true
fi
}
pick_and_restart_one() {
# restart a child first
pick_and_restart_child
# memory still low, restart a parent
[[ `all_free_mem` -lt $THRESHOLD_LOW_MEM ]] && pick_and_restart_parent
# memory still low, restart all child
[[ `all_free_mem` -lt $THRESHOLD_LOW_MEM ]] && \
(CONF="child"; restart_process)
}
notify_admin() {
sendmail -t -f $MAIL_FROM <<EOF
To: $MAIL
From: $MAIL_FROM
Subject: $INSTANCE on `hostname` restarted
重启过了 `hostname` 上的 $INSTANCE
进程 $INSTANCE 的最后状态为: $process_state
当前系统总剩余内存为: `all_free_mem`
.
EOF
}
while true; do
keep_processes_health
# child占用内存大于THRESHOLD_CHILD_MAX_MEM就重启,不等总内存过低
pick_and_restart_child
if [ `all_free_mem` -lt $THRESHOLD_LOW_MEM ]; then
LEVEL="emerg"
echo "low memory at `date +%F\ %T`" >> $LOG
pick_and_restart_one
else
LEVEL="normal"
sleep $CHECK_INTERVAL
fi
done
# vim: set sw=4 ts=4:
|