Another little script I wrote to check capacity aspects of an AIX LPAR. I call it capacity checks as it is basing most of the checks on counters and averaging out over 90 days. Some of this is based on Earl Jew’s excellent vmstat presentation to the IBM POWER VUG.
The script checks memory and I/O buffer over-flow counters as well as LPAR SRAD spreading.
#!/bin/ksh93
# Performance recommendation tool
#
# Copyright Henrik Morsing, 2022
#
# Initial version 1.0
# 09-11-2022 Henrik Morsing 1.1 Added more informative output
# and correct when to alert (6 digits, not 5)
# Set a reference to current days up
ref="$(uptime | grep days | awk '{ print $3 }')"
#
# If less than a day or two, exit, less than twenty, warn
if [[ "${ref}" == "" ]]
then
echo "System uptime too low."
exit 1
elseif [[ "${ref}" -lt 20 ]]
echo "System uptime too low to give accurate results."
fi
echo
echo "Starting System Performance Analyser v1.0"
echo
echo "System Name: $(uname -n) - System Uptime Days: ${ref}"
echo
echo "Please bear in mind, as stats used are accumulated over time,"
echo "they can be a view of the past and issues may already have been rectified."
echo
echo
#####################
# MEMORY
#####################
echo "\t *** MEMORY CHECKS ***"
echo
echo "Add more memory to rectify these"
echo
# Start by checking some memory variables
# Read paging space page outs, revolutions of the clock hand, free frame waits
vmstat -s | grep -E 'paging space page outs|revolutions of the clock hand|free frame waits' | awk '{ print $1 } ' | tr '\n' ' ' | read page_outs revolutions frame_waits
# First, convert to 90 day reference
page_outs_90=$(( ${page_outs}/${ref}*90 ))
revolutions_90=$(( ${revolutions}/${ref}*90 ))
frame_waits_90=$(( ${frame_waits}/${ref}*90 ))
# echo ${page_outs_90}
# echo ${revolutions_90}
# echo ${frame_waits_90}
# Then, find number of digits
page_outs_digits=${#page_outs_90}
frame_waits_digits=${#frame_waits_90}
# echo "${page_outs_digits}"
# echo "${frame_waits_digits}"
# Check on numbers and warn as needed
if [[ ${page_outs_digits} -gt 7 || ${revolutions} -gt $(( ${ref}*100 )) || ${frame_waits_digits} -gt 6 ]]
then
echo "You are extremely memory constrained:"
[[ ${page_outs_digits} -gt 7 ]] && echo "- \033[1;31m'paging space page outs' extremely is high:\033[m ${page_outs} -> ${page_outs_90} per 90 days (${page_outs_digits} digits)"
[[ ${revolutions} -gt $(( ${ref}*100 )) ]] && echo "- \033[1;31m'revolutions of the clock hand' is extremely high:\033[m ${revolutions} -> ${revolutions_90} per 90 days"
[[ ${frame_waits_digits} -gt 6 ]] && echo "- \033[1;31m'free frame waits' is extremely high:\033[m ${frame_waits} -> ${frame_waits_90} per 90 days (${frame_waits_digits} digits)"
elif [[ ${page_outs_digits} -gt 6 || ${revolutions} -gt $(( ${ref}*10 )) || ${frame_waits_digits} -gt 5 ]]
then
echo "You are very memory constrained:"
[[ ${page_outs_digits} -gt 6 ]] && echo "- \033[1;33m'paging space page outs' very is high:\033[m ${page_outs} -> ${page_outs_90} per 90 days (${page_outs_digits} digits)"
[[ ${revolutions} -gt $(( ${ref}*10 )) ]] && echo "- \033[1;33m'revolutions of the clock hand' is very high:\033[m ${revolutions} -> ${revolutions_90} per 90 days"
[[ ${frame_waits_digits} -gt 5 ]] && echo "- \033[1;33m'free frame waits' is very high:\033[m ${frame_waits} -> ${frame_waits_90} per 90 days (${frame_waits_digits} digits)"
elif [[ ${page_outs_digits} -gt 5 || ${revolutions} -gt ${ref} || ${frame_waits_digits} -gt 4 ]]
then
echo "You could benefit from adding more memory:"
[[ ${page_outs_digits} -gt 5 ]] && echo "- 'paging space page outs' is high: ${page_outs} -> ${page_outs_90} per 90 days (${page_outs_digits} digits)"
[[ ${revolutions} -gt ${ref} ]] && echo "- 'revolutions of the clock hand' is high: ${revolutions} -> ${revolutions_90} per 90 days"
[[ ${frame_waits_digits} -gt 4 ]] && echo "- 'free frame waits' is high: ${frame_waits} -> ${frame_waits_90} per 90 days (${frame_waits_digits} digits)"
fi
#####################
# PROCESSOR
#####################
echo
echo "\t *** PROCESSOR CHECKS ***"
echo
# Checking for LPAR SRAD spreading
num_srads="$(lssrad -a | grep -v SRAD | wc -l)"
vCPUs_online="$(lparstat -i | grep 'Online Virtual CPUs' | awk '{ print $NF }').0"
vCPUs_max="$(lparstat -i | grep "Maximum Virtual CPUs" | awk '{ print $NF }')"
Entitlement="$(lparstat -i | grep "Entitled Capacity" | grep -v "Pool" | awk '{ print $NF }')"
if [[ ${num_srads} -gt "2" ]]
then
echo "LPAR is spread across multiple SRADs (${num_srads}). If memory (2TB?) and max processor allocations (less than 15 vCPUs, currently ${vCPUs_max}) suggests it can be contained within one SRAD, powering the LPAR off and on again might align it correctly."
fi
echo
printf "*** Checking spreading factor ***"
if [[ ${vCPUs_online} -gt "1" ]]
then
if [[ ${spreading} -gt 2 ]]
then
echo "\t[\033[1;33mWARNING\033[m]"
echo "Number of virtual processors is high compared to entitlement."
else
echo "\t[\033[1;32mOK\033[m]"
fi
fi
#####################
# I/O
#####################
# Starting from the top, VGs first
echo
echo "\t *** I/O CHECKS ***"
echo
for volgroup in $(lsvg -o)
do
printf "*** Checking ${volgroup} ***"
msg=false
##################
# Checking pbufs #
##################
# Count blocked I/Os with no pbuf
pervg_blocked_io_count=$(/usr/sbin/lvmo -v ${volgroup} -o pervg_blocked_io_count)
# Reference to 90 days
pbio_90=$(( ${pervg_blocked_io_count}/${ref}*90 ))
# Find number of digits
pbio_digits=${#pbio_90}
# Recommendation based on number of digits
if [[ ${pbio_digits} -gt 6 ]]
then
url=true
echo "\t[\033[1;33mWARNING\033[m]"
# Calculate recommended pv_pbuf_count for VG
pbuf_curr=$(lvmo -v ${volgroup} -o pv_pbuf_count)
pbuf_vg=$(( ${pbuf_curr}+16384 ))
echo "Volume group ${volgroup} is extremely low on pbufs"
echo "- \033[1;31m'pending disk I/Os blocked with no pbuf' is extremely high:\033[m ${pbuf_curr}. Increase 'pv_pbuf_count' to ${pbuf_vg}.\n"
else
echo "\t[\033[1;32mOK\033[m]"
fi
done
###################
# Checking psbufs #
###################
# Count blocked paging space I/O with no psbuf
vmstat -v | grep -E 'paging space I/Os blocked with no psbuf|external pager filesystem I/Os blocked with no fsbuf' | awk '{ print $1 } ' | tr '\n' ' ' | read psbuf fsbuf
# Reference to 90 days
psio_90=$(( ${psbuf}/${ref}*90 ))
# Any psbufs blocked is bad
if [[ ${#psio_90} -gt 1 ]]
then
url=true
printf "[\033[1;33mWARNING\033[m] "
echo "\033[1;31mpsbufs is above 10\033[m, indicating severe memory restriction causing excessive paging. If you cannot add memory, alleviate by adding parallel paging spaces."
fi
###################
# Checking fsbufs #
###################
echo
# Count blocked external pager filesystem I/O with no fsbuf
# Reference to 90 days
fsio_90=$(( ${fsbuf}/${ref}*90 ))
# Any fsbufs blocked is bad
if [[ ${#fsio_90} -gt 2 ]]
then
url=true
printf "[\033[1;33mWARNING\033[m] "
echo "\033[1;31mfsbufs is above 100\033[m, indicating filesystem I/O over-load. Increase j2_dynamicBufferPreallocation with ioo to fix this. Start by doubling value."
echo "Also consider splitting into smaller file systems."
fi
[[ "${url}" == "true" ]] && echo "Info on I/O buffers: https://www.ibm.com/support/pages/blocked-ios-due-buffers-shortage"
###################
# Fibre Adapters #
###################
adapters=$(lsdev -Ccadapter | grep fcs | awk '{ print $1 }')
# Check No Command Resource Count (Update num_cmd_elems)
for adapter in ${adapters}
do
ncrc=$(fcstat -D ${adapter} | grep "No Command Resource Count" | awk '{ print $NF }')
# Reference to 90 days
ncrc_90=$(( ${ncrc}/${ref}*90 ))
# No sure how many is bad, let's start with 6 digits
if [[ ${#ncrc_90} -gt 6 ]]
then
url=true
printf "[\033[1;33mWARNING\033[m] "
echo "- \033[1;31mNo Command Resource Count for adapter ${adapter} is extremely high:\033[m ${ncrc} -> ${ncrc_90} per 90 days (${#ncrc_90} digits)"
echo "Increase num_cmd_elems on ${adapter} to fix, but not higher than num_cmd_elems on the VIO physical adapter."
elif [[ ${#ncrc_90} -gt 5 ]]
then
url=true
printf "[\033[1;33mWARNING\033[m] "
echo "- \033[1;31mNo Command Resource Count for adapter ${adapter} is very high:\033[m ${ncrc} -> ${ncrc_90} per 90 days (${#ncrc_90} digits)"
echo "Increase num_cmd_elems on ${adapter} to fix, but not higher than num_cmd_elems on the VIO physical adapter."
fi
done
[[ "${url}" == "true" ]] && echo "Info on fcs buffers: https://www.ibm.com/support/pages/no-command-resource-count-and-high-water-mark-active-and-pending-commands"
url=false
echo
# Check High water mark of active/pending commands (Update num_cmd_elems)
for adapter in ${adapters}
do
hwmac=$(fcstat -D ${adapter} | grep -p "FC SCSI Adapter Driver Queue" | grep "High water mark of active commands" | awk '{ print $NF }')
hwmpc=$(fcstat -D ${adapter} | grep -p "FC SCSI Adapter Driver Queue" | grep "High water mark of pending commands" | awk '{ print $NF }')
# Reference to 90 days
hwmac_90=$(( ${hwmac}/${ref}*90 ))
hwmpc_90=$(( ${hwmpc}/${ref}*90 ))
hwm_summ=$(( ${hwmac} + ${hwmpc} ))
# We need the current num_cmd_elems setting
nce=$(lsattr -El fcs0 -a num_cmd_elems -F value)
if [[ ${hwm_summ} -gt ${nce} ]]
then
url=true
printf "[\033[1;33mWARNING\033[m] "
echo "- \033[1;31mHigh water mark for active/pending command for adapter ${adapter} is higher than num_cmd_elems:\033[m ${hwm_summ} vs. ${nce}"
echo "Increase num_cmd_elems on ${adapter} to fix, but not higher than num_cmd_elems on the VIO physical adapter."
fi
done
# Link to helpful web page.
echo
[[ "${url}" == "true" ]] && echo "Info on fcs buffers: https://www.ibm.com/support/pages/no-command-resource-count-and-high-water-mark-active-and-pending-commands"
url=false
echo
# Check No DMA Resource Count (Update max_xfer_size)
for adapter in ${adapters}
do
nodma=$(fcstat -D ${adapter} | grep "No DMA Resource Count" | awk '{ print $NF }')
# Reference to 90 days
nodma_90=$(( ${nodma}/${ref}*90 ))
if [[ ${#nodma_90} -gt 3 ]]
then
url=true
printf "[\033[1;33mWARNING\033[m] "
echo "- \033[1;31mNo DMA Resource Count for adapter ${adapter} is higher than 3 digits per 90 days:\033[m ${nodma_90}"
echo "Increase max_xfer_size on ${adapter} to fix, but not higher than max_xfer_size on the VIO physical adapter."
fi
done
# Link to helpful web page.
echo
[[ "${url}" == "true" ]] && echo "Info on fcs buffers: https://www.ibm.com/support/pages/no-command-resource-count-and-high-water-mark-active-and-pending-commands"
url=false
echo
exit 0