Capacity and performance check script

Another little script I wrote to check capacity aspects of an AIX LPAR. I call it capacity checks as it is basing most of the checks on counters and averaging out over 90 days. Some of this is based on Earl Jew’s excellent vmstat presentation to the IBM POWER VUG.

The script checks memory and I/O buffer over-flow counters as well as LPAR SRAD spreading.

#!/bin/ksh93

# Performance recommendation tool
#
# Copyright Henrik Morsing, 2022
#
# Initial version 1.0
# 09-11-2022    Henrik Morsing  1.1     Added more informative output
#                                       and correct when to alert (6 digits, not 5)

# Set a reference to current days up

ref="$(uptime | grep days | awk '{ print $3 }')"

#

# If less than a day or two, exit, less than twenty, warn

if [[ "${ref}" == "" ]]
then
   echo "System uptime too low."
   exit 1
elseif [[ "${ref}" -lt 20 ]]
   echo "System uptime too low to give accurate results."
fi

echo
echo "Starting System Performance Analyser v1.0"
echo
echo "System Name: $(uname -n) - System Uptime Days: ${ref}"
echo
echo "Please bear in mind, as stats used are accumulated over time,"
echo "they can be a view of the past and issues may already have been rectified."
echo
echo

#####################
# MEMORY
#####################

echo "\t *** MEMORY CHECKS ***"
echo
echo "Add more memory to rectify these"
echo

# Start by checking some memory variables
# Read paging space page outs, revolutions of the clock hand, free frame waits

vmstat -s | grep -E 'paging space page outs|revolutions of the clock hand|free frame waits' | awk '{ print $1 } ' | tr '\n' ' ' | read page_outs revolutions frame_waits

# First, convert to 90 day reference
page_outs_90=$(( ${page_outs}/${ref}*90 ))
revolutions_90=$(( ${revolutions}/${ref}*90 ))
frame_waits_90=$(( ${frame_waits}/${ref}*90 ))

# echo ${page_outs_90}
# echo ${revolutions_90}
# echo ${frame_waits_90}

# Then, find number of digits
page_outs_digits=${#page_outs_90}
frame_waits_digits=${#frame_waits_90}

# echo "${page_outs_digits}"
# echo "${frame_waits_digits}"

# Check on numbers and warn as needed

if [[ ${page_outs_digits} -gt 7 || ${revolutions} -gt $(( ${ref}*100 )) || ${frame_waits_digits} -gt 6 ]]
then
   echo "You are extremely memory constrained:"
   [[ ${page_outs_digits} -gt 7 ]] && echo "- \033[1;31m'paging space page outs' extremely is high:\033[m ${page_outs} -> ${page_outs_90} per 90 days (${page_outs_digits} digits)"
   [[ ${revolutions} -gt $(( ${ref}*100 )) ]] && echo "- \033[1;31m'revolutions of the clock hand' is extremely high:\033[m ${revolutions} -> ${revolutions_90} per 90 days"
   [[ ${frame_waits_digits} -gt 6 ]] && echo "- \033[1;31m'free frame waits' is extremely high:\033[m ${frame_waits} -> ${frame_waits_90} per 90 days (${frame_waits_digits} digits)"

elif [[ ${page_outs_digits} -gt 6 || ${revolutions} -gt $(( ${ref}*10 )) || ${frame_waits_digits} -gt 5 ]]
then
   echo "You are very memory constrained:"
   [[ ${page_outs_digits} -gt 6 ]] && echo "- \033[1;33m'paging space page outs' very is high:\033[m ${page_outs} -> ${page_outs_90} per 90 days (${page_outs_digits} digits)"
   [[ ${revolutions} -gt $(( ${ref}*10 )) ]] && echo "- \033[1;33m'revolutions of the clock hand' is very high:\033[m ${revolutions} -> ${revolutions_90} per 90 days"
   [[ ${frame_waits_digits} -gt 5 ]] && echo "- \033[1;33m'free frame waits' is very high:\033[m ${frame_waits} -> ${frame_waits_90} per 90 days (${frame_waits_digits} digits)"

elif [[ ${page_outs_digits} -gt 5 || ${revolutions} -gt ${ref} || ${frame_waits_digits} -gt 4 ]]
then
   echo "You could benefit from adding more memory:"
   [[ ${page_outs_digits} -gt 5 ]] && echo "- 'paging space page outs' is high: ${page_outs} -> ${page_outs_90} per 90 days  (${page_outs_digits} digits)"
   [[ ${revolutions} -gt ${ref} ]] && echo "- 'revolutions of the clock hand' is high: ${revolutions} -> ${revolutions_90} per 90 days"
   [[ ${frame_waits_digits} -gt 4 ]] && echo "- 'free frame waits' is high: ${frame_waits} -> ${frame_waits_90} per 90 days (${frame_waits_digits} digits)"
fi


#####################
# PROCESSOR
#####################

echo
echo "\t *** PROCESSOR CHECKS ***"
echo

# Checking for LPAR SRAD spreading

num_srads="$(lssrad -a | grep -v SRAD | wc -l)"
vCPUs_online="$(lparstat -i | grep 'Online Virtual CPUs' | awk '{ print $NF }').0"
vCPUs_max="$(lparstat -i | grep "Maximum Virtual CPUs" | awk '{ print $NF }')"
Entitlement="$(lparstat -i | grep "Entitled Capacity" | grep -v "Pool" | awk '{ print $NF }')"

if [[ ${num_srads} -gt "2" ]]
then
        echo "LPAR is spread across multiple SRADs (${num_srads}). If memory (2TB?) and max processor allocations (less than 15 vCPUs, currently ${vCPUs_max}) suggests it can be contained within one SRAD, powering the LPAR off and on again might align it correctly."
fi

echo
printf "*** Checking spreading factor ***"

if [[ ${vCPUs_online} -gt "1" ]]
then
   if [[ ${spreading} -gt 2 ]]
   then
      echo "\t[\033[1;33mWARNING\033[m]"
      echo "Number of virtual processors is high compared to entitlement."
   else
      echo "\t[\033[1;32mOK\033[m]"
   fi
fi


#####################
# I/O
#####################

# Starting from the top, VGs first

echo
echo "\t *** I/O CHECKS ***"
echo

for volgroup in $(lsvg -o)
do

   printf "*** Checking ${volgroup} ***"
   msg=false

   ##################
   # Checking pbufs #
   ##################

   # Count blocked I/Os with no pbuf
   pervg_blocked_io_count=$(/usr/sbin/lvmo -v ${volgroup} -o pervg_blocked_io_count)

   # Reference to 90 days
   pbio_90=$(( ${pervg_blocked_io_count}/${ref}*90 ))

   # Find number of digits
   pbio_digits=${#pbio_90}

   # Recommendation based on number of digits
   if [[ ${pbio_digits} -gt 6 ]]
   then
      url=true
      echo "\t[\033[1;33mWARNING\033[m]"

      # Calculate recommended pv_pbuf_count for VG
      pbuf_curr=$(lvmo -v ${volgroup} -o pv_pbuf_count)
      pbuf_vg=$(( ${pbuf_curr}+16384 ))

      echo "Volume group ${volgroup} is extremely low on pbufs"
      echo "- \033[1;31m'pending disk I/Os blocked with no pbuf' is extremely high:\033[m ${pbuf_curr}. Increase 'pv_pbuf_count' to ${pbuf_vg}.\n"
   else
      echo "\t[\033[1;32mOK\033[m]"
   fi
done


   ###################
   # Checking psbufs #
   ###################

   # Count blocked paging space I/O with no psbuf

   vmstat -v | grep -E 'paging space I/Os blocked with no psbuf|external pager filesystem I/Os blocked with no fsbuf' | awk '{ print $1 } ' | tr '\n' ' ' | read psbuf fsbuf

   # Reference to 90 days
   psio_90=$(( ${psbuf}/${ref}*90 ))

   # Any psbufs blocked is bad
   if [[ ${#psio_90} -gt 1 ]]
   then
      url=true
      printf "[\033[1;33mWARNING\033[m] "
      echo "\033[1;31mpsbufs is above 10\033[m, indicating severe memory restriction causing excessive paging. If you cannot add memory, alleviate by adding parallel paging spaces."
   fi


   ###################
   # Checking fsbufs #
   ###################
   echo
   # Count blocked external pager filesystem I/O with no fsbuf

   # Reference to 90 days
   fsio_90=$(( ${fsbuf}/${ref}*90 ))

   # Any fsbufs blocked is bad
   if [[ ${#fsio_90} -gt 2 ]]
   then
      url=true
      printf "[\033[1;33mWARNING\033[m] "
      echo "\033[1;31mfsbufs is above 100\033[m, indicating filesystem I/O over-load. Increase j2_dynamicBufferPreallocation with ioo to fix this. Start by doubling value."
      echo "Also consider splitting into smaller file systems."
   fi

   [[ "${url}" == "true" ]] && echo "Info on I/O buffers: https://www.ibm.com/support/pages/blocked-ios-due-buffers-shortage"

   ###################
   # Fibre Adapters  #
   ###################

adapters=$(lsdev -Ccadapter | grep fcs | awk '{ print $1 }')

# Check No Command Resource Count (Update num_cmd_elems)

   for adapter in ${adapters}
   do
      ncrc=$(fcstat -D ${adapter} | grep "No Command Resource Count" | awk '{ print $NF }')

      # Reference to 90 days
      ncrc_90=$(( ${ncrc}/${ref}*90 ))

      # No sure how many is bad, let's start with 6 digits

      if [[ ${#ncrc_90} -gt 6 ]]
      then
         url=true
         printf "[\033[1;33mWARNING\033[m] "
         echo "- \033[1;31mNo Command Resource Count for adapter ${adapter} is extremely high:\033[m ${ncrc} -> ${ncrc_90} per 90 days (${#ncrc_90} digits)"
         echo "Increase num_cmd_elems on ${adapter} to fix, but not higher than num_cmd_elems on the VIO physical adapter."
      elif [[ ${#ncrc_90} -gt 5 ]]
      then
         url=true
         printf "[\033[1;33mWARNING\033[m] "
         echo "- \033[1;31mNo Command Resource Count for adapter ${adapter} is very high:\033[m ${ncrc} -> ${ncrc_90} per 90 days (${#ncrc_90} digits)"
         echo "Increase num_cmd_elems on ${adapter} to fix, but not higher than num_cmd_elems on the VIO physical adapter."
      fi
   done

   [[ "${url}" == "true" ]] && echo "Info on fcs buffers: https://www.ibm.com/support/pages/no-command-resource-count-and-high-water-mark-active-and-pending-commands"
   url=false

   echo


# Check High water mark of active/pending commands (Update num_cmd_elems)

   for adapter in ${adapters}
   do
      hwmac=$(fcstat -D ${adapter} | grep -p "FC SCSI Adapter Driver Queue" | grep "High water mark  of active commands" | awk '{ print $NF }')
      hwmpc=$(fcstat -D ${adapter} | grep -p "FC SCSI Adapter Driver Queue" | grep "High water mark of pending commands" | awk '{ print $NF }')

      # Reference to 90 days
      hwmac_90=$(( ${hwmac}/${ref}*90 ))
      hwmpc_90=$(( ${hwmpc}/${ref}*90 ))

      hwm_summ=$(( ${hwmac} + ${hwmpc} ))

      # We need the current num_cmd_elems setting

      nce=$(lsattr -El fcs0 -a num_cmd_elems -F value)

      if [[ ${hwm_summ} -gt ${nce} ]]
      then
         url=true
         printf "[\033[1;33mWARNING\033[m] "
         echo "- \033[1;31mHigh water mark for active/pending command for adapter ${adapter} is higher than num_cmd_elems:\033[m ${hwm_summ} vs. ${nce}"
         echo "Increase num_cmd_elems on ${adapter} to fix, but not higher than num_cmd_elems on the VIO physical adapter."
      fi
   done

   # Link to helpful web page.
   echo
   [[ "${url}" == "true" ]] && echo "Info on fcs buffers: https://www.ibm.com/support/pages/no-command-resource-count-and-high-water-mark-active-and-pending-commands"
   url=false
   echo


# Check No DMA Resource Count (Update max_xfer_size)

   for adapter in ${adapters}
   do
      nodma=$(fcstat -D ${adapter} | grep "No DMA Resource Count" | awk '{ print $NF }')

      # Reference to 90 days
      nodma_90=$(( ${nodma}/${ref}*90 ))

      if [[ ${#nodma_90} -gt 3 ]]
      then
         url=true
         printf "[\033[1;33mWARNING\033[m] "
         echo "- \033[1;31mNo DMA Resource Count for adapter ${adapter} is higher than 3 digits per 90 days:\033[m ${nodma_90}"
         echo "Increase max_xfer_size on ${adapter} to fix, but not higher than max_xfer_size on the VIO physical adapter."
      fi
   done

   # Link to helpful web page.
   echo
   [[ "${url}" == "true" ]] && echo "Info on fcs buffers: https://www.ibm.com/support/pages/no-command-resource-count-and-high-water-mark-active-and-pending-commands"
   url=false

echo
exit 0
This entry was posted in AIX, IBM POWER, Performance tuning, Scripting and tagged , , . Bookmark the permalink.

Leave a Reply