Check Storage Script

Ce script sert à surveiller l’état SMART des disques et vérifier le RAID système (mdadm). Il permet également de surveiller un storage externe et de repérer les disques dégradés, les températures anormales, les secteurs défectueux, les erreurs PCI/contrôleur et d’envoyer un rapport par mail pour une maintenance preventive.

Smartctl

L’outil smartctl fait partie du paquet smartmontools` et permet de :

  • lire les informations SMART des disques
  • lancer des auto-tests
  • détecter des signes de panne imminente

L’outil smartctl fonctionne avec :

  • disques SATA
  • SAS
  • NVMe
  • disques USB (souvent partiellement)
  • RAID matériels (via des drivers spécifiques)

L’accès aux disques se fait via :

  • le driver cciss (pour les anciens serveurs)
  • smartctl -a -d cciss,0 /dev/sdX
  • hpsa (actuel)
  • smartctl -a -d hpsa,0 /dev/sdX

Script bash

Ce script génère un rapport dans le dossier /root/ et l’envoie par mail.

#!/bin/bash

# ============================================= #
# SCRIPT: CHECK STORAGE #
# VERSION: 2.0 #
# UPDATE: 2026/01/15 #
# ============================================= #

# =========================
# VARIABLES DEF
MAIL="[email protected]"
REPORT="/root/check_storage.txt"
DISK="/dev/disk/by-id/scsi-adcde12345fghij67890klmno01234pqr"
SYSTEM_DISKS=(
"/dev/disk/by-id/ata-2.5__SATA_SSD_3MG2-P_ABCDEF0011223344"
"/dev/disk/by-id/ata-2.5__SATA_SSD_3MG2-P_FEDCBA9988776655"
)
SPAREFILE=/root/sparedisk.txt

# =========================
# DELETE OLD REPORT
rm -f $REPORT

# =========================
# MANAGE SPARE DISK
# /!\ update the sparedisk.txt after change the disk
# Update the DISK variable to correspond of the disks
if [ ! -f $SPAREFILE ]
then
SPAREDISK=5
else
SPAREDISK=$(cat $SPAREFILE)
fi

# =========================
# DENERAL INFORMATIONS
echo -e "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" > $REPORT
echo -e "┃ DATE: $(date)" >> $REPORT
echo -e "┃ HOST: $(hostname)" >> $REPORT
echo -e "┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT

echo -e "\n┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
echo -e "┣━━━━━ GENERAL INFORMATIONS" >> $REPORT
echo -e "┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
echo "System: $(cat /etc/redhat-release)" >> $REPORT
echo "Kernel: $(uname -r)" >> $REPORT
echo "Uptime: $(uptime | awk -F'up ' '{print $2}' | awk -F',' '{print $1}')" >> $REPORT

# =========================
# SUMMARY DISK
echo -e "\n--------------------------------------------------------------" >> $REPORT
echo -e "| DISK - SUMMARY" >> $REPORT
echo -e "--------------------------------------------------------------" >> $REPORT
for i in $(seq 0 25); do
if /usr/sbin/smartctl -i -d cciss,$i "$DISK" >/dev/null 2>&1; then
TEMP=$(/usr/sbin/smartctl -a -d cciss,$i "$DISK" | awk -F: '/Current Drive Temperature/ {gsub(/ C/,"",$2); print $2}' | xargs)
HEALTH=$(/usr/sbin/smartctl -a -d cciss,$i "$DISK" | awk -F: '/SMART Health Status/ {print $2}' | xargs)
DEFECTS=$(/usr/sbin/smartctl -a -d cciss,$i "$DISK" | awk -F: '/Elements in grown defect list/ {print $2}' | xargs)
echo "| DISK $i | Health: $HEALTH | Temperature: ${TEMP:-N/A}°C | Grown Defects: ${DEFECTS:-N/A}" >> "$REPORT"
fi
done
echo "-----------------------------------------------------------------------------------------" >> "$REPORT"

# =========================
# RAID STATUS
echo -e "\n┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
echo -e "┣━━━━━ SYSTEM RAID STATUS" >> $REPORT
echo -e "┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
cat /proc/mdstat >> $REPORT

# =========================
# SYSTEME DISKS
echo -e "\n┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
echo -e "┣━━━━━ SYSTEM DISK - SDC">> $REPORT
echo -e "┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
SMART_FILTER="Serial Number|SMART overall-health|Power_On_Hours|Temperature_Celsius|Raw_Read_Error_Rate|Reallocated_Event_Count|Current_Pending_Sector|Offline_Uncorrectable|UDMA_CRC_Error_Count|Multi_Zone_Error_Rate|Self-test|LifeTime|offline"
disk_num=1
for DISKS in "${SYSTEM_DISKS[@]}"; do
echo -e "\n┣━━━ DISK $disk_num ━━━━━━━━━━━━━━━━━━━ \n" >> "$REPORT"
/usr/sbin/smartctl -a "$DISKS" \
| egrep "$SMART_FILTER" \
| head -19 3>&1 4>&2 >> "$REPORT" 2>&1
((disk_num++))
done


# =========================
# LOG
echo -e "\n┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
echo -e "┣━━━━━ HPSSA LOG" >> $REPORT
echo -e "┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
grep hpsa /var/log/messages* >> $REPORT

echo -e "\n┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
echo -e "┣━━━━━ Number PCI error" >> $REPORT
echo -e "┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
dmesg | grep -c "Corrected error received" >> $REPORT


echo -e "\n┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
echo -e "┣━━━━━ /var/log/messages" >> $REPORT
echo -e "┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
cat /var/log/messages | grep disk >> $REPORT

# =========================
# STORAGE DISKS
echo -e "\n┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
echo -e "┣━━━━━ DISK OF STORAGE D3700 - SDA" >> $REPORT
echo -e "┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> $REPORT
echo "Spare Disk: $SPAREDISK" >> $REPORT
for i in $(seq 0 5);
do
if [ $i -ne $SPAREDISK ];
then
echo -e "\n┣━━━ DISK $i ━━━━━━━━━━━━━━━━━━━ \n" >> $REPORT
/usr/sbin/smartctl -a -d cciss,$i $DISK | egrep "Serial number|SMART Health|Current Drive Temperature|Elements in grown defect|Errors Corrected|algorithm|read:|write:|verify:|Non-medium|Background short|Self-test log|LifeTime" | head -15 3>&1 4>&2 >>$REPORT 2>&1

fi
done

# =========================
# MAIL SENDED
echo -e "Please find the reports for the preventive maintenance of the storage $(hostname)."\
| mail -s "$(hostname) : Preventive maintenance" -S \
smtp=smtp://mail.my.company.org -S from="[email protected]" -a $REPORT $MAIL

Documentation

MAN smartctl

🡅 Partager