Thursday, November 28, 2013

iotop -CP

The DTrace Toolkit provides a way to directly measure disk utilization via the iotop -CP command. This command shows UIDs, process IDs and device names, which can help identify a culprit. (The -C option provides a rolling output rather than having it clear at each time step. The -P option shows the %I/O utilization.)



#!/usr/bin/ksh
#
# iotop - display top disk I/O events by process.
#         Written using DTrace (Solaris 10 3/05).
#
# This is measuring disk events that have made it past system caches.
#
# 20-Apr-2006, ver 0.76
#
# USAGE:    iotop [-C] [-D|-o|-P] [-j|-Z] [-d device] [-f filename]
#              [-m mount_point] [-t top] [interval [count]]
#
#        iotop       # default output, 5 second intervals
#
#        -C        # don't clear the screen
#        -D        # print delta times, elapsed, us
#        -j        # print project ID
#        -o        # print disk delta times, us
#        -P        # print %I/O (disk delta times)
#        -Z        # print zone ID
#        -d device    # instance name to snoop (eg, dad0)
#        -f filename    # full pathname of file to snoop
#        -m mount_point    # this FS only (will skip raw events)
#        -t top        # print top number only
#    eg,
#        iotop 1      # 1 second samples
#        iotop -C    # don't clear the screen
#        iotop -P    # print %I/O (time based)
#        iotop -j    # print project IDs
#        iotop -Z     # print zone IDs
#        iotop -t 20     # print top 20 lines only
#        iotop -C 5 12    # print 12 x 5 second samples
#    
# FIELDS:
#        UID        user ID
#        PID        process ID
#        PPID        parent process ID
#        PROJ        project ID
#        ZONE        zone ID
#        CMD        process command name
#        DEVICE      device name
#        MAJ         device major number
#        MIN         device minor number
#        D        direction, Read or Write
#        BYTES        total size of operations, bytes
#        ELAPSED        total elapsed from request to completion, us
#        DISKTIME    total time for disk to complete request, us
#        %I/O        percent disk I/O, based on time (DISKTIME)
#        load        1 min load average
#        disk_r        total disk read Kbytes for sample
#        disk_w        total disk write Kbytes for sample
#
# NOTE:
# * There are two different delta times reported. -D prints the
#   elapsed time from the disk request (strategy) to the disk completion
#   (iodone); -o prints the time for the disk to complete that event
#   since it's last event (time between iodones), or, the time to the
#   strategy if the disk had been idle.
# * The %I/O value can exceed 100%. It represents how busy a process is
#   making the disks, in terms of a single disk. A value of 200% could
#   mean 2 disks are busy at 100%, or 4 disks at 50%...
#
# SEE ALSO: iosnoop
#        BigAdmin: DTrace, http://www.sun.com/bigadmin/content/dtrace
#        Solaris Dynamic Tracing Guide, http://docs.sun.com
#        DTrace Tools, http://www.brendangregg.com/dtrace.html
#
# INSPIRATION:  top(1) by William LeFebvre
#
# COPYRIGHT: Copyright (c) 2005, 2006 Brendan Gregg.
#
# CDDL HEADER START
#
#  The contents of this file are subject to the terms of the
#  Common Development and Distribution License, Version 1.0 only
#  (the "License").  You may not use this file except in compliance
#  with the License.
#
#  You can obtain a copy of the license at Docs/cddl1.txt
#  or http://www.opensolaris.org/os/licensing.
#  See the License for the specific language governing permissions
#  and limitations under the License.
#
# CDDL HEADER END
#
# KNOWN BUGS:
# - This can print errors while running on servers with Veritas volumes.
#
# Author: Brendan Gregg  [Sydney, Australia]
#
# 15-Jul-2005    Brendan Gregg    Created this.
#


##############################
# --- Process Arguments ---
#

### default variables
opt_device=0; opt_file=0; opt_mount=0; opt_clear=1; opt_proj=0; opt_zone=0
opt_percent=0; opt_def=1; opt_bytes=1; filter=0; device=.; filename=.; mount=.
opt_top=0; opt_elapsed=0; opt_dtime=0; interval=5; count=-1; top=0

### process options
while getopts CDd:f:hjm:oPt:Z name
do
    case $name in
    C)    opt_clear=0 ;;
    D)    opt_elapsed=1; opt_bytes=0 ;;
    d)    opt_device=1; device=$OPTARG ;;
    f)    opt_file=1; filename=$OPTARG ;;
    j)    opt_proj=1; opt_def=0 ;;
    m)    opt_mount=1; mount=$OPTARG ;;
    o)    opt_dtime=1; opt_bytes=0 ;;
    P)    opt_percent=1; opt_dtime=1; opt_bytes=0 ;;
    t)    opt_top=1; top=$OPTARG ;;
    Z)    opt_zone=1; opt_def=0 ;;
    h|?)    cat <<-END >&2
        USAGE: iotop [-C] [-D|-o|-P] [-j|-Z] [-d device] [-f filename]
                     [-m mount_point] [-t top] [interval [count]]

                        -C      # don't clear the screen
                        -D      # print delta times, elapsed, us
                        -j      # print project ID
                        -o      # print disk delta times, us
                        -P      # print %I/O (disk delta times)
                        -Z      # print zone ID
                        -d device       # instance name to snoop
                        -f filename     # snoop this file only
                        -m mount_point  # this FS only
                        -t top      # print top number only
           eg,
                iotop         # default output, 5 second samples
                iotop 1       # 1 second samples
                iotop -P      # print %I/O (time based)
                iotop -m /    # snoop events on filesystem / only
                iotop -t 20   # print top 20 lines only
                iotop -C 5 12 # print 12 x 5 second samples
        END
        exit 1
    esac
done

shift $(( $OPTIND - 1 ))

### option logic
if [[ "$1" > 0 ]]; then
        interval=$1; shift
fi
if [[ "$1" > 0 ]]; then
        count=$1; shift
fi
if (( opt_proj && opt_zone )); then
        opt_proj=0
fi
if (( opt_elapsed && opt_dtime )); then
        opt_elapsed=0
fi
if (( opt_device || opt_mount || opt_file )); then
    filter=1
fi
if (( opt_clear )); then
        clearstr=`clear`
else
        clearstr=.
fi



#################################
# --- Main Program, DTrace ---
#
/usr/sbin/dtrace -n '
 /*
  * Command line arguments
  */
 inline int OPT_def     = '$opt_def';
 inline int OPT_proj     = '$opt_proj';
 inline int OPT_zone     = '$opt_zone';
 inline int OPT_clear     = '$opt_clear';
 inline int OPT_bytes     = '$opt_bytes';
 inline int OPT_elapsed = '$opt_elapsed';
 inline int OPT_dtime     = '$opt_dtime';
 inline int OPT_percent    = '$opt_percent';
 inline int OPT_device     = '$opt_device';
 inline int OPT_mount     = '$opt_mount';
 inline int OPT_file     = '$opt_file';
 inline int OPT_top     = '$opt_top';
 inline int INTERVAL     = '$interval';
 inline int COUNTER     = '$count';
 inline int FILTER     = '$filter';
 inline int TOP     = '$top';
 inline string DEVICE     = "'$device'";
 inline string FILENAME = "'$filename'";
 inline string MOUNT     = "'$mount'";
 inline string CLEAR     = "'$clearstr'";

 #pragma D option quiet

 /*
  * Print header
  */
 dtrace:::BEGIN
 {
    last_event[""] = 0;

        /* starting values */
        counts = COUNTER;
        secs = INTERVAL;
        disk_r = 0;
        disk_w = 0;

        printf("Tracing... Please wait.\n");
 }

 /*
  * Check event is being traced
  */
 io:genunix::start,
 io:genunix::done
 {
    /* default is to trace unless filtering, */
    this->ok = FILTER ? 0 : 1;

    /* check each filter, */
    (OPT_device == 1 && DEVICE == args[1]->dev_statname)? this->ok = 1 : 1;
    (OPT_file == 1 && FILENAME == args[2]->fi_pathname) ? this->ok = 1 : 1;
    (OPT_mount == 1 && MOUNT == args[2]->fi_mount)  ? this->ok = 1 : 1;
 }

 /*
  * Reset last_event for disk idle -> start
  * this prevents idle time being counted as disk time.
  */
 io:genunix::start
 /! pending[args[1]->dev_statname]/
 {
    /* save last disk event */
    last_event[args[1]->dev_statname] = timestamp;
 }

 /*
  * Store entry details
  */
 io:genunix::start
 /this->ok/
 {
    /* these are used as a unique disk event key, */
     this->dev = args[0]->b_edev;
     this->blk = args[0]->b_blkno;

    /* save disk event details, */
     start_uid[this->dev, this->blk] = uid;
     start_pid[this->dev, this->blk] = pid;
     start_ppid[this->dev, this->blk] = ppid;
     start_comm[this->dev, this->blk] = execname;
     start_time[this->dev, this->blk] = timestamp;
     start_proj[this->dev, this->blk] = curpsinfo->pr_projid;
     start_zone[this->dev, this->blk] = curpsinfo->pr_zoneid;
     start_rw[this->dev, this->blk] = args[0]->b_flags & B_READ ? "R" : "W";
    disk_r += args[0]->b_flags & B_READ ? args[0]->b_bcount : 0;
    disk_w += args[0]->b_flags & B_READ ? 0 : args[0]->b_bcount;

    /* increase disk event pending count */
    pending[args[1]->dev_statname]++;
 }

 /*
  * Process and Print completion
  */
 io:genunix::done
 /this->ok/
 {
    /* decrease disk event pending count */
    pending[args[1]->dev_statname]--;

    /*
     * Process details
     */

     /* fetch entry values */
     this->dev = args[0]->b_edev;
     this->blk = args[0]->b_blkno;
     this->suid = start_uid[this->dev, this->blk];
     this->spid = start_pid[this->dev, this->blk];
     this->sppid = start_ppid[this->dev, this->blk];
     this->sproj = start_proj[this->dev, this->blk];
     this->szone = start_zone[this->dev, this->blk];
     self->scomm = start_comm[this->dev, this->blk];
     this->stime = start_time[this->dev, this->blk];
    this->etime = timestamp; /* endtime */
    this->elapsed = this->etime - this->stime;
     self->rw = start_rw[this->dev, this->blk];
    this->dtime = last_event[args[1]->dev_statname] == 0 ? 0 :
        timestamp - last_event[args[1]->dev_statname];

     /* memory cleanup */
     start_uid[this->dev, this->blk]  = 0;
     start_pid[this->dev, this->blk]  = 0;
     start_ppid[this->dev, this->blk] = 0;
     start_time[this->dev, this->blk] = 0;
     start_comm[this->dev, this->blk] = 0;
     start_zone[this->dev, this->blk] = 0;
     start_proj[this->dev, this->blk] = 0;
     start_rw[this->dev, this->blk]   = 0;

    /*
     * Choose statistic to track
     */
    OPT_bytes   ? this->value = args[0]->b_bcount    : 1;
    OPT_elapsed ? this->value = this->elapsed / 1000 : 1;
    OPT_dtime   ? this->value = this->dtime / 1000   : 1;
   
    /*
     * Save details
     */
    OPT_def ? @out[this->suid, this->spid, this->sppid, self->scomm,
        args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
        self->rw] = sum(this->value) : 1;
    OPT_proj ? @out[this->sproj, this->spid, this->sppid, self->scomm,
        args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
        self->rw] = sum(this->value) : 1;
    OPT_zone ? @out[this->szone, this->spid, this->sppid, self->scomm,
        args[1]->dev_statname, args[1]->dev_major, args[1]->dev_minor,
        self->rw] = sum(this->value) : 1;

    /* save last disk event */
    last_event[args[1]->dev_statname] = timestamp;

    self->scomm = 0;
    self->rw = 0;
 }

 /*
  * Prevent pending from underflowing
  * this can happen if this program is started during disk events.
  */
 io:genunix::done
 /pending[args[1]->dev_statname] < 0/
 {
    pending[args[1]->dev_statname] = 0;
 }

 /*
  * Timer
  */
 profile:::tick-1sec
 {
    secs--;
 }

 /*
  * Print Report
  */
 profile:::tick-1sec
 /secs == 0/
 {
    /* fetch 1 min load average */
    this->load1a  = `hp_avenrun[0] / 65536;
    this->load1b  = ((`hp_avenrun[0] % 65536) * 100) / 65536;

    /* convert counters to Kbytes */
    disk_r /= 1024;
    disk_w /= 1024;

    /* print status */
    OPT_clear ? printf("%s", CLEAR) : 1;
    printf("%Y,  load: %d.%02d,  disk_r: %6d KB,  disk_w: %6d KB\n\n",
        walltimestamp, this->load1a, this->load1b, disk_r, disk_w);

    /* print headers */
    OPT_def  ? printf("  UID ") : 1;
    OPT_proj ? printf(" PROJ ") : 1;
    OPT_zone ? printf(" ZONE ") : 1;
    printf("%6s %6s %-16s %-7s %3s %3s %1s",
        "PID", "PPID", "CMD", "DEVICE", "MAJ", "MIN", "D");
    OPT_bytes   ? printf(" %16s\n", "BYTES") : 1;
    OPT_elapsed ? printf(" %16s\n", "ELAPSED") : 1;
    OPT_dtime && ! OPT_percent  ? printf(" %16s\n", "DISKTIME") : 1;
    OPT_dtime && OPT_percent    ? printf(" %6s\n", "%I/O") : 1;

    /* truncate to top lines if needed */
    OPT_top ? trunc(@out, TOP) : 1;

    /* normalise to percentage if needed */
    OPT_percent ? normalize(@out, INTERVAL * 10000) : 1;

    /* print data */
    ! OPT_percent ?
        printa("%5d %6d %6d %-16s %-7s %3d %3d %1s %16@d\n", @out) :
        printa("%5d %6d %6d %-16s %-7s %3d %3d %1s %6@d\n", @out);
    printf("\n");

    /* clear data */
    trunc(@out);
    disk_r = 0;
    disk_w = 0;
    secs = INTERVAL;
    counts--;
 }

 /*
  * End of program
  */
 profile:::tick-1sec
 /counts == 0/
 {
    exit(0);
 }

 /*
  * Cleanup for Ctrl-C
  */
 dtrace:::END
 {
    trunc(@out);
 }
'


No comments:

Post a Comment