Monday, 2 October 2017

Using truss to track system callls in Solaris

The truss command in Solaris is used to trace system/library calls made by new or existing processes. A system call is a C library that requests and interacts with kernel services.

The truss command is very helpful in debugging process hung and core dump issues. It can also be used to view which processes are taking more time and what parameters are being passed for each system call.

To demonstrate the usage I've written a small script which runs for five iterations, prints something to stdout, runs the date command and sleeps for 5 seconds.

root@sandbox:/# cat hup.bash
#!/bin/bash

i=0
while [ $i -le 5 ]

do
echo "printting endlessly"
date
sleep 5
i=$[$i+1]
done


When I ran the script here's what I got on the terminal:

root@sandbox:/# ./hup.bash
printting endlessly
Mon Oct  2 12:56:46 IST 2017
printting endlessly
Mon Oct  2 12:56:51 IST 2017
printting endlessly
Mon Oct  2 12:56:56 IST 2017
printting endlessly
Mon Oct  2 12:57:01 IST 2017
printting endlessly
Mon Oct  2 12:57:06 IST 2017
printting endlessly
Mon Oct  2 12:57:11 IST 2017


On another terminal I did a pgrep for the process id (PID) of the script and ran truss over it.

root@sandbox:/# pgrep hup
1446
root@sandbox:/# truss -p 1446
waitid(P_ALL, 0, 0x08047A70, WEXITED|WTRAPPED) (sleeping...)
waitid(P_ALL, 0, 0x08047A70, WEXITED|WTRAPPED)  = 0
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047AA0, 0x08047B10)       = 0
setcontext(0x080479B0)
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
write(1, " p r i n t t i n g   e n".., 20)      = 20
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
fork1()                                         = 1453
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
    Received signal #18, SIGCLD [caught]
      siginfo: SIGCLD CLD_EXITED pid=1453 status=0x0000
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
waitid(P_ALL, 0, 0x08047630, WEXITED|WTRAPPED|WNOHANG) = 0
waitid(P_ALL, 0, 0x08047630, WEXITED|WTRAPPED|WNOHANG) Err#10 ECHILD
setcontext(0x08047530)
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047A20, 0x08047A90)       = 0
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047A10, 0x08047A80)       = 0
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
fork1()                                         = 1454
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047AB0, 0x08047B20)       = 0
waitid(P_ALL, 0, 0x08047A70, WEXITED|WTRAPPED) (sleeping...)
waitid(P_ALL, 0, 0x08047A70, WEXITED|WTRAPPED)  = 0
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047AA0, 0x08047B10)       = 0
setcontext(0x080479B0)
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
write(1, " p r i n t t i n g   e n".., 20)      = 20
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
fork1()                                         = 1457
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
    Received signal #18, SIGCLD [caught]
      siginfo: SIGCLD CLD_EXITED pid=1457 status=0x0000
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
waitid(P_ALL, 0, 0x08047630, WEXITED|WTRAPPED|WNOHANG) = 0
waitid(P_ALL, 0, 0x08047630, WEXITED|WTRAPPED|WNOHANG) Err#10 ECHILD
setcontext(0x08047530)
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047A20, 0x08047A90)       = 0
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047A10, 0x08047A80)       = 0
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
fork1()                                         = 1458
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047AB0, 0x08047B20)       = 0
waitid(P_ALL, 0, 0x08047A70, WEXITED|WTRAPPED) (sleeping...)
waitid(P_ALL, 0, 0x08047A70, WEXITED|WTRAPPED)  = 0
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047AA0, 0x08047B10)       = 0
setcontext(0x080479B0)
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
write(1, " p r i n t t i n g   e n".., 20)      = 20
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
fork1()                                         = 1461
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
    Received signal #18, SIGCLD [caught]
      siginfo: SIGCLD CLD_EXITED pid=1461 status=0x0000
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
waitid(P_ALL, 0, 0x08047630, WEXITED|WTRAPPED|WNOHANG) = 0
waitid(P_ALL, 0, 0x08047630, WEXITED|WTRAPPED|WNOHANG) Err#10 ECHILD
setcontext(0x08047530)
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047A20, 0x08047A90)       = 0
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047A10, 0x08047A80)       = 0
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
fork1()                                         = 1462
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047AB0, 0x08047B20)       = 0
waitid(P_ALL, 0, 0x08047A70, WEXITED|WTRAPPED) (sleeping...)
waitid(P_ALL, 0, 0x08047A70, WEXITED|WTRAPPED)  = 0
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047AA0, 0x08047B10)       = 0
setcontext(0x080479B0)
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
write(1, " p r i n t t i n g   e n".., 20)      = 20
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
fork1()                                         = 1467
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047A20, 0x08047A90)       = 0
waitid(P_ALL, 0, 0x080479E0, WEXITED|WTRAPPED)  = 0
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047A10, 0x08047A80)       = 0
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
fork1()                                         = 1468
lwp_sigmask(SIG_SETMASK, 0x00020002, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047AB0, 0x08047B20)       = 0
waitid(P_ALL, 0, 0x08047A70, WEXITED|WTRAPPED) (sleeping...)
waitid(P_ALL, 0, 0x08047A70, WEXITED|WTRAPPED)  = 0
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
sigaction(SIGINT, 0x08047AA0, 0x08047B10)       = 0
setcontext(0x080479B0)
lwp_sigmask(SIG_SETMASK, 0x00020000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
lwp_sigmask(SIG_SETMASK, 0x00000000, 0x00000000) = 0xFFBFFEFF [0x0000FFFF]
read(255, 0x080FCA08, 95)                       = 0
_exit(0)


While the truss command was tracing my script I also ran ptree on the script to view the PIDs of the sub shells it was creating just to have a more thorough picture of what was going on.

I looped that over while to view the ptree output continously throughout the lifetime of the script execution.

root@sandbox:/# while true ; do ptree 1446 ;sleep 4; done
565   /usr/lib/ssh/sshd
  804   /usr/lib/ssh/sshd
    805   /usr/lib/ssh/sshd
      838   -bash
        1446  /bin/bash ./hup.bash
          1454  sleep 5
565   /usr/lib/ssh/sshd
  804   /usr/lib/ssh/sshd
    805   /usr/lib/ssh/sshd
      838   -bash
        1446  /bin/bash ./hup.bash
          1458  sleep 5
565   /usr/lib/ssh/sshd
  804   /usr/lib/ssh/sshd
    805   /usr/lib/ssh/sshd
      838   -bash
        1446  /bin/bash ./hup.bash
          1462  sleep 5
565   /usr/lib/ssh/sshd
  804   /usr/lib/ssh/sshd
    805   /usr/lib/ssh/sshd
      838   -bash
        1446  /bin/bash ./hup.bash
          1462  sleep 5
565   /usr/lib/ssh/sshd
  804   /usr/lib/ssh/sshd
    805   /usr/lib/ssh/sshd
      838   -bash
        1446  /bin/bash ./hup.bash
          1468  sleep 5



Now let's try to briefly understand and interpret the truss output in the next few lines:

The sigaction() function allows the calling process to examine or specify the action to be taken on delivery of a specific signal.

The setcontext() function restores the user context pointed to by ucp. A successful call to setcontext() does not return; program execution resumes at the point specified by the ucp argument passed to setcontext().

The write() function will write from the buffer to the file associated with the open file descriptor. For our example the write function will write the strings "printing endlessly" to the terminal.

The programmer can control which signals will be blocked by the running application process by setting a signal mask which is a bit array. SIG_SETMASK specifies that the new mask should replace the old mask. Signals are blocked if the  corresponding  bit in mask is a 1; the macro sigmask is provided to construct the mask for a given signum.

The fork() function create a new process. The address space of the new process (child process) is an exact copy of the address space of the calling process (parent process). In our example the first fork() function creates a child process for executing the date command.
This child process receives the SIGCLD signal immediately after execution implying that the child process status has changed as it exited after completing execution.

The waitid() function suspends the calling/parent process until one of its child processes changes state. It records the current state of a child in the structure pointed to by infop. It returns immediately if a child process changed state prior to the call.
Once the chiild process termination is acknowledged by the parent process and it's entry removed from the process table, the parent process continues it's execution run.

Next we see another call to the fork() function this time executing the 5 second sleep mentioned in the script.

Notice that the signal mask is set after each fork() system call.

This continues for five iterations specified in the script.

The read() function here towards the end of the truss output indicates an end of file since no process has the pipe or file descriptor open for writting.
Finally we exit after the completion of the fifth iteration of the script using the _exit() function to terminate the process.

I kept the ptree command running continously because I wanted to demonstrate how the new PIDs for the sleep child process are being generated by the fork() system call being executed. In the truss output, the fork() function gives the process id of the child process everytime it creates one.
Ptree did not show the child process for the date command being run as perhaps the context switch was too fast. I am open to other justifications though.


In the above scenario we executed the truss command on a script but we can do the same for a operating system command as well to better understand its behavior or perform some troubleshooting if the command is not generating the intented results.

Here is an example of using truss on the date command:

root@sandbox:/# truss -d date
Base time stamp:  1506928158.6656  [ Mon Oct  2 12:39:18 IST 2017 ]
 0.0000 execve("/usr/bin/date", 0x08047DE8, 0x08047DF0)  argc = 1
 0.0183 mmap(0x00000000, 4096, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANON, -1, 0) = 0xFEFF0000
 0.0184 resolvepath("/usr/lib/ld.so.1", "/lib/ld.so.1", 1023) = 12
 0.0186 resolvepath("/usr/bin/date", "/usr/bin/date", 1023) = 13
 0.0186 sysconfig(_CONFIG_PAGESIZE)                     = 4096
 0.0188 stat64("/usr/bin/date", 0x08047B80)             = 0
 0.0189 open("/var/ld/ld.config", O_RDONLY)             Err#2 ENOENT
 0.0190 stat64("/lib/libc.so.1", 0x08047420)            = 0
 0.0191 resolvepath("/lib/libc.so.1", "/lib/libc.so.1", 1023) = 14
 0.0192 open("/lib/libc.so.1", O_RDONLY)                = 3
 0.0193 mmap(0x00010000, 32768, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_ALIGN, 3, 0) = 0xFEFB0000
 0.0195 mmap(0x00010000, 1212416, PROT_NONE, MAP_PRIVATE|MAP_NORESERVE|MAP_ANON|MAP_ALIGN, -1, 0) = 0xFEE80000
 0.0196 mmap(0xFEE80000, 1102437, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_TEXT, 3, 0) = 0xFEE80000
 0.0196 mmap(0xFEF9E000, 30183, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_INITDATA, 3, 1105920) = 0xFEF9E000
 0.0198 mmap(0xFEFA6000, 4240, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANON, -1, 0) = 0xFEFA6000
 0.0198 munmap(0xFEF8E000, 65536)                       = 0
 0.0200 memcntl(0xFEE80000, 124080, MC_ADVISE, MADV_WILLNEED, 0, 0) = 0
 0.0201 close(3)                                        = 0
 0.0204 mmap(0x00010000, 24576, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANON|MAP_ALIGN, -1, 0) = 0xFEF90000
 0.0205 munmap(0xFEFB0000, 32768)                       = 0
 0.0206 getcontext(0x080479F0)
 0.0207 getrlimit(RLIMIT_STACK, 0x080479E8)             = 0
 0.0208 getpid()                                        = 1052 [1051]
 0.0209 lwp_private(0, 1, 0xFEF92A00)                   = 0x000001C3
 0.0210 setustack(0xFEF92A60)
 0.0211 sysi86(SI86FPSTART, 0xFEFA6740, 0x0000133F, 0x00001F80) = 0x00000001
 0.0212 brk(0x08062ED0)                                 = 0
 0.0213 brk(0x08064ED0)                                 = 0
 0.0214 time()                                          = 1506928158
 0.0215 brk(0x08064ED0)                                 = 0
 0.0216 brk(0x08066ED0)                                 = 0
 0.0217 open("/usr/share/lib/zoneinfo/Asia/Calcutta", O_RDONLY) = 3
 0.0218 fstat64(3, 0x08047C30)                          = 0
 0.0219 read(3, " T Z i f\0\0\0\0\0\0\0\0".., 109)      = 109
 0.0220 close(3)                                        = 0
 0.0221 sysconfig(_CONFIG_PAGESIZE)                     = 4096
 0.0222 ioctl(1, TCGETA, 0x08047C54)                    = 0
 0.0223 fstat64(1, 0x08047BC0)                          = 0
Mon Oct  2 12:39:18 IST 2017
 0.0224 write(1, " M o n   O c t     2   1".., 29)      = 29
 0.0225 _exit(0)
root@sandbox:/#


The -d option prints a timestamp along side each operation being performed.

if we are interested in viewing only the number, names and time duration of the various system calls being executed while a command is run then we may use the -c option with truss as shown below:


root@sandbox:/# truss -c -d date
Mon Oct  2 12:43:24 IST 2017

syscall               seconds   calls  errors
_exit                    .000       1
read                     .000       1
write                    .000       1
open                     .000       3       1
close                    .000       2
time                     .000       1
brk                      .000       4
getpid                   .000       1
sysi86                   .000       1
ioctl                    .000       1
execve                   .000       1
getcontext               .000       1
setustack                .000       1
mmap                     .000       7
munmap                   .000       2
getrlimit                .000       1
memcntl                  .000       1
sysconfig                .000       2
lwp_private              .000       1
resolvepath              .000       3
stat64                   .000       2
fstat64                  .000       2
                     --------  ------   ----
sys totals:              .000      40      1
usr time:                .000
elapsed:                 .020


I hope this article was helpful to you and I thank you for reading.

No comments:

Post a Comment

Using capture groups in grep in Linux

Introduction Let me start by saying that this article isn't about capture groups in grep per se. What we are going to do here with gr...