[Search for users] [Overall Top Noters] [List of all Conferences] [Download this site]

Conference turris::digital_unix

Title:DIGITAL UNIX(FORMERLY KNOWN AS DEC OSF/1)
Notice:Welcome to the Digital UNIX Conference
Moderator:SMURF::DENHAM
Created:Thu Mar 16 1995
Last Modified:Fri Jun 06 1997
Last Successful Update:Fri Jun 06 1997
Number of topics:10068
Total number of notes:35879

8699.0. "Problem with hang in xticlose during exit processing." by CSC64::BLAYLOCK (If at first you doubt,doubt again.) Tue Feb 04 1997 13:37

From a system crash dump that I am looking at, I have a process
that is stuck in xticlose during exit processing.  It would appear
to be closing stderr and the process is sleeping (mpsleep) while
waiting for some event to clear.  Unfortunately, nothing seems
to timeout or otherwise end the connection (the remote system
has disconnected).  What needs to be set on the socket (if anything)
to get the close to complete?

The customer originally found this (these) process to show up
as a defunct process with ps(1) that never went away.  Unfortuanately,
I have not been able to determine from the customer how they get
this into this state or how they setup the socket that stderr points to.
There is also a 'sth' module in this stream that I have not received
any information on as yet.

This is Digital UNIX 4.0a (rev 464).

The PID in question is 25176 and is running 'ca' (still no info on
this process).  Its parent is '25390   listen'.

>  0 thread_block() ["../../../../src/kernel/kern/sched_prim.c":2036,
0xfffffc00
   1 mpsleep(0x0, 0xffffffffa83e3500, 0x0, 0x0, 0x0)
["../../../../src/kernel/bsd/kern_synch.c":563, 0xfffffc00002f9bc4]
   2 streams_mpsleep(0xfffffc000f8bdc00, 0x0, 0x0, 0x0, 0xfffffc0032778ef8)
["../../../../src/kernel/streams/str_env.c":574, 0xfffffc00003c3718]
   3 xticlose(q = 0xfffffc001fe08e00, credp = 0xfffffc000ac36d00)
["../../../../src/kernel/streamsm/xtiso.c":965, 0xfffffc00003e593c]
   4 close_wrapper(0xfffffc00005c5ce0, 0xfffffc00003d1950,
0xffffffffa83e35e0, 0xfffffc0032778ef8, 0xfffffc00003d2f84)
["../../../../src/kernel/streams/str_subr.c":317, 0xfffffc00003d19]
   5 csq_protect(q1 = 0xfffffc001fe08e00, q2 = 0xfffffc001fe08ee8, func =
0xfffffc00003d1950, arg = 0xffffffffa83e35e0, sq = 0xfffffc0032778ef8,
which = 0) ["../../../../src/kernel/streams/str_synch.c":770,
0xfffffc00003d2f80
   6 osr_pop_subr(0xfffffc000ac36d00, 0xfffffc0032778ef8,
0xfffffc001fe08e00, 0x400, 0xfffffc0032778e00)
["../../../../src/kernel/streams/str_osr.c":1686, 0xfffffc00003c8a00]
   7 osr_close_subr(osrq = 0xffffffffa83e36b8)
["../../../../src/kernel/streams/str_scalls.c":1345, 0xfffffc00003ce924]
   8 pse_close(dev = 5606416, flag = 5606416, mode = 5606416, cred = (nil),
private = (nil)) ["../../../../src/kernel/streams/str_scalls.c":1196,
0xfffffc00
   9 speclose(0xfffffc0032778e00, 0xfffffc003d968f80, 0x2000, 0x400000d,
0xfffff000ac36d00) ["../../../../src/kernel/vfs/spec_vnops.c":2462,
0xfffffc000040f13c]
  10 spec_close(0x2, 0xfffffc000abe6ef0, 0x0, 0x2, 0xfffffc0000000000)
["../../../../src/kernel/vfs/spec_vnops.c":2616, 0xfffffc000040f430]
  11 vn_close(0xfffffc00002e07f0, 0xfffffc001f41d300, 0xfffffc0000598b40,
0x2, 0x2) ["../../../../src/kernel/vfs/vfs_vnops.c":1292,
0xfffffc000042fa04]
  12 closef(0x2, 0x6400, 0xfffffc00002e3884, 0xfffffc000abe6ca0,
0xfffffc001f41d300) ["../../../../src/kernel/bsd/kern_descrip.c":1539,
0xfffffc0
  13 exit(0x0, 0xfffffc000abe6a80, 0x6400, 0xfffffc0039c42220, 0x1)
["../../../../src/kernel/bsd/kern_exit.c":936, 0xfffffc00002e3880]
  14 rexit(0x6400, 0xfffffc0039c42220, 0x1, 0x1, 0xfffffc000046fed4)
["../../../../src/kernel/bsd/kern_exit.c":755, 0xfffffc00002e3338]
  15 syscall(0x1, 0x0, 0x2a33332e4d824, 0x3ff81d34830, 0x1)
["../../../../src/kernel/arch/alpha/syscall_trap.c":555,
0xfffffc000046fed0]
  16 _Xsyscall(0x8, 0x3ff800d5fa8, 0x3ffc00931b0, 0x64, 0x6)
["../../../../src/kernel/arch/alpha/locore.s":1177, 0xfffffc0000466f54]

At the point we call 'exit' we are working on the following loop
(kern/std_exit.c)

        ufp = &u.u_file_state;
        for (i = 0; i <= u.u_lastfile; i++ {
                f = U_OFILE(i, ufp);
                U_POFILE_SET(i, 0, ufp);
                if (f != NULL && f != U_FD_RESERVED) {
                        U_OFILE_SET(i, NULL, ufp);
                        (void)closef(f);
                }
                if (f == U_FD_RESERVED)
                        aprintf("exit: stumbled on reserved fd\n");
        }


the 3rd entry in this list ($r15 is 2 and $r15 is the value of 'i').
This would be stderr.

(kdbx) $pc?50i
...
  [exit:931, 0xfffffc00002e37c0]        bis     r31, r31, r15
...
(kdbx) p $r15
2

Since we are in xticlose code at the top of the stack:
The code in xticlose is:

xticlose(q, flag, credp)
        register queue_t *q;
        int flag;
        cred_t *credp;
{
        register struct xticb *xtip;
        int error = 0, error2;

        CHECKPOINT("xticlose");

        /*
         * If xti context block pointer is NULL, then
         * no more cleanup
         */
        if (xtip = (struct xticb *)q->q_ptr) {
                if (! (flag & (O_NDELAY | O_NONBLOCK))) {
                        xtip->xti_flags |= XTI_ISCLOSING;
                        while ( OTHERQ(q)->q_first || xtip->xti_pendcall){
                                if (error = tsleep((caddr_t)xtip,
                                    (PZERO + 1) | PCATCH, "xtiso", 0))
                                        break;


The value of the q structure is moved from $r16 to $r9 at the
begining of the routine:

  [xticlose:947, 0xfffffc00003e58d0]    stq     r11, 24(sp)
  [xticlose:947, 0xfffffc00003e58d4]    stq     r12, 32(sp)
  [xticlose:947, 0xfffffc00003e58d8]    stq     r13, 40(sp)
  [xticlose:947, 0xfffffc00003e58dc]    bis     r31, r16, r9
  [xticlose:961, 0xfffffc00003e58e0]    ldq     r11, 40(r16)
  [xticlose:947, 0xfffffc00003e58e4]    bis     r31, r18, r10
  [xticlose:947, 0xfffffc00003e58e8]    stq     r26, 0(sp)
  [xticlose:951, 0xfffffc00003e58ec]    addl    r31, r17, r12
  [xticlose:953, 0xfffffc00003e58f0]    bis     r31, r31, r13

So q -> $r9, flag -> $r12 and cred -> $r10


Now the 'struct queue' referenced is from the streams.h file
and conflicts with the 'sys/queue.h' in the kernel.  So for
example, q->q_ptr where we get the xtip is at offset 40
from the q structure

(kdbx) px $r9+40
0xfffffc001fe08e28
(kdbx) 0xfffffc001fe08e28/X
fffffc001fe08e28:  fffffc003a769e28


So in this case, xtip->xti_pendcall is zero:

(kdbx) cast 0xfffffc003a769e28 struct xticb
struct {
    xti_state = 4
    xti_flags = 1097
    xti_minor = 13
    xti_so = 0xfffffc003a7697c0
    xti_sostate = 49152
    xti_soerror = 54
    xti_sorcount = 0
    xti_sowspace = 840
    xti_sosnap = 0
    xti_lso = (nil)
    xti_maxqlen = 1024
    xti_seq = 0xffffffff80844000
    xti_qlen = 0
    xti_cindno = 0
    xti_seqcnt = 1
    xti_pendind = 0
    xti_tsdu = 0
    xti_etsdu = 0
    xti_rq = 0xfffffc001fe08e00
    xti_wq = 0xfffffc001fe08ee8
    xti_wndata = 0xfffffc000cf72a00
    xti_wexpdata = (nil)
    xti_wnam = (nil)
    xti_rdata = (nil)
    xti_rnam = (nil)
    xti_rflags = 0
    xti_pendcall = (nil)
    xti_bufcallid = 0
    xti_tlierr = 0
    xti_unixerr = 0
    xti_errtype = 0
    xti_cfg = 0xfffffc003f509e00
    xti_def_opts = (nil)
    xti_userstate = -1
}

(kdbx) set $xticb=0xfffffc003a769e28

So we appear to be waiting on the queue to empty.

The queue structure is:

struct queue {
        struct qinit *  q_qinfo;        /* procedures and limits for queue
*/
        struct msgb *   q_first;        /* head of message queue */
        struct msgb *   q_last;         /* tail of message queue */
        struct queue *  q_next;         /* next QUEUE in Stream */
        struct queue *  q_link;         /* link to scheduling queue */
        caddr_t         q_ptr;          /* to private data structure */
        ulong           q_count;        /* weighted count of characters on
q */
        ulong           q_flag;         /* QUEUE state */
        long            q_minpsz;       /* min packet size accepted */
        long            q_maxpsz;       /* max packet size accepted */
        ulong           q_hiwat;        /* high water mark, for flow
control */
        ulong           q_lowat;        /* low water mark */
        struct qband *  q_bandp;        /* band information */
        unsigned char   q_nband;        /* number of bands */
        unsigned char   q_pad1[3];      /* reserved */
        struct queue *  q_other;        /* pointer to other Q in queue
pair */
        QUEUE_KERNEL_FIELDS
};


/** Get pointer to the mate queue */

#define OTHERQ(q)       ((q)->q_other)

struct queue * q_other;

So this becomes:

             ((q)->q_other)->q_first

So the q_first for the primary structure is:


fffffc001fe08e00:  fffffc00005927a0 0000000000000000
fffffc001fe08e10:  0000000000000000 fffffc000f8bdc00
fffffc001fe08e20:  0000000000000000 fffffc003a769e28
fffffc001fe08e30:  0000000000000000 0000000000001021
fffffc001fe08e40:  0000000000000000 ffffffffffffffff
fffffc001fe08e50:  0000000000001900 0000000000001700
fffffc001fe08e60:  0000000000000000 0000000000000000
fffffc001fe08e70:  fffffc001fe08ee8 fffffc000f8bdc00
(kdbx) px $r9+8
0xfffffc001fe08e08
(kdbx) 0xfffffc001fe08e08/X
fffffc001fe08e08:  0000000000000000

Now q_other is at offset 112 (14*8)

(kdbx) px $r9+112
0xfffffc001fe08e70                      # Offset of q+q_other
(kdbx) 0xfffffc001fe08e70/X             # contents of q->q_other
fffffc001fe08e70:  fffffc001fe08ee8
(kdbx) px 0xfffffc001fe08ee8+8          # Offset of q->q_other + q_first
0xfffffc001fe08ef0
(kdbx) 0xfffffc001fe08ef0/X             # Contents of q->q_other->q_first
fffffc001fe08ef0:  fffffc00074d1e00
So the value of q->q_other->q_first is 0xfffffc00074d1e00
so we are going to hang up here until something releases this.

(kdbx) cast 0xfffffc00074d1e00 struct msgb  # The q_first value

struct {
    b_next = 0xfffffc0009a3b200
    b_prev = (nil)
    b_cont = 0xfffffc0009670a00
    b_rptr = 0xfffffc00074d1ec8 = "^C"
    b_wptr = 0xfffffc00074d1ed0 = ""
    b_datap = 0xfffffc00074d1e80
    b_band = '^@'
    b_pad1 = '^@'
    b_flag = 0
    b_pad2 = 0
    b_sq = struct {
        sq_next = (nil)
        sq_prev = (nil)
        sq_flags = 0
        sq_cpuno = 0
        sq_target = (nil)
        sq_queue = 0xfffffc001fe08ee8
        sq_entry = 0xfffffc00003e59f0
        sq_arg0 = 0xfffffc001fe08ee8
        sq_arg1 = (nil)
    }
}


...
        sq_entry = 0xfffffc00003e59f0
...
(kdbx) 0xfffffc00003e59f0/i
[xtiwput:1013, 0xfffffc00003e59f0]    lda     sp, -80(sp)


At this point, there are 110 references to q_first in sources and it is
not clear as to which one actually clears this field
(kern/streams/str_util.c seems to be the place, but there is a lack of
documentation ...)

(kdbx) 0xfffffc001fe08ee8/16X           # The q->q_other structure
fffffc001fe08ee8:  fffffc00005927d8 fffffc00074d1e00
fffffc001fe08ef8:  fffffc00063a9d00 0000000000000000
fffffc001fe08f08:  0000000000000000 fffffc003a769e28
fffffc001fe08f18:  0000000000002134 0000000000001034
fffffc001fe08f28:  0000000000000000 ffffffffffffffff
fffffc001fe08f38:  0000000000001900 0000000000001700
fffffc001fe08f48:  0000000000000000 0000000000000000
fffffc001fe08f58:  fffffc001fe08e00 0000000000000000

(kdbx) cast 0xfffffc00005927d8 struct qinit  # The q_info value
struct {
    qi_putp = 0xfffffc00003e59f0
    qi_srvp = 0xfffffc00003e61f8
    qi_qopen = (nil)
    qi_qclose = (nil)
    qi_qadmin = (nil)
    qi_minfo = 0xfffffc0000592770
    qi_mstat = (nil)
}

(kdbx) 0xfffffc00003e59f0/i
[xtiwput:1013, 0xfffffc00003e59f0]    lda     sp, -80(sp)
(kdbx) 0xfffffc00003e61f8/i
[xtiwsrv:1555, 0xfffffc00003e61f8]    lda     sp, -32(sp)
(kdbx) cast 0xfffffc0000592770 struct module_info
struct {
    mi_idnum = 5010
    mi_idname = 0xfffffc00005bd270 = "xtiso"
    mi_minpsz = 0
    mi_maxpsz = -1
    mi_hiwat = 6400
    mi_lowat = 5888
}

(kdbx) px 0xfffffc001fe08ee8 + 112
0xfffffc001fe08f58
(kdbx) 0xfffffc001fe08f58/X
fffffc001fe08f58:  fffffc001fe08e00
(kdbx) 0xfffffc001fe08e00/16X
fffffc001fe08e00:  fffffc00005927a0 0000000000000000
fffffc001fe08e10:  0000000000000000 fffffc000f8bdc00
fffffc001fe08e20:  0000000000000000 fffffc003a769e28
fffffc001fe08e30:  0000000000000000 0000000000001021
fffffc001fe08e40:  0000000000000000 ffffffffffffffff
fffffc001fe08e50:  0000000000001900 0000000000001700
fffffc001fe08e60:  0000000000000000 0000000000000000
fffffc001fe08e70:  fffffc001fe08ee8 fffffc000f8bdc00

(kdbx) cast 0xfffffc00005927a0 struct qinit
struct {
    qi_putp = (nil)
    qi_srvp = 0xfffffc00003e68a8
    qi_qopen = 0xfffffc00003e55f8
    qi_qclose = 0xfffffc00003e58b8
    qi_qadmin = (nil)
    qi_minfo = 0xfffffc0000592770
    qi_mstat = (nil)
    }
(kdbx) cast 0xfffffc0000592770 struct module_info
struct {
    mi_idnum = 5010
    mi_idname = 0xfffffc00005bd270 = "xtiso"
    mi_minpsz = 0
    mi_maxpsz = -1
    mi_hiwat = 6400
    mi_lowat = 5888
}
(kdbx) 0xfffffc00003e58b8/i
[xticlose:947, 0xfffffc00003e58b8]    lda     sp, -48(sp)
(kdbx)
(kdbx) 0xfffffc001fe08ee8/16X
fffffc001fe08ee8:  fffffc00005927d8 fffffc00074d1e00
fffffc001fe08ef8:  fffffc00063a9d00 0000000000000000
fffffc001fe08f08:  0000000000000000 fffffc003a769e28
fffffc001fe08f18:  0000000000002134 0000000000001034
fffffc001fe08f28:  0000000000000000 ffffffffffffffff
fffffc001fe08f38:  0000000000001900 0000000000001700
fffffc001fe08f48:  0000000000000000 0000000000000000
fffffc001fe08f58:  fffffc001fe08e00 0000000000000000


So it would appear that the q->q_other chain has a loop in it
or more properly is a matched pair of structures. (A guess from
the comments)

The addresses of the 'struct qinit' above match that of the
following structures in xtiso.c

(kdbx) px &xtirinit
0xfffffc00005927a0
(kdbx) px &xtiwinit
0xfffffc00005927d8
(kdbx) 0xfffffc000f8bdc00/16X                   # q_next structure
fffffc000f8bdc00:  fffffc00005916e0 0000000000000000
fffffc000f8bdc10:  0000000000000000 0000000000000000
fffffc000f8bdc20:  0000000000000000 fffffc0032778e00
fffffc000f8bdc30:  0000000000000000 0000000000001029
fffffc000f8bdc40:  0000000000000000 0000000000007d00
fffffc000f8bdc50:  0000000000000800 0000000000000400
fffffc000f8bdc60:  0000000000000000 0000000000000000
fffffc000f8bdc70:  fffffc000f8bdce8 0000000000000000
(kdbx) 0xfffffc000f8bdce8/16X                   # q_other struct
fffffc000f8bdce8:  fffffc0000591718 0000000000000000
fffffc000f8bdcf8:  0000000000000000 fffffc001fe08ee8
fffffc000f8bdd08:  0000000000000000 fffffc0032778e00
fffffc000f8bdd18:  0000000000000000 000000000000102a
fffffc000f8bdd28:  0000000000000000 0000000000007d00
fffffc000f8bdd38:  0000000000000800 0000000000000400
fffffc000f8bdd48:  0000000000000000 0000000000000000
fffffc000f8bdd58:  fffffc000f8bdc00 fffffc001fe08ee8
(kdbx) cast 0xfffffc00005916e0 struct qinit
struct {
    qi_putp = 0xfffffc00003d0490
    qi_srvp = (nil)
    qi_qopen = (nil)
    qi_qclose = (nil)
    qi_qadmin = (nil)
    qi_minfo = 0xfffffc00005916b0
    qi_mstat = (nil)
    }
(kdbx) cast 0xfffffc00005916b0 struct module_info
struct {
    mi_idnum = 0
    mi_idname = 0xfffffc00005bce60 = "sth"
    mi_minpsz = 0
    mi_maxpsz = 32000
    mi_hiwat = 2048
    mi_lowat = 1024
}
(kdbx)

The q_first structure is 0 in this case so there are no messages (first
or last).  The process has two modules on it.  The base xtiso module and
the module 'sth'.  I cannot find this 'sth' module on my local
systems.
T.RTitleUserPersonal
Name
DateLines