-
Type: Task
-
Resolution: Done
-
None
-
Affects Version/s: None
-
Component/s: None
-
None
A Jenkins job is hanging doing a compact in an LSM tree.
http://mjc.homeunix.org:8180/job/wiredtiger-perf-med-lsm-compact/977/
Potentially relevant stack traces:
(gdb) thread apply all where Thread 9 (Thread 0x7fdf97fff700 (LWP 26291)): #0 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:239 WT-1 0x000000000042386f in __wt_cond_wait (session=0x7fdf98041280, cond=0x7fdf98016160, usecs=<value optimized out>) at ../src/os_posix/os_mtx.c:77 WT-2 0x0000000000444b88 in __evict_server (arg=0x7fdf98041280) at ../src/btree/bt_evict.c:194 WT-3 0x00007fdf98ca1ddb in start_thread (arg=0x7fdf97fff700) at pthread_create.c:301 WT-4 0x00007fdf989efa1d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115 Thread 6 (Thread 0x7fdf967fc700 (LWP 26294)): #0 0x00007fdf989e8aa3 in select () at ../sysdeps/unix/syscall-template.S:82 WT-1 0x0000000000425958 in __wt_sleep (seconds=<value optimized out>, micro_seconds=<value optimized out>) at ../src/os_posix/os_sleep.c:22 WT-2 0x000000000041eb77 in __wt_lsm_compact (session=0x7fdf98041a00, name=<value optimized out>, skip=<value optimized out>) at ../src/lsm/lsm_tree.c:1134 WT-3 0x00000000004318c7 in __wt_schema_worker (session=0x7fdf98041a00, uri=0x7fdf75879060 "lsm:test", file_func=0, name_func=0x41e910 <__wt_lsm_compact>, cfg=0x7fdf967fbd20, open_flags=0) at ../src/schema/schema_worker.c:37 WT-4 0x0000000000431deb in __wt_schema_worker (session=0x7fdf98041a00, uri=<value optimized out>, file_func=0, name_func=0x41e910 <__wt_lsm_compact>, cfg=0x7fdf967fbd20, open_flags=0) at ../src/schema/schema_worker.c:86 WT-5 0x000000000043538a in __wt_session_compact (wt_session=0x7fdf98041a00, uri=0x7fdf98020270 "table:test", config=<value optimized out>) at ../src/session/session_compact.c:229 WT-6 0x000000000044195f in __async_worker_execop (arg=0x7fdf98041a00) at ../src/async/async_worker.c:198 WT-7 __async_worker_op (arg=0x7fdf98041a00) at ../src/async/async_worker.c:253 WT-8 __wt_async_worker (arg=0x7fdf98041a00) at ../src/async/async_worker.c:310 WT-9 0x00007fdf98ca1ddb in start_thread (arg=0x7fdf967fc700) at pthread_create.c:301 WT-10 0x00007fdf989efa1d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115 Thread 5 (Thread 0x7fdf95ffb700 (LWP 26295)): #0 0x00007fdf989e8aa3 in select () at ../sysdeps/unix/syscall-template.S:82 WT-1 0x0000000000425958 in __wt_sleep (seconds=<value optimized out>, ---Type <return> to continue, or q <return> to quit--- micro_seconds=<value optimized out>) at ../src/os_posix/os_sleep.c:22 WT-2 0x000000000041bece in __lsm_manager_run_server (arg=<value optimized out>) at ../src/lsm/lsm_manager.c:312 WT-3 __lsm_worker_manager (arg=<value optimized out>) at ../src/lsm/lsm_manager.c:377 WT-4 0x00007fdf98ca1ddb in start_thread (arg=0x7fdf95ffb700) at pthread_create.c:301 WT-5 0x00007fdf989efa1d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115 Thread 4 (Thread 0x7fdf957fa700 (LWP 26296)): #0 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:239 WT-1 0x000000000042386f in __wt_cond_wait (session=0x7fdf98042180, cond=0x7fdf98016080, usecs=<value optimized out>) at ../src/os_posix/os_mtx.c:77 WT-2 0x000000000041f177 in __lsm_worker (arg=0x7fdf9800f0a0) at ../src/lsm/lsm_worker.c:149 WT-3 0x00007fdf98ca1ddb in start_thread (arg=0x7fdf957fa700) at pthread_create.c:301 WT-4 0x00007fdf989efa1d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115 Thread 3 (Thread 0x7fdf947ff700 (LWP 26297)): #0 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:239 WT-1 0x000000000042386f in __wt_cond_wait (session=0x7fdf98042400, cond=0x7fdf98016080, usecs=<value optimized out>) at ../src/os_posix/os_mtx.c:77 WT-2 0x000000000041f177 in __lsm_worker (arg=0x7fdf9800f0c0) at ../src/lsm/lsm_worker.c:149 WT-3 0x00007fdf98ca1ddb in start_thread (arg=0x7fdf947ff700) at pthread_create.c:301 WT-4 0x00007fdf989efa1d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115 Thread 2 (Thread 0x7fdf93ffe700 (LWP 26298)): #0 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:239 WT-1 0x000000000042386f in __wt_cond_wait (session=0x7fdf98042680, cond=0x7fdf98016080, usecs=<value optimized out>) at ../src/os_posix/os_mtx.c:77 WT-2 0x000000000041f177 in __lsm_worker (arg=0x7fdf9800f0e0) at ../src/lsm/lsm_worker.c:149 WT-3 0x00007fdf98ca1ddb in start_thread (arg=0x7fdf93ffe700) at pthread_create.c:301 WT-4 0x00007fdf989efa1d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:115
Information:
(gdb) p /x lsm_tree->flags $2 = 0x33 (gdb) list lsm.h:197 192 int freeing_old_chunks; /* Whether chunks are being freed */ 193 uint32_t merge_aggressiveness; /* Increase amount of work per merge */ 194 195 #define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */ 196 #define WT_LSM_TREE_COMPACT_FLUSH 0x02 /* Flushed for compact */ 197 #define WT_LSM_TREE_COMPACTING 0x04 /* Tree being compacted */ 198 #define WT_LSM_TREE_NEED_SWITCH 0x08 /* New chunk needs creating */ 199 #define WT_LSM_TREE_OPEN 0x10 /* The tree is open */ 200 #define WT_LSM_TREE_THROTTLE 0x20 /* Throttle updates */ 201 uint32_t flags; (gdb) p lsm_tree->chunk[0] $4 = (WT_LSM_CHUNK *) 0x7fdf86045480 (gdb) p lsm_tree->chunk[1] $5 = (WT_LSM_CHUNK *) 0x7fdf8640e580 (gdb) p *lsm_tree->chunk[0] $6 = {uri = 0x7fdf3d5c3520 "file:test-000110.lsm", bloom_uri = 0x0, create_ts = { tv_sec = 1410541054, tv_nsec = 666855286}, count = 49624376, size = 6401531904, switch_txn = 0, id = 110, generation = 3, refcnt = 0, bloom_busy = 0, empty = 0 '\000', evicted = 0 '\000', flags = 4} (gdb) p *lsm_tree->chunk[1] $7 = {uri = 0x7fdf3df38ca0 "file:test-000106.lsm", bloom_uri = 0x0, create_ts = { tv_sec = 1410540986, tv_nsec = 133071004}, count = 375624, size = 0, switch_txn = 50001018, id = 106, generation = 0, refcnt = 1, bloom_busy = 0, empty = 0 '\000', evicted = 0 '\000', flags = 0}
There are only three entries in the app queue:
(gdb) p manager->appqh $17 = {tqh_first = 0x7fdf948a93a0, tqh_last = 0x7fdf948a9460} (gdb) p $17->tqh_first $19 = (struct __wt_lsm_work_unit *) 0x7fdf948a93a0 (gdb) p *$19 $20 = {q = {tqe_next = 0x7fdf948a9400, tqe_prev = 0x7fdf9802be50}, flags = 2, lsm_tree = 0x7fdf9808d000} (gdb) p *$19->q.tqe_next $21 = {q = {tqe_next = 0x7fdf948a9460, tqe_prev = 0x7fdf948a93a0}, flags = 4, lsm_tree = 0x7fdf9808d000} (gdb) p *$21->q.tqe_next $22 = {q = {tqe_next = 0x0, tqe_prev = 0x7fdf948a9400}, flags = 1, lsm_tree = 0x7fdf9808d000} (gdb) p *$22->q.tqe_next Cannot access memory at address 0x0
One entry in the manager queue:
(gdb) p manager->managerqh $18 = {tqh_first = 0x7fdf948a9280, tqh_last = 0x7fdf948a94a0} (gdb) p *$18->tqh_first->q.tqe_next $23 = {q = {tqe_next = 0x0, tqe_prev = 0x7fdf948a9280}, flags = 8, lsm_tree = 0x7fdf9808d000} (gdb) p *$23->q.tqe_next
None in the switch queue.
The worker code is never retrieving a chunk to flush (chunk is always NULL at line 56 of lsm_worker.c).
The compact code is never pushing more forced flushes. I think it's a problem with:
/* * The flush flag is cleared when the the chunk has been * flushed. Continue to push forced flushes until the * chunk is on disk. Once it is on disk move to the compacting * phase. */ if (flushing && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACT_FLUSH)) { if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact flush retry %s chunk %u", name, chunk->id)); F_SET(lsm_tree, WT_LSM_TREE_COMPACT_FLUSH); WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH | WT_LSM_WORK_FORCE, lsm_tree)); } else { if (ref) { WT_ASSERT(session, chunk != NULL); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact flush done %s chunk %u", name, chunk->id)); (void)WT_ATOMIC_SUB(chunk->refcnt, 1); } flushing = ref = 0; compacting = 1; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); } }
It should be
(flushing && F_ISSET(lsm_tree, WT_LSM_TREE_COMPACT_FLUSH))
, but I need to think through the other implications of that change.
- is related to
-
WT-1201 medium-lsm-multi failure in WT_CONNECTION::close
- Closed
-
WT-1247 compaction still hanging
- Closed
- related to
-
WT-2 What does metadata look like?
- Closed
-
WT-3 What file formats are required?
- Closed
-
WT-4 Flexible cursor traversals
- Closed
-
WT-5 How does pget work: is it necessary?
- Closed
-
WT-6 Complex schema example
- Closed
-
WT-7 Do we need the handle->err/errx methods?
- Closed
-
WT-8 Do we need table load, bulk-load and/or dump methods?
- Closed
-
WT-9 Does adding schema need to be transactional?
- Closed
-
WT-10 Basic "getting started" tutorial
- Closed
-
WT-1226 Fix a hang in LSM compact. Refs #1225
- Closed