-
Type: Bug
-
Resolution: Fixed
-
Priority: Critical - P2
-
Affects Version/s: None
-
Component/s: None
-
5
-
2023-05-30 - 7.0 Readiness
-
v7.0
In the testing of migrating the old history store cursor to the new history store cursor, we found test_wt4333_handle_locks is failing with checkpoint blocked by waiting the eviction gen to drain. While one eviction thread is blocked by opening a hs dhandle.
#0 0x00007f07609b1fb9 in futex_reltimed_wait_cancelable (private=<optimized out>, reltime=0x7f0737ffdac0, expected=0, futex_word=0x22bfadc) at ../sysdeps/unix/sysv/linux/futex-internal.h:142 #1 __pthread_cond_wait_common (abstime=0x7f0737ffdb80, mutex=0x22bfa88, cond=0x22bfab0) at pthread_cond_wait.c:533 #2 __pthread_cond_timedwait (cond=cond@entry=0x22bfab0, mutex=mutex@entry=0x22bfa88, abstime=abstime@entry=0x7f0737ffdb80) at pthread_cond_wait.c:667 #3 0x000000000048fcb2 in __wt_cond_wait_signal () at ../src/os_posix/os_mtx_cond.c:115 #4 0x00000000004c5f4e in __wt_cond_wait (run_func=0x0, usecs=10000, cond=<optimized out>, session=0x7f07613ba770) at ../src/include/misc_inline.h:19 #5 __wt_readlock () at ../src/support/mtx_rw.c:219 #6 0x00000000004c12e2 in __session_find_shared_dhandle (checkpoint=0x0, uri=0x61aa78 "file:WiredTigerHS.wt", session=0x7f07613ba770) at ../src/session/session_dhandle.c:408 #7 __session_get_dhandle (checkpoint=0x0, uri=0x61aa78 "file:WiredTigerHS.wt", session=0x7f07613ba770) at ../src/session/session_dhandle.c:445 #8 __wt_session_get_dhandle () at ../src/session/session_dhandle.c:474 #9 0x00000000004c193e in __wt_session_get_btree_ckpt (session=session@entry=0x7f07613ba770, uri=uri@entry=0x61aa78 "file:WiredTigerHS.wt", cfg=cfg@entry=0x7f0737ffded0, flags=0) at ../src/session/session_dhandle.c:319 #10 0x0000000000456080 in __wt_curfile_open () at ../src/cursor/cur_file.c:801 #11 0x00000000004bbf2f in __session_open_cursor_int () at ../src/session/session_api.c:479 #12 0x00000000004bbc07 in __wt_open_cursor (session=session@entry=0x7f07613ba770, uri=uri@entry=0x61aa78 "file:WiredTigerHS.wt", owner=owner@entry=0x0, cfg=cfg@entry=0x7f0737ffded0, cursorp=cursorp@entry=0x7f0737ffdec8) at ../src/session/session_api.c:543 #13 0x00000000005992e5 in __hs_cursor_open_int (cursorp=0x7f072447aca8, session=0x7f07613ba770) at ../src/cursor/cur_hs.c:26 #14 __wt_curhs_open (session=session@entry=0x7f07613ba770, owner=owner@entry=0x0, cursorp=cursorp@entry=0x7f0737ffdf70) at ../src/cursor/cur_hs.c:1015 #15 0x00000000005bf00f in __wt_hs_delete_key_from_ts () at ../src/history/hs_rec.c:652 #16 0x00000000005bfea1 in __wt_hs_insert_updates () at ../src/history/hs_rec.c:458 #17 0x000000000049fff6 in __rec_hs_wrapup (r=0x7f0724b100e0, session=0x7f07613ba770) at ../src/reconcile/rec_write.c:2302 #18 __rec_write_wrapup () at ../src/reconcile/rec_write.c:2127 #19 0x00000000004a1ffe in __reconcile (page_lockedp=<synthetic pointer>, flags=560, salvage=<optimized out>, ref=0x7f06e4105ce0, session=0x7f07613ba770) at ../src/reconcile/rec_write.c:214 #20 __wt_reconcile () at ../src/reconcile/rec_write.c:99 #21 0x000000000046ddef in __evict_review (inmem_splitp=<synthetic pointer>, evict_flags=0, ref=0x7f06e4105ce0, session=0x7f07613ba770) at ../src/evict/evict_page.c:709 #22 __wt_evict () at ../src/evict/evict_page.c:160 #23 0x00000000004653c3 in __evict_page () at ../src/evict/evict_lru.c:2280 #24 0x000000000046696e in __wt_cache_eviction_worker () at ../src/evict/evict_lru.c:2386 #25 0x00000000004d84fa in __wt_cache_eviction_check (busy=false, readonly=false, didworkp=0x0, didworkp=0x0, readonly=false, busy=false, session=0x7f07613ba770) at ../src/include/cache_inline.h:512 #26 __wt_txn_commit () at ../src/txn/txn.c:1653 #27 0x000000000045209a in __curfile_insert () at ../src/cursor/cur_file.c:282 #28 0x00000000004324b9 in op (session=<optimized out>, rnd=rnd@entry=0x7f0737ffee80, cpp=cpp@entry=0x7f0737ffeea0) at ../../../test/csuite/wt4333_handle_locks/main.c:133 #29 0x000000000043277b in wthread (arg=<optimized out>) at ../../../test/csuite/wt4333_handle_locks/main.c:166 #30 0x00007f07609ab6db in start_thread (arg=0x7f0737fff700) at pthread_create.c:463 #31 0x00007f07602af71f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51 #1 0x00007f07601ce921 in __GI_abort () at abort.c:79 #2 0x000000000041076d in __wt_abort (session=session@entry=0x7f07613b87f0) at ../src/os_common/os_abort.c:30 #3 0x00000000004189d4 in __wt_gen_drain (session=0x7f07613b87f0, which=2, generation=1639) at ../src/support/generation.c:169 #4 0x00000000004c3111 in __wt_gen_next_drain (session=session@entry=0x7f07613b87f0, which=which@entry=2) at ../src/support/generation.c:97 #5 0x0000000000561423 in __wt_sync_file () at ../src/btree/bt_sync.c:543 #6 0x00000000004dbf33 in __checkpoint_tree.constprop.11 () at ../src/txn/txn_ckpt.c:1672 #7 0x00000000004dc3b0 in __wt_checkpoint () at ../src/txn/txn_ckpt.c:1832 #8 0x0000000000484c1c in __wt_meta_track_off () at ../src/meta/meta_track.c:287 #9 0x00000000004de8e4 in __wt_checkpoint_close () at ../src/txn/txn_ckpt.c:1912 #10 0x0000000000440cf3 in __wt_conn_dhandle_close () at ../src/conn/conn_dhandle.c:342 #11 0x00000000004410d6 in __conn_dhandle_close_one (session=0x7f07613bcae0, uri=<optimized out>, checkpoint=<optimized out>, removed=<optimized out>, mark_dead=<optimized out>) at ../src/conn/conn_dhandle.c:707 #12 0x0000000000442047 in __wt_conn_dhandle_close_all () at ../src/conn/conn_dhandle.c:746 #13 0x00000000004adb63 in __wt_exclusive_handle_operation () at ../src/schema/schema_worker.c:26 #14 0x00000000004ade9f in __wt_schema_worker.localalias.0 () at ../src/schema/schema_worker.c:70 #15 0x00000000004ae063 in __wt_schema_worker.localalias.0 () at ../src/schema/schema_worker.c:100 #16 0x000000000041590b in __session_verify (wt_session=0x7f07613bcae0, uri=0x232af30 "table:466", config=0x0) at ../src/session/session_api.c:1475 #17 0x0000000000432679 in vthread (arg=<optimized out>) at ../../../test/csuite/wt4333_handle_locks/main.c:195 #18 0x00007f07609ab6db in start_thread (arg=0x7f07237fe700) at pthread_create.c:463 #19 0x00007f07602af71f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 (gdb) f 4 #4 0x00000000004c3111 in __wt_gen_next_drain (session=session@entry=0x7f07613b87f0, which=which@entry=2) at ../src/support/generation.c:97 97 ../src/support/generation.c: No such file or directory. (gdb) p session->dhandle $2 = (WT_DATA_HANDLE *) 0x22df1e0 (gdb) p *session->dhandle $3 = {rwlock = {u = {v = 81604378881, s = {current = 1 '\001', next = 1 '\001', reader = 0 '\000', readers_queued = 0 '\000', readers_active = 19}}, stat_read_count_off = -1, stat_write_count_off = -1, stat_app_usecs_off = -1, stat_int_usecs_off = -1, stat_session_usecs_off = 0, cond_readers = 0x22df660, cond_writers = 0x22e0970}, q = {tqe_next = 0x0, tqe_prev = 0x22f40a8}, hashq = {tqe_next = 0x0, tqe_prev = 0x40d00b8}, name = 0x22c98d0 "file:WiredTiger.wt", name_hash = 1045034099109282882, checkpoint = 0x0, cfg = 0x22c82a0, meta_base = 0x22e3190 "access_pattern_hint=none,allocation_size=4KB,app_metadata=,assert=(read_timestamp=none,write_timestamp=off),block_allocation=best,block_compressor=,cache_resident=false,checksum=uncompressed,collator="..., session_ref = 19, session_inuse = 19, excl_ref = 0, timeofdeath = 0, excl_session = 0x0, dsrc = 0x0, handle = 0x22df3a0, type = WT_DHANDLE_TYPE_BTREE, compact_skip = false, close_lock = {lock = {__data = { __lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0}, name = 0x61a987 "data handle close", stat_count_off = -1, stat_app_usecs_off = -1, stat_int_usecs_off = -1, stat_session_usecs_off = 0, initialized = 1 '\001'}, stats = {0x22e34b0, 0x22e3b18, 0x22e4180, 0x22e47e8, 0x22e4e50, 0x22e54b8, 0x22e5b20, 0x22e6188, 0x22e67f0, 0x22e6e58, 0x22e74c0, 0x22e7b28, 0x22e8190, 0x22e87f8, 0x22e8e60, 0x22e94c8, 0x22e9b30, 0x22ea198, 0x22ea800, 0x22eae68, 0x22eb4d0, 0x22ebb38, 0x22ec1a0}, stat_array = 0x22e34b0, flags = 160, ts_flags = 0} (gdb)
The failure signature is the same as WT-5785. We attempted to solve it by caching the history store cursor in session before each eviction in WT-5785. However, it is not perfect as there is a chance that the cached hs cursor might be swept away by __session_dhandle_sweep.
This is likely what is happening in this failure. Here's a stack that I found the hs cursor might be swept away:
#0 0x00007fc25329ffb7 in raise () from /lib/x86_64-linux-gnu/libc.so.6 #1 0x00007fc2532a1921 in abort () from /lib/x86_64-linux-gnu/libc.so.6 #2 0x00000000004107bd in __wt_abort (session=session@entry=0x297e800) at ../src/os_common/os_abort.c:30 #3 0x000000000041807b in __session_dhandle_sweep (session=0x297e800) at ../src/session/session_dhandle.c:395 #4 __session_get_dhandle (checkpoint=0x0, uri=0x2a24620 "table:49", session=0x297e800) at ../src/session/session_dhandle.c:443 #5 __wt_session_get_dhandle (session=session@entry=0x297e800, uri=uri@entry=0x2a24620 "table:49", checkpoint=checkpoint@entry=0x0, cfg=cfg@entry=0x0, flags=flags@entry=0) at ../src/session/session_dhandle.c:478 #6 0x00000000004a8221 in __wt_schema_get_table_uri (session=session@entry=0x297e800, uri=uri@entry=0x2a24620 "table:49", ok_incomplete=ok_incomplete@entry=false, flags=flags@entry=0, tablep=tablep@entry=0x7fc1f47f7c28) at ../src/schema/schema_list.c:27 #7 0x00000000005ba7fa in __wt_curtable_open (session=session@entry=0x297e800, uri=<optimized out>, uri@entry=0x2a24620 "table:49", owner=owner@entry=0x0, cfg=cfg@entry=0x7fc1f47f7d40, cursorp=cursorp@entry=0x7fc1f47f7d30) at ../src/cursor/cur_table.c:970 #8 0x00000000004bbd23 in __session_open_cursor_int (session=0x297e800, uri=<optimized out>, owner=0x0, other=<optimized out>, cfg=0x7fc1f47f7d40, cursorp=0x7fc1f47f7d30) at ../src/session/session_api.c:447 #9 0x00000000004bc685 in __session_open_cursor (wt_session=0x297e800, uri=<optimized out>, to_dup=0x0, config=<optimized out>, cursorp=0x7fc1f47f7db8) at ../src/session/session_api.c:563 #10 0x0000000000432456 in op (session=0x297e800, rnd=rnd@entry=0x7fc1f47f7e80, cpp=cpp@entry=0x7fc1f47f7eb0) at ../../../test/csuite/wt4333_handle_locks/main.c:115 #11 0x00000000004327fb in wthread (arg=<optimized out>) at ../../../test/csuite/wt4333_handle_locks/main.c:166 #12 0x00007fc253a7e6db in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0 #13 0x00007fc25338271f in clone () from /lib/x86_64-linux-gnu/libc.so.6
I managed to reproduce the issue on an evergreen spawn host. Disabling dhandle sweep prevented the problem from reproducing.
- is duplicated by
-
WT-13498 Race between sweep server and checkpoint interacting with the same dhandle associated with a named checkpoint being removed (7.0)
- Closed