diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index 8292a08c4..aa15940c8 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -285,6 +285,57 @@ __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref) } } +/* + * __wt_ref_addr_free_copy -- + * Free the address in a reference, if necessary. + */ +void +__wt_ref_addr_free_copy(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_PAGE *home; + void *ref_addr; + + /* + * In order to free the WT_REF.addr field we need to read and clear the address without a race. + * The WT_REF may be a child of a page being split, in which case the addr field could be + * instantiated concurrently which changes the addr field. Once we swap in NULL we effectively + * own the addr. Then provided the addr is off page we can free the memory. + * + * However as we could be the child of a page being split the ref->home pointer which tells us + * whether the addr is on or off page could change concurrently. To avoid this we save the home + * pointer before we do the compare and swap. While the second ordered read should be sufficient + * we use an ordered read on the ref->home pointer as that is the standard mechanism to + * guarantee we read the current value. + * + * We don't reread this value inside loop as if it was to change then we would be pointing at a + * new parent, which would mean that our ref->addr must have been instantiated and thus we are + * safe to free it at the end of this function. + */ + WT_ORDERED_READ(home, ref->home); + do { + WT_ORDERED_READ(ref_addr, ref->addr); + if (ref_addr == NULL) + return; + } while (!__wt_atomic_cas_ptr(&ref->addr, ref_addr, NULL)); + + if (home == NULL || __wt_off_page(home, ref_addr)) { + __wt_free(session, ((WT_ADDR *)ref_addr)->addr); + __wt_free(session, ref_addr); + } + + /* WT-11062 race attempt + * Signal to ref_addr_copy_copy that we've freed the ref. Don't move forward in case this thread + * ends up populating ref->addr with new (copy-able) data + */ + if (F_ISSET(ref, WT_REF_FLAG_WT11062_TRY_RACE)) { + F_SET(ref, WT_REF_FLAG_WT11062_REF_FREED); + while (F_ISSET(ref, WT_REF_FLAG_WT11062_TRY_RACE)) { + __wt_yield(); + } + F_CLR(ref, WT_REF_FLAG_WT11062_REF_FREED); + } +} + /* * __wt_free_ref -- * Discard the contents of a WT_REF structure (optionally including the pages it references). diff --git a/src/include/btmem.h b/src/include/btmem.h index dd2b8023f..2eb19266d 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -938,10 +938,12 @@ struct __wt_ref { * depending on it to be "!leaf" instead. */ /* AUTOMATIC FLAG VALUE GENERATION START 0 */ -#define WT_REF_FLAG_INTERNAL 0x1u /* Page is an internal page */ -#define WT_REF_FLAG_LEAF 0x2u /* Page is a leaf page */ -#define WT_REF_FLAG_READING 0x4u /* Page is being read in */ - /* AUTOMATIC FLAG VALUE GENERATION STOP 8 */ +#define WT_REF_FLAG_INTERNAL 0x01u /* Page is an internal page */ +#define WT_REF_FLAG_LEAF 0x02u /* Page is a leaf page */ +#define WT_REF_FLAG_READING 0x04u /* Page is being read in */ +#define WT_REF_FLAG_WT11062_REF_FREED 0x08u /* Signal we've 86ed the ref->addr */ +#define WT_REF_FLAG_WT11062_TRY_RACE 0x10u /* Start attempt to trigger WT-11062 race */ + /* AUTOMATIC FLAG VALUE GENERATION STOP 8 */ uint8_t flags; #define WT_REF_DISK 0 /* Page is on disk */ diff --git a/src/include/btree_inline.h b/src/include/btree_inline.h index e13f59b0d..b7e775ae8 100644 --- a/src/include/btree_inline.h +++ b/src/include/btree_inline.h @@ -1589,6 +1589,121 @@ __wt_ref_addr_copy(WT_SESSION_IMPL *session, WT_REF *ref, WT_ADDR_COPY *copy) return (true); } +/* + * __wt_ref_addr_copy_copy -- + * A copy of ref_addr_copy only called by __wt_btcur_skip_page. Adds a section which attempts to + * force the proposed race in WT-11062 where a page gets dirtied and reconciled between us + * entering this function and attempting to memcpy addr->addr. + */ +static inline bool +__wt_ref_addr_copy_copy(WT_SESSION_IMPL *session, WT_REF *ref, WT_ADDR_COPY *copy) +{ + WT_ADDR *addr; + WT_CELL_UNPACK_ADDR *unpack, _unpack; + WT_PAGE *page; + int i; + + unpack = &_unpack; + page = ref->home; + copy->del_set = false; + + /* + * To look at an on-page cell, we need to look at the parent page's disk image, and that can be + * dangerous. The problem is if the parent page splits, deepening the tree. As part of that + * process, the WT_REF WT_ADDRs pointing into the parent's disk image are copied into off-page + * WT_ADDRs and swapped into place. The content of the two WT_ADDRs are identical, and we don't + * care which version we get as long as we don't mix-and-match the two. + */ + WT_ORDERED_READ(addr, (WT_ADDR *)ref->addr); + + /* If NULL, there is no information. */ + if (addr == NULL) + return (false); + + /* WT-11062 race start ===================================== */ + i = 0; + if (ref->page != NULL && ref->page->modify != NULL && ref->page->modify->rec_result == 0 && + __wt_random(&session->rnd) % 100 == 0) { + /* + * The page needs to be fresh in-mem to hit the correct path in rec_write_wrapup. No prior + * reconciliations Only run this once every 100 calls. We'll wait for longer on a subset of + * pages and increase the time window for the issue to fire + */ + F_SET(ref, WT_REF_FLAG_WT11062_TRY_RACE); + + /* Wait for the page to get dirtied */ + while (ref->page->modify->page_state != WT_PAGE_DIRTY) { + /* Give up after 5 second so we don't deadlock. Consistently poll every 1ms in case we + * miss the dirtied page before it's cleaned. */ + usleep(1000); + if (i++ == 5000) { + goto give_up_race; + } + } + + /* Wait for the addr to get freed */ + __wt_errx(session, + "WT-11062 - Detected a page that has been dirtied during ref_addr_copy! Waiting for it " + "to be reconciled."); + while (!F_ISSET(ref, WT_REF_FLAG_WT11062_REF_FREED)) { + __wt_yield(); + } + + /* Make sure ref->home hasn't changed, that could be a separate issue */ + if (ref->home == page) { + __wt_errx(session, "WT-11062 - Attempting to hit segfault from memcpy"); + memcpy(copy->addr, addr->addr, copy->size = addr->size); + } else { + __wt_errx(session, "WT-11062 - ref->home changed. dropping race attempt"); + } + +give_up_race: + __wt_errx(session, "give up race"); + F_CLR(ref, WT_REF_FLAG_WT11062_TRY_RACE); + } + /* WT-11062 race end ===================================== */ + + /* If off-page, the pointer references a WT_ADDR structure. */ + if (__wt_off_page(page, addr)) { + WT_TIME_AGGREGATE_COPY(©->ta, &addr->ta); + copy->type = addr->type; + memcpy(copy->addr, addr->addr, copy->size = addr->size); + return (true); + } + + /* If on-page, the pointer references a cell. */ + __wt_cell_unpack_addr(session, page->dsk, (WT_CELL *)addr, unpack); + WT_TIME_AGGREGATE_COPY(©->ta, &unpack->ta); + + switch (unpack->raw) { + case WT_CELL_ADDR_INT: + copy->type = WT_ADDR_INT; + break; + case WT_CELL_ADDR_LEAF: + copy->type = WT_ADDR_LEAF; + break; + case WT_CELL_ADDR_DEL: + /* Copy out any fast-truncate information. */ + copy->del_set = true; + if (F_ISSET(page->dsk, WT_PAGE_FT_UPDATE)) + copy->del = unpack->page_del; + else { + /* It's a legacy page; create default delete information. */ + copy->del.txnid = WT_TXN_NONE; + copy->del.timestamp = copy->del.durable_timestamp = WT_TS_NONE; + copy->del.prepare_state = 0; + copy->del.previous_ref_state = WT_REF_DISK; + copy->del.committed = true; + } + /* FALLTHROUGH */ + case WT_CELL_ADDR_LEAF_NO: + copy->type = WT_ADDR_LEAF_NO; + break; + } + memcpy(copy->addr, unpack->data, copy->size = (uint8_t)unpack->size); + return (true); +} + /* * __wt_ref_block_free -- * Free the on-disk block for a reference and clear the address. @@ -1604,7 +1719,7 @@ __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref) WT_RET(__wt_btree_block_free(session, addr.addr, addr.size)); /* Clear the address (so we don't free it twice). */ - __wt_ref_addr_free(session, ref); + __wt_ref_addr_free_copy(session, ref); return (0); } @@ -2330,7 +2445,7 @@ __wt_btcur_skip_page( */ if ((previous_state == WT_REF_DISK || (previous_state == WT_REF_MEM && !__wt_page_is_modified(ref->page))) && - __wt_ref_addr_copy(session, ref, &addr)) { + __wt_ref_addr_copy_copy(session, ref, &addr)) { /* If there's delete information in the disk address, we can use it. */ if (addr.del_set && __wt_page_del_visible(session, &addr.del, true)) { *skipp = true; diff --git a/src/include/extern.h b/src/include/extern.h index 6c915c8ff..59bf519f7 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -1922,6 +1922,7 @@ extern void __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, uint32_t extern void __wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r); extern void __wt_rec_dictionary_reset(WT_RECONCILE *r); extern void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref); +extern void __wt_ref_addr_free_copy(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_rollback_to_stable_init(WT_CONNECTION_IMPL *conn); extern void __wt_root_ref_init( @@ -2077,6 +2078,8 @@ static inline bool __wt_rec_need_split(WT_RECONCILE *r, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_ref_addr_copy(WT_SESSION_IMPL *session, WT_REF *ref, WT_ADDR_COPY *copy) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline bool __wt_ref_addr_copy_copy(WT_SESSION_IMPL *session, WT_REF *ref, + WT_ADDR_COPY *copy) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_ref_cas_state_int(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t old_state, uint8_t new_state, const char *func, int line) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_ref_is_root(WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/session/session_api.c b/src/session/session_api.c index dcff7b1d9..00a6719d7 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -480,6 +480,9 @@ __session_reconfigure(WT_SESSION *wt_session, const char *config) else F_CLR(session, WT_SESSION_DEBUG_RELEASE_EVICT); } + /* Force this setting on. We want as many pages as possible to have rec_result = 0 which we'll + * get from restoring pages from disk. */ + F_SET(session, WT_SESSION_DEBUG_RELEASE_EVICT); WT_ERR_NOTFOUND_OK(ret, false); diff --git a/test/format/checkpoint.c b/test/format/checkpoint.c index e1d5f0a01..f77f3eda2 100644 --- a/test/format/checkpoint.c +++ b/test/format/checkpoint.c @@ -48,9 +48,9 @@ wts_checkpoints(void) if (g.checkpoint_config != CHECKPOINT_WIREDTIGER) return; - testutil_check( - __wt_snprintf(config, sizeof(config), ",checkpoint=(wait=%" PRIu32 ",log_size=%" PRIu32 ")", - GV(CHECKPOINT_WAIT), MEGABYTE(GV(CHECKPOINT_LOG_SIZE)))); + /* Hard code maximum frequency for checkpoints */ + testutil_check(__wt_snprintf(config, sizeof(config), + ",checkpoint=(wait=1,log_size=%" PRIu32 ")", MEGABYTE(GV(CHECKPOINT_LOG_SIZE)))); testutil_check(g.wts_conn->reconfigure(g.wts_conn, config)); }