MongoDB replica secondaries crashing and not-starting
I have a 3 node replica set with mongodb version of 3.4.13 running on ubuntu 16.
I updated a secondary node from 3.4.13 to 3.6.5 and switched it to become primary. This node became primary and running fine with no issues.
Soon after that, the rest two secondary nodes crashed with a stack trace given below.
I tried to restart them but i am still getting the same errors and the mongo is aborting.
Any pointers?
2018-06-21T16:58:00.493-0700 I FTDC [initandlisten] Initializing full-time diagnostic data capture with directory '/xxxx/diagnostic.data' 2018-06-21T16:58:00.499-0700 I REPL [initandlisten] Replaying stored operations from { ts: Timestamp 1529618536000|1, t: 18 } (exclusive) to { ts: Timestamp 1529618546000|1, t: 18 } (inclusive). 2018-06-21T16:58:00.532-0700 E REPL [initandlisten] Failed command { create: "system.sessions", idIndex: { v: 2, key: { _id: 1 }, name: "_id_", ns: "config.system.sessions" } } on config with status BadValue: cannot write to 'config.system.sessions' during oplog application 2018-06-21T16:58:00.532-0700 I - [initandlisten] Fatal assertion 40294 BadValue: cannot write to 'config.system.sessions' at src/mongo/db/repl/replication_coordinator_external_state_impl.cpp 669 2018-06-21T16:58:00.532-0700 I - [initandlisten] ***aborting after fassert() failure 2018-06-21T16:58:00.570-0700 F - [initandlisten] Got signal: 6 (Aborted). 0x7efde8e5a6a1 0x7efde8e598b9 0x7efde8e59d9d 0x7efde64dc390 0x7efde6136428 0x7efde613802a 0x7efde8104eb7 0x7efde88c129d 0x7efde88d0423 0x7efde88d1f4c 0x7efde80f28d1 0x7efde8110a46 0x7efde6121830 0x7efde81700f9 ----- BEGIN BACKTRACE ----- {"backtrace":[{"b":"7EFDE78E8000","o":"15726A1","s":"_ZN5mongo15printStackTraceERSo"},{"b":"7EFDE78E8000","o":"15718B9"},{"b":"7EFDE78E8000","o":"1571D9D"},{"b":"7EFDE64CB000","o":"11390"},{"b":"7EFDE6101000","o":"35428","s":"gsignal"},{"b":"7EFDE6101000","o":"3702A","s":"abort"},{"b":"7EFDE78E8000","o":"81CEB7","s":"_ZN5mongo42fassertFailedWithStatusNoTraceWithLocationEiRKNS_6StatusEPKcj"},{"b":"7EFDE78E8000","o":"FD929D","s":"_ZN5mongo4repl39ReplicationCoordinatorExternalStateImpl21cleanUpLastApplyBatchEPNS_16OperationContextE"},{"b":"7EFDE78E8000","o":"FE8423","s":"_ZN5mongo4repl26ReplicationCoordinatorImpl21_startLoadLocalConfigEPNS_16OperationContextE"},{"b":"7EFDE78E8000","o":"FE9F4C","s":"_ZN5mongo4repl26ReplicationCoordinatorImpl7startupEPNS_16OperationContextE"},{"b":"7EFDE78E8000","o":"80A8D1"},{"b":"7EFDE78E8000","o":"828A46","s":"main"},{"b":"7EFDE6101000","o":"20830","s":"__libc_start_main"},{"b":"7EFDE78E8000","o":"8880F9","s":"_start"}],"processInfo":{ "mongodbVersion" : "3.4.13", "gitVersion" : "fbdef2ccc53e0fcc9afb570063633d992b2aae42", "compiledModules" : [], "uname" : { "sysname" : "Linux", "release" : "3.13.0-112-generic", "version" : "#159-Ubuntu SMP Fri Mar 3 15:26:07 UTC 2017", "machine" : "x86_64" }, "somap" : [ { "b" : "7EFDE78E8000", "elfType" : 3, "buildId" : "9A34D266FEE22AA39FD157A54318E11A8F6D072B" }, { "b" : "7FFF268AB000", "elfType" : 3, "buildId" : "012E1338BA43AF7C0DC7D069F64F0A6490CC6D9C" }, { "b" : "7EFDE7457000", "path" : "/lib/x86_64-linux-gnu/libssl.so.1.0.0", "elfType" : 3, "buildId" : "DCF10134B91ED2139E3E8C72564668F5CDBA8522" }, { "b" : "7EFDE7013000", "path" : "/lib/x86_64-linux-gnu/libcrypto.so.1.0.0", "elfType" : 3, "buildId" : "1649272BE0CA9FA22F082DC86372B6C9959779B0" }, { "b" : "7EFDE6E0B000", "path" : "/lib/x86_64-linux-gnu/librt.so.1", "elfType" : 3, "buildId" : "89C34D7A182387D76D5CDA1F7718F5D58824DFB3" }, { "b" : "7EFDE6C07000", "path" : "/lib/x86_64-linux-gnu/libdl.so.2", "elfType" : 3, "buildId" : "8CC8D0D119B142D839800BFF71FB71E73AEA7BD4" }, { "b" : "7EFDE68FE000", "path" : "/lib/x86_64-linux-gnu/libm.so.6", "elfType" : 3, "buildId" : "DFB85DE42DAFFD09640C8FE377D572DE3E168920" }, { "b" : "7EFDE66E8000", "path" : "/lib/x86_64-linux-gnu/libgcc_s.so.1", "elfType" : 3, "buildId" : "68220AE2C65D65C1B6AAA12FA6765A6EC2F5F434" }, { "b" : "7EFDE64CB000", "path" : "/lib/x86_64-linux-gnu/libpthread.so.0", "elfType" : 3, "buildId" : "CE17E023542265FC11D9BC8F534BB4F070493D30" }, { "b" : "7EFDE6101000", "path" : "/lib/x86_64-linux-gnu/libc.so.6", "elfType" : 3, "buildId" : "B5381A457906D279073822A5CEB24C4BFEF94DDB" }, { "b" : "7EFDE76C0000", "path" : "/lib64/ld-linux-x86-64.so.2", "elfType" : 3, "buildId" : "5D7B6259552275A3C17BD4C3FD05F5A6BF40CAA5" } ] }} mongod(_ZN5mongo15printStackTraceERSo+0x41) [0x7efde8e5a6a1] mongod(+0x15718B9) [0x7efde8e598b9] mongod(+0x1571D9D) [0x7efde8e59d9d] libpthread.so.0(+0x11390) [0x7efde64dc390] libc.so.6(gsignal+0x38) [0x7efde6136428] libc.so.6(abort+0x16A) [0x7efde613802a] mongod(_ZN5mongo42fassertFailedWithStatusNoTraceWithLocationEiRKNS_6StatusEPKcj+0x0) [0x7efde8104eb7] mongod(_ZN5mongo4repl39ReplicationCoordinatorExternalStateImpl21cleanUpLastApplyBatchEPNS_16OperationContextE+0xA0D) [0x7efde88c129d] mongod(_ZN5mongo4repl26ReplicationCoordinatorImpl21_startLoadLocalConfigEPNS_16OperationContextE+0x363) [0x7efde88d0423] mongod(_ZN5mongo4repl26ReplicationCoordinatorImpl7startupEPNS_16OperationContextE+0x1DC) [0x7efde88d1f4c] mongod(+0x80A8D1) [0x7efde80f28d1] mongod(main+0x966) [0x7efde8110a46] libc.so.6(__libc_start_main+0xF0) [0x7efde6121830] mongod(_start+0x29) [0x7efde81700f9] ----- END BACKTRACE -----
Fix Implementation
1. Return error status from SessionsCollection derived ::setupSessionsCollection method if the featureCompatibility is not fully upgraded to 3.6 this status will be checked in the _refresh method https://github.com/mongodb/mongo/blob/r3.6.6/src/mongo/db/logical_session_cache_impl.cpp#L275-L279 . This is the main defense against setting up sessions table too early
2. return from _refresh if FCV is not fully upgraded to 3.6 in https://github.com/mongodb/mongo/blob/r3.6.6/src/mongo/db/logical_session_cache_impl.cpp#L262 i.e. after computing statistics but before calling setupSessionCollections. This way the logs will not be polluted with the messages. The full downgrade support is tracked in SERVER-36104
3. This is not required for this fix but nice to have while on it: Change return type https://github.com/mongodb/mongo/blob/r3.6.6/src/mongo/db/initialize_operation_session_info.cpp#L41 from void to Status and consequently do not uassert inside initializeOperationSessionInfo
and uassert in the callers.
- is related to
-
SERVER-35957 While upgrading from Shard 3.4 to Shard 3.6 config.system.sessions is not sharded.
- Closed
- related to
-
SERVER-36104 LogicalSessions should destroy cache on setting FCV from 3.6 to 3.4
- Closed
-
SERVER-36223 Add hook for the fuzzer to not send lsid in the preamble on v3.6
- Closed