Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-47554

Replica Set member suddenly stopped replication process

    • Type: Icon: Bug Bug
    • Resolution: Gone away
    • Priority: Icon: Major - P3 Major - P3
    • None
    • Affects Version/s: 4.2.5
    • Component/s: None
    • None
    • Service Arch
    • ALL
    • Show
      db1-4-diag.zip

      We've recently updated our 3.4 Replica Set to 4.2.5 and faced an issue tonight.

      One of replica set members (db3) suddenly stopped replicating. All data was stale on it and optime showed that it lags 9+ hours behind primary.

      The last meaningfull log message on db3 was:

      2020-04-14T22:32:57.641+0000 I  CONNPOOL [ReplicaSetMonitor-TaskExecutor] Dropping all pooled connections to db3:27017 due to ShutdownInProgress: Pool for db3:27017 has expired.
      2020-04-14T22:32:57.641+0000 I  CONNPOOL [ReplicaSetMonitor-TaskExecutor] Dropping all pooled connections to db1:27017 due to ShutdownInProgress: Pool for db1:27017 has expired.
      2020-04-14T22:32:57.641+0000 I  CONNPOOL [ReplicaSetMonitor-TaskExecutor] Dropping all pooled connections to db2:27017 due to ShutdownInProgress: Pool for db2:27017 has expired.
      

      Replica Set status as seen from Primary at the time of incident (db1):

      {
      	"set" : "***",
      	"date" : ISODate("2020-04-15T08:26:26.804Z"),
      	"myState" : 1,
      	"term" : NumberLong(22),
      	"syncingTo" : "",
      	"syncSourceHost" : "",
      	"syncSourceId" : -1,
      	"heartbeatIntervalMillis" : NumberLong(2000),
      	"majorityVoteCount" : 3,
      	"writeMajorityCount" : 3,
      	"optimes" : {
      		"lastCommittedOpTime" : {
      			"ts" : Timestamp(1586939186, 813),
      			"t" : NumberLong(22)
      		},
      		"lastCommittedWallTime" : ISODate("2020-04-15T08:26:26.798Z"),
      		"readConcernMajorityOpTime" : {
      			"ts" : Timestamp(1586939186, 813),
      			"t" : NumberLong(22)
      		},
      		"readConcernMajorityWallTime" : ISODate("2020-04-15T08:26:26.798Z"),
      		"appliedOpTime" : {
      			"ts" : Timestamp(1586939186, 813),
      			"t" : NumberLong(22)
      		},
      		"durableOpTime" : {
      			"ts" : Timestamp(1586939186, 813),
      			"t" : NumberLong(22)
      		},
      		"lastAppliedWallTime" : ISODate("2020-04-15T08:26:26.798Z"),
      		"lastDurableWallTime" : ISODate("2020-04-15T08:26:26.798Z")
      	},
      	"lastStableRecoveryTimestamp" : Timestamp(1586939153, 301),
      	"lastStableCheckpointTimestamp" : Timestamp(1586939153, 301),
      	"electionCandidateMetrics" : {
      		"lastElectionReason" : "priorityTakeover",
      		"lastElectionDate" : ISODate("2020-04-07T06:22:02.485Z"),
      		"electionTerm" : NumberLong(22),
      		"lastCommittedOpTimeAtElection" : {
      			"ts" : Timestamp(1586240522, 526),
      			"t" : NumberLong(21)
      		},
      		"lastSeenOpTimeAtElection" : {
      			"ts" : Timestamp(1586240522, 526),
      			"t" : NumberLong(21)
      		},
      		"numVotesNeeded" : 3,
      		"priorityAtElection" : 2,
      		"electionTimeoutMillis" : NumberLong(10000),
      		"priorPrimaryMemberId" : 9,
      		"numCatchUpOps" : NumberLong(4),
      		"newTermStartDate" : ISODate("2020-04-07T06:22:02.539Z"),
      		"wMajorityWriteAvailabilityDate" : ISODate("2020-04-07T06:22:03.616Z")
      	},
      	"members" : [
      		{
      			"_id" : 7,
      			"name" : "db4:27017",
      			"health" : 1,
      			"state" : 2,
      			"stateStr" : "SECONDARY",
      			"uptime" : 748567,
      			"optime" : {
      				"ts" : Timestamp(1586939185, 78),
      				"t" : NumberLong(22)
      			},
      			"optimeDurable" : {
      				"ts" : Timestamp(1586939185, 78),
      				"t" : NumberLong(22)
      			},
      			"optimeDate" : ISODate("2020-04-15T08:26:25Z"),
      			"optimeDurableDate" : ISODate("2020-04-15T08:26:25Z"),
      			"lastHeartbeat" : ISODate("2020-04-15T08:26:25.058Z"),
      			"lastHeartbeatRecv" : ISODate("2020-04-15T08:26:25.034Z"),
      			"pingMs" : NumberLong(0),
      			"lastHeartbeatMessage" : "",
      			"syncingTo" : "db1:27017",
      			"syncSourceHost" : "db1:27017",
      			"syncSourceId" : 8,
      			"infoMessage" : "",
      			"configVersion" : 128
      		},
      		{
      			"_id" : 8,
      			"name" : "db1:27017",
      			"health" : 1,
      			"state" : 1,
      			"stateStr" : "PRIMARY",
      			"uptime" : 751010,
      			"optime" : {
      				"ts" : Timestamp(1586939186, 813),
      				"t" : NumberLong(22)
      			},
      			"optimeDate" : ISODate("2020-04-15T08:26:26Z"),
      			"syncingTo" : "",
      			"syncSourceHost" : "",
      			"syncSourceId" : -1,
      			"infoMessage" : "",
      			"electionTime" : Timestamp(1586240522, 538),
      			"electionDate" : ISODate("2020-04-07T06:22:02Z"),
      			"configVersion" : 128,
      			"self" : true,
      			"lastHeartbeatMessage" : ""
      		},
      		{
      			"_id" : 9,
      			"name" : "db2:27017",
      			"health" : 1,
      			"state" : 2,
      			"stateStr" : "SECONDARY",
      			"uptime" : 751002,
      			"optime" : {
      				"ts" : Timestamp(1586939185, 995),
      				"t" : NumberLong(22)
      			},
      			"optimeDurable" : {
      				"ts" : Timestamp(1586939185, 991),
      				"t" : NumberLong(22)
      			},
      			"optimeDate" : ISODate("2020-04-15T08:26:25Z"),
      			"optimeDurableDate" : ISODate("2020-04-15T08:26:25Z"),
      			"lastHeartbeat" : ISODate("2020-04-15T08:26:25.969Z"),
      			"lastHeartbeatRecv" : ISODate("2020-04-15T08:26:24.893Z"),
      			"pingMs" : NumberLong(0),
      			"lastHeartbeatMessage" : "",
      			"syncingTo" : "db1:27017",
      			"syncSourceHost" : "db1:27017",
      			"syncSourceId" : 8,
      			"infoMessage" : "",
      			"configVersion" : 128
      		},
      		{
      			"_id" : 10,
      			"name" : "db3:27017",
      			"health" : 1,
      			"state" : 2,
      			"stateStr" : "SECONDARY",
      			"uptime" : 751002,
      			"optime" : {
      				"ts" : Timestamp(1586903526, 1069),
      				"t" : NumberLong(22)
      			},
      			"optimeDurable" : {
      				"ts" : Timestamp(1586903526, 1069),
      				"t" : NumberLong(22)
      			},
      			"optimeDate" : ISODate("2020-04-14T22:32:06Z"),
      			"optimeDurableDate" : ISODate("2020-04-14T22:32:06Z"),
      			"lastHeartbeat" : ISODate("2020-04-15T08:26:25.034Z"),
      			"lastHeartbeatRecv" : ISODate("2020-04-15T08:26:25.594Z"),
      			"pingMs" : NumberLong(0),
      			"lastHeartbeatMessage" : "",
      			"syncingTo" : "db1:27017",
      			"syncSourceHost" : "db1:27017",
      			"syncSourceId" : 8,
      			"infoMessage" : "",
      			"configVersion" : 128
      		},
      		{
      			"_id" : 11,
      			"name" : "db1-dc2:27017",
      			"health" : 1,
      			"state" : 2,
      			"stateStr" : "SECONDARY",
      			"uptime" : 751002,
      			"optime" : {
      				"ts" : Timestamp(1586939185, 627),
      				"t" : NumberLong(22)
      			},
      			"optimeDurable" : {
      				"ts" : Timestamp(1586939185, 622),
      				"t" : NumberLong(22)
      			},
      			"optimeDate" : ISODate("2020-04-15T08:26:25Z"),
      			"optimeDurableDate" : ISODate("2020-04-15T08:26:25Z"),
      			"lastHeartbeat" : ISODate("2020-04-15T08:26:25.660Z"),
      			"lastHeartbeatRecv" : ISODate("2020-04-15T08:26:24.804Z"),
      			"pingMs" : NumberLong(1),
      			"lastHeartbeatMessage" : "",
      			"syncingTo" : "db1:27017",
      			"syncSourceHost" : "db1:27017",
      			"syncSourceId" : 8,
      			"infoMessage" : "",
      			"configVersion" : 128
      		}
      	],
      	"ok" : 1,
      	"$clusterTime" : {
      		"clusterTime" : Timestamp(1586939186, 814),
      		"signature" : {
      			"hash" : BinData(0,"zGvmG5qYkIif17pfXKXh2dgvwIg="),
      			"keyId" : NumberLong("6812519156018054020")
      		}
      	},
      	"operationTime" : Timestamp(1586939186, 813)
      }
      
      ***:PRIMARY> rs.printSlaveReplicationInfo()
      source: db4:27017
              syncedTo: Wed Apr 15 2020 08:22:15 GMT+0000 (UTC)
              1 secs (0 hrs) behind the primary 
      source: db2:27017
              syncedTo: Wed Apr 15 2020 08:22:15 GMT+0000 (UTC)
              1 secs (0 hrs) behind the primary 
      source: db3:27017
              syncedTo: Tue Apr 14 2020 22:32:06 GMT+0000 (UTC)
              35410 secs (9.84 hrs) behind the primary 
      source: db1-dc2:27017
              syncedTo: Wed Apr 15 2020 08:22:15 GMT+0000 (UTC)
              1 secs (0 hrs) behind the primary
      

      All other metrics (CPU Usage, Memory, Network, etc) were fine.
      Moreover db3 served read queries perfectly fine with outdated data and that caused major problems due to write conflicts on primary db1.

      Additionally attaching diagnostic.data from all dbs.

        1. SERVER-47554.png
          SERVER-47554.png
          350 kB
        2. SERVER-47554_apr26.png
          SERVER-47554_apr26.png
          192 kB
        3. decommit.png
          decommit.png
          122 kB
        4. db1-4-diag.zip
          39.65 MB

            Assignee:
            backlog-server-servicearch [DO NOT USE] Backlog - Service Architecture
            Reporter:
            neanton@gmail.com Anton Neznaienko
            Votes:
            1 Vote for this issue
            Watchers:
            17 Start watching this issue

              Created:
              Updated:
              Resolved: