-
Type: Bug
-
Resolution: Cannot Reproduce
-
Priority: Major - P3
-
None
-
Affects Version/s: 3.0.0
-
Component/s: Replication, WiredTiger
-
None
-
Fully Compatible
-
ALL
We noticed that one of our 4 shards had a secondary with high replication lag.
Replica shard log has periodic very slow serverStatus with large values for "after oplog":
2015-03-18T18:30:44.539+0000 I COMMAND [conn82] serverStatus was very slow:
{ after basic: 0, after asserts: 0, after connections: 0, after cursors: 0, after extra_info: 0, after globalLock: 0, after locks: 0, after network: 0, after opcounters: 0, after opcountersRepl: 0, after oplog: 72189, after repl: 72189, after storageEngine: 72189, after wiredTiger: 72189, at end: 72189 }Restarted the replica shard hoping that this was a fluke, but the shard quickly went back to a low insertion rate.
Running db.getSiblingDB("local").oplog.rs.stats() produces:
{ "ns" : "local.oplog.rs", "count" : 174896077, "size" : 125833992359, "avgObjSize" : 719, "storageSize" : 62140203008, "capped" : true, "max" : -1, "maxSize" : NumberLong("125829120000"), "wiredTiger" : { "metadata" : { "formatVersion" : 1, "oplogKeyExtractionVersion" : 1 }, "creationString" : "allocation_size=4KB,app_metadata=(formatVersion=1,oplogKeyExtractionVersion=1),block_allocation=best,block_compressor=snappy,cache_resident=0,checkpoint=(WiredTigerCheckpoint.23998=(addr=\"01e30ad2de81e436162abee3ddf13ebde46538e310e3ddf17bb3e43c3b848d808080e50e77d81fc0e50bad737fc0\",order=23998,time=1426703669,size=50154676224,write_gen=20018662)),checkpoint_lsn=(7514,101134976),checksum=uncompressed,collator=,columns=,dictionary=0,format=btree,huffman_key=,huffman_value=,id=9,internal_item_max=0,internal_key_max=0,internal_key_truncate=,internal_page_max=4KB,key_format=q,key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=1MB,memory_page_max=10m,os_cache_dirty_max=0,os_cache_max=0,prefix_compression=0,prefix_compression_min=4,split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90,value_format=u,version=(major=1,minor=1)", "type" : "file", "uri" : "statistics:table:local/collection-2--4790446482890547801", "LSM" : { "bloom filters in the LSM tree" : 0, "bloom filter false positives" : 0, "bloom filter hits" : 0, "bloom filter misses" : 0, "bloom filter pages evicted from cache" : 0, "bloom filter pages read into cache" : 0, "total size of bloom filters" : 0, "sleep for LSM checkpoint throttle" : 0, "chunks in the LSM tree" : 0, "highest merge generation in the LSM tree" : 0, "queries that could have benefited from a Bloom filter that did not exist" : 0, "sleep for LSM merge throttle" : 0 }, "block-manager" : { "file allocation unit size" : 4096, "blocks allocated" : 176633, "checkpoint size" : 50154676224, "allocations requiring file extension" : 0, "blocks freed" : 206417, "file magic number" : 120897, "file major version number" : 1, "minor version number" : 0, "file bytes available for reuse" : 12083351552, "file size in bytes" : 62140203008 }, "btree" : { "column-store variable-size deleted values" : 0, "column-store fixed-size leaf pages" : 0, "column-store internal pages" : 0, "column-store variable-size leaf pages" : 0, "pages rewritten by compaction" : 0, "number of key/value pairs" : 0, "fixed-record size" : 0, "maximum tree depth" : 32, "maximum internal page key size" : 368, "maximum internal page size" : 4096, "maximum leaf page key size" : 3276, "maximum leaf page size" : 32768, "maximum leaf page value size" : 1048576, "overflow pages" : 0, "row-store internal pages" : 0, "row-store leaf pages" : 0 }, "cache" : { "bytes read into cache" : 5684148385, "bytes written from cache" : 5626150236, "checkpoint blocked page eviction" : 11413, "unmodified pages evicted" : 175446, "page split during eviction deepened the tree" : 1, "modified pages evicted" : 3911, "data source pages selected for eviction unable to be evicted" : 2675, "hazard pointer blocked page eviction" : 589, "internal pages evicted" : 1882, "pages split during eviction" : 585, "in-memory page splits" : 553, "overflow values cached in memory" : 0, "pages read into cache" : 178344, "overflow pages read into cache" : 0, "pages written from cache" : 176607 }, "compression" : { "raw compression call failed, no additional data available" : 0, "raw compression call failed, additional data available" : 0, "raw compression call succeeded" : 0, "compressed pages read" : 176000, "compressed pages written" : 173316, "page written failed to compress" : 3, "page written was too small to compress" : 3288 }, "cursor" : { "create calls" : 128, "insert calls" : 10935859, "bulk-loaded cursor-insert calls" : 0, "cursor-insert key and value bytes inserted" : 5679356593, "next calls" : 7327404, "prev calls" : 35, "remove calls" : 561, "cursor-remove key bytes removed" : 5049, "reset calls" : 10937358, "search calls" : 0, "search near calls" : 1122, "update calls" : 0, "cursor-update value bytes updated" : 0 }, "reconciliation" : { "dictionary matches" : 0, "internal page multi-block writes" : 25, "leaf page multi-block writes" : 605, "maximum blocks required for a page" : 236, "internal-page overflow keys" : 0, "leaf-page overflow keys" : 0, "overflow values written" : 0, "pages deleted" : 2780, "page checksum matches" : 5294, "page reconciliation calls" : 6475, "page reconciliation calls for eviction" : 1885, "leaf page key bytes discarded using prefix compression" : 0, "internal page key bytes discarded using suffix compression" : 175625 }, "session" : { "object compaction" : 0, "open cursor count" : 128 }, "transaction" : { "update conflicts" : 0 } }, "nindexes" : 0, "totalIndexSize" : 0, "indexSizes" : { }, "ok" : 1 }