-
Type: Improvement
-
Resolution: Unresolved
-
Priority: Major - P3
-
None
-
Affects Version/s: None
-
Component/s: None
-
Storage Engines
-
StorEng - Defined Pipeline
Background :
The following is a slow check of a MongoDB cluster. During the slow check, the usage of mongod node resources (CPU and MEM) is low, and the _id query is simple. diagnose.data finally confirms that the problem is caused by wt reconcile and page lock. The mongod.log slow log is as follows:
{"t":{"$date":"2024-06-01T15:15:36.781+08:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn934679","msg":"Slow query","attr":{"type":"command","ns":"ycsb.test","command":{"insert":"test","ordered":true,"writeConcern":{"w":2},"$db":"ycsb","$clusterTime":{"clusterTime":{"$timestamp":{"t":1717658125,"i":10261}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"lsid":{"id":{"$uuid":"ae524131-e013-4e4a-b38c-764148abc4e7"}}},"ninserted":1,"keysInserted":1,"numYields":0,"reslen":230,"locks":{"ParallelBatchWriterMode":{"acquireCount":{"r":1}},"FeatureCompatibilityVersion":{"acquireCount":{"w":1}},"ReplicationStateTransition":{"acquireCount":{"w":1}},"Global":{"acquireCount":{"w":1}},"Database":{"acquireCount":{"w":1}},"Collection":{"acquireCount":{"w":1}},"Mutex":{"acquireCount":{"r":1}}},"flowControl":{"acquireCount":1,"timeAcquiringMicros":8},"readConcern":{"level":"local","provenance":"implicitDefault"},"writeConcern":{"w":2,"wtimeout":0,"provenance":"clientSupplied"},"storage":{},"remote":"xx.xx.0.39:41134","protocol":"op_msg","durationMillis":10580}} {"t":{"$date":"2024-06-01T15:15:36.782+08:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn934699","msg":"Slow query","attr":{"type":"command","ns":"ycsb.test","command":{"insert":"test","ordered":true,"writeConcern":{"w":2},"$db":"ycsb","$clusterTime":{"clusterTime":{"$timestamp":{"t":1717658125,"i":10261}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"lsid":{"id":{"$uuid":"41acb827-f32b-4671-b5c4-39e6432d94af"}}},"ninserted":1,"keysInserted":1,"numYields":0,"reslen":230,"locks":{"ParallelBatchWriterMode":{"acquireCount":{"r":1}},"FeatureCompatibilityVersion":{"acquireCount":{"w":1}},"ReplicationStateTransition":{"acquireCount":{"w":1}},"Global":{"acquireCount":{"w":1}},"Database":{"acquireCount":{"w":1}},"Collection":{"acquireCount":{"w":1}},"Mutex":{"acquireCount":{"r":1}}},"flowControl":{"acquireCount":1,"timeAcquiringMicros":1},"readConcern":{"level":"local","provenance":"implicitDefault"},"writeConcern":{"w":2,"wtimeout":0,"provenance":"clientSupplied"},"storage":{},"remote":"xx.xx.0.39:41336","protocol":"op_msg","durationMillis":10579}} {"t":{"$date":"2024-06-01T15:15:36.782+08:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn934701","msg":"Slow query","attr":{"type":"command","ns":"ycsb.test","command":{"insert":"test","ordered":true,"writeConcern":{"w":2},"$db":"ycsb","$clusterTime":{"clusterTime":{"$timestamp":{"t":1717658125,"i":10261}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"lsid":{"id":{"$uuid":"b986eada-8c78-4d38-85af-d79918c004fe"}}},"ninserted":1,"keysInserted":1,"numYields":0,"reslen":230,"locks":{"ParallelBatchWriterMode":{"acquireCount":{"r":1}},"FeatureCompatibilityVersion":{"acquireCount":{"w":1}},"ReplicationStateTransition":{"acquireCount":{"w":1}},"Global":{"acquireCount":{"w":1}},"Database":{"acquireCount":{"w":1}},"Collection":{"acquireCount":{"w":1}},"Mutex":{"acquireCount":{"r":1}}},"flowControl":{"acquireCount":1,"timeAcquiringMicros":1},"readConcern":{"level":"local","provenance":"implicitDefault"},"writeConcern":{"w":2,"wtimeout":0,"provenance":"clientSupplied"},"storage":{},"remote":"xx.xx.0.39:41366","protocol":"op_msg","durationMillis":10579}} {"t":{"$date":"2024-06-01T15:15:36.782+08:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn934650","msg":"Slow query","attr":{"type":"command","ns":"ycsb.test","command":{"insert":"test","ordered":true,"writeConcern":{"w":2},"$db":"ycsb","$clusterTime":{"clusterTime":{"$timestamp":{"t":1717658125,"i":10261}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"lsid":{"id":{"$uuid":"8e807dea-e498-42a1-a961-ce05a4446c9d"}}},"ninserted":1,"keysInserted":1,"numYields":0,"reslen":230,"locks":{"ParallelBatchWriterMode":{"acquireCount":{"r":1}},"FeatureCompatibilityVersion":{"acquireCount":{"w":1}},"ReplicationStateTransition":{"acquireCount":{"w":1}},"Global":{"acquireCount":{"w":1}},"Database":{"acquireCount":{"w":1}},"Collection":{"acquireCount":{"w":1}},"Mutex":{"acquireCount":{"r":1}}},"flowControl":{"acquireCount":1,"timeAcquiringMicros":1},"readConcern":{"level":"local","provenance":"implicitDefault"},"writeConcern":{"w":2,"wtimeout":0,"provenance":"clientSupplied"},"storage":{},"remote":"xx.xx.0.39:40870","protocol":"op_msg","durationMillis":10580}} {"t":{"$date":"2024-06-01T15:15:36.782+08:00"},"s":"I", "c":"COMMAND", "id":51803, "ctx":"conn934714","msg":"Slow query","attr":{"type":"command","ns":"ycsb.test","command":{"insert":"test","ordered":true,"writeConcern":{"w":2},"$db":"ycsb","$clusterTime":{"clusterTime":{"$timestamp":{"t":1717658125,"i":10261}},"signature":{"hash":{"$binary":{"base64":"AAAAAAAAAAAAAAAAAAAAAAAAAAA=","subType":"0"}},"keyId":0}},"lsid":{"id":{"$uuid":"65f9547c-9637-4b59-8466-71111b0f8ded"}}},"ninserted":1,"keysInserted":1,"numYields":0,"reslen":230,"locks":{"ParallelBatchWriterMode":{"acquireCount":{"r":1}},"FeatureCompatibilityVersion":{"acquireCount":{"w":1}},"ReplicationStateTransition":{"acquireCount":{"w":1}},"Global":{"acquireCount":{"w":1}},"Database":{"acquireCount":{"w":1}},"Collection":{"acquireCount":{"w":1}},"Mutex":{"acquireCount":{"r":1}}},"flowControl":{"acquireCount":1,"timeAcquiringMicros":1},"readConcern":{"level":"local","provenance":"implicitDefault"},"writeConcern":{"w":2,"wtimeout":0,"provenance":"clientSupplied"},"storage":{},"remote":"xx.xx.0.39:41486","protocol":"op_msg","durationMillis":10579}}
From the slow log, you can see :"storage":{}, there is no storage information. It took me three days to analyze the problem
Through this example, I think there are the following problems:
1. It is not possible to visually determine whether the slow query is caused by mongo server or wt using mongod.log.
2. It is not possible to intuitively determine which module of wt is causing the problem
3. In the scenario of diagnosing a small number of slow searches, the diagnosis data of wt is basically useless, because the statistics are easy to be averaged out