Uploaded image for project: 'Core Server'
  1. Core Server
  2. SERVER-17423

sh.stopBalancer does not respect the passed timeout and interval

    • Type: Icon: Bug Bug
    • Resolution: Done
    • Priority: Icon: Major - P3 Major - P3
    • None
    • Affects Version/s: None
    • Component/s: Shell
    • Sharding
    • ALL
    • Hide

      jstest is attached. Basically:

      1. Start sharded cluster
      2. sh.stopBalancer()
      3. db.getSiblingDB("config").locks.update({_id:"balancer"},{$set:{state:2}})
      4. sh.stopBalancer(1000) takes 15 minutes to timeout, not 1 second
      Show
      jstest is attached. Basically: Start sharded cluster sh.stopBalancer() db.getSiblingDB( "config" ).locks.update({_id: "balancer" },{$set:{state:2}}) sh.stopBalancer(1000) takes 15 minutes to timeout, not 1 second

      sh.stopBalancer() accepts a timeout and interval, and passes them to sh.waitForBalancer(), which in turn passes them to sh.waitForBalancerOff():

      sh.stopBalancer = function( timeout, interval ) {
          sh.setBalancerState( false )
          sh.waitForBalancer( false, timeout, interval )
      }
      
      sh.waitForBalancer = function( onOrNot, timeout, interval ){
          
          // If we're waiting for the balancer to turn on or switch state or
          // go to a particular state
          if( onOrNot ){
              // Just wait for the balancer lock to change, can't ensure we'll ever see it
              // actually locked
              sh.waitForDLock( "balancer", undefined, timeout, interval )
          }
          else {
              // Otherwise we need to wait until we're sure balancing stops
              sh.waitForBalancerOff( timeout, interval )
          }
          
      }
      

      However, sh.waitForBalancerOff does not pass these values through to sh.waitForDLock, instead passing a hardcoded value of 15 minutes:

      sh.waitForBalancerOff = function( timeout, interval ){
          
          var pings = db.getSisterDB( "config" ).mongos.find().toArray()
          var activePings = []
          for( var i = 0; i < pings.length; i++ ){
              if( ! pings[i].waiting ) activePings.push( pings[i] )
          }
          
          print( "Waiting for active hosts..." )
          
          activePings = sh.waitForPingChange( activePings, 60 * 1000 )
          
          // After 1min, we assume that all hosts with unchanged pings are either 
          // offline (this is enough time for a full errored balance round, if a network
          // issue, which would reload settings) or balancing, which we wait for next
          // Legacy hosts we always have to wait for
          
          print( "Waiting for the balancer lock..." )
          
          // Wait for the balancer lock to become inactive
          // We can guess this is stale after 15 mins, but need to double-check manually
          try{ 
              sh.waitForDLock( "balancer", false, 15 * 60 * 1000 )
          }
          catch( e ){
              print( "Balancer still may be active, you must manually verify this is not the case using the config.changelog collection." )
              throw Error(e);
          }
              
          print( "Waiting again for active hosts after balancer is off..." )
          
          // Wait a short time afterwards, to catch the host which was balancing earlier
          activePings = sh.waitForPingChange( activePings, 5 * 1000 )
          
          // Warn about all the stale host pings remaining
          for( var i = 0; i < activePings.length; i++ ){
              print( "Warning : host " + activePings[i]._id + " seems to have been offline since " + activePings[i].ping )
          }
          
      }
      

      The 15 minute timeout should be a default which can be overridden, and the interval should be respected, i.e.:

      Unable to find source-code formatter for language: diff. Available languages are: actionscript, ada, applescript, bash, c, c#, c++, cpp, css, erlang, go, groovy, haskell, html, java, javascript, js, json, lua, none, nyan, objc, perl, php, python, r, rainbow, ruby, scala, sh, sql, swift, visualbasic, xml, yaml
      diff --git a/src/mongo/shell/utils_sh.js b/src/mongo/shell/utils_sh.js
      index d9c05a3..f9215bb 100644
      --- a/src/mongo/shell/utils_sh.js
      +++ b/src/mongo/shell/utils_sh.js
      @@ -225,7 +225,7 @@ sh.waitForBalancerOff = function( timeout, interval ){
           // Wait for the balancer lock to become inactive
           // We can guess this is stale after 15 mins, but need to double-check manually
           try{
      -        sh.waitForDLock( "balancer", false, 15 * 60 * 1000 )
      +        sh.waitForDLock( "balancer", false, timeout || 15 * 60 * 1000, interval )
           }
           catch( e ){
               print( "Balancer still may be active, you must manually verify this is not the case using the config.changelog collection." )
      

            Assignee:
            backlog-server-sharding [DO NOT USE] Backlog - Sharding Team
            Reporter:
            kevin.pulo@mongodb.com Kevin Pulo
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

              Created:
              Updated:
              Resolved: