@@ -395,6 +395,7 @@ MtmRefreshClusterStatus()
395395
396396 /*
397397 * Check for referee decision when only half of nodes are visible.
398+ * Do not hold lock here, but recheck later wheter mask changed.
398399 */
399400 if (MtmRefereeConnStr && * MtmRefereeConnStr && !Mtm -> refereeWinnerId &&
400401 countZeroBits (SELF_CONNECTIVITY_MASK , Mtm -> nAllNodes ) == Mtm -> nAllNodes /2 )
@@ -406,29 +407,40 @@ MtmRefreshClusterStatus()
406407 Mtm -> refereeWinnerId = winner_node_id ;
407408 if (!BIT_CHECK (SELF_CONNECTIVITY_MASK , winner_node_id - 1 ))
408409 {
409- MTM_LOG1 ("[STATE] Referee allowed to proceed with half of the nodes (winner_id = %d)" ,
410- winner_node_id );
411- Mtm -> refereeGrant = true;
410+ /*
411+ * By the time we enter this block we can already see other nodes.
412+ * So recheck old conditions under lock.
413+ */
412414 MtmLock (LW_EXCLUSIVE );
413- if (countZeroBits (SELF_CONNECTIVITY_MASK , Mtm -> nAllNodes ) == 1 )
415+ if (countZeroBits (SELF_CONNECTIVITY_MASK , Mtm -> nAllNodes ) == Mtm -> nAllNodes /2 &&
416+ !BIT_CHECK (SELF_CONNECTIVITY_MASK , winner_node_id - 1 ))
414417 {
415- // XXXX: that is valid for two nodes. Better idea is to parametrize MtmPollStatus*
416- // functions.
417- int neighbor_node_id = MtmNodeId == 1 ? 2 : 1 ;
418- MtmPollStatusOfPreparedTransactionsForDisabledNode (neighbor_node_id , true);
418+ MTM_LOG1 ("[STATE] Referee allowed to proceed with half of the nodes (winner_id = %d)" ,
419+ winner_node_id );
420+ Mtm -> refereeGrant = true;
421+ if (countZeroBits (SELF_CONNECTIVITY_MASK , Mtm -> nAllNodes ) == 1 )
422+ {
423+ // XXXX: that is valid for two nodes. Better idea is to parametrize MtmPollStatus*
424+ // functions.
425+ int neighbor_node_id = MtmNodeId == 1 ? 2 : 1 ;
426+ MtmPollStatusOfPreparedTransactionsForDisabledNode (neighbor_node_id , true);
427+ }
428+ MtmEnableNode (MtmNodeId );
429+ MtmCheckState ();
419430 }
420- MtmEnableNode (MtmNodeId );
421- MtmCheckState ();
422431 MtmUnlock ();
423432 }
424433 }
425434 }
426435
427436 /*
428- * Clear winner if we again have all nodes online.
437+ * Clear winner if we again have all nodes recovered.
438+ * We should clean old value based on disabledNodeMask instead of SELF_CONNECTIVITY_MASK
439+ * because we can clean old value before failed node starts it recovery and that node
440+ * can get refereeGrant before start of walsender, so it start in recovered mode.
429441 */
430- if (MtmRefereeConnStr && * MtmRefereeConnStr && Mtm -> refereeWinnerId &&
431- countZeroBits (SELF_CONNECTIVITY_MASK , Mtm -> nAllNodes ) == Mtm -> nAllNodes )
442+ if (MtmRefereeConnStr && * MtmRefereeConnStr && Mtm -> refereeWinnerId &&
443+ countZeroBits (Mtm -> disabledNodeMask , Mtm -> nAllNodes ) == Mtm -> nAllNodes )
432444 {
433445 if (MtmRefereeClearWinner ())
434446 {
@@ -438,8 +450,10 @@ MtmRefreshClusterStatus()
438450 }
439451 }
440452
441- /* Do not check clique with referee grant */
442- if (Mtm -> refereeWinnerId )
453+ /*
454+ * Do not check clique with referee grant, because we can disable ourself.
455+ */
456+ if (Mtm -> refereeGrant )
443457 return ;
444458
445459 /*
0 commit comments