* Did some brief testing with Slony 1.2, 2.0 and 2.1 and appears to all work as expected.
authorglyn <glyn@8kb.co.uk>
Sun, 12 Oct 2014 20:11:22 +0000 (21:11 +0100)
committerglyn <glyn@8kb.co.uk>
Sun, 12 Oct 2014 20:11:22 +0000 (21:11 +0100)
* Added some extra logic to try to better handle multiple node failover in Slony versions pre 2.2. Seems to work well for Slony 1.2, however issues in 2.0 and 2.1 limit usefulness.
* Added some extra sanity checks to autofailover, with new parameters autofailover_perspective_sleep_time, autofailover_majority_only, autofailover_is_quorum.  Still think logic used to pick best node to failover to is flawed.

README.md
slony_failover.conf
slony_failover.pl

index 481ded8..dbb06ef 100644 (file)
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ $ sudo invoke-rc.d slony_failover start
 ##Command line parameters
 
 ```bash
-$ ./failover.pl [options]
+$ ./slony_failover.pl [options]
 ```
 
 |Switch    | Description
@@ -68,7 +68,7 @@ $ ./failover.pl [options]
 | General     |**separate_working_directory**                | boolean                       | *'true'*                        | Append a separate working directory to the prefix_directory for each run
 | General     |**slonik_path**                               | /full/path/to/bin/directory   | *null*                          | Slonik binary if not in current path
 | General     |**pid_filename**                              | /path/to/pidfile              | *'/var/run/slony_failover.pid'* | Pid file to use when running in autofailover mode
-| General     |**enable_try_blocks**                         | boolean                       | *false*                         |    Write slonik script with try blocks where possible to aid error handling
+| General     |**enable_try_blocks**                         | boolean                       | *false*                         | Write slonik script with try blocks where possible to aid error handling
 | General     |**lockset_method**                            | single/multiple               | *'multiple'*                    | Write slonik script that locks all sets
 | General     |**pull_aliases_from_comments**                | boolean                       | *false*                         | If true, script will pull text from comment fields and use to generate
 |             |                                              |                               |                                 | possibly meaningful aliases for nodes and sets.
@@ -102,9 +102,20 @@ $ ./failover.pl [options]
 | Autofailover|**autofailover_forwarding_providers**         | boolean                       | *'false'*                       | If true a failure of a pure forwarding provider will also trigger failover
 | Autofailover|**autofailover_config_any_node**              | boolean                       | *'true'*                        | After reading the initial cluster configuration, subsequent reads of the configuration 
 |             |                                              |                               |                                 | will use conninfo read from sl_subscribe to read from any node.
-| Autofailover|**autofailover_poll_interval**                | integer                       | 500                             | How often to check for failure of nodes (milliseconds)
-| Autofailover|**autofailover_node_retry**                   | integer                       | 2                               | When failure is detected, retry this many times before initiating failover
-| Autofailover|**autofailover_sleep_time**                   | integer                       | 1000                            | Interval between retries (milliseconds)
+| Autofailover|**autofailover_poll_interval**                | integer                       | *500*                           | How often to check for failure of nodes (milliseconds)
+| Autofailover|**autofailover_node_retry**                   | integer                       | *2*                             | When failure is detected, retry this many times before initiating failover
+| Autofailover|**autofailover_sleep_time**                   | integer                       | *1000*                          | Interval between retries (milliseconds)
+| Autofailover|**autofailover_perspective_sleep_time**       | integer                       | *20000*                         | Interval between lag reads for failed nodes from surviving nodes. If greater 
+                                                                                                                               | than zero any observation that nodes have failed is checked from surviving nodes
+                                                                                                                               | perspective by checking if lag times are extending.  This does not guarantee 100%
+                                                                                                                               | the nodes are down but if set to a large enough interval can back up our observation.
+| Autofailover|**autofailover_majority_only**                | boolean                       | *false*                         | Only fail over if the quantity of surviving nodes is greater than the quantity of
+                                                                                                                               | failed nodes.  Intended to be used to prevent a split-brain scenario in 
+                                                                                                                               | conjunction with some other logic to monitor and fence off the old origin if it
+                                                                                                                               | is in the minority.
+| Autofailover|**autofailover_is_quorum**                    | boolean                       | *false*                         | If this script is running on a separate host set to true to treat it as a quorum
+                                                                                                                               | server. Effectively increments sum of surviving nodes when calculating the 
+                                                                                                                               | majority above.
 
 Changes
 -------
@@ -113,6 +124,7 @@ Changes
 * 04/11/2012 - Experiment with different use of try blocks (currently can't use multiple lock sets indide try)
 * 13/04/2014 - Update to work differently for Slony 2.2+
 * 05/05/2014 - Experiment with autofailover ideas
+* 10/09/2014 - Add some logic to autofailover for doing extra checks from perspective of other nodes. Still a naive autofailover implementaition imho.
 
 Licence
 -------
index f1be52a..194c20b 100644 (file)
@@ -87,3 +87,20 @@ autofailover_config_any_node = true         # After reading the initial cluster
 autofailover_poll_interval = 50             # Cluster state poll interval (milliseconds)
 autofailover_node_retry = 2                 # Retries before initiating failover
 autofailover_sleep_time = 5000              # Interval between retries (milliseconds)
+autofailover_perspective_sleep_time = 20000 # Interval between lag reads for failed nodes
+                                            # from surviving nodes. If greater than zero
+                                            # any observation that nodes have failed is
+                                            # checked from surviving nodes perspective by
+                                            # checking if lag times are extending.
+                                            # This does not guarantee 100% the nodes are down
+                                            # but if set to a large enough interval can back
+                                            # up our observation.
+autofailover_majority_only = true           # Only fail over if the quantity of surviving nodes
+                                            # is greater than the quantity of failed nodes.
+                                            # Intended to be used to prevent a split-brain scenario
+                                            # in conjunction with some other logic to monitor and 
+                                            # fence off the old origin if it is in the minority.
+autofailover_is_quorum = true               # If this script is running on a separate host set to 
+                                            # true to treat it as a quorum server. Effectively 
+                                            # increments sum of surviving nodes when calculating
+                                            # the majority above.
index 19bedcf..a6020fd 100755 (executable)
@@ -1,9 +1,9 @@
 #!/usr/bin/perl
 
 # Script:      failover.pl
-# Copyright:   08/04/2012: v1.0.1 Glyn Astill <glyn@8kb.co.uk>
+# Copyright:   08/04/2012: v1.0.2 Glyn Astill <glyn@8kb.co.uk>
 # Requires:    Perl 5.10.1+, Data::UUID, File::Slurp
-#               PostgreSQL 9.0+ Slony-I 2.0+
+#               PostgreSQL 9.0+ Slony-I 1.2+ / 2.0+
 #
 # This script is a command-line utility to manage switchover and failover
 # of replication sets in Slony-I clusters.
@@ -36,7 +36,7 @@ use Config qw/%Config/;
 use constant false => 0;
 use constant true  => 1;
 
-my $g_script_version = '1.0.1';
+my $g_script_version = '1.0.2';
 my $g_debug = false;
 my $g_pidfile = '/var/run/slony_failover.pid';
 my $g_pid_written = false;
@@ -55,6 +55,7 @@ my $g_use_try_blocks = false;
 my $g_lockset_method = 'multiple';
 my $g_logfile = 'failover.log';
 my $g_input;
+my $g_silence_notice = false;
 my $g_reason;
 my $g_script;
 my $g_node_from;
@@ -88,6 +89,9 @@ my $g_autofailover_retry = 2;
 my $g_autofailover_retry_sleep = 1000;
 my $g_autofailover_provs = false;
 my $g_autofailover_config_any = true;
+my $g_autofailover_perspective_sleep = 20000;
+my $g_autofailover_majority_only = false;
+my $g_autofailover_is_quorum = false;
 my @g_unresponsive;
 my %g_backups;
 my $g_pid = $$;
@@ -95,7 +99,7 @@ my $g_pid = $$;
 
 my %message = (
 'en' => {
-    'usage'                            => q{-h <host> -p <port> -db <database> -c <cluster name> -u <username> -P <password> (Password option not recommended; use pgpass instead)},
+    'usage'                            => q{-h <host> -p <port> -db <database> -cl <cluster name> -u <username> -P <password> -f <config file> (Password option not recommended; use pgpass instead)},
     'title'                            => q{Slony-I failover script version $1},
     'cluster_fixed'                    => q{Aborting failover action: all origin nodes now responsive},
     'cluster_failed'                   => q{Found $1 failed nodes, sleeping for $2ms before retry $3 of $4},
@@ -120,10 +124,20 @@ my %message = (
     'autofailover_promote_found'       => q{Using previously found most up to date subscriber to all sets ($1) on unresponsive node $2},
     'autofailover_promote_skip'        => q{No failover required for unresponsive node $1 as it is neither the origin or an active forwarder of any sets},
     'autofailover_promote_fail'        => q{Could not find suitable backup node for promotion},
-    'autofailover_node_detail'         => q{Node $1 is $2 and provides sets $3 at $4 lag},
-    'autofailover_promote_best'        => q{Best node for promotion is node $1 seq = $2},
+    'autofailover_node_detail'         => q{Node $1 is $2 subscribed to ($3) node $4 and provides sets $5 at $6 lag ($7 events)},
+    'autofailover_node_detail_subonly' => q{Node $1 is $2 subscribed to ($3) node $4 and is a subscriber only at $5 lag ($6 events)},
+    'autofailover_promote_best'        => q{Best node for promotion is node $1 seq = $2 ($3 events)},
     'autofailover_unresponsive'        => q{Detected unresponsive provider node: $1},
     'autofailover_unresponsive_subonly'=> q{Detected unresponsive subscriber only node: $1},
+    'autofailover_pspec_check_fail'    => q{Failed to connect to node $1: $2},
+    'autofailover_pspec_check'         => q{Getting objective judgement from other nodes, apparent unresponsive nodes are : $1 (Failed nodes = $2 of $3)},
+    'autofailover_pspec_check_sleep'   => q{Sleeping for $1 ms},
+    'autofailover_pspec_check_data'    => q{$1: Node $2 says lag from node $3 -> $4 is $5 seconds},
+    'autofailover_pspec_check_true'    => q{All detected failed nodes confirmed as lagging by other nodes},
+    'autofailover_pspec_check_false'   => q{Not all nodes confirmed as lagging},
+    'autofailover_pspec_check_unknown' => q{Unable to confirm lag status of all nodes},
+    'autofailover_split_check'         => q{Surviving nodes ($1 of $2) are the majority},
+    'autofailover_split_check_fail'    => q{Surviving nodes ($1) are not the majority},
     'interactive_head_id'              => q{ID},
     'interactive_head_name'            => q{Name},
     'interactive_head_status'          => q{Status},
@@ -139,6 +153,7 @@ my %message = (
     'interactive_write_script'         => q{Writing a script to $1 node $2 to $3},
     'interactive_check_nodes'          => q{Checking availability of database nodes...},
     'interactive_continue'             => q{Do you wish to continue [y/n]?},
+    'interactive_drop_nodes'           => q{Do you want to also drop the failed nodes from the slony configuration [y/n]?},
     'interactive_preserve'             => q{Preserve subscription paths to follow the origin node (choose no if unsure) [y/n]?},
     'interactive_aliases'              => q{Generate aliases based on sl_node/set comments in parentheses (choose no if unsure) [y/n]?},
     'interactive_summary'              => q{Summary of nodes to be passed to failover:},
@@ -155,6 +170,9 @@ my %message = (
     'wrn_node_unavailable'             => q{WARNING: Node $1 unavailable},
     'wrn_req_unavailable'              => q{WARNING: Old origin node ($1) is available, however $2 subscribers are unavailable},
     'wrn_not_tested'                   => q{WARNING: Script not tested with Slony-I v$1},
+    'wrn_failover_issues'              => q{WARNING: Slony-I v$1 may struggle to failover correctly with multiple failed nodes (affects v2.0-2.1)},
+    'note_autofail_fwd_only'           => q{NOTICE: Slony versions prior to 2.2 cannot initiate failover from only failed forwarding providers},
+    'note_fail_sub_only'               => q{NOTICE: Slony versions prior to 2.2 cannot failover subscriber only nodes, reverting to failover_offline_subscriber_only = false},
     'note_multiple_try'                => q{NOTICE: Cannot lock multiple sets within try blocks in version $1 dropping back to single sets},
     'note_reshape_cluster'             => q{NOTICE: Either drop the failed subscribers or bring them back up, then retry to MOVE SET},
     'dbg_generic'                      => q{DEBUG: $1},
@@ -197,7 +215,7 @@ my %message = (
     'exit'                             => q{Exited by $1}
     },
 'fr' => {
-    'usage'                            => q{-h <host> -p <port> -db <database> -c <cluster name> -u <username> -P <password> (Option mot de passe pas recommandé; utiliser pgpass place)},
+    'usage'                            => q{-h <host> -p <port> -db <database> -cl <cluster name> -u <username> -P <password> -f <config file> (Option mot de passe pas recommandé; utiliser pgpass place)},
     'title'                            => q{Slony-I failover (basculement) version de script $1},
     'cluster_fixed'                    => q{Abandon de l'action de basculement: tous les noeuds d'origine maintenant sensible},
     'cluster_failed'                   => q{Trouvé $1 échoué noeuds, couchage pour $2 ms avant réessayer $3 de $4},
@@ -222,10 +240,20 @@ my %message = (
     'autofailover_promote_found'       => q{Utilisation précédemment trouvé plus à jour abonné à tous les jeux ($1) sur le noeud ne répond pas $2},
     'autofailover_promote_skip'        => q{Pas de failover requis pour le noeud ne répond pas $1 car il n'est ni l'origine ou un transitaire active de tous les jeux},
     'autofailover_promote_fail'        => q{Impossible de trouver le noeud de sauvegarde approprié pour la promotion},
-    'autofailover_node_detail'         => q{Noeud $1 est $2 et fournit des ensembles $3 à $4 retard},
-    'autofailover_promote_best'        => q{Meilleur noeud pour la promotion est noeud $1 suivants = $2},
+    'autofailover_node_detail'         => q{Noeud $1 est souscrit à $2 ($3) noeud $4 et fournit des ensembles de $5 à retard $6 ($7  événements)},
+    'autofailover_node_detail_subonly' => q{Noeud $1 est souscrit à $2 ($3) et le noeud $4 est un abonné à retard $5 ($6 événements)},
+    'autofailover_promote_best'        => q{Meilleur noeud pour la promotion est noeud $1 suivants = $2 ($3 événements)},
     'autofailover_unresponsive'        => q{Noeud ne répond pas détecté: $1},
     'autofailover_unresponsive_subonly'=> q{Abonné ne répond pas détecté seulement de noeud: $1},
+    'autofailover_pspec_check_fail'    => q{Impossible de se connecter au noeud $1: $2},
+    'autofailover_pspec_check'         => q{Obtenir un jugement objectif à partir d'autres noeuds, les noeuds qui ne répondent pas apparentes sont : $1 (Noeuds défaillants = $2 de $3)},
+    'autofailover_pspec_check_sleep'   => q{Dormir pour $1 ms},
+    'autofailover_pspec_check_data'    => q{$1: Noeud $2 dit décalage de $3 -> $4 noeud est $5 secondes},
+    'autofailover_pspec_check_true'    => q{Tous les noeuds détectés pas confirmés comme à la traîne par d'autres noeuds},
+    'autofailover_pspec_check_false'   => q{Pas tous les noeuds confirmé retard},
+    'autofailover_pspec_check_unknown' => q{Impossible de confirmer le statut de latence de tous les noeuds},
+    'autofailover_split_check'         => q{Autres noeuds ($1 sur $2) sont la majorité},
+    'autofailover_split_check_fail'    => q{Autres noeuds ($1) ne sont pas la majorité},
     'interactive_head_name'            => q{Nom},
     'interactive_head_status'          => q{Statut},
     'interactive_head_providers'       => q{Fournisseur IDs},
@@ -234,6 +262,7 @@ my %message = (
     'interactive_detail_3'             => q{Abonnements: },
     'interactive_choose_node'          => q{S'il vous plaît choisissez le noeud à déplacer tous les ensembles $1:},
     'interactive_confirm'              => q{Vous avez choisi de passer ensembles $1 noeud $2 ($3). Est-ce correct [o/n]? },
+    'interactive_drop_nodes'           => q{Voulez-vous laisser tomber aussi les noeuds défaillants de la configuration de slony [o/n]?},
     'interactive_action'               => q{Meilleur plan d'action est le plus susceptible de faire une $1. Voulez-vous continuer [o/n]?},
     'interactive_surrender'            => q{Uable pour déterminer le meilleur plan d'action},
     'interactive_write_script'         => q{Rédaction d'un script à $1 $2 noeud à $3},
@@ -255,6 +284,9 @@ my %message = (
     'wrn_node_unavailable'             => q{ATTENTION: Noeud $1 disponible},
     'wrn_req_unavailable'              => q{ATTENTION: Noeud Old origine ($1) est disponible, mais $2 abonnés ne sont pas disponibles},
     'wrn_not_tested'                   => q{ATTENTION: Script pas testé avec Slony-I v$1},
+    'wrn_failover_issues'              => q{ATTENTION: Slony-I v$1 peut lutter pour basculer correctement avec plusieurs nœuds défaillants (affecte v2.0-2.1)},
+    'note_autofail_fwd_only'           => q{AVIS: Versions antérieures à la 2.2 Slony ne peuvent pas initier le basculement de seulement échoué transmettre fournisseurs},
+    'note_fail_sub_only'               => q{AVIS: Versions antérieures à la 2.2 Slony ne peuvent pas basculer abonnes seuls les noeuds, revenant à failover_offile_subscriber_only = false},
     'note_multiple_try'                => q{AVIS: Vous ne pouvez pas verrouiller plusieurs ensembles dans des blocs try dans la version $1 de retomber à des jeux simples},
     'note_reshape_cluster'             => q{AVIS: Vous devez supprimer les abonnés défaillants ou les ramener, puis réessayez à MOVE SET},
     'err_generic'                      => q{ERREUR: $1},
@@ -272,7 +304,7 @@ my %message = (
     'err_cluster_empty'                => q{ERREUR: Groupe chargé contient pas de noeuds},
     'err_cluster_offline'              => q{ERREUR: Groupe chargé contient pas de noeuds accessibles},
     'err_cluster_lone'                 => q{ERRRUE: Groupe chargé ne contient que 1 noeud},
-    'err_not_origin'                   => q{ERREUR: Nœud $1 n'est pas à l'origine de tous les jeux},
+    'err_not_origin'                   => q{ERREUR: Noeud $1 n'est pas à l'origine de tous les jeux},
     'err_not_provider'                 => q{ERREUR: Noeud $1 n'est pas un fournisseur de tous les jeux},
     'err_not_provider_sets'            => q{ERREUR: Noeud $1 ne fournit pas les ensembles nécessaires: le besoin ($2), mais fournit ($3)},
     'err_no_configuration'             => q{ERREUR: Impossible de lire la configuration pour le noeud $1},
@@ -413,9 +445,6 @@ if ($g_node_count <= 0) {
     printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('err_cluster_empty'));
     cleanExit(3, "system");
 }
-elsif (substr($g_version,0,1) < 2) {
-    printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('wrn_not_tested', $g_version));
-}
 else {
     printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('load_cluster_success', $g_version, $g_clname, $g_node_count, $g_dbhost, $g_dbport, $g_dbname));
     printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('script_settings', $g_lockset_method, $g_failover_method, uc($g_resubscribe_method)));
@@ -515,6 +544,12 @@ if ($g_failover) {
     printlogln($g_prefix,$g_logfile,$g_log_prefix,"\t" . lookupMsg('interactive_failover_detail_3'));
     printlogln($g_prefix,$g_logfile,$g_log_prefix,"\t" . lookupMsg('interactive_failover_detail_4'));
 
+    printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_drop_nodes'));
+    $g_input = <>;
+    if ($g_input ~~ /^[Y|O]$/i) {
+        $g_drop_failed = true;
+    }
+
     printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_reason'));
     $g_reason = <>;
     printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_continue'));
@@ -775,6 +810,20 @@ sub loadCluster {
             $g_failover_method = 'new';
             $g_resubscribe_method = 'resubscribe';
         }
+        else {
+            unless ($g_silence_notice) {
+                if ((substr($version,0,3) >= 2.0) && (substr($version,0,3) < 2.2)) {
+                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('wrn_failover_issues', $version));
+                }
+                printlogln($prefix,$logfile,$log_prefix,lookupMsg('note_autofail_fwd_only'));
+                $g_silence_notice = true;
+            }
+            if ($g_fail_subonly) {
+                printlogln($prefix,$logfile,$log_prefix,lookupMsg('note_fail_sub_only'));
+                $g_fail_subonly = false;
+            }
+        }
+        
     }
 
     return (scalar(@g_cluster), $version);
@@ -1001,7 +1050,7 @@ sub writePreamble {
             }
             elsif (!$g_fail_subonly) {
                 foreach my $unresponsive (@g_unresponsive) {
-                    if (($_->[0] == $unresponsive->[0]) && !defined($_->[9])) {
+                    if (($_->[0] == $unresponsive->[0]) && !defined($_->[9]) && ($g_failover_method eq 'new')) {
                         $line_prefix = "# (Node $_->[0] unavailable subscriber only) ";
                     }
                 }
@@ -1189,7 +1238,7 @@ sub writeMoveSet {
                                 @subsets = (split(',', $setlist)) ;
 
                                 if ($g_debug) {
-                                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_resubscribe', $_->[1], $_->[0]), $other_subs->[0], $other_subs->[4], $setlist, $setlist_name, $node, $node_name);
+                                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_resubscribe', $_->[1], $_->[0], $other_subs->[0], $other_subs->[4], $setlist, $setlist_name, $node, $node_name));
                                 }    
 
                                 if ($_->[0] ~~ @subsets) {
@@ -1319,6 +1368,16 @@ sub writeFailover {
     my $event_node;
     my ($year, $month, $day, $hour, $min, $sec) = (localtime(time))[5,4,3,2,1,0];
     my $filetime = sprintf ("%02d_%02d_%04d_%02d:%02d:%02d", $day, $month+1, $year+1900, $hour, $min, $sec);
+    my $sets = false;
+
+    my $subprov_idx;
+    my @subprov_name;
+    my ($node, $setlist);
+    my ($node_name, $setlist_name);
+    my @subsets;
+    my @subsets_name;
+    my $set_idx;
+    my @dropped;
 
     if (defined($from) && defined($to)) {
         $filename = $prefix . "/" . $clname . "-failover_from_" . $from . "_to_" . $to . "_on_" . $filetime . ".scr";
@@ -1327,7 +1386,14 @@ sub writeFailover {
         $filename = $prefix . "/" . $clname . "-autofailover_on_" . $filetime . ".scr";
     }
 
-    unless (writePreamble($filename, $dbconninfo, $clname, $dbuser, $dbpass, false, $aliases, $prefix, $logfile, $log_prefix, false)) {
+    if ($g_failover_method ne 'new') {
+        # For pre 2.2 failover with multiple nodes, we attempt to resubscribe sets and drop other failed providers;
+        # This will never work as well as 2.2+ failover behaviour (infact failover may not work as all in 2.0/2.1 with multiple failed nodes)
+        # We also need to define the sets in the preamble for this.
+        $sets = true;
+    }
+
+    unless (writePreamble($filename, $dbconninfo, $clname, $dbuser, $dbpass, $sets, $aliases, $prefix, $logfile, $log_prefix, false)) {
         printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_incomplete_preamble'));
     }
 
@@ -1338,17 +1404,85 @@ sub writeFailover {
             printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_failover_method',$g_failover_method));
         }
 
+        # If we are on pre 2.2 we need to drop failed subscriber nodes first regardless
+        if ($g_failover_method ne 'new') {
+            foreach (@g_failed) {
+                if (!defined($_->[3])) {
+                    foreach my $backup (@g_cluster) {
+                        if ($backup->[0] == $g_backups{$_->[0]}) {  # this backup node candidate is in the list of suitable nodes for {failed node}
+                            foreach my $subscriber (@g_cluster) {
+                                if (defined($subscriber->[1]) && $subscriber->[1] == $_->[0] && $subscriber->[0] != $backup->[0]) {
+                                    # mess here needs cleaning up
+                                    @subprov_name = (split(';', $subscriber->[8]));
+                                    $subprov_idx = 0;
+                                    foreach my $subprov (split(';', $subscriber->[5])) {
+                                        ($node, $setlist) = (split('->', $subprov)) ;
+                                        ($node_name, $setlist_name) = (split('->', $subprov_name[$subprov_idx])) ;
+                                        $subprov_idx++;
+                                        $node =~ s/n//g;
+    
+                                        if ($node == $_->[0]) {
+                                            if ($aliases) {
+                                                print SLONFILE ("ECHO 'Resubscribing all sets on receiver $subscriber->[4] provided by other failed node $_->[4] to backup node $backup->[4]';\n");
+                                            }
+                                            else {
+                                                print SLONFILE ("ECHO 'Resubscribing all sets on receiver $subscriber->[0]  provided by other failed node $_->[0] to backup node $backup->[0]';\n");
+                                            }
+                                            $setlist =~ s/(\)|\(|s)//g;
+                                            @subsets = (split(',', $setlist));
+                                            $setlist_name =~ s/(\)|\()//g;
+                                            @subsets_name = (split(',', $setlist_name));
+                                        
+                                            $set_idx = 0;
+                                            foreach my $subset (@subsets) {
+                                                if ($aliases) {
+                                                    print SLONFILE ("SUBSCRIBE SET (ID = \@$subsets_name[$set_idx], PROVIDER = \@$backup->[4], RECEIVER = \@$subscriber->[4], FORWARD = YES);\n");
+                                                    print SLONFILE ("WAIT FOR EVENT (ORIGIN = \@$backup->[4], CONFIRMED = \@$subscriber->[4], WAIT ON = \@$backup->[4]);\n");
+                                                }
+                                                else {
+                                                    print SLONFILE ("SUBSCRIBE SET (ID = $subset, PROVIDER = $backup->[0], RECEIVER = $subscriber->[0], FORWARD = YES);\n");
+                                                    print SLONFILE ("WAIT FOR EVENT (ORIGIN = $backup->[0], CONFIRMED = $subscriber->[0], WAIT ON = $backup->[0]);\n");
+                                                }
+                                                $set_idx++;
+                                            }
+                                            print SLONFILE ("\n");
+                                        }
+                                    }
+    
+                                    if ($aliases) {
+                                        print SLONFILE ("ECHO 'Dropping other failed node $_->[4] ($_->[0])';\n");
+                                         print SLONFILE ("DROP NODE (ID = \@$_->[4], EVENT NODE = \@$backup->[4]);\n\n");
+                                    }
+                                    else {
+                                        print SLONFILE ("ECHO 'Dropping other failed node $_->[0]';\n");
+                                        print SLONFILE ("DROP NODE (ID = $_->[0], EVENT NODE = $backup->[0]);\n\n");
+                                    }   
+                                    push(@dropped, $_->[0]);
+                                }
+                                else {
+                                    # The node is failed, but there are no downstream subscribers
+                                }
+                            }
+                            last;
+                        }
+                    }
+                }
+            }
+        }
+
         foreach (@g_failed) {
-            foreach my $backup (@g_cluster) {
-                if ($backup->[0] == $g_backups{$_->[0]}) {
-                    ## Here we have both details of the backup node and the failed node
-                    if ($aliases) {
-                        print SLONFILE ("ECHO 'Failing over slony cluster from $_->[4] (id $_->[0]) to $backup->[4] (id $backup->[0])';\n");
+            if (($g_failover_method eq 'new') || defined($_->[3])) {
+                foreach my $backup (@g_cluster) {
+                    if ($backup->[0] == $g_backups{$_->[0]}) {
+                        ## Here we have both details of the backup node and the failed node
+                        if ($aliases) {
+                            print SLONFILE ("ECHO 'Failing over slony cluster from $_->[4] (id $_->[0]) to $backup->[4] (id $backup->[0])';\n");
+                        }
+                        else {
+                            print SLONFILE ("ECHO 'Failing over slony cluster from node $_->[0] to node $backup->[0]';\n");
+                        }   
+                        last;
                     }
-                    else {
-                        print SLONFILE ("ECHO 'Failing over slony cluster from node $_->[0] to node $backup->[0]';\n");
-                    }   
-                    last;
                 }
             }
         }
@@ -1356,47 +1490,46 @@ sub writeFailover {
         print SLONFILE ("FAILOVER (\n\t");
         $written = 0;
         foreach (@g_failed) {
-            foreach my $backup (@g_cluster) {
-
-                if ($backup->[0] == $g_backups{$_->[0]}) {
-                    ## Here we have both details of the backup node and the failed node
-                    if ($g_failover_method eq 'new') {
-                        if( $written != 0 ) {
-                            print SLONFILE (",\n\t");
+            if (($g_failover_method eq 'new') || defined($_->[3])) {
+                foreach my $backup (@g_cluster) {
+                    if ($backup->[0] == $g_backups{$_->[0]}) {
+                        ## Here we have both details of the backup node and the failed node
+                        if ($g_failover_method eq 'new') {
+                            if( $written != 0 ) {
+                                print SLONFILE (",\n\t");
+                            }
+                            print SLONFILE ("NODE = (");
                         }
-                        print SLONFILE ("NODE = (");
-                    }
-                    else {
-                        if( $written != 0 ) {
-                            print SLONFILE ("\n);\nFAILOVER (\n\t");
+                        else {
+                            if( $written != 0 ) {
+                                print SLONFILE ("\n);\nFAILOVER (\n\t");
+                            }
                         }
+                        if ($aliases) {
+                            print SLONFILE ("ID = \@$_->[4], BACKUP NODE = \@$backup->[4]");
+                        }
+                        else {
+                            print SLONFILE ("ID = $_->[0], BACKUP NODE = $backup->[0]");
+                        }
+                        if ($g_failover_method eq 'new') {
+                            print SLONFILE (")");
+                        }
+                        last;
                     }
-                    if ($aliases) {
-                        print SLONFILE ("ID = \@$_->[4], BACKUP NODE = \@$backup->[4]");
-                    }
-                    else {
-                        print SLONFILE ("ID = $_->[0], BACKUP NODE = $backup->[0]");
-                    }
-                    if ($g_failover_method eq 'new') {
-                        print SLONFILE (")");
-                    }
-                    last;
                 }
+                $written++;
             }
-            $written++;
         }
-        print SLONFILE ("\n);\n");
+        print SLONFILE ("\n);\n\n");
 
         if ($g_drop_failed) {
-
-
             if (($g_failover_method eq 'new')  && (scalar(@g_failed) > 1)) {
                 foreach (@g_failed) {
                     if ($aliases) {
-                        print SLONFILE ("ECHO 'Dropping node $_->[4] ($_->[0])';\n");
+                        print SLONFILE ("ECHO 'Dropping failed node $_->[4] ($_->[0])';\n");
                     }
                     else {
-                        print SLONFILE ("ECHO 'Dropping node $_->[0]';\n");
+                        print SLONFILE ("ECHO 'Dropping failed node $_->[0]';\n");
                     }   
                 }
 
@@ -1419,7 +1552,7 @@ sub writeFailover {
                             if( $written != 0 ) {
                                 print SLONFILE (",");
                             }
-                            ## Don;t bother being pissy and trying to define array values 
+                            ## Don't bother trying to define array values 
                             #if ($aliases) {
                             #    print SLONFILE "\@$_->[4]";
                             #}
@@ -1428,14 +1561,14 @@ sub writeFailover {
                             #}
                             $written++;
                         }
-                        else {
+                        elsif (($g_failover_method eq 'new') || defined($_->[3]) || !($_->[0] ~~ @dropped)) {
                             if ($aliases) {
-                                print SLONFILE ("ECHO 'Dropping node $_->[4] ($_->[0])';\n");
-                                print SLONFILE ("DROP NODE (ID = \@$_->[4], EVENT NODE = \@$backup->[4]);\n");
+                                print SLONFILE ("ECHO 'Dropping failed node $_->[4] ($_->[0])';\n");
+                                print SLONFILE ("DROP NODE (ID = \@$_->[4], EVENT NODE = \@$backup->[4]);\n\n");
                             }
                             else {
-                                print SLONFILE ("ECHO 'Dropping node $_->[0]';\n");
-                                print SLONFILE ("DROP NODE (ID = $_->[0], EVENT NODE = $backup->[0]);\n");
+                                print SLONFILE ("ECHO 'Dropping failed node $_->[0]';\n");
+                                print SLONFILE ("DROP NODE (ID = $_->[0], EVENT NODE = $backup->[0]);\n\n");
                             }
                         }
                         last;
@@ -1818,6 +1951,15 @@ sub getConfig {
                     when(/\bautofailover_config_any_node\b/i) {
                         $g_autofailover_config_any = checkBoolean($value);
                     }
+                    when(/\bautofailover_perspective_sleep_time\b/i) {
+                        $g_autofailover_perspective_sleep = checkInteger($value);
+                    }
+                    when(/\bautofailover_majority_only\b/i) {
+                        $g_autofailover_majority_only = checkBoolean($value);
+                    }
+                    when(/\bautofailover_is_quorum\b/i) {
+                        $g_autofailover_is_quorum  = checkBoolean($value);
+                    }
                 }
             }
         }
@@ -1975,22 +2117,24 @@ sub autoFailover {
                 }
             }
             if ($failed > 0) {
-                $actions = findBackup($clname, $dbuser, $dbpass, $prefix, $logfile, $log_prefix);
-                if ($actions > 0) {
-                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_proceed'));
-                    foreach my $failed ( keys %g_backups ) {
-                        printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_detail', $failed, $g_backups{$failed}));
+                if ((!$g_autofailover_majority_only || checkSplit($prefix, $logfile, $log_prefix)) && (($g_autofailover_perspective_sleep <= 0) || checkPerspective($clname, $dbuser, $dbpass, $prefix, $logfile, $log_prefix))) {
+                    $actions = findBackup($clname, $dbuser, $dbpass, $prefix, $logfile, $log_prefix);
+                    if ($actions > 0) {
+                        printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_proceed'));
+                        foreach my $failed ( keys %g_backups ) {
+                            printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_detail', $failed, $g_backups{$failed}));
+                        }
+                        $g_script = writeFailover($prefix, $dbconninfo, $clname, $dbuser, $dbpass, undef, undef, $g_subs_follow_origin, $g_use_comment_aliases, $logfile, $log_prefix);   
+                        unless (runSlonik($g_script, $prefix, $logfile, $log_prefix)) {
+                            printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_execute_fail', 'slonik script', $g_script));
+                        }
+                        $cluster_loaded = false;
+                        #print "SCRIPT: $g_script\n";
+                        #exit(0);
                     }
-                    $g_script = writeFailover($prefix, $dbconninfo, $clname, $dbuser, $dbpass, undef, undef, $g_subs_follow_origin, $g_use_comment_aliases, $logfile, $log_prefix);   
-                    unless (runSlonik($g_script, $prefix, $logfile, $log_prefix)) {
-                        printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_execute_fail', 'slonik script', $g_script));
+                    else {
+                        printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_halt', $failed));
                     }
-                    $cluster_loaded = false;
-                    #print "SCRIPT: $g_script\n";
-                    #exit(0);
-                }
-                else {
-                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_halt', $failed));
                 }
             }
             usleep($g_autofailover_poll_interval * 1000);
@@ -2002,6 +2146,165 @@ sub autoFailover {
     }
 }
 
+sub checkSplit {
+    my $prefix = shift;
+    my $logfile = shift;
+    my $log_prefix = shift;
+
+    my $majority = false; 
+    my $failed = scalar(@g_unresponsive);
+    my $survivers = (scalar(@g_cluster) - scalar(@g_unresponsive));
+
+    if ($survivers > $failed) {
+        $majority = true; 
+        printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_split_check', $survivers, ($survivers+$failed)));
+    }
+    elsif (($survivers == $failed) && $g_autofailover_is_quorum) {
+        $majority = true; 
+        printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_split_check', ($survivers . '+quorum'), ($survivers+$failed)));
+    }
+    else {
+        printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_split_check_fail', $survivers));
+    }
+
+    return $majority;
+}
+
+# Check each nodes perspective of the failure to try to ensure the issue isn't that this script just can't connect to the origin/provider
+# The idea here is just to wait for a short period of time and see if the lag time for the nodes has increased by the same amount.
+sub checkPerspective {
+    my $clname = shift;
+    my $dbuser = shift;
+    my $dbpass = shift;
+    my $prefix = shift;
+    my $logfile = shift;
+    my $log_prefix = shift;
+
+    my $dsn;
+    my $dbh;
+    my $sth;
+    my $query;
+    my $qw_clname;
+    my $param_on;
+    my $agreed = false;
+    my @unresponsive_ids;
+    my $lag_idx;
+    my $lag_confirmed;
+    my @lag_info1;
+    my @lag_info2;
+    my $bad = 0;
+
+    foreach (@g_unresponsive) {
+        push(@unresponsive_ids, $_->[0]);
+    }
+    printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check', join(", ", @unresponsive_ids), scalar(@g_unresponsive), scalar(@g_cluster)));
+
+    foreach (@g_cluster) {
+        unless ($_->[0] ~~ @unresponsive_ids)  {
+            $dsn = "DBI:Pg:$_->[2];";
+            eval {
+                $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
+                $qw_clname = $dbh->quote_identifier("_" . $clname);
+
+                $query = "SELECT a.st_origin, a.st_received, extract(epoch from a.st_lag_time)::integer
+                        FROM _test_replication.sl_status a
+                        INNER JOIN _test_replication.sl_node b on a.st_origin = b.no_id
+                        INNER JOIN _test_replication.sl_node c on a.st_received = c.no_id
+                        WHERE a.st_received IN (" . substr('?, ' x scalar(@unresponsive_ids), 0, -2) . ") ORDER BY a.st_origin, a.st_received;";
+
+                $sth = $dbh->prepare($query);
+
+                $param_on = 1; 
+                foreach (@unresponsive_ids) {
+                    $sth->bind_param($param_on, $_);
+                    $param_on++;
+                }
+                $sth->execute();
+
+                while (my @node_lag = $sth->fetchrow) { 
+                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_data', 'Check1', $_->[0], $node_lag[0], $node_lag[1], $node_lag[2]));
+                    push(@lag_info1, \@node_lag);
+                }
+
+                $sth->finish;
+                $dbh->disconnect();
+            };
+            if ($@) {
+                printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_fail', $_->[0], $@));
+                $bad++;
+            } 
+        }
+    }
+
+    if ($bad == 0) {
+        printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_sleep', $g_autofailover_perspective_sleep));
+        usleep($g_autofailover_perspective_sleep * 1000);
+
+        foreach (@g_cluster) {
+            unless ($_->[0] ~~ @unresponsive_ids)  {
+                $dsn = "DBI:Pg:$_->[2];";
+                eval {
+                    $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
+                    $qw_clname = $dbh->quote_identifier("_" . $clname);
+
+                    $query = "SELECT a.st_origin, a.st_received, extract(epoch from a.st_lag_time)::integer
+                            FROM _test_replication.sl_status a
+                            INNER JOIN _test_replication.sl_node b on a.st_origin = b.no_id
+                            INNER JOIN _test_replication.sl_node c on a.st_received = c.no_id
+                            WHERE a.st_received IN (" . substr('?, ' x scalar(@unresponsive_ids), 0, -2) . ") ORDER BY a.st_origin, a.st_received;";
+
+                    $sth = $dbh->prepare($query);
+
+                    $param_on = 1;
+                    foreach (@unresponsive_ids) {
+                        $sth->bind_param($param_on, $_);
+                        $param_on++;
+                    }
+                    $sth->execute();
+
+                    while (my @node_lag = $sth->fetchrow) {
+                        printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_data', 'Check2', $_->[0], $node_lag[0], $node_lag[1], $node_lag[2]));
+                        push(@lag_info2, \@node_lag);
+                    }
+
+                    $sth->finish;
+                    $dbh->disconnect();
+                };
+                if ($@) {
+                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_fail', $_->[0], $@));
+                    $bad++;
+                }
+            }
+        }
+
+        $lag_idx = 0;
+        $lag_confirmed = 0;
+        foreach (@lag_info1) {
+            if ($g_debug) {
+                printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', ("Node $_->[0] lag between checks on node $_->[1] is " . ($lag_info2[$lag_idx]->[2]-$_->[2]) . " seconds")));
+            }
+
+            if ((($lag_info2[$lag_idx]->[2]-$_->[2])*1000) >= $g_autofailover_perspective_sleep) {
+                $lag_confirmed++;
+            }
+            $lag_idx++;
+        }  
+    }
+
+    if ($bad > 0) {
+        printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_unknown'));
+    }   
+    elsif ($lag_idx == $lag_confirmed) {
+        printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_true'));
+        $agreed = true;
+    } 
+    else {
+        printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_false'));
+    }
+
+    return $agreed;
+}
+
 sub checkFailed {
     my $clname = shift;
     my $dbuser = shift;
@@ -2055,7 +2358,9 @@ sub checkFailed {
                 push(@g_unresponsive, \@$_); 
                 if ((defined($_->[3])) || ($g_autofailover_provs && defined($_->[9]))) {
                     printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_unresponsive', $_->[0]));
-                    $prov_failed++;
+                    unless ($g_failover_method ne 'new' && !defined($_->[3])) {
+                        $prov_failed++;
+                    }
                 }
                 else {
                     printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_unresponsive_subonly', $_->[0]));
@@ -2093,7 +2398,8 @@ sub findBackup {
     my $query;
     my $qw_clname;
     my $result_count = 0;
-    my $lowest_lag;
+    my $lowest_lag_time;
+    my $lowest_lag_events;
     my $best_node_id;    
     my $best_node_is_direct;    
     my @sets_from;
@@ -2108,7 +2414,8 @@ sub findBackup {
             printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_find', ($_->[9] // "none"), $_->[0]));
 
             undef $best_node_id;
-            $lowest_lag = (1<<$Config{ivsize}*8-1)-1;
+            $lowest_lag_time = (1<<$Config{ivsize}*8-1)-1;
+            $lowest_lag_events = $lowest_lag_time;
 
             if (defined($_->[9]) && (exists $backup_for_set_chosen{$_->[9]})) {
                 $best_node_id = $backup_for_set_chosen{$_->[9]};
@@ -2127,11 +2434,11 @@ sub findBackup {
                             $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
                             $qw_clname = $dbh->quote_identifier("_" . $clname);
 
-                            $query = "SELECT extract(epoch from a.st_lag_time), (a.st_received = ?) AS direct 
+                            $query = "SELECT extract(epoch from a.st_lag_time), a.st_lag_num_events, (a.st_received = ?) AS direct
                                 FROM $qw_clname.sl_status a
                                 INNER JOIN $qw_clname.sl_subscribe b ON b.sub_provider = a.st_received AND b.sub_receiver = a.st_origin
                                 WHERE b.sub_active 
-                                GROUP BY a.st_lag_time, a.st_received;";
+                                GROUP BY a.st_lag_time, a.st_lag_num_events, a.st_received;";
 
                             $sth = $dbh->prepare($query);
                             $sth->bind_param(1, $_->[0]);
@@ -2141,22 +2448,23 @@ sub findBackup {
 
                                 undef @sets_from;
                                 if (defined($_->[9])) {
-                                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_node_detail', $subscriber->[0], ($subinfo[1] ? "a direct subscriber" : "an indirect subscriber"), $subscriber->[7], $subinfo[0]));
+                                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_node_detail', $subscriber->[0], ($subinfo[2]?"directly":"indirectly"), (defined($_->[3])?"origin":"provider"), $_->[0], $subscriber->[7], $subinfo[0], $subinfo[1]));
                                     @sets_from = split(',',$_->[9]);
                                     @sets_to = split(',',$subscriber->[7]);
                                 }
                                 elsif ($g_fail_subonly) {
                                     # Subscriber only node will have no active sets forwarding sets to check
-                                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_node_detail', $subscriber->[0], "suitable backup for this subscriber only node"  , $subscriber->[7], $subinfo[0]));
+                                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_node_detail_subonly', $subscriber->[0], ($subinfo[2]?"directly":"indirectly"), (defined($_->[3])?"origin":"provider"), $_->[0], $subinfo[0], $subinfo[1]));
                                     @sets_from = (0);
                                     @sets_to = (0);
                                 }
 
-                                if ((checkProvidesAllSets(\@sets_from, \@sets_to)) && (($subinfo[0] < $lowest_lag) || (!$best_node_is_direct && $subinfo[1]))) {
-                                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_best', $subscriber->[0], $subinfo[0]));
+                                if ((checkProvidesAllSets(\@sets_from, \@sets_to)) && (($subinfo[0] < $lowest_lag_time && ($subinfo[2] || !defined($best_node_id))) || (!$best_node_is_direct && $subinfo[2]))) {
+                                    printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_best', $subscriber->[0], $subinfo[0], $subinfo[1]));
                                     $best_node_id = $subscriber->[0];
-                                    $lowest_lag = $subinfo[0];
-                                    $best_node_is_direct = $subinfo[1];
+                                    $lowest_lag_time = $subinfo[0];
+                                    $lowest_lag_events = $subinfo[1];
+                                   $best_node_is_direct = $subinfo[2];
                                 }
                             }
                         };