From 146c773e63a572450722aaa917dc96ced7319f7d Mon Sep 17 00:00:00 2001 From: glyn Date: Fri, 28 Nov 2014 16:10:35 +0000 Subject: [PATCH 1/1] Changes: - In autofailover mode lag/event times now read directly from sl_event/sl_confirm as reading sl_status on a subscriber can be misleading. - Tried to clean up some of the printed messages whilst being ashamed at the lack of comments and unnecessary complexity. Hindsight is always 20/20. --- README.md | 2 +- slony_failover.conf | 6 +- slony_failover.pl | 180 ++++++++++++++++++++++++++++++++++---------- 3 files changed, 143 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 8e62acf..4b68f20 100644 --- a/README.md +++ b/README.md @@ -105,7 +105,7 @@ $ ./slony_failover.pl [options] | Autofailover|**autofailover_poll_interval** | integer | *500* | How often to check for failure of nodes (milliseconds) | Autofailover|**autofailover_node_retry** | integer | *2* | When failure is detected, retry this many times before initiating failover | Autofailover|**autofailover_sleep_time** | integer | *1000* | Interval between retries (milliseconds) -| Autofailover|**autofailover_perspective_sleep_time** | integer | *20000* | Interval between lag reads for failed nodes from surviving nodes. If greater than zero any observation that nodes have failed is checked from surviving nodes perspective by checking if lag times are extending. This does not guarantee 100% the nodes are down but if set to a large enough interval can back up our observation. +| Autofailover|**autofailover_perspective_sleep_time** | integer | *20000* | Interval between lag reads for failed nodes from surviving nodes. If greater than zero any observation that nodes have failed is checked from surviving nodes perspective by checking if lag times are extending. This does not guarantee 100% the nodes are down but if set to a large enough interval (at least sync_interval_timeout) can back up our observation. | Autofailover|**autofailover_majority_only** | boolean | *false* | Only fail over if the quantity of surviving nodes is greater than the quantity of failed nodes. Intended to be used to prevent a split-brain scenario in conjunction with some other logic to monitor and fence off the old origin if it is in the minority. | Autofailover|**autofailover_is_quorum** | boolean | *false* | If this script is running on a separate host set to true to treat it as a quorum server. Effectively increments sum of surviving nodes when calculating the majority above. diff --git a/slony_failover.conf b/slony_failover.conf index 194c20b..8ad164a 100644 --- a/slony_failover.conf +++ b/slony_failover.conf @@ -46,7 +46,7 @@ drop_failed_nodes = true # Drop all failed nodes immediately # Database node to read initial Slony-I configuration from #------------------------------------------------------------------------------ slony_database_host = localhost -slony_database_port = 5434 +slony_database_port = 5432 slony_database_name = TEST slony_database_user = slony slony_database_password = # Recommended to leave blank and use .pgpass @@ -93,8 +93,8 @@ autofailover_perspective_sleep_time = 20000 # Interval between lag reads for fai # checked from surviving nodes perspective by # checking if lag times are extending. # This does not guarantee 100% the nodes are down - # but if set to a large enough interval can back - # up our observation. + # but if set to a large enough interval (at least + # sync_interval_timeout) can back up our observation. autofailover_majority_only = true # Only fail over if the quantity of surviving nodes # is greater than the quantity of failed nodes. # Intended to be used to prevent a split-brain scenario diff --git a/slony_failover.pl b/slony_failover.pl index 792095e..f1eab2c 100755 --- a/slony_failover.pl +++ b/slony_failover.pl @@ -36,7 +36,7 @@ use Config qw/%Config/; use constant false => 0; use constant true => 1; -my $g_script_version = '1.0.2'; +my $g_script_version = '1.0.3'; my $g_debug = false; my $g_pidfile = '/var/run/slony_failover.pid'; my $g_pid_written = false; @@ -93,6 +93,7 @@ my $g_autofailover_perspective_sleep = 20000; my $g_autofailover_majority_only = false; my $g_autofailover_is_quorum = false; my @g_unresponsive; +my %g_unresponsive_subonly; my %g_backups; my $g_pid = $$; @@ -101,11 +102,11 @@ my %message = ( 'en' => { 'usage' => q{-h -p -db -cl -u -P -f (Password option not recommended; use pgpass instead)}, 'title' => q{Slony-I failover script version $1}, - 'cluster_fixed' => q{Aborting failover action: all origin nodes now responsive}, + 'cluster_fixed' => q{Aborting failover action: all origin/provider nodes now responsive}, 'cluster_failed' => q{Found $1 failed nodes, sleeping for $2ms before retry $3 of $4}, 'load_cluster' => q{Getting a list of database nodes...}, 'load_cluster_fail' => q{Unable to read cluster configuration $1}, - 'load_cluster_success' => q{Loaded Slony-I v$1 cluster "$2" with $3 nodes read from node at $4:$5/$6}, + 'load_cluster_success' => q{I Loaded Slony-I v$1 cluster "$2" with $3 nodes read from node at $4:$5/$6}, 'lag_detail' => q{Current node lag information from configuration node:}, 'script_settings' => q{Using $1 batches of lock set, $2 FAILOVER and $3}, 'generated_script' => q{Generated script "$1"}, @@ -114,7 +115,7 @@ my %message = ( 'autofailover_init_pol' => q{Polling every $1ms}, 'autofailover_init_ret' => q{Failed nodes will be retried $1 times with $2ms sleep}, 'autofailover_init_set' => q{Failed forwarding providers $1 be failed over}, - 'autofailover_load_cluster' => q{$1 Slony-I v$2 cluster "$3" with $4 nodes}, + 'autofailover_load_cluster' => q{$1 Slony-I v$2 cluster "$3" with $4 nodes read from node $5}, 'autofailover_proceed' => q{Proceeding with failover:}, 'autofailover_detail' => q{Failed node: $1, Backup node: $2}, 'autofailover_halt' => q{Unable to perform any failover for $1 failed nodes}, @@ -124,11 +125,13 @@ my %message = ( 'autofailover_promote_found' => q{Using previously found most up to date subscriber to all sets ($1) on unresponsive node $2}, 'autofailover_promote_skip' => q{No failover required for unresponsive node $1 as it is neither the origin or an active forwarder of any sets}, 'autofailover_promote_fail' => q{Could not find suitable backup node for promotion}, - 'autofailover_node_detail' => q{Node $1 is $2 subscribed to ($3) node $4 and provides sets $5 at $6 lag ($7 events)}, - 'autofailover_node_detail_subonly' => q{Node $1 is $2 subscribed to ($3) node $4 and is a subscriber only at $5 lag ($6 events)}, - 'autofailover_promote_best' => q{Best node for promotion is node $1 seq = $2 ($3 events)}, - 'autofailover_unresponsive' => q{Detected unresponsive provider node: $1}, + 'autofailover_node_detail' => q{Node $1 is $2 subscribed to ($3) node $4 and provides sets $5 at $6 seconds lag (on event $7)}, + 'autofailover_promote_best' => q{Best node for promotion is node $1 lag = $2 seconds (event $3)}, + 'autofailover_promote_unsuitable' => q{Node $1 is unsuitable for promotion}, + 'autofailover_unresponsive' => q{Detected unresponsive origin node: $1}, + 'autofailover_unresponsive_prov' => q{Detected unresponsive provider node: $1}, 'autofailover_unresponsive_subonly'=> q{Detected unresponsive subscriber only node: $1}, + 'autofailover_recovery_subonly' => q{Detected recovery of previously unresponsive subscriber only node: $1}, 'autofailover_pspec_check_fail' => q{Failed to connect to node $1: $2}, 'autofailover_pspec_check' => q{Getting objective judgement from other nodes, apparent unresponsive nodes are : $1 (Failed nodes = $2 of $3)}, 'autofailover_pspec_check_sleep' => q{Sleeping for $1 ms}, @@ -217,7 +220,7 @@ my %message = ( 'fr' => { 'usage' => q{-h -p -db -cl -u -P -f (Option mot de passe pas recommandé; utiliser pgpass place)}, 'title' => q{Slony-I failover (basculement) version de script $1}, - 'cluster_fixed' => q{Abandon de l'action de basculement: tous les noeuds d'origine maintenant sensible}, + 'cluster_fixed' => q{Abandon de l'action de basculement: tous les noeuds d'origine / de fournisseurs maintenant sensible}, 'cluster_failed' => q{Trouvé $1 échoué noeuds, couchage pour $2 ms avant réessayer $3 de $4}, 'load_cluster' => q{Obtenir une liste de noeuds de base de donnees...}, 'load_cluster_fail' => q{Impossible de lire la configuration du cluster $1}, @@ -230,7 +233,7 @@ my %message = ( 'autofailover_init_pol' => q{Vérifier toutes les $1ms}, 'autofailover_init_ret' => q{Noeuds défaillants seront rejugés $1 fois avec $2 ms sommeil}, 'autofailover_init_set' => q{Fournisseurs d'expédition échoué $1 être échoué sur}, - 'autofailover_load_cluster' => q{$1 Slony-I v$2 grappe "$3" avec $4 noeuds}, + 'autofailover_load_cluster' => q{$1 Slony-I v$2 grappe "$3" avec $4 noeuds lire à noeud $5}, 'autofailover_proceed' => q{De procéder à failover:}, 'autofailover_detail' => q{Noeud défaillant: $1, noeud de sauvegarde: $2}, 'autofailover_halt' => q{Noeuds Impossible d'effectuer une failover pour $1 échoué}, @@ -240,11 +243,13 @@ my %message = ( 'autofailover_promote_found' => q{Utilisation précédemment trouvé plus à jour abonné à tous les jeux ($1) sur le noeud ne répond pas $2}, 'autofailover_promote_skip' => q{Pas de failover requis pour le noeud ne répond pas $1 car il n'est ni l'origine ou un transitaire active de tous les jeux}, 'autofailover_promote_fail' => q{Impossible de trouver le noeud de sauvegarde approprié pour la promotion}, - 'autofailover_node_detail' => q{Noeud $1 est souscrit à $2 ($3) noeud $4 et fournit des ensembles de $5 à retard $6 ($7 événements)}, - 'autofailover_node_detail_subonly' => q{Noeud $1 est souscrit à $2 ($3) et le noeud $4 est un abonné à retard $5 ($6 événements)}, - 'autofailover_promote_best' => q{Meilleur noeud pour la promotion est noeud $1 suivants = $2 ($3 événements)}, - 'autofailover_unresponsive' => q{Noeud ne répond pas détecté: $1}, + 'autofailover_node_detail' => q{Noeud $1 est souscrit à $2 ($3) noeud $4 et fournit des ensembles de $5 à retard $6 secondes (en cas d'événement $7)}, + 'autofailover_promote_best' => q{Meilleur noeud pour la promotion est noeud $1 décalage = $2 secondes (événement $3)}, + 'autofailover_promote_unsuitable' => q{Noeud $1 est inadapté pour la promotion}, + 'autofailover_unresponsive' => q{Noeud d'origine ne répond pas détecté: $1}, + 'autofailover_unresponsive_prov' => q{Noeud fournisseur ne répond pas détecté: $1}, 'autofailover_unresponsive_subonly'=> q{Abonné ne répond pas détecté seulement de noeud: $1}, + 'autofailover_recovery_subonly' => q{Recouvrement détecté de l'abonné ne répond pas seulement auparavant de noeud: $1}, 'autofailover_pspec_check_fail' => q{Impossible de se connecter au noeud $1: $2}, 'autofailover_pspec_check' => q{Obtenir un jugement objectif à partir d'autres noeuds, les noeuds qui ne répondent pas apparentes sont : $1 (Noeuds défaillants = $2 de $3)}, 'autofailover_pspec_check_sleep' => q{Dormir pour $1 ms}, @@ -784,6 +789,7 @@ sub loadCluster { $sth->execute(); while (my @node = $sth->fetchrow) { + #printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', join(' - ', @node))); push(@g_cluster, \@node); } @@ -2035,6 +2041,7 @@ sub autoFailover { my $actions; my $current_retry; my $cluster_loaded; + my $cluster_loaded_from; my @cluster; my $node_count; my $version; @@ -2057,6 +2064,7 @@ sub autoFailover { @cluster = @g_cluster; die lookupMsg('err_cluster_lone') if ($node_count == 1); $cluster_loaded = true; + $cluster_loaded_from = 'conninfo specified in config'; }; if ($@) { printlogln($prefix,$logfile,$log_prefix, lookupMsg('load_cluster_fail', 'from supplied configuration')); @@ -2075,6 +2083,7 @@ sub autoFailover { @cluster = @g_cluster; die lookupMsg('err_cluster_lone') if ($node_count == 1); $cluster_loaded = true; + $cluster_loaded_from = $_->[0]; }; if ($@) { printlogln($prefix,$logfile,$log_prefix, lookupMsg('load_cluster_fail', 'from node ' . $_->[0] . ': trying next node')); @@ -2088,7 +2097,7 @@ sub autoFailover { } if ($cluster_loaded) { - printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_load_cluster', (!defined($cluster_time) ? "Loaded" : "Reloaded"), $version, $clname, $node_count)); + printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_load_cluster', (!defined($cluster_time) ? "Loaded" : "Reloaded"), $version, $clname, $node_count, $cluster_loaded_from)); $cluster_time = time(); } else { @@ -2129,6 +2138,7 @@ sub autoFailover { printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_execute_fail', 'slonik script', $g_script)); } $cluster_loaded = false; + #print "SCRIPT: $g_script\n"; #exit(0); } @@ -2350,6 +2360,11 @@ sub checkFailed { $sth->finish; $dbh->disconnect(); + + if (exists($g_unresponsive_subonly{$_->[0]})) { + printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_recovery_subonly', $_->[0])); + delete $g_unresponsive_subonly{$_->[0]}; + } }; if ($@) { if ($g_debug) { @@ -2357,13 +2372,21 @@ sub checkFailed { } push(@g_unresponsive, \@$_); if ((defined($_->[3])) || ($g_autofailover_provs && defined($_->[9]))) { - printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_unresponsive', $_->[0])); + if (defined($_->[3])) { + printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_unresponsive', $_->[0])); + } + else { + printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_unresponsive_prov', $_->[0])); + } unless ($g_failover_method ne 'new' && !defined($_->[3])) { $prov_failed++; } } else { - printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_unresponsive_subonly', $_->[0])); + if (!exists($g_unresponsive_subonly{$_->[0]})) { + printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_unresponsive_subonly', $_->[0])); + $g_unresponsive_subonly{$_->[0]} = true; + } if ($g_fail_subonly) { $subonly_failed++; } @@ -2399,11 +2422,13 @@ sub findBackup { my $qw_clname; my $result_count = 0; my $lowest_lag_time; - my $lowest_lag_events; + my $latest_last_event; my $best_node_id; my $best_node_is_direct; + my $best_node_can_forward; my @sets_from; my @sets_to; + my @sets_to_prov; my %backup_for_set_chosen; undef %g_backups; @@ -2415,7 +2440,9 @@ sub findBackup { undef $best_node_id; $lowest_lag_time = (1<<$Config{ivsize}*8-1)-1; - $lowest_lag_events = $lowest_lag_time; + $latest_last_event = 0; + $best_node_is_direct = false; + $best_node_can_forward = false; if (defined($_->[9]) && (exists $backup_for_set_chosen{$_->[9]})) { $best_node_id = $backup_for_set_chosen{$_->[9]}; @@ -2427,45 +2454,116 @@ sub findBackup { if ($g_debug) { printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_check_sub',$subscriber->[0])); } - + + # Here the strings containing the sets are converted to arrays + # Origin / Forwarded sets + undef @sets_to_prov; + if (defined($subscriber->[10])) { + @sets_to_prov = split(',',$subscriber->[10]); + } + if (defined($subscriber->[3])) { + if (@sets_to_prov) { + @sets_to_prov = (@sets_to_prov, split(',',$subscriber->[3])); + } + else { + @sets_to_prov = split(',',$subscriber->[3]); + } + } + if (!defined($subscriber->[10]) && !defined($subscriber->[3])) { + @sets_to_prov = (0); + } + + # Origin / Subscribed sets. + undef @sets_to; + if (defined($subscriber->[7])) { + @sets_to = split(',',$subscriber->[7]); + } + if (defined($subscriber->[3])) { + if (@sets_to) { + @sets_to = (@sets_to, split(',',$subscriber->[3])); + } + else { + @sets_to = split(',',$subscriber->[3]); + } + } + if (!defined($subscriber->[7]) && !defined($subscriber->[3])) { + @sets_to = (0); + } + + # Sets provided by the failed node. + undef @sets_from; + if (defined($_->[9])) { + @sets_from = split(',',$_->[9]); + } + else { + @sets_from = (0); + @sets_to = (0); + } + $dsn = "DBI:Pg:$subscriber->[2]"; eval { $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1}); $qw_clname = $dbh->quote_identifier("_" . $clname); - $query = "SELECT extract(epoch from a.st_lag_time), a.st_lag_num_events, (a.st_received = ?) AS direct - FROM $qw_clname.sl_status a - INNER JOIN $qw_clname.sl_subscribe b ON b.sub_provider = a.st_received AND b.sub_receiver = a.st_origin - WHERE b.sub_active - GROUP BY a.st_lag_time, a.st_lag_num_events, a.st_received;"; + #print "\tNODE " . $subscriber->[0] . ") SETS TO = " . join(',', @sets_to) . " SETS FROM = " . join(',', @sets_from) . " SETS TO PROV = " . join(',', @sets_to_prov) . "\n"; + + if (defined($subscriber->[3]) && checkProvidesAllSets(\@sets_from, \@sets_to)) { + $query = "SELECT 0, ev_seqno, (ev_origin = ?) + FROM $qw_clname.sl_event + WHERE ev_origin = $qw_clname.getlocalnodeid(?) + ORDER BY ev_seqno DESC LIMIT 1"; + } + else { + $query = "SELECT extract(epoch from (current_timestamp-a.con_timestamp)), a.con_seqno, (a.con_origin = ?) AS direct + FROM $qw_clname.sl_confirm a + INNER JOIN $qw_clname.sl_event b on b.ev_seqno = a.con_seqno AND a.con_origin = b.ev_origin + INNER JOIN $qw_clname.sl_subscribe c ON c.sub_provider = a.con_origin AND c.sub_receiver = a.con_received + WHERE c.sub_active AND a.con_received = $qw_clname.getlocalnodeid(?) + ORDER BY a.con_seqno DESC LIMIT 1;"; + } $sth = $dbh->prepare($query); $sth->bind_param(1, $_->[0]); + $sth->bind_param(2, "_" . $clname); $sth->execute(); while (my @subinfo = $sth->fetchrow) { - - undef @sets_from; - if (defined($_->[9])) { - printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_node_detail', $subscriber->[0], ($subinfo[2]?"directly":"indirectly"), (defined($_->[3])?"origin":"provider"), $_->[0], $subscriber->[7], $subinfo[0], $subinfo[1])); - @sets_from = split(',',$_->[9]); - @sets_to = split(',',$subscriber->[7]); - } - elsif ($g_fail_subonly) { - # Subscriber only node will have no active sets forwarding sets to check - printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_node_detail_subonly', $subscriber->[0], ($subinfo[2]?"directly":"indirectly"), (defined($_->[3])?"origin":"provider"), $_->[0], $subinfo[0], $subinfo[1])); - @sets_from = (0); - @sets_to = (0); + # If the failed provider node isn't an origin for any sets, we classify any direct subscribers to it as indeirect + # because they are indirect to the origin. + if ($subinfo[2] && defined($_->[9]) && !defined($_->[3])) { + $subinfo[2] = false; } - if ((checkProvidesAllSets(\@sets_from, \@sets_to)) && (($subinfo[0] < $lowest_lag_time && ($subinfo[2] || !defined($best_node_id))) || (!$best_node_is_direct && $subinfo[2]))) { + printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_node_detail', + $subscriber->[0], + ($subinfo[2]?"directly":"indirectly"), + (defined($_->[3])?"origin":(defined($_->[9])?"provider":"subscriber only")), + $_->[0], + (defined($subscriber->[10])?$subscriber->[10]:(defined($subscriber->[3])?$subscriber->[3]:"")), + $subinfo[0], $subinfo[1])); + + # If select this node as the backup node if: + # 1) The node is a subscriber to all sets on the failed node + # 2) In order of preference: + # The node is one of the direct subscribers to the failed node on the most recent event and is a forwarding provider + # OR + # The node is one of the direct subscribers to the failed node on the most recent event and is not a forwarding provider + # OR + # The node is an indirect subscriber to the failed node with the lowest lag time + if (!checkProvidesAllSets(\@sets_from, \@sets_to)) { + printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_unsuitable', $subscriber->[0])); + } + elsif (($subinfo[2] && (!$best_node_is_direct || $subinfo[1] > $latest_last_event || (!$best_node_can_forward && checkProvidesAllSets(\@sets_from, \@sets_to_prov) && $subinfo[1] == $latest_last_event))) + || (!$best_node_is_direct && !$subinfo[2] && $subinfo[0] < $lowest_lag_time)) { printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_best', $subscriber->[0], $subinfo[0], $subinfo[1])); $best_node_id = $subscriber->[0]; $lowest_lag_time = $subinfo[0]; - $lowest_lag_events = $subinfo[1]; - $best_node_is_direct = $subinfo[2]; + $latest_last_event = $subinfo[1]; + $best_node_is_direct = $subinfo[2]; + $best_node_can_forward = checkProvidesAllSets(\@sets_from, \@sets_to_prov); } + } }; if ($@) { @@ -2485,7 +2583,7 @@ sub findBackup { } } else { - printlog($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_fail')); + printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_fail')); } } else { -- 2.39.2