+# Count of failed and live nodes to perform very basic split-brain check
+sub checkSplit {
+ my $prefix = shift;
+ my $logfile = shift;
+ my $log_prefix = shift;
+
+ my $majority = false;
+ my $failed = scalar(@g_unresponsive);
+ my $survivers = (scalar(@g_cluster) - scalar(@g_unresponsive));
+
+ if ($survivers > $failed) {
+ $majority = true;
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_split_check', $survivers, ($survivers+$failed)));
+ }
+ elsif (($survivers == $failed) && $g_autofailover_is_quorum) {
+ $majority = true;
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_split_check', ($survivers . '+quorum'), ($survivers+$failed)));
+ }
+ else {
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_split_check_fail', $survivers));
+ }
+
+ return $majority;
+}
+
+# Check each nodes perspective of the failure to try to ensure the issue isn't that this script just can't connect to the origin/provider
+# The idea here is just to wait for a short period of time and see if the lag time for the nodes has increased by the same amount
+sub checkPerspective {
+ my $clname = shift;
+ my $dbuser = shift;
+ my $dbpass = shift;
+ my $prefix = shift;
+ my $logfile = shift;
+ my $log_prefix = shift;
+
+ my $dsn;
+ my $dbh;
+ my $sth;
+ my $query;
+ my $qw_clname;
+ my $param_on;
+ my $agreed = false;
+ my @unresponsive_ids;
+ my $lag_idx;
+ my $lag_confirmed;
+ my @lag_info1;
+ my @lag_info2;
+ my $bad = 0;
+
+ foreach (@g_unresponsive) {
+ push(@unresponsive_ids, $_->[0]);
+ }
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check', join(", ", @unresponsive_ids), scalar(@g_unresponsive), scalar(@g_cluster)));
+
+ foreach (@g_cluster) {
+ unless ($_->[0] ~~ @unresponsive_ids) {
+ $dsn = "DBI:Pg:$_->[2];";
+ eval {
+ $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
+ $qw_clname = $dbh->quote_identifier("_" . $clname);
+
+ $query = "SELECT a.st_origin, a.st_received, extract(epoch from a.st_lag_time)::integer
+ FROM _test_replication.sl_status a
+ INNER JOIN _test_replication.sl_node b on a.st_origin = b.no_id
+ INNER JOIN _test_replication.sl_node c on a.st_received = c.no_id
+ WHERE a.st_received IN (" . substr('?, ' x scalar(@unresponsive_ids), 0, -2) . ") ORDER BY a.st_origin, a.st_received;";
+
+ $sth = $dbh->prepare($query);
+
+ $param_on = 1;
+ foreach (@unresponsive_ids) {
+ $sth->bind_param($param_on, $_);
+ $param_on++;
+ }
+ $sth->execute();
+
+ while (my @node_lag = $sth->fetchrow) {
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_data', 'Check1', $_->[0], $node_lag[0], $node_lag[1], $node_lag[2]));
+ push(@lag_info1, \@node_lag);
+ }
+
+ $sth->finish;
+ $dbh->disconnect();
+ };
+ if ($@) {
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_fail', $_->[0], $@));
+ $bad++;
+ }
+ }
+ }
+
+ if ($bad == 0) {
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_sleep', $g_autofailover_perspective_sleep));
+ usleep($g_autofailover_perspective_sleep * 1000);
+
+ foreach (@g_cluster) {
+ unless ($_->[0] ~~ @unresponsive_ids) {
+ $dsn = "DBI:Pg:$_->[2];";
+ eval {
+ $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
+ $qw_clname = $dbh->quote_identifier("_" . $clname);
+
+ $query = "SELECT a.st_origin, a.st_received, extract(epoch from a.st_lag_time)::integer
+ FROM _test_replication.sl_status a
+ INNER JOIN _test_replication.sl_node b on a.st_origin = b.no_id
+ INNER JOIN _test_replication.sl_node c on a.st_received = c.no_id
+ WHERE a.st_received IN (" . substr('?, ' x scalar(@unresponsive_ids), 0, -2) . ") ORDER BY a.st_origin, a.st_received;";
+
+ $sth = $dbh->prepare($query);
+
+ $param_on = 1;
+ foreach (@unresponsive_ids) {
+ $sth->bind_param($param_on, $_);
+ $param_on++;
+ }
+ $sth->execute();
+
+ while (my @node_lag = $sth->fetchrow) {
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_data', 'Check2', $_->[0], $node_lag[0], $node_lag[1], $node_lag[2]));
+ push(@lag_info2, \@node_lag);
+ }
+
+ $sth->finish;
+ $dbh->disconnect();
+ };
+ if ($@) {
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_fail', $_->[0], $@));
+ $bad++;
+ }
+ }
+ }
+
+ $lag_idx = 0;
+ $lag_confirmed = 0;
+ foreach (@lag_info1) {
+ if ($g_debug) {
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', ("Node $_->[0] lag between checks on node $_->[1] is " . ($lag_info2[$lag_idx]->[2]-$_->[2]) . " seconds")));
+ }
+
+ if ((($lag_info2[$lag_idx]->[2]-$_->[2])*1000) >= $g_autofailover_perspective_sleep) {
+ $lag_confirmed++;
+ }
+ $lag_idx++;
+ }
+ }
+
+ if ($bad > 0) {
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_unknown'));
+ }
+ elsif ($lag_idx == $lag_confirmed) {
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_true'));
+ $agreed = true;
+ }
+ else {
+ printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_false'));
+ }
+
+ return $agreed;
+}
+
+# Check if any nodes have failed by connecting and probing the Slony schema