4 # Copyright: 08/04/2012: v1.0.2 Glyn Astill <glyn@8kb.co.uk>
5 # Requires: Perl 5.10.1+, Data::UUID, File::Slurp
6 # PostgreSQL 9.0+ Slony-I 1.2+ / 2.0+
8 # This script is a command-line utility to manage switchover and failover
9 # of replication sets in Slony-I clusters.
11 # This script is free software: you can redistribute it and/or modify
12 # it under the terms of the GNU General Public License as published by
13 # the Free Software Foundation, either version 3 of the License, or
14 # (at your option) any later version.
16 # This script is distributed in the hope that it will be useful,
17 # but WITHOUT ANY WARRANTY; without even the implied warranty of
18 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 # GNU General Public License for more details.
21 # You should have received a copy of the GNU General Public License
22 # along with this script. If not, see <http://www.gnu.org/licenses/>.
26 use experimental 'smartmatch';
28 use Getopt::Long qw/GetOptions/;
32 use sigtrap 'handler' => \&sigExit, 'HUP', 'INT','ABRT','QUIT','TERM';
33 use Time::HiRes qw/usleep gettimeofday/;
34 use POSIX qw/strftime/;
35 use Config qw/%Config/;
37 use constant false => 0;
38 use constant true => 1;
40 my $g_script_version = '1.0.3';
42 my $g_pidfile = '/var/run/slony_failover.pid';
43 my $g_pid_written = false;
45 my $g_prefix = '/tmp/slony_failovers';
46 my $g_separate_working = true;
47 my $g_log_prefix = '%t';
48 my $g_log_to_db = false;
55 my $g_use_try_blocks = false;
56 my $g_lockset_method = 'multiple';
57 my $g_logfile = 'failover.log';
59 my $g_silence_notice = false;
70 my $g_dbuser = 'slony';
72 my $g_available_node_count;
73 my $g_critical_node_count;
74 my $g_subs_follow_origin = false;
75 my $g_use_comment_aliases = false;
76 my @g_cluster; # Array refs of node info. In hindsight this should have been a hash, should be fairly simple to switch.
82 my $g_failover_method = 'old';
83 my $g_resubscribe_method = 'subscribe';
84 my $g_failover = false;
85 my $g_fail_subonly = false;
86 my $g_drop_failed = false;
87 my $g_autofailover = false;
88 my $g_autofailover_poll_interval = 500;
89 my $g_autofailover_retry = 2;
90 my $g_autofailover_retry_sleep = 1000;
91 my $g_autofailover_provs = false;
92 my $g_autofailover_config_any = true;
93 my $g_autofailover_perspective_sleep = 20000;
94 my $g_autofailover_majority_only = false;
95 my $g_autofailover_is_quorum = false;
97 my %g_unresponsive_subonly;
101 # Hash containing messages used by lookupMsg()
104 'usage' => q{-h <host> -p <port> -db <database> -cl <cluster name> -u <username> -P <password> -f <config file> (Password option not recommended; use pgpass instead)},
105 'title' => q{Slony-I failover script version $1},
106 'cluster_fixed' => q{Aborting failover action: all origin/provider nodes now responsive},
107 'cluster_failed' => q{Found $1 failed nodes, sleeping for $2ms before retry $3 of $4},
108 'load_cluster' => q{Getting a list of database nodes...},
109 'load_cluster_fail' => q{Unable to read cluster configuration $1},
110 'load_cluster_success' => q{I Loaded Slony-I v$1 cluster "$2" with $3 nodes read from node at $4:$5/$6},
111 'lag_detail' => q{Current node lag information from configuration node:},
112 'script_settings' => q{Using $1 batches of lock set, $2 FAILOVER and $3},
113 'generated_script' => q{Generated script "$1"},
114 'autofailover_init' => q{Entering autofailover mode},
115 'autofailover_init_cnf' => q{Slony configuration will be read from $1 node},
116 'autofailover_init_pol' => q{Polling every $1ms},
117 'autofailover_init_ret' => q{Failed nodes will be retried $1 times with $2ms sleep},
118 'autofailover_init_set' => q{Failed forwarding providers $1 be failed over},
119 'autofailover_load_cluster' => q{$1 Slony-I v$2 cluster "$3" with $4 nodes read from node $5},
120 'autofailover_proceed' => q{Proceeding with failover:},
121 'autofailover_detail' => q{Failed node: $1, Backup node: $2},
122 'autofailover_halt' => q{Unable to perform any failover for $1 failed nodes},
123 'autofailover_check_sub' => q{Checking subscriber node $1},
124 'autofailover_check_sub_fail' => q{Unable to check subscriber node $1},
125 'autofailover_promote_find' => q{Finding most up to date subscriber to all sets ($1) on unresponsive node $2},
126 'autofailover_promote_found' => q{Using previously found most up to date subscriber to all sets ($1) on unresponsive node $2},
127 'autofailover_promote_skip' => q{No failover required for unresponsive node $1 as it is neither the origin or an active forwarder of any sets},
128 'autofailover_promote_fail' => q{Could not find suitable backup node for promotion},
129 'autofailover_node_detail' => q{Node $1 is $2 subscribed to ($3) node $4 and provides sets $5 at $6 seconds lag (on event $7)},
130 'autofailover_promote_best' => q{Best node for promotion is node $1 lag = $2 seconds (event $3)},
131 'autofailover_promote_unsuitable' => q{Node $1 is unsuitable for promotion},
132 'autofailover_unresponsive' => q{Detected unresponsive origin node: $1},
133 'autofailover_unresponsive_prov' => q{Detected unresponsive provider node: $1},
134 'autofailover_unresponsive_subonly'=> q{Detected unresponsive subscriber only node: $1},
135 'autofailover_recovery_subonly' => q{Detected recovery of previously unresponsive subscriber only node: $1},
136 'autofailover_pspec_check_fail' => q{Failed to connect to node $1: $2},
137 'autofailover_pspec_check' => q{Getting objective judgement from other nodes, apparent unresponsive nodes are : $1 (Failed nodes = $2 of $3)},
138 'autofailover_pspec_check_sleep' => q{Sleeping for $1 ms},
139 'autofailover_pspec_check_data' => q{$1: Node $2 says lag from node $3 -> $4 is $5 seconds},
140 'autofailover_pspec_check_true' => q{All detected failed nodes confirmed as lagging by other nodes},
141 'autofailover_pspec_check_false' => q{Not all nodes confirmed as lagging},
142 'autofailover_pspec_check_unknown' => q{Unable to confirm lag status of all nodes},
143 'autofailover_split_check' => q{Surviving nodes ($1 of $2) are the majority},
144 'autofailover_split_check_fail' => q{Surviving nodes ($1) are not the majority},
145 'interactive_head_id' => q{ID},
146 'interactive_head_name' => q{Name},
147 'interactive_head_status' => q{Status},
148 'interactive_head_providers' => q{Provider IDs},
149 'interactive_head_config' => q{Configuration},
150 'interactive_detail_1' => q{Origin for sets: },
151 'interactive_detail_2' => q{Providing sets: },
152 'interactive_detail_3' => q{Subscriptions: },
153 'interactive_choose_node' => q{Please choose the node to move all sets $1:},
154 'interactive_confirm' => q{You chose to move sets $1 node $2 ($3). Is this correct [y/n]? },
155 'interactive_action' => q{Best course of action is most likely to do a "$1". Do you wish to continue [y/n]?},
156 'interactive_surrender' => q{Uable to determine best course of action},
157 'interactive_write_script' => q{Writing a script to $1 node $2 to $3},
158 'interactive_check_nodes' => q{Checking availability of database nodes...},
159 'interactive_continue' => q{Do you wish to continue [y/n]?},
160 'interactive_drop_nodes' => q{Do you want to also drop the failed nodes from the slony configuration [y/n]?},
161 'interactive_preserve' => q{Preserve subscription paths to follow the origin node (choose no if unsure) [y/n]?},
162 'interactive_aliases' => q{Generate aliases based on sl_node/set comments in parentheses (choose no if unsure) [y/n]?},
163 'interactive_summary' => q{Summary of nodes to be passed to failover:},
164 'interactive_node_info' => q{Node : $1 ($2) $3 (conninfo $4)},
165 'interactive_run_script' => q{Would you like to run this script now [y/n]?},
166 'interactive_running' => q{Running the script now. This may take some time; please be patient!},
167 'interactive_reason' => q{Please enter a brief reson for taking this action: },
168 'interactive_failover_detail_1' => q{Before you go any further please consider the impact of a full failover:},
169 'interactive_failover_detail_2' => q{The node you are failing over from will cease to participate in the cluster permanently until it is rebuild & subscribed},
170 'interactive_failover_detail_3' => q{If the outage is temporary (i.e. network/power/easily replaceable hardware related) consider waiting it out},
171 'interactive_failover_detail_4' => q{This type of failover is likely to be more a business decision than a technical one},
172 'info_all_nodes_available' => q{INFO: All nodes are available},
173 'info_req_nodes_available' => q{INFO: $1 of $2 nodes are available. No unavailable nodes are subscribed to the old origin},
174 'wrn_node_unavailable' => q{WARNING: Node $1 unavailable},
175 'wrn_req_unavailable' => q{WARNING: Old origin node ($1) is available, however $2 subscribers are unavailable},
176 'wrn_not_tested' => q{WARNING: Script not tested with Slony-I v$1},
177 'wrn_failover_issues' => q{WARNING: Slony-I v$1 may struggle to failover correctly with multiple failed nodes (affects v2.0-2.1)},
178 'note_autofail_fwd_only' => q{NOTICE: Slony versions prior to 2.2 cannot initiate failover from only failed forwarding providers},
179 'note_fail_sub_only' => q{NOTICE: Slony versions prior to 2.2 cannot failover subscriber only nodes, reverting to failover_offline_subscriber_only = false},
180 'note_multiple_try' => q{NOTICE: Cannot lock multiple sets within try blocks in version $1 dropping back to single sets},
181 'note_reshape_cluster' => q{NOTICE: Either drop the failed subscribers or bring them back up, then retry to MOVE SET},
182 'dbg_generic' => q{DEBUG: $1},
183 'dbg_cluster' => q{DEBUG: NodeID $1/ProvIDs $2/Conninfo $3/OrigSets $4/NodeName $5/ProvTree $6/Active $7/FwdSets $8/ActSubSets $9},
184 'dbg_resubscribe' => q{DEBUG: Checking possibility to resubscribe set $1 ($2) to node $3 ($4) which pulls $5 ($6) from $7 ($8)},
185 'dbg_failover_method' => q{DEBUG: Failover method is $1},
186 'dbg_cluster_load' => q{DEBUG: Loading cluster configuration from $1},
187 'dbg_cluster_good' => q{DEBUG: Cluster state good},
188 'dbg_autofailover_check' => q{DEBUG: Checking node $1 ($2) role is $3 (conninfo: $4)},
189 'dbg_autofailover_active_check' => q{DEBUG: Initiate active check of $1 node $2},
190 'dbg_slonik_script' => q{DEBUG: Running slonik script $1},
191 'err_generic' => q{ERROR: $1},
192 'err_no_database' => q{ERROR: Please specify a database name},
193 'err_no_cluster' => q{ERROR: Please specify a slony cluster name},
194 'err_no_host' => q{ERROR: Please specify a host},
195 'err_no_config' => q{ERROR: No valid config found},
196 'err_fail_config' => q{ERROR: Failed to load configuration},
197 'err_write_fail' => q{ERROR: Could not write to $1 "$2"},
198 'err_read_fail' => q{ERROR: Could not read from $1 "$2"},
199 'err_unlink_fail' => q{ERROR: Could not delete $1 "$2"},
200 'err_mkdir_fail' => q{ERROR: Unable to create $1 directory "$2"},
201 'err_execute_fail' => q{ERROR: Could not execute $1 "$2"},
202 'err_inactive' => q{ERROR: Node $1 is not active (state = $2)},
203 'err_cluster_empty' => q{ERROR: Loaded cluster contains no nodes},
204 'err_cluster_offline' => q{ERROR: Loaded cluster contains no reachable nodes},
205 'err_cluster_lone' => q{ERROR: Loaded cluster contains only 1 node},
206 'err_not_origin' => q{ERROR: Node $1 is not the origin of any sets},
207 'err_not_provider' => q{ERROR: Node $1 is not a provider of any sets},
208 'err_not_provider_sets' => q{ERROR: Node $1 does not provide the sets required: need ($2) but provides ($3)},
209 'err_no_configuration' => q{ERROR: Could not read configuration for node $1},
210 'err_must_enter_node_id' => q{ERROR: You must enter a node id},
211 'err_not_a_node_id' => q{ERROR: I have no knowledge of a node $1},
212 'err_same_node' => q{ERROR: Cant move from and to the same node},
213 'err_node_offline' => q{ERROR: $1 node ($2) is not available},
214 'err_incomplete_preamble' => q{ERROR: Incomplete preamble},
215 'err_running_slonik' => q{ERROR: Could not run slonik: $1},
216 'err_pgsql_connect' => q{ERROR: Cannot connect to postgres server},
217 'slonik_output' => q{SLONIK: $1},
218 'exit_noaction' => q{Exiting, no action has been taken},
219 'exit' => q{Exited by $1}
222 'usage' => q{-h <host> -p <port> -db <database> -cl <cluster name> -u <username> -P <password> -f <config file> (Option mot de passe pas recommandé; utiliser pgpass place)},
223 'title' => q{Slony-I failover (basculement) version de script $1},
224 'cluster_fixed' => q{Abandon de l'action de basculement: tous les noeuds d'origine / de fournisseurs maintenant sensible},
225 'cluster_failed' => q{Trouvé $1 échoué noeuds, couchage pour $2 ms avant réessayer $3 de $4},
226 'load_cluster' => q{Obtenir une liste de noeuds de base de donnees...},
227 'load_cluster_fail' => q{Impossible de lire la configuration du cluster $1},
228 'load_cluster_success' => q{Chargé Slony-I v$1 groupe "$2" avec $3 noeuds lire à partir du noeud à $4:$5/$6},
229 'lag_detail' => q{Current informations noeud de décalage à partir du noeud de configuration:},
230 'script_settings' => q{Utilisation de $1 lots de système de verrouillage, $2 FAILOVER et $3},
231 'generated_script' => q{Script généré "$1"},
232 'autofailover_init' => q{Entrer dans le mode de autofailover},
233 'autofailover_init_cnf' => q{Configuration Slony sera lu à partir de $1 noeud},
234 'autofailover_init_pol' => q{Vérifier toutes les $1ms},
235 'autofailover_init_ret' => q{Noeuds défaillants seront rejugés $1 fois avec $2 ms sommeil},
236 'autofailover_init_set' => q{Fournisseurs d'expédition échoué $1 être échoué sur},
237 'autofailover_load_cluster' => q{$1 Slony-I v$2 grappe "$3" avec $4 noeuds lire à noeud $5},
238 'autofailover_proceed' => q{De procéder à failover:},
239 'autofailover_detail' => q{Noeud défaillant: $1, noeud de sauvegarde: $2},
240 'autofailover_halt' => q{Noeuds Impossible d'effectuer une failover pour $1 échoué},
241 'autofailover_check_sub' => q{Vérification noeud abonné $1},
242 'autofailover_check_sub_fail' => q{Impossible de vérifier noeud abonné $1},
243 'autofailover_promote_find' => q{Trouver plus à jour abonné à tous les jeux ($1) sur le noeud ne répond pas $2},
244 'autofailover_promote_found' => q{Utilisation précédemment trouvé plus à jour abonné à tous les jeux ($1) sur le noeud ne répond pas $2},
245 'autofailover_promote_skip' => q{Pas de failover requis pour le noeud ne répond pas $1 car il n'est ni l'origine ou un transitaire active de tous les jeux},
246 'autofailover_promote_fail' => q{Impossible de trouver le noeud de sauvegarde approprié pour la promotion},
247 'autofailover_node_detail' => q{Noeud $1 est souscrit à $2 ($3) noeud $4 et fournit des ensembles de $5 à retard $6 secondes (en cas d'événement $7)},
248 'autofailover_promote_best' => q{Meilleur noeud pour la promotion est noeud $1 décalage = $2 secondes (événement $3)},
249 'autofailover_promote_unsuitable' => q{Noeud $1 est inadapté pour la promotion},
250 'autofailover_unresponsive' => q{Noeud d'origine ne répond pas détecté: $1},
251 'autofailover_unresponsive_prov' => q{Noeud fournisseur ne répond pas détecté: $1},
252 'autofailover_unresponsive_subonly'=> q{Abonné ne répond pas détecté seulement de noeud: $1},
253 'autofailover_recovery_subonly' => q{Recouvrement détecté de l'abonné ne répond pas seulement auparavant de noeud: $1},
254 'autofailover_pspec_check_fail' => q{Impossible de se connecter au noeud $1: $2},
255 'autofailover_pspec_check' => q{Obtenir un jugement objectif à partir d'autres noeuds, les noeuds qui ne répondent pas apparentes sont : $1 (Noeuds défaillants = $2 de $3)},
256 'autofailover_pspec_check_sleep' => q{Dormir pour $1 ms},
257 'autofailover_pspec_check_data' => q{$1: Noeud $2 dit décalage de $3 -> $4 noeud est $5 secondes},
258 'autofailover_pspec_check_true' => q{Tous les noeuds détectés pas confirmés comme à la traîne par d'autres noeuds},
259 'autofailover_pspec_check_false' => q{Pas tous les noeuds confirmé retard},
260 'autofailover_pspec_check_unknown' => q{Impossible de confirmer le statut de latence de tous les noeuds},
261 'autofailover_split_check' => q{Autres noeuds ($1 sur $2) sont la majorité},
262 'autofailover_split_check_fail' => q{Autres noeuds ($1) ne sont pas la majorité},
263 'interactive_head_name' => q{Nom},
264 'interactive_head_status' => q{Statut},
265 'interactive_head_providers' => q{Fournisseur IDs},
266 'interactive_detail_1' => q{Origine pour les jeux: },
267 'interactive_detail_2' => q{Fournir des ensembles: },
268 'interactive_detail_3' => q{Abonnements: },
269 'interactive_choose_node' => q{S'il vous plaît choisissez le noeud à déplacer tous les ensembles $1:},
270 'interactive_confirm' => q{Vous avez choisi de passer ensembles $1 noeud $2 ($3). Est-ce correct [o/n]? },
271 'interactive_drop_nodes' => q{Voulez-vous laisser tomber aussi les noeuds défaillants de la configuration de slony [o/n]?},
272 'interactive_action' => q{Meilleur plan d'action est le plus susceptible de faire une $1. Voulez-vous continuer [o/n]?},
273 'interactive_surrender' => q{Uable pour déterminer le meilleur plan d'action},
274 'interactive_write_script' => q{Rédaction d'un script à $1 $2 noeud à $3},
275 'interactive_check_nodes' => q{Vérification de la disponibilité des noeuds de base de donnees...},
276 'interactive_continue' => q{Voulez-vous continuer [o/n]?},
277 'interactive_preserve' => q{Préserver les chemins de souscription à suivre le noeud d'origine (ne pas choisir en cas de doute) [o/n]?},
278 'interactive_aliases' => q{Générer des alias sur la base de sl_node / set commentaires entre parenthèses (ne pas choisir en cas de doute) [o/n]?},
279 'interactive_summary' => q{Résumé des noeuds à passer à failover:},
280 'interactive_node_info' => q{Noeud : $1 ($2) $3 (conninfo $4)},
281 'interactive_run_script' => q{Voulez-vous exécuter ce script maintenant [o/n]?},
282 'interactive_running' => q{L'exécution du script maintenant. Cela peut prendre un certain temps; s'il vous plaît être patient!},
283 'interactive_reason' => q{S'il vous plaît entrer une brève reson pour cette action: },
284 'interactive_failover_detail_1' => q{Avant d'aller plus loin s'il vous plaît envisager l'impact d'un failover (basculement) complet:},
285 'interactive_failover_detail_2' => q{Le noeud vous ne parviennent pas au-dessus de cesse de participer au groupe de façon permanente jusqu'à ce qu'il soit à reconstruire et souscrit},
286 'interactive_failover_detail_3' => q{Si la panne est temporaire (c.-à -réseau / alimentation / facilement remplaçable matériel connexe) envisager d'attendre dehors},
287 'interactive_failover_detail_4' => q{Ce type de failover est susceptible d'être plus une décision d'affaires que technique},
288 'info_all_nodes_available' => q{INFO: Tous les noeuds sont disponibles},
289 'info_req_nodes_available' => q{INFO: $1 of $2 noeuds sont disponibles. Pas de noeuds indisponibles sont souscrites à l'ancienne origine},
290 'wrn_node_unavailable' => q{ATTENTION: Noeud $1 disponible},
291 'wrn_req_unavailable' => q{ATTENTION: Noeud Old origine ($1) est disponible, mais $2 abonnés ne sont pas disponibles},
292 'wrn_not_tested' => q{ATTENTION: Script pas testé avec Slony-I v$1},
293 'wrn_failover_issues' => q{ATTENTION: Slony-I v$1 peut lutter pour basculer correctement avec plusieurs nœuds défaillants (affecte v2.0-2.1)},
294 'note_autofail_fwd_only' => q{AVIS: Versions antérieures à la 2.2 Slony ne peuvent pas initier le basculement de seulement échoué transmettre fournisseurs},
295 'note_fail_sub_only' => q{AVIS: Versions antérieures à la 2.2 Slony ne peuvent pas basculer abonnes seuls les noeuds, revenant à failover_offile_subscriber_only = false},
296 'note_multiple_try' => q{AVIS: Vous ne pouvez pas verrouiller plusieurs ensembles dans des blocs try dans la version $1 de retomber à des jeux simples},
297 'note_reshape_cluster' => q{AVIS: Vous devez supprimer les abonnés défaillants ou les ramener, puis réessayez à MOVE SET},
298 'err_generic' => q{ERREUR: $1},
299 'err_no_database' => q{ERREUR: S'il vous plaît spécifier un base de donnees nom},
300 'err_no_cluster' => q{ERREUR: S'il vous plaît indiquez un nom de cluster slony},
301 'err_no_host' => q{ERREUR: S'il vous plaît spécifier un hôte},
302 'err_no_config' => q{ERREUR: Aucune configuration valide n'a été trouvée},
303 'err_fail_config' => q{ERREUR: Impossible de charger la configuration},
304 'err_write_fail' => q{ERREUR: Impossible d'écrire dans $1 "$2"},
305 'err_read_fail' => q{ERREUR: Impossible de lire $1 "$2"},
306 'err_unlink_fail' => q{ERREUR: Impossible de supprimer $1 "$2"},
307 'err_mkdir_fail' => q{ERREUR: Impossible de créer $1 répertoire "$2"},
308 'err_execute_fail' => q{ERREUR: Impossible d'exécuter $1 "$2"},
309 'err_inactive' => q{ERREUR: Noeud $1 n'est pas active (état = $2)},
310 'err_cluster_empty' => q{ERREUR: Groupe chargé contient pas de noeuds},
311 'err_cluster_offline' => q{ERREUR: Groupe chargé contient pas de noeuds accessibles},
312 'err_cluster_lone' => q{ERRRUE: Groupe chargé ne contient que 1 noeud},
313 'err_not_origin' => q{ERREUR: Noeud $1 n'est pas à l'origine de tous les jeux},
314 'err_not_provider' => q{ERREUR: Noeud $1 n'est pas un fournisseur de tous les jeux},
315 'err_not_provider_sets' => q{ERREUR: Noeud $1 ne fournit pas les ensembles nécessaires: le besoin ($2), mais fournit ($3)},
316 'err_no_configuration' => q{ERREUR: Impossible de lire la configuration pour le noeud $1},
317 'err_must_enter_node_id' => q{ERREUR: Vous devez entrer un id de noeud},
318 'err_not_a_node_id' => q{ERREUR: Je n'ai pas connaissance d'un $1 de noeud},
319 'err_same_node' => q{ERREUR: Cant déplacer depuis et vers le même noeud},
320 'err_node_offline' => q{ERREUR: $1 noeud ($2) n'est pas disponible},
321 'err_incomplete_preamble' => q{ERREUR: Préambule incomplète},
322 'err_running_slonik' => q{ERREUR: Ne pouvait pas courir slonik: $1},
323 'err_pgsql_connect' => q{ERREUR: Impossible de se connecter au serveur postgres},
324 'slonik_output' => q{SLONIK: $1},
325 'exit_noaction' => q{Quitter, aucune action n'a été prise},
326 'exit' => q{Quitter par $1}
331 # Setup date variables
332 my ($g_year, $g_month, $g_day, $g_hour, $g_min, $g_sec) = (localtime(time))[5,4,3,2,1,0];
333 my $g_date = sprintf ("%02d:%02d:%02d on %02d/%02d/%04d", $g_hour, $g_min, $g_sec, $g_day, $g_month+1, $g_year+1900);
335 # Handle command line options
336 Getopt::Long::Configure('no_ignore_case');
338 die lookupMsg('usage') unless GetOptions(\%opt, 'host|H=s', 'port|p=i', 'dbname|db=s', 'clname|cl=s', 'dbuser|u=s', 'dbpass|P=s', 'cfgfile|f=s', 'infoprint|I', ) and keys %opt and ! @ARGV;
341 if (defined($opt{cfgfile})) {
342 unless (getConfig($opt{cfgfile})) {
343 println(lookupMsg('err_no_config'));
348 if (defined($opt{dbname})) {
349 $g_dbname = $opt{dbname};
351 if (defined($opt{clname})) {
352 $g_clname = $opt{clname};
354 if (defined($opt{host})) {
355 $g_dbhost = $opt{host};
357 if (defined($opt{port})) {
358 $g_dbport = $opt{port};
360 if (defined($opt{dbuser})) {
361 $g_dbuser = $opt{dbuser};
363 if (defined($opt{dbpass})) {
364 $g_dbpass = $opt{dbpass};
368 # Display message and die if any of the required configuration variables are missing
369 if (!defined($g_dbname)) {
370 println(lookupMsg('err_no_database'));
371 die lookupMsg('usage');
373 if (!defined($g_clname)) {
374 println(lookupMsg('err_no_cluster'));
375 die lookupMsg('usage');
377 if (!defined($g_dbhost)) {
378 println(lookupMsg('err_no_host'));
379 die lookupMsg('usage');
383 # Build conninfo from supplied datbase name/host/port
384 $g_dbconninfo = "dbname=$g_dbname;host=$g_dbhost;port=$g_dbport";
386 if (!defined($opt{infoprint})) {
387 # Check prefix directory and create if not present
388 unless(-e $g_prefix or mkdir $g_prefix) {
389 println(lookupMsg('err_mkdir_fail', 'prefix', $g_prefix));
393 if ($g_separate_working) {
394 if ($g_prefix !~ m/\/$/) {
398 # Get a uuid for working directory
399 $g_prefix .= getUUID($g_date);
401 # Create a working directory and setup log file
402 unless(-e $g_prefix or mkdir $g_prefix) {
403 println(lookupMsg('err_mkdir_fail', 'work', $g_prefix));
408 # Set postgres path if provided
409 if (defined($g_slonikpath) && ($g_slonikpath ne "")) {
410 $ENV{PATH} .= ":$g_slonikpath";
413 # Check if autofailover is enabled, if so check configuration and enter autofailover mode
414 if (($g_autofailover) && !defined($opt{infoprint})) {
416 # Write out a PID file
417 if (writePID($g_prefix, $g_logfile, $g_log_prefix, $g_pidfile)) {
418 $g_pid_written = true;
421 cleanExit(1, "system");
424 # Go into endless loop for autofailover
425 autoFailover($g_dbconninfo, $g_clname, $g_dbuser, $g_dbpass, $g_prefix, $g_logfile, $g_log_prefix);
428 # Read slony configuration and output some basic information
431 println(lookupMsg('load_cluster', $g_prefix));
432 ($g_node_count, $g_version) = loadCluster($g_dbconninfo, $g_clname, $g_dbuser, $g_dbpass, $g_prefix, $g_logfile, $g_log_prefix);
435 println(lookupMsg('load_cluster_fail', 'from supplied configuration'));
436 cleanExit(2, "system");
439 if (defined($opt{infoprint})) {
440 println(lookupMsg('load_cluster_success', $g_version, $g_clname, $g_node_count, $g_dbhost, $g_dbport, $g_dbname) . ":");
441 chooseNode("info", undef, undef, undef, 0);
445 printlog($g_prefix,$g_logfile,$g_log_prefix,"*"x68 . "\n* ");
446 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('title', $g_script_version));
447 printlogln($g_prefix,$g_logfile,$g_log_prefix,"*"x68);
450 if ($g_node_count <= 0) {
451 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('err_cluster_empty'));
452 cleanExit(3, "system");
455 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('load_cluster_success', $g_version, $g_clname, $g_node_count, $g_dbhost, $g_dbport, $g_dbname));
456 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('script_settings', $g_lockset_method, $g_failover_method, uc($g_resubscribe_method)));
459 # Output lag information between each node and node configuration was read from
460 if (loadLag($g_dbconninfo, $g_clname, $g_dbuser, $g_dbpass, $g_prefix, $g_logfile, $g_log_prefix) > 0) {
461 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('lag_detail'));
463 printlogln($g_prefix,$g_logfile,$g_log_prefix,"\t$_");
465 printlog($g_prefix,$g_logfile,$g_log_prefix,"\n");
468 # Prompt user to choose nodes to move sets from / to
469 $g_node_from = chooseNode("from", $g_prefix, $g_logfile, $g_log_prefix, 0);
470 if ($g_node_from == 0) {
471 cleanExit(4, "user");
473 elsif ($g_node_from == -1) {
474 cleanExit(5, "system");
477 $g_node_to = chooseNode("to", $g_prefix, $g_logfile, $g_log_prefix, $g_node_from);
478 if ($g_node_to == 0) {
479 cleanExit(6, "user");
481 elsif ($g_node_to == -1) {
482 cleanExit(7, "system");
484 elsif ($g_node_from == $g_node_to) {
485 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('err_same_node'));
486 cleanExit(8, "system");
489 # Check nodes are available and decide on action to take
490 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_check_nodes'));
491 ($g_available_node_count, $g_critical_node_count) = checkNodes($g_clname, $g_dbuser, $g_dbpass, $g_node_from, $g_node_to, $g_prefix, $g_logfile, $g_log_prefix);
493 if ($g_available_node_count <= 0) {
494 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('err_cluster_offline'));
495 cleanExit(9, "system");
497 elsif ($g_critical_node_count == -1) {
498 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('err_node_offline', 'Target new origin', $g_node_to));
499 cleanExit(10, "system");
501 elsif ($g_critical_node_count == -2) {
502 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('err_node_offline', 'Old origin', $g_node_from));
503 printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_action', 'FAILOVER'));
506 elsif ($g_critical_node_count == 0) {
507 if ($g_node_count == $g_available_node_count) {
508 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('info_all_nodes_available'));
511 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('info_req_nodes_available', $g_available_node_count, $g_node_count));
513 printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_action', 'MOVE SET'));
515 elsif ($g_critical_node_count > 0) {
516 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('wrn_req_unavailable', $g_node_from, $g_critical_node_count));
517 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('note_reshape_cluster'));
518 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('exit_noaction'));
519 cleanExit(11, "user");
522 printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_surrender'));
523 cleanExit(12, "system");
527 if ($g_input !~ /^[Y|O]$/i) {
528 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('exit_noaction'));
529 cleanExit(13, "user");
532 if (!$g_use_comment_aliases) {
533 printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_aliases'));
536 if ($g_input =~ /^[Y|O]$/i) {
537 $g_use_comment_aliases = true;
542 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_summary'));
544 foreach (@g_failed) {
545 printlogln($g_prefix,$g_logfile,$g_log_prefix,"\t" . lookupMsg('interactive_node_info',$_->[0],($_->[4] // "unnamed"),(defined($_->[9]) ? "providing sets $_->[9]" : "sole subscriber"), $_->[2]));
548 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_failover_detail_1'));
549 printlogln($g_prefix,$g_logfile,$g_log_prefix,"\t" . lookupMsg('interactive_failover_detail_2'));
550 printlogln($g_prefix,$g_logfile,$g_log_prefix,"\t" . lookupMsg('interactive_failover_detail_3'));
551 printlogln($g_prefix,$g_logfile,$g_log_prefix,"\t" . lookupMsg('interactive_failover_detail_4'));
553 printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_drop_nodes'));
555 if ($g_input ~~ /^[Y|O]$/i) {
556 $g_drop_failed = true;
559 printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_reason'));
561 printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_continue'));
564 if ($g_input !~ /^[Y|O]$/i) {
565 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('exit_noaction'));
566 cleanExit(14, "user");
569 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_write_script', 'failover from', $g_node_from, $g_node_to));
570 $g_script = writeFailover($g_prefix, $g_dbconninfo, $g_clname, $g_dbuser, $g_dbpass, $g_node_from, $g_node_to, $g_subs_follow_origin, $g_use_comment_aliases, $g_logfile, $g_log_prefix);
573 printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_preserve'));
576 if ($g_input =~ /^[Y|O]$/i) {
577 $g_subs_follow_origin = true;
580 printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_reason'));
583 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_write_script', 'move all sets provided by', $g_node_from, $g_node_to));
584 $g_script = writeMoveSet($g_prefix, $g_dbconninfo, $g_clname, $g_dbuser, $g_dbpass, $g_node_from, $g_node_to, $g_subs_follow_origin, $g_use_comment_aliases, $g_logfile, $g_log_prefix);
587 # Complete and run script if required
589 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('generated_script', $g_script));
590 printlog($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_run_script', $g_script));
593 if ($g_input =~ /^[Y|O]$/i) {
594 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('interactive_running'));
595 unless (runSlonik($g_script, $g_prefix, $g_logfile, $g_log_prefix)) {
596 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('err_execute_fail', 'slonik script', $g_script));
600 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('exit_noaction'));
604 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('err_read_fail', 'slonik script', $g_script));
605 cleanExit(15, "system");
608 cleanExit(0, "script completion");
610 ###########################################################################################################################################
612 # Display exit message, insert log file into database if requested, delete any pid files and exit with the requested code
614 my $exit_code = shift;
617 printlogln($g_prefix,$g_logfile,$g_log_prefix,lookupMsg('exit', $type));
621 logDB("dbname=$g_logdb_name;host=$g_logdb_host;port=$g_logdb_port", $g_logdb_user, $g_logdb_pass, $exit_code, $g_reason, $g_prefix, $g_logfile, $g_log_prefix, $g_clname, $g_script);
625 if ($g_pid_written) {
626 removePID($g_prefix, $g_logfile, $g_log_prefix, $g_pidfile);
632 # Exit on caught signal
634 cleanExit(100,'signal');
637 # Check we can reach each node in the cluster and that it contains the Slony schema
646 my $log_prefix = shift;
652 my $result_count = 0;
653 my $critical_count = 0;
659 undef @g_unresponsive;
662 foreach (@g_cluster) {
663 if ($_->[0] == $from) {
664 @origsets = split(',', $_->[3]);
669 foreach (@g_cluster) {
671 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_cluster', $_->[0],($_->[1] // "<NONE>"),$_->[2],($_->[3] // "<NONE>"),$_->[4],($_->[5] // "<NONE>") . "(" . ($_->[8] // "<NONE>") . ")",$_->[6],($_->[7] // "<NONE>"),($_->[9] // "<NONE>") . " (" . ($_->[10] // "<NONE>") . ")"));
674 $dsn = "DBI:Pg:$_->[2];";
676 $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
677 $query = "SELECT count(*) FROM pg_namespace WHERE nspname = ?";
678 $sth = $dbh->prepare($query);
679 $sth->bind_param(1, "_" . $clname);
682 $result_count = $result_count+$sth->rows;
689 # Critical count will be -1 if the new origin is down, -2 if the old origin is down or positive if subscribers to sets on old origin are down.
690 printlogln($prefix,$logfile,$log_prefix,lookupMsg('wrn_node_unavailable', $_->[0]));
692 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', $@));
694 if ($_->[0] == $to) {
695 $critical_count = -1;
697 elsif ($_->[0] == $from) {
698 $critical_count = -2;
701 foreach my $subprov (split(';', $_->[5])) {
702 my ($node, $setlist) = (split('->', $subprov)) ;
704 $setlist =~ s/(\)|\(|s)//g;
705 @subsets = (split(',', $setlist));
707 if (($critical_count >= 0) && (checkSubscribesAnySets(\@origsets, \@subsets))) {
712 # Only push nodes with active subscribers to sets into failed list unless explicitly told to
713 if (($g_fail_subonly) || (defined($_->[9]))) {
714 push(@g_failed, \@$_);
715 $g_backups{$_->[0]} = $to;
717 push(@g_unresponsive, \@$_);
721 return ($result_count, $critical_count);
724 # Load information on all nodes in the Slony cluster into global @g_cluster:
725 # 0) no_id = Node id of this node
726 # 1) no_provs = Comma separated list of all provider node ids
727 # 2) no_conninfo = Conninfo as recorded in sl_path
728 # 3) origin_sets = Comma separated list of set ids originating on this node
729 # 4) no_name = Node name; this is extracted from text between parentheses in sl_node.no_comment
730 # 5) no_sub_tree = Text representation of subscriptions in the form n<provider node id>->(s<set id>, ..)
731 # 6) no_status = Text representing the state of the node; either ACTIVE,INACTIVE or FAILED
732 # 7) sub_sets = Comma separated list of all set ids this node is subscribed to
733 # 8) no_sub_tree_name = As per no_sub_tree but holds textual names extracted from sl_node.no_comment
734 # 9) prov_sets_active = Comma separated list of all set ids this node is actively forwarding
735 # 10) prov_sets = Comma separated list of all set ids this node is subscribed to and able to forward
737 my $dbconninfo = shift;
743 my $log_prefix = shift;
754 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_cluster_load', $dbconninfo));
757 $dsn = "DBI:Pg:$dbconninfo;";
759 $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
760 $qw_clname = $dbh->quote_identifier("_" . $clname);
762 $query = "SELECT $qw_clname.getModuleVersion()";
763 $sth = $dbh->prepare($query);
765 ($version) = $sth->fetchrow;
768 $query = "WITH z AS (
769 SELECT a.no_id, b.sub_provider AS no_prov,
770 COALESCE(c.pa_conninfo,(SELECT pa_conninfo FROM $qw_clname.sl_path WHERE pa_server = $qw_clname.getlocalnodeid(?) LIMIT 1)) AS no_conninfo,
771 array_to_string(array(SELECT set_id FROM $qw_clname.sl_set WHERE set_origin = a.no_id ORDER BY set_id),',') AS origin_sets,
772 string_agg(CASE WHEN b.sub_receiver = a.no_id AND b.sub_forward AND b.sub_active THEN b.sub_set::text END, ',' ORDER BY b.sub_set) AS sub_sets,
773 coalesce(trim(regexp_replace(substring(a.no_comment from E'\\\\((.+)\\\\)'), '[^0-9A-Za-z]','_','g')), 'node' || a.no_id) AS no_name,
774 'n' || b.sub_provider || '->(' || string_agg(CASE WHEN b.sub_receiver = a.no_id THEN 's' || b.sub_set END,',' ORDER BY b.sub_set,',') || ')' AS sub_tree,
775 coalesce(trim(regexp_replace(substring(d.no_comment from E'\\\\((.+)\\\\)'), '[^0-9A-Za-z]','_','g')), 'node' || b.sub_provider, '')
776 || '->(' || string_agg(CASE WHEN b.sub_receiver = a.no_id THEN coalesce(trim(regexp_replace(e.set_comment, '[^0-9A-Za-z]', '_', 'g')), 'set' || b.sub_set) END,',' ORDER BY b.sub_set) || ')' AS sub_tree_name,
777 CASE " . ((substr($version,0,3) >= 2.2) ? "WHEN a.no_failed THEN 'FAILED' " : "") . "WHEN a.no_active THEN 'ACTIVE' ELSE 'INACTIVE' END AS no_status,
778 array_to_string(array(SELECT DISTINCT sub_set::text FROM $qw_clname.sl_subscribe WHERE sub_provider = a.no_id AND sub_active ORDER BY sub_set),',') AS prov_sets_active,
779 string_agg(CASE WHEN b.sub_receiver = a.no_id THEN b.sub_set::text END,',' ORDER BY b.sub_set,',') AS prov_sets
780 FROM $qw_clname.sl_node a
781 LEFT OUTER JOIN $qw_clname.sl_subscribe b ON a.no_id = b.sub_receiver
782 LEFT OUTER JOIN $qw_clname.sl_path c ON c.pa_server = a.no_id AND c.pa_client = $qw_clname.getlocalnodeid(?)
783 LEFT OUTER JOIN $qw_clname.sl_node d ON b.sub_provider = d.no_id
784 LEFT OUTER JOIN $qw_clname.sl_set e ON b.sub_set = e.set_id
785 GROUP BY b.sub_provider, a.no_id, a.no_comment, c.pa_conninfo, d.no_comment, a.no_active
789 nullif(string_agg(no_prov::text, ',' ORDER BY no_prov),'') AS no_provs,
791 nullif(string_agg(origin_sets::text, ',' ORDER BY origin_sets),'') AS origin_sets,
793 nullif(string_agg(sub_tree, ';' ORDER BY sub_tree),'') AS no_sub_tree,
795 nullif(string_agg(sub_sets::text, ',' ORDER BY prov_sets),'') AS sub_sets,
796 nullif(string_agg(sub_tree_name, ';' ORDER BY sub_tree_name),'') AS no_sub_tree_name,
797 nullif(string_agg(prov_sets_active::text, ',' ORDER BY prov_sets_active),'') AS prov_sets_active,
798 nullif(string_agg(prov_sets::text, ',' ORDER BY sub_sets),'') AS prov_sets
799 FROM z GROUP BY no_id, no_conninfo, no_name, no_status";
800 $sth = $dbh->prepare($query);
802 $sth->bind_param(1, "_" . $clname);
803 $sth->bind_param(2, "_" . $clname);
807 while (my @node = $sth->fetchrow) {
808 #printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', join(' - ', @node)));
809 push(@g_cluster, \@node);
818 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', $@));
820 die lookupMsg('err_pgsql_connect');
823 #if (substr($version,0,1) < 2) {
824 # printlogln($prefix,$logfile,$log_prefix,lookupMsg('wrn_not_tested', $version));
826 if (($g_use_try_blocks) && ($g_lockset_method eq 'multiple') && (substr($version,0,3) <= 9.9)) {
827 # It's currently not possible to lock multiple sets at a time within a try block (v2.2.2), leave the logic in and set a high version number for now.
828 printlogln($prefix,$logfile,$log_prefix, lookupMsg('note_multiple_try', $version));
829 $g_lockset_method = 'single';
831 if (substr($version,0,3) >= 2.2) {
832 $g_failover_method = 'new';
833 $g_resubscribe_method = 'resubscribe';
836 unless ($g_silence_notice) {
837 if ((substr($version,0,3) >= 2.0) && (substr($version,0,3) < 2.2)) {
838 printlogln($prefix,$logfile,$log_prefix,lookupMsg('wrn_failover_issues', $version));
840 printlogln($prefix,$logfile,$log_prefix,lookupMsg('note_autofail_fwd_only'));
841 $g_silence_notice = true;
843 if ($g_fail_subonly) {
844 printlogln($prefix,$logfile,$log_prefix,lookupMsg('note_fail_sub_only'));
845 $g_fail_subonly = false;
851 return (scalar(@g_cluster), $version);
854 # Load all sets originating on a node into global @g_sets
856 my $dbconninfo = shift;
858 my $nodenumber = shift;
863 my $log_prefix = shift;
873 $dsn = "DBI:Pg:$dbconninfo;";
875 $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
876 $qw_clname = $dbh->quote_identifier("_" . $clname);
877 $query = "SELECT set_id, trim(regexp_replace(set_comment,'[^0-9,A-Z,a-z]','_','g')) FROM $qw_clname.sl_set WHERE set_origin = ? ORDER BY set_id;";
879 $sth = $dbh->prepare($query);
880 $sth->bind_param(1, $nodenumber);
884 while (my @set = $sth->fetchrow) {
885 push(@g_sets, \@set);
893 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', $@));
895 die lookupMsg('err_pgsql_connect');
898 return scalar(@g_sets);
901 # Load information regarding replication lag from sl_status into @g_lags
902 # If loading from a node that is not the intended origin then this information might not be that accurate/useful
904 my $dbconninfo = shift;
910 my $log_prefix = shift;
920 $dsn = "DBI:Pg:$dbconninfo;";
922 $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
923 $qw_clname = $dbh->quote_identifier("_" . $clname);
924 $query = "SELECT a.st_origin || ' (' || coalesce(trim(regexp_replace(substring(b.no_comment from E'\\\\((.+)\\\\)'), '[^0-9A-Za-z]','_', 'g')), 'node' || b.no_id) || ')<->'
925 || a.st_received || ' (' || coalesce(trim(regexp_replace(substring(c.no_comment from E'\\\\((.+)\\\\)'), '[^0-9A-Za-z]','_', 'g')), 'node' || c.no_id) || ') Events: '
926 || a.st_lag_num_events || ' Time: ' || a.st_lag_time
927 FROM $qw_clname.sl_status a
928 INNER JOIN $qw_clname.sl_node b on a.st_origin = b.no_id
929 INNER JOIN $qw_clname.sl_node c on a.st_received = c.no_id";
931 $sth = $dbh->prepare($query);
934 while (my $lag = $sth->fetchrow) {
943 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', $@));
945 die lookupMsg('err_pgsql_connect');
948 return scalar(@g_lags);
951 # Prompt user for nodes to an from in interactive mode and do some checking
956 my $log_prefix = shift;
957 my $last_choice = shift;
966 $line = sprintf "%-4s %-14s %-10s %-24s %-s\n", lookupMsg('interactive_head_id'), lookupMsg('interactive_head_name'), lookupMsg('interactive_head_status'), lookupMsg('interactive_head_providers'), lookupMsg('interactive_head_config');
967 printlog($prefix,$logfile,$log_prefix,"$line");
968 $line = sprintf "%-4s %-14s %-10s %-24s %-s\n", "="x(length(lookupMsg('interactive_head_id'))), "="x(length(lookupMsg('interactive_head_name'))), "="x(length(lookupMsg('interactive_head_status'))), "="x(length(lookupMsg('interactive_head_providers'))), "="x(length(lookupMsg('interactive_head_config')));
969 printlog($prefix,$logfile,$log_prefix,"$line");
971 foreach (@g_cluster) {
972 $line = sprintf "%-4s %-14s %-10s %-24s %-s\n", $_->[0], $_->[4], $_->[6], ($_->[1] // "<NONE>"), (lookupMsg('interactive_detail_1') . ($_->[3] // "<NONE>"));
973 printlog($prefix,$logfile,$log_prefix,"$line");
974 $line = sprintf "%-55s %-s\n", " ", (lookupMsg('interactive_detail_2') . ($_->[7] // "<NONE>"));
975 printlog($prefix,$logfile,$log_prefix,"$line");
976 $line = sprintf "%-55s %-s\n", " ", (lookupMsg('interactive_detail_3') . ($_->[5] // "<NONE>"));
977 printlogln($prefix,$logfile,$log_prefix,"$line");
978 $options{$_->[0]} = {name => $_->[4], sets => ($_->[3] // ""), status => $_->[6], provider => $_->[7]};
980 if ($type !~ m/info/i) {
981 printlog($prefix,$logfile,$log_prefix,lookupMsg('interactive_choose_node', $type));
985 if(exists($options{$choice})) {
986 if ($options{$choice}->{status} ne "ACTIVE") {
987 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_inactive', $choice, lc($options{$choice}->{status})));
990 elsif (($type =~ m/from/i) && (length(trim($options{$choice}->{sets})) <= 0)) {
991 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_not_origin', $choice));
994 elsif ($type =~ m/to/i) {
995 if (length(trim($options{$choice}->{provider})) <= 0) {
996 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_not_provider', $choice));
1000 foreach my $old_origin (@g_cluster) {
1001 if ($old_origin->[0] == $last_choice) {
1002 @sets_from = split(',', $old_origin->[3]);
1003 @sets_to = split(',', $options{$choice}->{provider});
1004 if (checkProvidesAllSets(\@sets_from, \@sets_to)) {
1008 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_not_provider_sets',$choice,$old_origin->[3],$options{$choice}->{providers}));
1015 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_no_configuration', $last_choice));
1021 printlog($prefix,$logfile,$log_prefix,lookupMsg('interactive_confirm',$type,$choice,$options{$choice}->{name}));
1024 if ($ok !~ /^[Y|O]$/i) {
1025 printlogln($prefix,$logfile,$log_prefix,lookupMsg('exit_noaction'));
1030 elsif (!length($choice)) {
1031 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_must_enter_node_id'));
1035 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_not_a_node_id', $choice));
1043 # Write a slonik preamble section using information pulled into @g_cluster and @g_sets by loadCluster() and loadSets() functions
1045 my $filename = shift;
1046 my $dbconninfo = shift;
1051 my $aliases = shift;
1053 my $logfile = shift;
1054 my $log_prefix = shift;
1055 my $comment_all_failed = shift;
1058 my $success = false;
1060 my ($year, $month, $day, $hour, $min, $sec) = (localtime(time))[5,4,3,2,1,0];
1061 my $date = sprintf ("%02d:%02d:%02d on %02d/%02d/%04d", $hour, $min, $sec, $day, $month+1, $year+1900);
1063 if (open(SLONFILE, ">", $filename)) {
1064 print SLONFILE ("# Script autogenerated on $date\n\n");
1065 print SLONFILE ("######\n# Preamble (cluster structure)\n######\n\n# Cluster name\n");
1067 print SLONFILE ("DEFINE slony_cluster_name $clname;\n");
1068 print SLONFILE ("CLUSTER NAME = \@slony_cluster_name;\n\n");
1071 print SLONFILE ("CLUSTER NAME = $clname;\n\n");
1073 foreach (@g_cluster) {
1075 if (($comment_all_failed) && (exists $g_backups{$_->[0]})) {
1076 $line_prefix = "# (Node $_->[0] unavailable) ";
1078 elsif (!$g_fail_subonly) {
1079 foreach my $unresponsive (@g_unresponsive) {
1080 if (($_->[0] == $unresponsive->[0]) && !defined($_->[9]) && ($g_failover_method eq 'new')) {
1081 $line_prefix = "# (Node $_->[0] unavailable subscriber only) ";
1085 print SLONFILE ("# Preamble for node $_->[0] named $_->[4]\n");
1087 print SLONFILE ($line_prefix . "DEFINE $_->[4] $_->[0];\n");
1088 print SLONFILE ($line_prefix . "DEFINE $_->[4]_conninfo '$_->[2]';\n");
1089 print SLONFILE ($line_prefix . "NODE \@$_->[4] ADMIN CONNINFO = \@$_->[4]_conninfo;\n\n");
1092 print SLONFILE ($line_prefix . "NODE $_->[0] ADMIN CONNINFO = '$_->[2]';\n\n");
1094 if (($aliases) && ($sets)) {
1095 $set_count = loadSets($dbconninfo, $clname, $_->[0], $dbuser, $dbpass, $prefix, $logfile, $log_prefix);
1096 if ($set_count > 0) {
1097 print SLONFILE ("# Sets provided (currently) by node $_->[0]\n");
1098 foreach my $set (@g_sets) {
1099 print SLONFILE ($line_prefix . "DEFINE $set->[1] $set->[0];\n");
1101 print SLONFILE ("\n");
1108 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_write_fail', "script", $filename));
1114 # Write slonik commands to move sets
1117 my $dbconninfo = shift;
1124 my $aliases = shift;
1125 my $logfile = shift;
1126 my $log_prefix = shift;
1131 my $try_prefix = "";
1132 my ($year, $month, $day, $hour, $min, $sec) = (localtime(time))[5,4,3,2,1,0];
1133 my $filetime = sprintf ("%02d_%02d_%04d_%02d:%02d:%02d", $day, $month+1, $year+1900, $hour, $min, $sec);
1134 my $filename = $prefix . "/" . $clname . "-move_sets_from_" . $from . "_to_" . $to . "_on_" . $filetime . ".scr";
1136 if ($g_use_try_blocks) {
1143 my ($node, $setlist);
1144 my ($node_name, $setlist_name);
1147 unless (writePreamble($filename, $dbconninfo, $clname, $dbuser, $dbpass, true, $aliases, $prefix, $logfile, $log_prefix, true)) {
1148 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_incomplete_preamble'));
1151 foreach (@g_cluster) {
1152 if ($_->[0] == $from) {
1153 $from_name = $_->[4];
1155 elsif ($_->[0] == $to) {
1160 if (open(SLONFILE, ">>", $filename)) {
1162 print SLONFILE ("######\n# Actions (changes to cluster structure)\n######\n");
1164 $set_count = loadSets($dbconninfo, $clname, $from, $dbuser, $dbpass, $prefix, $logfile, $log_prefix);
1165 if ($set_count > 0) {
1167 if ($g_lockset_method ne "single") {
1168 if ($g_use_try_blocks) {
1169 print SLONFILE ("TRY {\n");
1173 print SLONFILE ($try_prefix . "ECHO 'Locking set $_->[1] ($_->[0])';\n");
1174 print SLONFILE ($try_prefix . "LOCK SET ( ID = \@$_->[1], ORIGIN = \@$from_name);\n");
1177 print SLONFILE ($try_prefix . "ECHO 'Locking set $_->[0]';\n");
1178 print SLONFILE ($try_prefix . "LOCK SET ( ID = $_->[0], ORIGIN = $from);\n");
1182 print SLONFILE ("\n");
1185 print SLONFILE ($try_prefix . "ECHO 'Moving set $_->[1] ($_->[0])';\n");
1186 print SLONFILE ($try_prefix . "MOVE SET ( ID = \@$_->[1], OLD ORIGIN = \@$from_name, NEW ORIGIN = \@$to_name);\n");
1189 print SLONFILE ($try_prefix . "ECHO 'Moving set $_->[0]';\n");
1190 print SLONFILE ($try_prefix . "MOVE SET ( ID = $_->[0], OLD ORIGIN = $from, NEW ORIGIN = $to);\n");
1194 if ($g_use_try_blocks) {
1195 print SLONFILE ("}\nON ERROR {\n");
1198 print SLONFILE ($try_prefix . "ECHO 'Unlocking set $_->[1] ($_->[0])';\n");
1199 print SLONFILE ($try_prefix . "UNLOCK SET ( ID = \@$_->[1], ORIGIN = \@$from_name);\n");
1202 print SLONFILE ($try_prefix . "ECHO 'Unlocking set $_->[0]';\n");
1203 print SLONFILE ($try_prefix . "UNLOCK SET ( ID = $_->[0], ORIGIN = $from);\n");
1206 print SLONFILE ("\tEXIT 1;\n}\nON SUCCESS {\n");
1209 print SLONFILE ($try_prefix . "WAIT FOR EVENT (ORIGIN = \@$from_name, CONFIRMED = ALL, WAIT ON = \@$from_name, TIMEOUT = 0);\n");
1212 print SLONFILE ($try_prefix . "WAIT FOR EVENT (ORIGIN = $from, CONFIRMED = ALL, WAIT ON = $from, TIMEOUT = 0);\n");
1214 if ($g_use_try_blocks) {
1215 print SLONFILE ("}\n");
1219 if ($g_lockset_method eq "single") {
1221 print SLONFILE ("\nECHO 'Moving set $_->[1] ($_->[0])';\n");
1222 if ($g_use_try_blocks) {
1223 print SLONFILE ("TRY {\n");
1225 print SLONFILE ($try_prefix . "LOCK SET ( ID = \@$_->[1], ORIGIN = \@$from_name);\n");
1226 print SLONFILE ($try_prefix . "MOVE SET ( ID = \@$_->[1], OLD ORIGIN = \@$from_name, NEW ORIGIN = \@$to_name);\n");
1227 if ($g_use_try_blocks) {
1228 print SLONFILE ("}\nON ERROR {\n" . $try_prefix . "UNLOCK SET ( ID = \@$_->[1], ORIGIN = \@$from_name);\n" . $try_prefix . "EXIT 1;\n}\n");
1230 print SLONFILE ("WAIT FOR EVENT (ORIGIN = \@$from_name, CONFIRMED = ALL, WAIT ON = \@$from_name, TIMEOUT = 0);\n");
1233 print SLONFILE ("\nECHO 'Moving set $_->[0]';\n");
1234 if ($g_use_try_blocks) {
1235 print SLONFILE ("TRY {\n");
1237 print SLONFILE ($try_prefix . "LOCK SET ( ID = $_->[0], ORIGIN = $from);\n");
1238 print SLONFILE ($try_prefix . "MOVE SET ( ID = $_->[0], OLD ORIGIN = $from, NEW ORIGIN = $to);\n");
1239 if ($g_use_try_blocks) {
1240 print SLONFILE ("}\nON ERROR {\n" . $try_prefix . "UNLOCK SET ( ID = $_->[0], ORIGIN = $from);\n" . $try_prefix . "EXIT 1;\n}\n");
1242 print SLONFILE ("WAIT FOR EVENT (ORIGIN = $from, CONFIRMED = ALL, WAIT ON = $from, TIMEOUT = 0);\n");
1245 if (($subs) && ($g_resubscribe_method eq 'subscribe')) {
1247 foreach my $other_subs (@g_cluster) {
1248 if (($other_subs->[6] eq "ACTIVE") && ($other_subs->[0] != $from) && ($other_subs->[0] != $to)) {
1250 if (exists $g_backups{$other_subs->[0]}) {
1251 $line_prefix = "# (Node $other_subs->[0] unavailable) ";
1257 # mess here needs cleaning up
1258 @subprov_name = (split(';', $other_subs->[8]));
1260 foreach $subprov (split(';', $other_subs->[5])) {
1261 ($node, $setlist) = (split('->', $subprov)) ;
1262 ($node_name, $setlist_name) = (split('->', $subprov_name[$subprov_idx])) ;
1265 $setlist =~ s/(\)|\(|s)//g;
1266 @subsets = (split(',', $setlist)) ;
1269 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_resubscribe', $_->[1], $_->[0], $other_subs->[0], $other_subs->[4], $setlist, $setlist_name, $node, $node_name));
1272 if ($_->[0] ~~ @subsets) {
1273 if ($node == $from) {
1275 print SLONFILE ($line_prefix .
1276 "ECHO 'Issuing subscribe for set $_->[1] ($_->[0]) provider $to_name ($to) -> " .
1277 "receiver $other_subs->[4] ($other_subs->[0])';\n");
1278 print SLONFILE ($line_prefix .
1279 "SUBSCRIBE SET ( ID = \@$_->[1], PROVIDER = \@$to_name, " .
1280 "RECEIVER = \@$other_subs->[4], FORWARD = YES);\n");
1283 print SLONFILE ($line_prefix .
1284 "ECHO 'Issuing subscribe for set $_->[1] ($_->[0]) provider $to -> " .
1285 "receiver $other_subs->[0]';\n");
1286 print SLONFILE ($line_prefix . "SUBSCRIBE SET ( ID = $_->[0], PROVIDER = $to, " .
1287 "RECEIVER = $other_subs->[0], FORWARD = YES);\n");
1292 print SLONFILE ($line_prefix .
1293 "ECHO 'Issuing subscribe for set $_->[1] ($_->[0]) provider $node_name ($node) -> " .
1294 "receiver $other_subs->[4] ($other_subs->[0])';\n");
1295 print SLONFILE ($line_prefix . "SUBSCRIBE SET ( ID = \@$_->[1], PROVIDER = \@$node_name, " .
1296 "RECEIVER = \@$other_subs->[4], FORWARD = YES);\n");
1299 print SLONFILE ($line_prefix .
1300 "ECHO 'Issuing subscribe for set $_->[1] ($_->[0]) provider $node -> " .
1301 "receiver $other_subs->[0]';\n");
1302 print SLONFILE ($line_prefix . "SUBSCRIBE SET ( ID = $_->[0], PROVIDER = $node, " .
1303 "RECEIVER = $other_subs->[0], FORWARD = YES);\n");
1313 if (($subs) && ($g_resubscribe_method eq 'resubscribe')) {
1315 foreach my $other_subs (@g_cluster) {
1316 if (($other_subs->[6] eq "ACTIVE") && ($other_subs->[0] != $from) && ($other_subs->[0] != $to)) {
1317 if (exists $g_backups{$other_subs->[0]}) {
1318 $line_prefix = "# (Node $other_subs->[0] unavailable) ";
1324 @subprov_name = (split(';', $other_subs->[8]));
1326 foreach $subprov (split(';', $other_subs->[5])) {
1327 ($node, $setlist) = (split('->', $subprov)) ;
1328 ($node_name, $setlist_name) = (split('->', $subprov_name[$subprov_idx])) ;
1332 print SLONFILE ("\n");
1333 if ($node == $from) {
1335 print SLONFILE ($line_prefix .
1336 "ECHO 'Issuing resubscribe for provider $to_name ($to) -> receiver $other_subs->[4] ($other_subs->[0])';\n");
1337 print SLONFILE ($line_prefix .
1338 "RESUBSCRIBE NODE ( ORIGIN = \@$to_name, PROVIDER = \@$to_name, RECEIVER = \@$other_subs->[4]);\n");
1341 print SLONFILE ($line_prefix .
1342 "ECHO 'Issuing resubscribe for provider $to -> receiver $other_subs->[0]';\n");
1343 print SLONFILE ($line_prefix .
1344 "RESUBSCRIBE NODE ( ORIGIN = $to, PROVIDER = $to, RECEIVER = $other_subs->[0] );\n");
1349 print SLONFILE ($line_prefix .
1350 "ECHO 'Issuing resubscribe for provider $node_name ($node) -> receiver $other_subs->[4] ($other_subs->[0])';\n");
1351 print SLONFILE ($line_prefix .
1352 "RESUBSCRIBE NODE ( ORIGIN = \@$to_name, PROVIDER = \@$node_name, RECEIVER = \@$other_subs->[4]);\n");
1355 print SLONFILE ($line_prefix .
1356 "ECHO 'Issuing resubscribe for provider $node -> receiver $other_subs->[0]';\n");
1357 print SLONFILE ($line_prefix .
1358 "RESUBSCRIBE NODE ( ORIGIN = $to, PROVIDER = $node, RECEIVER = $other_subs->[0]);\n");
1369 print SLONFILE ("\nECHO 'All sets originating from $from_name (id $from) have been moved to $to_name (id $to), ensure you modify any existing slonik scripts to reflect the new origin';\n");
1372 print SLONFILE ("\nECHO 'All sets originating from node $from have been moved to node $to, ensure you modify the any existing slonik scripts to reflect the new origin';\n");
1377 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_write_fail', "script", $filename));
1382 # Write slonik commands to failover sets
1385 my $dbconninfo = shift;
1392 my $aliases = shift;
1393 my $logfile = shift;
1394 my $log_prefix = shift;
1398 my ($year, $month, $day, $hour, $min, $sec) = (localtime(time))[5,4,3,2,1,0];
1399 my $filetime = sprintf ("%02d_%02d_%04d_%02d:%02d:%02d", $day, $month+1, $year+1900, $hour, $min, $sec);
1404 my ($node, $setlist);
1405 my ($node_name, $setlist_name);
1411 if (defined($from) && defined($to)) {
1412 $filename = $prefix . "/" . $clname . "-failover_from_" . $from . "_to_" . $to . "_on_" . $filetime . ".scr";
1415 $filename = $prefix . "/" . $clname . "-autofailover_on_" . $filetime . ".scr";
1418 if ($g_failover_method ne 'new') {
1419 # For pre 2.2 failover with multiple nodes, we attempt to resubscribe sets and drop other failed providers;
1420 # This will never work as well as 2.2+ failover behaviour (infact failover may not work as all in 2.0/2.1 with multiple failed nodes)
1421 # We also need to define the sets in the preamble for this.
1425 unless (writePreamble($filename, $dbconninfo, $clname, $dbuser, $dbpass, $sets, $aliases, $prefix, $logfile, $log_prefix, false)) {
1426 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_incomplete_preamble'));
1429 if (open(SLONFILE, ">>", $filename)) {
1431 print SLONFILE ("######\n# Actions (changes to cluster structure)\n######\n\n");
1433 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_failover_method',$g_failover_method));
1436 # If we are on pre 2.2 we need to drop failed subscriber nodes first regardless
1437 if ($g_failover_method ne 'new') {
1438 foreach (@g_failed) {
1439 if (!defined($_->[3])) {
1440 foreach my $backup (@g_cluster) {
1441 if ($backup->[0] == $g_backups{$_->[0]}) { # this backup node candidate is in the list of suitable nodes for {failed node}
1442 foreach my $subscriber (@g_cluster) {
1443 if (defined($subscriber->[1]) && $subscriber->[1] == $_->[0] && $subscriber->[0] != $backup->[0]) {
1444 # mess here needs cleaning up
1445 @subprov_name = (split(';', $subscriber->[8]));
1447 foreach my $subprov (split(';', $subscriber->[5])) {
1448 ($node, $setlist) = (split('->', $subprov)) ;
1449 ($node_name, $setlist_name) = (split('->', $subprov_name[$subprov_idx])) ;
1453 if ($node == $_->[0]) {
1455 print SLONFILE ("ECHO 'Resubscribing all sets on receiver $subscriber->[4] provided by other failed node $_->[4] to backup node $backup->[4]';\n");
1458 print SLONFILE ("ECHO 'Resubscribing all sets on receiver $subscriber->[0] provided by other failed node $_->[0] to backup node $backup->[0]';\n");
1460 $setlist =~ s/(\)|\(|s)//g;
1461 @subsets = (split(',', $setlist));
1462 $setlist_name =~ s/(\)|\()//g;
1463 @subsets_name = (split(',', $setlist_name));
1466 foreach my $subset (@subsets) {
1468 print SLONFILE ("SUBSCRIBE SET (ID = \@$subsets_name[$set_idx], PROVIDER = \@$backup->[4], RECEIVER = \@$subscriber->[4], FORWARD = YES);\n");
1469 print SLONFILE ("WAIT FOR EVENT (ORIGIN = \@$backup->[4], CONFIRMED = \@$subscriber->[4], WAIT ON = \@$backup->[4]);\n");
1472 print SLONFILE ("SUBSCRIBE SET (ID = $subset, PROVIDER = $backup->[0], RECEIVER = $subscriber->[0], FORWARD = YES);\n");
1473 print SLONFILE ("WAIT FOR EVENT (ORIGIN = $backup->[0], CONFIRMED = $subscriber->[0], WAIT ON = $backup->[0]);\n");
1477 print SLONFILE ("\n");
1482 print SLONFILE ("ECHO 'Dropping other failed node $_->[4] ($_->[0])';\n");
1483 print SLONFILE ("DROP NODE (ID = \@$_->[4], EVENT NODE = \@$backup->[4]);\n\n");
1486 print SLONFILE ("ECHO 'Dropping other failed node $_->[0]';\n");
1487 print SLONFILE ("DROP NODE (ID = $_->[0], EVENT NODE = $backup->[0]);\n\n");
1489 push(@dropped, $_->[0]);
1492 # The node is failed, but there are no downstream subscribers
1502 foreach (@g_failed) {
1503 if (($g_failover_method eq 'new') || defined($_->[3])) {
1504 foreach my $backup (@g_cluster) {
1505 if ($backup->[0] == $g_backups{$_->[0]}) {
1506 ## Here we have both details of the backup node and the failed node
1508 print SLONFILE ("ECHO 'Failing over slony cluster from $_->[4] (id $_->[0]) to $backup->[4] (id $backup->[0])';\n");
1511 print SLONFILE ("ECHO 'Failing over slony cluster from node $_->[0] to node $backup->[0]';\n");
1519 print SLONFILE ("FAILOVER (\n\t");
1521 foreach (@g_failed) {
1522 if (($g_failover_method eq 'new') || defined($_->[3])) {
1523 foreach my $backup (@g_cluster) {
1524 if ($backup->[0] == $g_backups{$_->[0]}) {
1525 ## Here we have both details of the backup node and the failed node
1526 if ($g_failover_method eq 'new') {
1527 if( $written != 0 ) {
1528 print SLONFILE (",\n\t");
1530 print SLONFILE ("NODE = (");
1533 if( $written != 0 ) {
1534 print SLONFILE ("\n);\nFAILOVER (\n\t");
1538 print SLONFILE ("ID = \@$_->[4], BACKUP NODE = \@$backup->[4]");
1541 print SLONFILE ("ID = $_->[0], BACKUP NODE = $backup->[0]");
1543 if ($g_failover_method eq 'new') {
1544 print SLONFILE (")");
1552 print SLONFILE ("\n);\n\n");
1554 if ($g_drop_failed) {
1555 if (($g_failover_method eq 'new') && (scalar(@g_failed) > 1)) {
1556 foreach (@g_failed) {
1558 print SLONFILE ("ECHO 'Dropping failed node $_->[4] ($_->[0])';\n");
1561 print SLONFILE ("ECHO 'Dropping failed node $_->[0]';\n");
1565 print SLONFILE ("DROP NODE (ID = '");
1569 foreach (@g_failed) {
1570 foreach my $backup (@g_cluster) {
1571 if ($backup->[0] == $g_backups{$_->[0]}) {
1572 if (!defined($event_node)) {
1574 $event_node = $backup->[4];
1577 $event_node = $backup->[0];
1580 if (($g_failover_method eq 'new') && (scalar(@g_failed) > 1)) {
1581 if( $written != 0 ) {
1582 print SLONFILE (",");
1584 ## Don't bother trying to define array values
1586 # print SLONFILE "\@$_->[4]";
1589 print SLONFILE $_->[0];
1593 elsif (($g_failover_method eq 'new') || defined($_->[3]) || !($_->[0] ~~ @dropped)) {
1595 print SLONFILE ("ECHO 'Dropping failed node $_->[4] ($_->[0])';\n");
1596 print SLONFILE ("DROP NODE (ID = \@$_->[4], EVENT NODE = \@$backup->[4]);\n\n");
1599 print SLONFILE ("ECHO 'Dropping failed node $_->[0]';\n");
1600 print SLONFILE ("DROP NODE (ID = $_->[0], EVENT NODE = $backup->[0]);\n\n");
1607 if (($g_failover_method eq 'new') && (scalar(@g_failed) > 1)) {
1609 print SLONFILE ("', EVENT NODE = \@$event_node);\n");
1612 print SLONFILE ("', EVENT NODE = $event_node);\n");
1619 printlog($prefix,$logfile,$log_prefix,lookupMsg('err_write_fail', "script", $filename));
1625 # Used to return informational text from the %message hashes, pretty much entirely stolen from check_postgres (http://bucardo.org)
1627 my $name = shift || '?';
1631 if (exists $message{$g_lang}{$name}) {
1632 $text = $message{$g_lang}{$name};
1634 elsif (exists $message{'en'}{$name}) {
1635 $text = $message{'en'}{$name};
1638 $line_call = (caller)[2];
1639 $text = qq{Failed to lookup text "$name" at line $line_call};
1645 $val = '?' if ! defined $val;
1646 last unless $text =~ s/\$$x/$val/g;
1653 # Trim quotes off a string
1656 $string =~ s/^('|")+//;
1657 $string =~ s/('|")+$//;
1664 $string =~ s/^\s+//;
1665 $string =~ s/\s+$//;
1669 # Print command with a linefeed
1671 print ((@_ ? join($/, @_) : $_), $/);
1674 # Print to stdout and the logfile, doing some replacements allong the way for logging
1677 my $logfile_name = shift;
1678 my $log_prefix = shift;
1679 my $message = shift;
1685 if (defined($logfile_name)) {
1687 # Do we have to do this all the time? Perhaps could check parameters first
1688 if ($logfile_name =~ /^\//i) {
1689 $logfile = strftime($logfile_name, localtime);
1692 $logfile = "$prefix/" . strftime($logfile_name, localtime);
1695 if ($log_prefix =~ m/(\%[mt])/) {
1696 my ($year, $month, $day, $hour, $min, $sec) = (localtime(time))[5,4,3,2,1,0];
1697 my ($h_sec, $h_msec) = gettimeofday;
1698 $date = sprintf ("%02d-%02d-%04d %02d:%02d:%02d.%03d", $day, $month+1, $year+1900, $hour, $min, $sec, $h_msec/1000);
1699 $log_prefix =~ s/\%m/$date/g;
1701 $date = sprintf ("%02d-%02d-%04d %02d:%02d:%02d", $day, $month+1, $year+1900, $hour, $min, $sec);
1702 $log_prefix =~ s/\%t/$date/g;
1704 if ($log_prefix =~ m/(\%p)/) {
1705 $log_prefix =~ s/\%p/$g_pid/g;
1708 if (open(LOGFILE, ">>", $logfile)) {
1709 print LOGFILE $log_prefix . " " . $message;
1713 println(lookupMsg('err_write_fail', "logfile", $logfile));
1718 # Printlog command with a linefeed
1720 printlog ($_[0], $_[1], $_[2], $_[3] . $/);
1723 # Insert details of any action into a database table
1725 my $dbconninfo = shift;
1728 my $exit_code = shift;
1731 my $logfile = shift;
1732 my $log_prefix = shift;
1744 unless($results = (read_file($logfile))) {
1745 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_read_fail', "logfile", $logfile));
1748 if (defined($script) && (-e $script)) {
1749 unless ($script_data = (read_file($script))) {
1750 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_read_fail', "script file", $script));
1754 $script_data = "No script data was generated.";
1755 $script = "No script generated.";
1758 $dsn = "DBI:Pg:$dbconninfo;";
1760 $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
1761 $query = "INSERT INTO public.failovers (reason, exit_code, results, script, cluster_name)
1762 VALUES (?, ?, ?, ?, ?)";
1764 $sth = $dbh->prepare($query);
1766 $sth->bind_param(1, $reason);
1767 $sth->bind_param(2, $exit_code);
1768 $sth->bind_param(3, $results);
1769 $sth->bind_param(4, $script . ":\n" . $script_data);
1770 $sth->bind_param(5, $clname);
1779 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', $@));
1781 die lookupMsg('err_pgsql_connect');
1787 # Returns a uuid used for the failover script directory
1789 my $date_string = shift;
1790 my $g_ug = new Data::UUID;
1791 my $g_uuid = $g_ug->create_from_name("failover_script", $date_string);
1792 my $g_uuid_str = $g_ug->to_string($g_uuid);
1796 # Write out a PID file
1799 my $logfile = shift;
1800 my $log_prefix = shift;
1801 my $pidfile_name = shift;
1805 if ($pidfile_name =~ /^\//i) {
1806 $pidfile = $pidfile_name;
1809 $pidfile = "$prefix/" . $pidfile_name;
1812 open (PIDFILE, ">", $pidfile);
1818 printlogln($prefix,$logfile,$log_prefix, lookupMsg('dbg_generic', $!));
1820 printlogln($prefix,$logfile,$log_prefix, lookupMsg('err_write_fail', "pid file", $pidfile));
1826 # Remove the PID file
1829 my $logfile = shift;
1830 my $log_prefix = shift;
1831 my $pidfile_name = shift;
1835 if ($pidfile_name =~ /^\//i) {
1836 $pidfile = $pidfile_name;
1839 $pidfile = "$prefix/" . $pidfile_name;
1846 printlogln($prefix,$logfile,$log_prefix, lookupMsg('dbg_generic', 'PID file never existed to be removed'));
1851 printlogln($prefix,$logfile,$log_prefix, lookupMsg('dbg_generic', $!));
1853 printlogln($prefix,$logfile,$log_prefix, lookupMsg('err_unlink_fail', "pid file", $pidfile));
1859 # Check all sets from an originationg node are contained in the list provided by another node
1860 sub checkProvidesAllSets {
1861 my ($originSets, $providerSets) = @_;
1864 undef @test_hash{@$originSets}; # add a hash key for each element of @$originSets
1865 delete @test_hash{@$providerSets}; # remove all keys for elements of @$providerSets
1867 return !%test_hash; # return false if any keys are left in the hash
1870 # Check any sets from an originationg node are contained in the list subscribed to by another node
1871 sub checkSubscribesAnySets {
1872 my ($originSets, $subscriberSets) = @_;
1877 undef @test_hash{@$originSets}; # add a hash key for each element of @$originSets
1878 $before = scalar(keys %test_hash);
1879 delete @test_hash{@$subscriberSets}; # remove all keys for elements of @$subscriberSets
1880 $after = scalar(keys %test_hash);
1881 return ($before != $after); # return false if no keys were removed from the hash
1884 # Read configuration details from a configuration file
1886 my $cfgfile = shift;
1888 my $success = false;
1891 if (open(CFGFILE, "<", $cfgfile)) {
1892 foreach (<CFGFILE>) {
1897 s/#(?=(?:(?:[^']|[^"]*+'){2})*+[^']|[^"]*+\z).*//;
1899 if (length(trim($_))) {
1900 @fields = split('=', $_, 2);
1901 given(lc($fields[0])) {
1902 $value = qtrim(trim($fields[1]));
1906 when(/\bslony_database_host\b/i) {
1909 when(/\bslony_database_port\b/i) {
1910 $g_dbport = checkInteger($value);
1912 when(/\bslony_database_name\b/i) {
1915 when(/\bslony_database_user\b/i) {
1918 when(/\bslony_database_password\b/i) {
1921 when(/\bslony_cluster_name\b/i) {
1924 when(/\benable_debugging\b/i) {
1925 $g_debug = checkBoolean($value);
1927 when(/\bprefix_directory\b/i) {
1930 when(/\bseparate_working_directory\b/i) {
1931 $g_separate_working = checkBoolean($value);
1933 when(/\bpid_filename\b/i) {
1934 $g_pidfile = $value;
1936 when(/\bfailover_offline_subscriber_only\b/i) {
1937 $g_fail_subonly = checkBoolean($value);
1939 when(/\bdrop_failed_nodes\b/i) {
1940 $g_drop_failed = checkBoolean($value);
1942 when(/\blog_line_prefix\b/i) {
1943 $g_log_prefix = $value;
1945 when(/\blog_filename\b/i) {
1946 $g_logfile = $value;
1948 when(/\blog_to_postgresql\b/i) {
1949 $g_log_to_db = checkBoolean($value);
1951 when(/\blog_database_host\b/i) {
1952 $g_logdb_host = $value;
1954 when(/\blog_database_port\b/i) {
1955 $g_logdb_port = checkInteger($value);
1957 when(/\blog_database_name\b/i) {
1958 $g_logdb_name = $value;
1960 when(/\blog_database_user\b/i) {
1961 $g_logdb_user = $value;
1963 when(/\blog_database_password\b/i) {
1964 $g_logdb_pass = $value;
1966 when(/\benable_try_blocks\b/i) {
1967 $g_use_try_blocks = checkBoolean($value);
1969 when(/\bpull_aliases_from_comments\b/i) {
1970 $g_use_comment_aliases = checkBoolean($value);
1972 when(/\bslonik_path\b/i) {
1973 $g_slonikpath = $value;
1975 when(/\blockset_method\b/i) {
1976 $g_lockset_method = $value;
1978 when(/\benable_autofailover\b/i) {
1979 $g_autofailover = checkBoolean($value);
1981 when(/\bautofailover_poll_interval\b/i) {
1982 $g_autofailover_poll_interval = checkInteger($value);
1984 when(/\bautofailover_node_retry\b/i) {
1985 $g_autofailover_retry = checkInteger($value);
1987 when(/\bautofailover_sleep_time\b/i) {
1988 $g_autofailover_retry_sleep = checkInteger($value);
1990 when(/\bautofailover_forwarding_providers\b/i) {
1991 $g_autofailover_provs = checkBoolean($value);
1993 when(/\bautofailover_config_any_node\b/i) {
1994 $g_autofailover_config_any = checkBoolean($value);
1996 when(/\bautofailover_perspective_sleep_time\b/i) {
1997 $g_autofailover_perspective_sleep = checkInteger($value);
1999 when(/\bautofailover_majority_only\b/i) {
2000 $g_autofailover_majority_only = checkBoolean($value);
2002 when(/\bautofailover_is_quorum\b/i) {
2003 $g_autofailover_is_quorum = checkBoolean($value);
2013 println(lookupMsg('err_fail_config'));
2019 # Interpret a textual representation of a boolean value
2023 if ( grep /^$text$/i, ("y","yes","t","true","on") ) {
2026 elsif ( grep /^$text$/i, ("n","no","f","false","off") ) {
2032 # Check if a text value is a valid integer
2034 my $integer = shift;
2037 if (($integer * 1) eq $integer) {
2038 $value = int($integer);
2043 # Run a slonik command and capture all output via autoflushing channel
2047 my $logfile = shift;
2048 my $log_prefix = shift;
2052 printlogln($prefix,$logfile,$log_prefix, lookupMsg('dbg_slonik_script', $script));
2054 if (open(SLONIKSTATUS, "-|", "slonik $script 2>&1")) {
2055 while (<SLONIKSTATUS>) {
2056 printlogln($prefix,$logfile,$log_prefix,lookupMsg('slonik_output', $_));
2058 close(SLONIKSTATUS);
2062 printlogln($prefix,$logfile,$log_prefix, lookupMsg('err_running_slonik', $!));
2068 # Experimental logic to watch the cluster status and perform an automatic failover
2070 my $dbconninfo = shift;
2075 my $logfile = shift;
2076 my $log_prefix = shift;
2083 my $cluster_loaded_from;
2088 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_init'));
2089 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_init_cnf', ($g_autofailover_config_any ? 'any' : 'specified target')));
2090 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_init_pol', $g_autofailover_poll_interval));
2091 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_init_ret', $g_autofailover_retry, $g_autofailover_retry_sleep));
2092 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_init_set', ($g_autofailover_provs ? 'will' : 'will not')));
2095 # Probe current cluster configuration every minute
2096 if (!defined($cluster_time) || (time()-$cluster_time > 60)) {
2098 $cluster_loaded = false;
2099 if (!defined($cluster_time) || !$g_autofailover_config_any) {
2101 ($node_count, $version) = loadCluster($dbconninfo, $clname, $dbuser, $dbpass, $prefix, $logfile, $log_prefix);
2102 die lookupMsg('err_cluster_empty') if ($node_count == 0);
2103 @cluster = @g_cluster;
2104 die lookupMsg('err_cluster_lone') if ($node_count == 1);
2105 $cluster_loaded = true;
2106 $cluster_loaded_from = 'conninfo specified in config';
2109 printlogln($prefix,$logfile,$log_prefix, lookupMsg('load_cluster_fail', 'from supplied configuration'));
2111 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', $@));
2116 foreach (@cluster) {
2117 if ($_->[6] eq "ACTIVE") {
2118 unless ($cluster_loaded) {
2120 ($node_count, $version) = loadCluster($_->[2], $clname, $dbuser, $dbpass, $prefix, $logfile, $log_prefix);
2121 die lookupMsg('err_cluster_empty') if ($node_count == 0);
2122 @cluster = @g_cluster;
2123 die lookupMsg('err_cluster_lone') if ($node_count == 1);
2124 $cluster_loaded = true;
2125 $cluster_loaded_from = $_->[0];
2128 printlogln($prefix,$logfile,$log_prefix, lookupMsg('load_cluster_fail', 'from node ' . $_->[0] . ': trying next node'));
2130 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', $@));
2138 if ($cluster_loaded) {
2139 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_load_cluster', (!defined($cluster_time) ? "Loaded" : "Reloaded"), $version, $clname, $node_count, $cluster_loaded_from));
2140 $cluster_time = time();
2143 printlogln($prefix,$logfile,$log_prefix, lookupMsg('load_cluster_fail', 'from any node'));
2147 if ($cluster_loaded) {
2150 while(($current_retry <= $g_autofailover_retry) && ((!defined($failed)) || ($failed > 0))) {
2151 # Check status of cluster
2152 $failed = checkFailed($clname, $dbuser, $dbpass, $prefix, $logfile, $log_prefix);
2155 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_cluster_good'));
2157 if ($current_retry > 0) {
2158 printlogln($prefix,$logfile,$log_prefix,lookupMsg('cluster_fixed'));
2162 if (($failed > 0) && ($current_retry <= $g_autofailover_retry)) {
2163 printlogln($prefix,$logfile,$log_prefix,lookupMsg('cluster_failed', $failed,$g_autofailover_retry_sleep,$current_retry,$g_autofailover_retry));
2164 usleep($g_autofailover_retry_sleep * 1000);
2168 if ((!$g_autofailover_majority_only || checkSplit($prefix, $logfile, $log_prefix)) && (($g_autofailover_perspective_sleep <= 0) || checkPerspective($clname, $dbuser, $dbpass, $prefix, $logfile, $log_prefix))) {
2169 $actions = findBackup($clname, $dbuser, $dbpass, $prefix, $logfile, $log_prefix);
2171 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_proceed'));
2172 foreach my $failed ( keys %g_backups ) {
2173 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_detail', $failed, $g_backups{$failed}));
2175 $g_script = writeFailover($prefix, $dbconninfo, $clname, $dbuser, $dbpass, undef, undef, $g_subs_follow_origin, $g_use_comment_aliases, $logfile, $log_prefix);
2176 unless (runSlonik($g_script, $prefix, $logfile, $log_prefix)) {
2177 printlogln($prefix,$logfile,$log_prefix,lookupMsg('err_execute_fail', 'slonik script', $g_script));
2179 $cluster_loaded = false;
2181 #print "SCRIPT: $g_script\n";
2185 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_halt', $failed));
2189 usleep($g_autofailover_poll_interval * 1000);
2198 # Count of failed and live nodes to perform very basic split-brain check
2201 my $logfile = shift;
2202 my $log_prefix = shift;
2204 my $majority = false;
2205 my $failed = scalar(@g_unresponsive);
2206 my $survivers = (scalar(@g_cluster) - scalar(@g_unresponsive));
2208 if ($survivers > $failed) {
2210 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_split_check', $survivers, ($survivers+$failed)));
2212 elsif (($survivers == $failed) && $g_autofailover_is_quorum) {
2214 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_split_check', ($survivers . '+quorum'), ($survivers+$failed)));
2217 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_split_check_fail', $survivers));
2223 # Check each nodes perspective of the failure to try to ensure the issue isn't that this script just can't connect to the origin/provider
2224 # The idea here is just to wait for a short period of time and see if the lag time for the nodes has increased by the same amount
2225 sub checkPerspective {
2230 my $logfile = shift;
2231 my $log_prefix = shift;
2240 my @unresponsive_ids;
2247 foreach (@g_unresponsive) {
2248 push(@unresponsive_ids, $_->[0]);
2250 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check', join(", ", @unresponsive_ids), scalar(@g_unresponsive), scalar(@g_cluster)));
2252 foreach (@g_cluster) {
2253 unless ($_->[0] ~~ @unresponsive_ids) {
2254 $dsn = "DBI:Pg:$_->[2];";
2256 $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
2257 $qw_clname = $dbh->quote_identifier("_" . $clname);
2259 $query = "SELECT a.st_origin, a.st_received, extract(epoch from a.st_lag_time)::integer
2260 FROM _test_replication.sl_status a
2261 INNER JOIN _test_replication.sl_node b on a.st_origin = b.no_id
2262 INNER JOIN _test_replication.sl_node c on a.st_received = c.no_id
2263 WHERE a.st_received IN (" . substr('?, ' x scalar(@unresponsive_ids), 0, -2) . ") ORDER BY a.st_origin, a.st_received;";
2265 $sth = $dbh->prepare($query);
2268 foreach (@unresponsive_ids) {
2269 $sth->bind_param($param_on, $_);
2274 while (my @node_lag = $sth->fetchrow) {
2275 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_data', 'Check1', $_->[0], $node_lag[0], $node_lag[1], $node_lag[2]));
2276 push(@lag_info1, \@node_lag);
2283 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_fail', $_->[0], $@));
2290 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_sleep', $g_autofailover_perspective_sleep));
2291 usleep($g_autofailover_perspective_sleep * 1000);
2293 foreach (@g_cluster) {
2294 unless ($_->[0] ~~ @unresponsive_ids) {
2295 $dsn = "DBI:Pg:$_->[2];";
2297 $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
2298 $qw_clname = $dbh->quote_identifier("_" . $clname);
2300 $query = "SELECT a.st_origin, a.st_received, extract(epoch from a.st_lag_time)::integer
2301 FROM _test_replication.sl_status a
2302 INNER JOIN _test_replication.sl_node b on a.st_origin = b.no_id
2303 INNER JOIN _test_replication.sl_node c on a.st_received = c.no_id
2304 WHERE a.st_received IN (" . substr('?, ' x scalar(@unresponsive_ids), 0, -2) . ") ORDER BY a.st_origin, a.st_received;";
2306 $sth = $dbh->prepare($query);
2309 foreach (@unresponsive_ids) {
2310 $sth->bind_param($param_on, $_);
2315 while (my @node_lag = $sth->fetchrow) {
2316 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_data', 'Check2', $_->[0], $node_lag[0], $node_lag[1], $node_lag[2]));
2317 push(@lag_info2, \@node_lag);
2324 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_fail', $_->[0], $@));
2332 foreach (@lag_info1) {
2334 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', ("Node $_->[0] lag between checks on node $_->[1] is " . ($lag_info2[$lag_idx]->[2]-$_->[2]) . " seconds")));
2337 if ((($lag_info2[$lag_idx]->[2]-$_->[2])*1000) >= $g_autofailover_perspective_sleep) {
2345 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_unknown'));
2347 elsif ($lag_idx == $lag_confirmed) {
2348 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_true'));
2352 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_pspec_check_false'));
2358 # Check if any nodes have failed by connecting and probing the Slony schema
2364 my $logfile = shift;
2365 my $log_prefix = shift;
2371 my $result_count = 0;
2372 my $prov_failed = 0;
2373 my $subonly_failed = 0;
2375 undef @g_unresponsive;
2377 foreach (@g_cluster) {
2378 if ($_->[6] eq "ACTIVE") {
2380 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_autofailover_check',$_->[0], ($_->[4] // "unnamed"),(defined($_->[9]) ? "provider of sets $_->[9]" : "sole subscriber"),$_->[2]));
2384 if ((defined($_->[3])) || ($g_autofailover_provs && defined($_->[9]))) {
2385 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_autofailover_active_check', 'provider', $_->[0]));
2388 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_autofailover_active_check', 'subscriber only', $_->[0]));
2392 $dsn = "DBI:Pg:$_->[2];";
2394 $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
2395 $query = "SELECT count(*) FROM pg_namespace WHERE nspname = ?";
2396 $sth = $dbh->prepare($query);
2397 $sth->bind_param(1, "_" . $clname);
2400 $result_count = $result_count+$sth->rows;
2405 if (exists($g_unresponsive_subonly{$_->[0]})) {
2406 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_recovery_subonly', $_->[0]));
2407 delete $g_unresponsive_subonly{$_->[0]};
2412 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', $@));
2414 push(@g_unresponsive, \@$_);
2415 if ((defined($_->[3])) || ($g_autofailover_provs && defined($_->[9]))) {
2416 if (defined($_->[3])) {
2417 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_unresponsive', $_->[0]));
2420 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_unresponsive_prov', $_->[0]));
2422 unless ($g_failover_method ne 'new' && !defined($_->[3])) {
2427 if (!exists($g_unresponsive_subonly{$_->[0]})) {
2428 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_unresponsive_subonly', $_->[0]));
2429 $g_unresponsive_subonly{$_->[0]} = true;
2431 if ($g_fail_subonly) {
2439 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_autofailover_check',$_->[0], ($_->[4] // "unnamed"), lc($_->[6] // "unknown") . ' node', $_->[2]));
2443 if ($prov_failed > 0) {
2444 return ($prov_failed+$subonly_failed);
2447 return $prov_failed;
2451 # Attempt to try and find the most suitable backup node for a failed node
2457 my $logfile = shift;
2458 my $log_prefix = shift;
2465 my $result_count = 0;
2466 my $lowest_lag_time;
2467 my $latest_last_event;
2469 my $best_node_is_direct;
2470 my $best_node_can_forward;
2474 my %backup_for_set_chosen;
2479 foreach (@g_unresponsive) {
2480 if ($g_fail_subonly || (defined($_->[3])) || ($g_autofailover_provs && defined($_->[9]))) {
2481 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_find', ($_->[9] // "none"), $_->[0]));
2483 undef $best_node_id;
2484 $lowest_lag_time = (1<<$Config{ivsize}*8-1)-1;
2485 $latest_last_event = 0;
2486 $best_node_is_direct = false;
2487 $best_node_can_forward = false;
2489 if (defined($_->[9]) && (exists $backup_for_set_chosen{$_->[9]})) {
2490 $best_node_id = $backup_for_set_chosen{$_->[9]};
2491 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_found', $_->[9], $_->[0]));
2494 foreach my $subscriber (@g_cluster) {
2495 if ($subscriber->[0] != $_->[0]) {
2497 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_check_sub',$subscriber->[0]));
2500 # Here the strings containing the sets are converted to arrays
2501 # Origin / Forwarded sets
2502 undef @sets_to_prov;
2503 if (defined($subscriber->[10])) {
2504 @sets_to_prov = split(',',$subscriber->[10]);
2506 if (defined($subscriber->[3])) {
2507 if (@sets_to_prov) {
2508 @sets_to_prov = (@sets_to_prov, split(',',$subscriber->[3]));
2511 @sets_to_prov = split(',',$subscriber->[3]);
2514 if (!defined($subscriber->[10]) && !defined($subscriber->[3])) {
2515 @sets_to_prov = (0);
2518 # Origin / Subscribed sets.
2520 if (defined($subscriber->[7])) {
2521 @sets_to = split(',',$subscriber->[7]);
2523 if (defined($subscriber->[3])) {
2525 @sets_to = (@sets_to, split(',',$subscriber->[3]));
2528 @sets_to = split(',',$subscriber->[3]);
2531 if (!defined($subscriber->[7]) && !defined($subscriber->[3])) {
2535 # Sets provided by the failed node.
2537 if (defined($_->[9])) {
2538 @sets_from = split(',',$_->[9]);
2545 $dsn = "DBI:Pg:$subscriber->[2]";
2548 $dbh = DBI->connect($dsn, $dbuser, $dbpass, {RaiseError => 1});
2549 $qw_clname = $dbh->quote_identifier("_" . $clname);
2551 #print "\tNODE " . $subscriber->[0] . ") SETS TO = " . join(',', @sets_to) . " SETS FROM = " . join(',', @sets_from) . " SETS TO PROV = " . join(',', @sets_to_prov) . "\n";
2553 if (defined($subscriber->[3]) && checkProvidesAllSets(\@sets_from, \@sets_to)) {
2554 $query = "SELECT 0, ev_seqno, (ev_origin = ?)
2555 FROM $qw_clname.sl_event
2556 WHERE ev_origin = $qw_clname.getlocalnodeid(?)
2557 ORDER BY ev_seqno DESC LIMIT 1";
2560 $query = "SELECT extract(epoch from (current_timestamp-a.con_timestamp)), a.con_seqno, (a.con_origin = ?) AS direct
2561 FROM $qw_clname.sl_confirm a
2562 INNER JOIN $qw_clname.sl_event b on b.ev_seqno = a.con_seqno AND a.con_origin = b.ev_origin
2563 INNER JOIN $qw_clname.sl_subscribe c ON c.sub_provider = a.con_origin AND c.sub_receiver = a.con_received
2564 WHERE c.sub_active AND a.con_received = $qw_clname.getlocalnodeid(?)
2565 ORDER BY a.con_seqno DESC LIMIT 1;";
2568 $sth = $dbh->prepare($query);
2569 $sth->bind_param(1, $_->[0]);
2570 $sth->bind_param(2, "_" . $clname);
2573 while (my @subinfo = $sth->fetchrow) {
2574 # If the failed provider node isn't an origin for any sets, we classify any direct subscribers to it as indeirect
2575 # because they are indirect to the origin.
2576 if ($subinfo[2] && defined($_->[9]) && !defined($_->[3])) {
2577 $subinfo[2] = false;
2580 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_node_detail',
2582 ($subinfo[2]?"directly":"indirectly"),
2583 (defined($_->[3])?"origin":(defined($_->[9])?"provider":"subscriber only")),
2585 (defined($subscriber->[10])?$subscriber->[10]:(defined($subscriber->[3])?$subscriber->[3]:"<NONE>")),
2586 $subinfo[0], $subinfo[1]));
2588 # If select this node as the backup node if:
2589 # 1) The node is a subscriber to all sets on the failed node
2590 # 2) In order of preference:
2591 # The node is one of the direct subscribers to the failed node on the most recent event and is a forwarding provider
2593 # The node is one of the direct subscribers to the failed node on the most recent event and is not a forwarding provider
2595 # The node is an indirect subscriber to the failed node with the lowest lag time
2596 if (!checkProvidesAllSets(\@sets_from, \@sets_to)) {
2597 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_unsuitable', $subscriber->[0]));
2599 elsif (($subinfo[2] && (!$best_node_is_direct || $subinfo[1] > $latest_last_event || (!$best_node_can_forward && checkProvidesAllSets(\@sets_from, \@sets_to_prov) && $subinfo[1] == $latest_last_event)))
2600 || (!$best_node_is_direct && !$subinfo[2] && $subinfo[0] < $lowest_lag_time)) {
2601 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_best', $subscriber->[0], $subinfo[0], $subinfo[1]));
2602 $best_node_id = $subscriber->[0];
2603 $lowest_lag_time = $subinfo[0];
2604 $latest_last_event = $subinfo[1];
2605 $best_node_is_direct = $subinfo[2];
2606 $best_node_can_forward = checkProvidesAllSets(\@sets_from, \@sets_to_prov);
2612 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_check_sub_fail', $subscriber->[0]));
2614 printlogln($prefix,$logfile,$log_prefix,lookupMsg('dbg_generic', $@));
2620 if (defined($best_node_id)) {
2621 push(@g_failed, \@$_);
2622 $g_backups{$_->[0]} = $best_node_id;
2623 if (defined($_->[9]) && !(exists $g_backups{$_->[9]})) {
2624 $backup_for_set_chosen{$_->[9]} = $best_node_id;
2628 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_fail'));
2632 printlogln($prefix,$logfile,$log_prefix,lookupMsg('autofailover_promote_skip', $_->[0]));
2635 return keys(%g_backups);