3 * $Header: /cvsroot/pgpool/pgpool-II/recovery.c,v 1.12.2.1 2009/08/22 04:19:49 t-ishii Exp $
5 * pgpool: a language independent connection pool server for PostgreSQL
6 * written by Tatsuo Ishii
8 * Copyright (c) 2003-2008 PgPool Global Development Group
10 * Permission to use, copy, modify, and distribute this software and
11 * its documentation for any purpose and without fee is hereby
12 * granted, provided that the above copyright notice appear in all
13 * copies and that both that copyright notice and this permission
14 * notice appear in supporting documentation, and that the name of the
15 * author not be used in advertising or publicity pertaining to
16 * distribution of the software without specific, written prior
17 * permission. The author makes no representations about the
18 * suitability of this software for any purpose. It is provided "as
19 * is" without express or implied warranty.
21 * recovery.c: online recovery process
33 #define WAIT_RETRY_COUNT (pool_config->recovery_timeout / 3)
36 #define SECOND_STAGE 1
38 static int exec_checkpoint(PGconn *conn);
39 static int exec_recovery(PGconn *conn, BackendInfo *backend, char stage);
40 static int exec_remote_start(PGconn *conn, BackendInfo *backend);
41 static PGconn *connect_backend_libpq(BackendInfo *backend);
42 static int wait_connection_closed(void);
43 static int check_postmaster_started(BackendInfo *backend);
45 static char recovery_command[1024];
47 extern volatile sig_atomic_t pcp_wakeup_request;
49 int start_recovery(int recovery_node)
52 BackendInfo *recovery_backend;
55 pool_log("starting recovering node %d", recovery_node);
57 if (VALID_BACKEND(recovery_node))
59 pool_error("start_recovery: backend node %d is alive", recovery_node);
63 Req_info->kind = NODE_RECOVERY_REQUEST;
65 backend = &pool_config->backend_desc->backend_info[MASTER_NODE_ID];
66 recovery_backend = &pool_config->backend_desc->backend_info[recovery_node];
68 conn = connect_backend_libpq(backend);
72 pool_error("start_recover: could not connect master node.");
77 if (exec_checkpoint(conn) != 0)
80 pool_error("start_recovery: CHECKPOINT failed");
84 pool_log("CHECKPOINT in the 1st stage done");
86 if (exec_recovery(conn, recovery_backend, FIRST_STAGE) != 0)
92 pool_log("1st stage is done");
94 pool_log("starting 2nd stage");
98 if (wait_connection_closed() != 0)
101 pool_error("start_recovery: timeover for waiting connection closed");
105 pool_log("all connections from clients have been closed");
107 if (exec_checkpoint(conn) != 0)
110 pool_error("start_recovery: CHECKPOINT failed");
114 pool_log("CHECKPOINT in the 2nd stage done");
116 if (exec_recovery(conn, recovery_backend, SECOND_STAGE) != 0)
121 if (exec_remote_start(conn, recovery_backend) != 0)
124 pool_error("start_recovery: remote start failed");
128 if (check_postmaster_started(recovery_backend))
131 pool_error("start_recovery: check start failed");
135 pool_log("%d node restarted", recovery_node);
138 * reset failover completion flag. this is necessary since
139 * previous failover/failback will set the flag to 1.
141 pcp_wakeup_request = 0;
143 /* send failback request to pgpool parent */
144 send_failback_request(recovery_node);
146 /* wait for failback */
147 while (!pcp_wakeup_request)
149 struct timeval t = {1, 0};
150 /* polling SIGUSR2 signal every 1 sec */
151 select(0, NULL, NULL, NULL, &t);
153 pcp_wakeup_request = 0;
157 pool_log("recovery done");
163 * Notice all children finishing recovery.
165 void finish_recovery(void)
168 kill(getppid(), SIGUSR2);
174 static int exec_checkpoint(PGconn *conn)
179 pool_debug("exec_checkpoint: start checkpoint");
180 result = PQexec(conn, "CHECKPOINT");
181 r = (PQresultStatus(result) != PGRES_COMMAND_OK);
183 pool_debug("exec_checkpoint: finish checkpoint");
188 * Call pgpool_recovery() function.
190 static int exec_recovery(PGconn *conn, BackendInfo *backend, char stage)
197 if (strlen(backend->backend_hostname) == 0)
198 hostname = "localhost";
200 hostname = backend->backend_hostname;
202 script = (stage == FIRST_STAGE) ?
203 pool_config->recovery_1st_stage_command : pool_config->recovery_2nd_stage_command;
205 if (script == NULL || strlen(script) == 0)
207 /* do not execute script */
211 snprintf(recovery_command,
212 sizeof(recovery_command),
213 "SELECT pgpool_recovery('%s', '%s', '%s')",
216 backend->backend_data_directory);
218 pool_log("starting recovery command: \"%s\"", recovery_command);
220 pool_debug("exec_recovery: start recovery");
221 result = PQexec(conn, recovery_command);
222 r = (PQresultStatus(result) != PGRES_TUPLES_OK);
225 pool_error("exec_recovery: %s command failed at %s",
227 (stage == FIRST_STAGE) ? "1st stage" : "2nd stage");
230 pool_debug("exec_recovery: finish recovery");
235 * Call pgpool_remote_start() function.
237 static int exec_remote_start(PGconn *conn, BackendInfo *backend)
243 if (strlen(backend->backend_hostname) == 0)
244 hostname = "localhost";
246 hostname = backend->backend_hostname;
248 snprintf(recovery_command, sizeof(recovery_command),
249 "SELECT pgpool_remote_start('%s', '%s')",
251 backend->backend_data_directory);
253 pool_debug("exec_remote_start: start pgpool_remote_start");
254 result = PQexec(conn, recovery_command);
255 r = (PQresultStatus(result) != PGRES_TUPLES_OK);
257 pool_error("exec_remote_start: pgpool_remote_start failed: %s", PQresultErrorMessage(result));
259 pool_debug("exec_remote_start: finish pgpool_remote_start");
264 * Check postmaster is started.
266 static int check_postmaster_started(BackendInfo *backend)
272 snprintf(port_str, sizeof(port_str),
273 "%d", backend->backend_port);
276 conn = PQsetdbLogin(backend->backend_hostname,
281 pool_config->recovery_user,
282 pool_config->recovery_password);
285 if (r == CONNECTION_OK)
288 if (WAIT_RETRY_COUNT != 0)
290 } while (i++ < WAIT_RETRY_COUNT);
292 pool_error("check_postmaster_started: remote host start up did not finish in %d sec.", pool_config->recovery_timeout);
296 static PGconn *connect_backend_libpq(BackendInfo *backend)
301 snprintf(port_str, sizeof(port_str),
302 "%d", backend->backend_port);
303 conn = PQsetdbLogin(backend->backend_hostname,
308 pool_config->recovery_user,
309 pool_config->recovery_password);
311 if (PQstatus(conn) != CONNECTION_OK)
320 * Wait all connections are closed.
322 static int wait_connection_closed(void)
328 if (Req_info->conn_counter == 0)
331 if (WAIT_RETRY_COUNT != 0)
333 } while (i++ < WAIT_RETRY_COUNT);
335 pool_error("wait_connection_closed: existing connections did not close in %d sec.", pool_config->recovery_timeout);