3 * $Header: /cvsroot/pgpool/pgpool-II/main.c,v 1.45.2.6 2009/09/25 07:35:16 t-ishii Exp $
5 * pgpool: a language independent connection pool server for PostgreSQL
6 * written by Tatsuo Ishii
8 * Copyright (c) 2003-2009 PgPool Global Development Group
10 * Permission to use, copy, modify, and distribute this software and
11 * its documentation for any purpose and without fee is hereby
12 * granted, provided that the above copyright notice appear in all
13 * copies and that both that copyright notice and this permission
14 * notice appear in supporting documentation, and that the name of the
15 * author not be used in advertising or publicity pertaining to
16 * distribution of the software without specific, written prior
17 * permission. The author makes no representations about the
18 * suitability of this software for any purpose. It is provided "as
19 * is" without express or implied warranty.
24 #include <sys/types.h>
25 #include <sys/socket.h>
26 #include <netinet/in.h>
29 #include <arpa/inet.h>
31 #ifdef HAVE_SYS_SELECT_H
32 #include <sys/select.h>
53 #include "parser/pool_memory.h"
54 #include "parser/pool_string.h"
57 * Process pending signal actions.
59 #define CHECK_REQUEST \
66 if (failover_request) \
69 failover_request = 0; \
71 if (sigchld_request) \
75 if (reload_config_request) \
78 reload_config_request = 0; \
83 #define PGPOOLMAXLITSENQUEUELENGTH 10000
84 static void daemonize(void);
85 static int read_pid_file(void);
86 static void write_pid_file(void);
87 static pid_t pcp_fork_a_child(int unix_fd, int inet_fd, char *pcp_conf_file);
88 static pid_t fork_a_child(int unix_fd, int inet_fd, int id);
89 static int create_unix_domain_socket(struct sockaddr_un un_addr_tmp);
90 static int create_inet_domain_socket(const char *hostname, const int port);
91 static void myexit(int code);
92 static void failover(void);
93 static void reaper(void);
94 static void wakeup_children(void);
95 static void reload_config(void);
96 static int pool_pause(struct timeval *timeout);
97 static void pool_sleep(unsigned int second);
98 static void kill_all_children(int sig);
99 static int get_next_master_node(void);
101 static RETSIGTYPE exit_handler(int sig);
102 static RETSIGTYPE reap_handler(int sig);
103 static RETSIGTYPE failover_handler(int sig);
104 static RETSIGTYPE reload_config_handler(int sig);
105 static RETSIGTYPE health_check_timer_handler(int sig);
106 static RETSIGTYPE wakeup_handler(int sig);
108 static void usage(void);
109 static void show_version(void);
110 static void stop_me(void);
112 static int trigger_failover_command(int node, const char *command_line);
114 static struct sockaddr_un un_addr; /* unix domain socket path */
115 static struct sockaddr_un pcp_un_addr; /* unix domain socket path for PCP */
117 ProcessInfo *pids; /* shmem child pid table */
120 * shmem connection info table
121 * this is a two dimension array. i.e.:
122 * con_info[pool_config->num_init_children][pool_config->max_pool]
124 ConnectionInfo *con_info;
126 static int unix_fd; /* unix domain socket fd */
127 static int inet_fd; /* inet domain socket fd */
129 static int pcp_pid; /* pid for child process handling PCP */
130 static int pcp_unix_fd; /* unix domain socket fd for PCP (not used) */
131 static int pcp_inet_fd; /* inet domain socket fd for PCP */
132 static char pcp_conf_file[POOLMAXPATHLEN+1]; /* path for pcp.conf */
133 static char conf_file[POOLMAXPATHLEN+1];
134 static char hba_file[POOLMAXPATHLEN+1];
136 static int exiting = 0; /* non 0 if I'm exiting */
137 static int switching = 0; /* non 0 if I'm fail overing or degenerating */
140 static int degenerated = 0; /* set non 0 if already degenerated */
143 static int clear_cache = 0; /* non 0 if clear chache option (-c) is given */
144 static int not_detach = 0; /* non 0 if non detach option (-n) is given */
145 int debug = 0; /* non 0 if debug option is given (-d) */
147 pid_t mypid; /* pgpool parent process id */
149 long int weight_master; /* normalized weight of master (0-RAND_MAX range) */
151 static int stop_sig = SIGTERM; /* stopping signal default value */
153 static volatile sig_atomic_t health_check_timer_expired; /* non 0 if health check timer expired */
155 POOL_REQUEST_INFO *Req_info; /* request info area in shared memory */
156 volatile sig_atomic_t *InRecovery; /* non 0 if recovery is started */
157 volatile sig_atomic_t reload_config_request = 0;
158 static volatile sig_atomic_t failover_request = 0;
159 static volatile sig_atomic_t sigchld_request = 0;
160 static volatile sig_atomic_t wakeup_request = 0;
162 static int pipe_fds[2]; /* for delivering signals */
170 * pgpool main program
172 int main(int argc, char **argv)
184 snprintf(conf_file, sizeof(conf_file), "%s/%s", DEFAULT_CONFIGDIR, POOL_CONF_FILE_NAME);
185 snprintf(pcp_conf_file, sizeof(pcp_conf_file), "%s/%s", DEFAULT_CONFIGDIR, PCP_PASSWD_FILE_NAME);
186 snprintf(hba_file, sizeof(hba_file), "%s/%s", DEFAULT_CONFIGDIR, HBA_CONF_FILE_NAME);
188 while ((opt = getopt(argc, argv, "a:cdf:F:hm:nv")) != -1)
192 case 'a': /* specify hba configuration file */
198 strncpy(hba_file, optarg, sizeof(hba_file));
201 case 'c': /* clear cache option */
205 case 'd': /* debug option */
209 case 'f': /* specify configuration file */
215 strncpy(conf_file, optarg, sizeof(conf_file));
218 case 'F': /* specify PCP password file */
224 strncpy(pcp_conf_file, optarg, sizeof(pcp_conf_file));
232 case 'm': /* stop mode */
238 if (*optarg == 's' || !strcmp("smart", optarg))
239 stop_sig = SIGTERM; /* smart shutdown */
240 else if (*optarg == 'f' || !strcmp("fast", optarg))
241 stop_sig = SIGINT; /* fast shutdown */
242 else if (*optarg == 'i' || !strcmp("immediate", optarg))
243 stop_sig = SIGQUIT; /* immediate shutdown */
251 case 'n': /* no detaching control ttys */
267 if (pool_init_config())
270 if (pool_get_config(conf_file, INIT_CONFIG))
272 pool_error("Unable to get configuration. Exiting...");
276 if (pool_config->enable_pool_hba)
280 * if a non-switch argument remains, then it should be either "reload", "stop" or "switch"
282 if (optind == (argc - 1))
284 if (!strcmp(argv[optind], "reload"))
288 pid = read_pid_file();
291 pool_error("could not read pid file");
296 if (kill(pid, SIGHUP) == -1)
298 pool_error("could not reload configuration file pid: %d. reason: %s", pid, strerror(errno));
305 if (!strcmp(argv[optind], "stop"))
319 * else if no non-switch argument remains, then it should be a start request
321 else if (optind == argc)
323 pid = read_pid_file();
326 if (kill(pid, 0) == 0)
328 fprintf(stderr, "pid file found. is another pgpool(%d) is running?\n", pid);
332 fprintf(stderr, "pid file found but it seems bogus. Trying to start pgpool anyway...\n");
336 * otherwise an error...
344 /* set signal masks */
352 if (pool_semaphore_create(MAX_NUM_SEMAPHORES))
354 pool_error("Unable to create semaphores. Exiting...");
360 if (clear_cache && pool_config->enable_query_cache && SYSDB_STATUS == CON_UP)
362 Interval interval[1];
364 interval[0].quantity = 0;
365 interval[0].unit = second;
367 pool_clear_cache_by_time(interval, 1);
370 /* set unix domain socket path */
371 snprintf(un_addr.sun_path, sizeof(un_addr.sun_path), "%s/.s.PGSQL.%d",
372 pool_config->socket_dir,
375 /* set up signal handlers */
376 pool_signal(SIGPIPE, SIG_IGN);
378 /* create unix domain socket */
379 unix_fd = create_unix_domain_socket(un_addr);
381 /* create inet domain socket if any */
382 if (pool_config->listen_addresses[0])
384 inet_fd = create_inet_domain_socket(pool_config->listen_addresses, pool_config->port);
387 size = pool_config->num_init_children * pool_config->max_pool * sizeof(ConnectionInfo);
388 con_info = pool_shared_memory_create(size);
389 if (con_info == NULL)
391 pool_error("failed to allocate connection informations");
394 memset(con_info, 0, size);
396 size = pool_config->num_init_children * (sizeof(ProcessInfo));
397 pids = pool_shared_memory_create(size);
400 pool_error("failed to allocate pids");
403 memset(pids, 0, size);
404 for (i = 0; i < pool_config->num_init_children; i++)
406 pids[i].connection_info = &con_info[i * pool_config->max_pool];
409 /* create fail over/switch over event area */
410 Req_info = pool_shared_memory_create(sizeof(POOL_REQUEST_INFO));
411 if (Req_info == NULL)
413 pool_error("failed to allocate Req_info");
417 /* initialize Req_info */
418 Req_info->kind = NODE_UP_REQUEST;
419 memset(Req_info->node_id, -1, sizeof(int) * MAX_NUM_BACKENDS);
420 Req_info->master_node_id = get_next_master_node();
421 Req_info->conn_counter = 0;
423 InRecovery = pool_shared_memory_create(sizeof(int));
424 if (InRecovery == NULL)
426 pool_error("failed to allocate InRecovery");
432 * We need to block signal here. Otherwise child might send some
433 * signals, for example SIGUSR1(fail over). Children will inherit
434 * signal blocking but they do unblock signals at the very beginning
435 * of process. So this is harmless.
437 POOL_SETMASK(&BlockSig);
439 /* fork the children */
440 for (i=0;i<pool_config->num_init_children;i++)
442 pids[i].pid = fork_a_child(unix_fd, inet_fd, i);
443 pids[i].start_time = time(NULL);
446 /* set up signal handlers */
448 pool_signal(SIGTERM, exit_handler);
449 pool_signal(SIGINT, exit_handler);
450 pool_signal(SIGQUIT, exit_handler);
451 pool_signal(SIGCHLD, reap_handler);
452 pool_signal(SIGUSR1, failover_handler);
453 pool_signal(SIGUSR2, wakeup_handler);
454 pool_signal(SIGHUP, reload_config_handler);
456 /* create pipe for delivering event */
457 if (pipe(pipe_fds) < 0)
459 pool_error("failed to create pipe");
463 pool_log("pgpool successfully started");
465 /* fork a child for PCP handling */
466 snprintf(pcp_un_addr.sun_path, sizeof(pcp_un_addr.sun_path), "%s/.s.PGSQL.%d",
467 pool_config->pcp_socket_dir,
468 pool_config->pcp_port);
469 pcp_unix_fd = create_unix_domain_socket(pcp_un_addr);
470 /* maybe change "*" to pool_config->pcp_listen_addresses */
471 pcp_inet_fd = create_inet_domain_socket("*", pool_config->pcp_port);
472 pcp_pid = pcp_fork_a_child(pcp_unix_fd, pcp_inet_fd, pcp_conf_file);
474 retrycnt = 0; /* reset health check retry counter */
475 sys_retrycnt = 0; /* reset SystemDB health check retry counter */
478 * This is the main loop
484 /* do we need health checking for PostgreSQL? */
485 if (pool_config->health_check_period > 0)
489 unsigned int sleep_time;
493 pool_debug("starting health checking");
497 pool_debug("retrying %d th health checking", retrycnt);
500 if (pool_config->health_check_timeout > 0)
503 * set health checker timeout. we want to detect
504 * communication path failure much earlier before
505 * TCP/IP stack detects it.
507 pool_signal(SIGALRM, health_check_timer_handler);
508 alarm(pool_config->health_check_timeout);
512 * do actual health check. trying to connect to the backend
515 health_check_timer_expired = 0;
516 POOL_SETMASK(&UnBlockSig);
517 sts = health_check();
518 POOL_SETMASK(&BlockSig);
519 if (pool_config->parallel_mode || pool_config->enable_query_cache)
520 sys_sts = system_db_health_check();
522 if ((sts > 0 || sys_sts < 0) && (errno != EINTR || (errno == EINTR && health_check_timer_expired)))
528 if (!pool_config->parallel_mode)
530 pool_log("set %d th backend down status", sts);
531 Req_info->kind = NODE_DOWN_REQUEST;
532 Req_info->node_id[0] = sts;
534 /* need to distribute this info to children */
539 pool_signal(SIGALRM, SIG_IGN); /* Cancel timer */
541 if (retrycnt > NUM_BACKENDS)
543 /* retry count over */
544 pool_log("set %d th backend down status", sts);
545 Req_info->kind = NODE_DOWN_REQUEST;
546 Req_info->node_id[0] = sts;
552 /* continue to retry */
553 sleep_time = pool_config->health_check_period/NUM_BACKENDS;
554 pool_debug("retry sleep time: %d seconds", sleep_time);
555 pool_sleep(sleep_time);
563 pool_signal(SIGALRM, SIG_IGN);
565 if (sys_retrycnt > NUM_BACKENDS)
567 pool_log("set SystemDB down status");
568 SYSDB_STATUS = CON_DOWN;
571 else if (sts == 0) /* goes to sleep only when SystemDB alone was down */
573 sleep_time = pool_config->health_check_period/NUM_BACKENDS;
574 pool_debug("retry sleep time: %d seconds", sleep_time);
575 pool_sleep(sleep_time);
581 if (pool_config->health_check_timeout > 0)
583 /* seems ok. cancel health check timer */
584 pool_signal(SIGALRM, SIG_IGN);
587 sleep_time = pool_config->health_check_period;
588 pool_sleep(sleep_time);
595 struct timeval t = {3, 0};
597 POOL_SETMASK(&UnBlockSig);
599 POOL_SETMASK(&BlockSig);
609 static void show_version(void)
611 fprintf(stderr, "%s version %s (%s)\n", PACKAGE, VERSION, PGPOOLVERSION);
614 static void usage(void)
616 fprintf(stderr, "%s version %s (%s),\n", PACKAGE, VERSION, PGPOOLVERSION);
617 fprintf(stderr, " a generic connection pool/replication/load balance server for PostgreSQL\n\n");
618 fprintf(stderr, "Usage:\n");
619 fprintf(stderr, " pgpool [ -c] [ -f CONFIG_FILE ] [ -F PCP_CONFIG_FILE ] [ -a HBA_CONFIG_FILE ]\n");
620 fprintf(stderr, " [ -n ] [ -d ]\n");
621 fprintf(stderr, " pgpool [ -f CONFIG_FILE ] [ -F PCP_CONFIG_FILE ] [ -a HBA_CONFIG_FILE ]\n");
622 fprintf(stderr, " [ -m SHUTDOWN-MODE ] stop\n");
623 fprintf(stderr, " pgpool [ -f CONFIG_FILE ] [ -F PCP_CONFIG_FILE ] [ -a HBA_CONFIG_FILE ] reload\n\n");
624 fprintf(stderr, "Common options:\n");
625 fprintf(stderr, " -a HBA_CONFIG_FILE Sets the path to the pool_hba.conf configuration file\n");
626 fprintf(stderr, " (default: %s/%s)\n",DEFAULT_CONFIGDIR, HBA_CONF_FILE_NAME);
627 fprintf(stderr, " -f CONFIG_FILE Sets the path to the pgpool.conf configuration file\n");
628 fprintf(stderr, " (default: %s/%s)\n",DEFAULT_CONFIGDIR, POOL_CONF_FILE_NAME);
629 fprintf(stderr, " -F PCP_CONFIG_FILE Sets the path to the pcp.conf configuration file\n");
630 fprintf(stderr, " (default: %s/%s)\n",DEFAULT_CONFIGDIR, PCP_PASSWD_FILE_NAME);
631 fprintf(stderr, " -h Prints this help\n\n");
632 fprintf(stderr, "Start options:\n");
633 fprintf(stderr, " -c Clears query cache (enable_query_cache must be on)\n");
634 fprintf(stderr, " -n Don't run in daemon mode, does not detach control tty\n");
635 fprintf(stderr, " -d Debug mode\n\n");
636 fprintf(stderr, "Stop options:\n");
637 fprintf(stderr, " -m SHUTDOWN-MODE Can be \"smart\", \"fast\", or \"immediate\"\n\n");
638 fprintf(stderr, "Shutdown modes are:\n");
639 fprintf(stderr, " smart quit after all clients have disconnected\n");
640 fprintf(stderr, " fast quit directly, with proper shutdown\n");
641 fprintf(stderr, " immediate quit without complete shutdown; will lead to recovery on restart\n");
645 * detach control ttys
647 static void daemonize(void)
654 if (pid == (pid_t) -1)
656 pool_error("fork() failed. reason: %s", strerror(errno));
659 return; /* not reached */
670 pool_error("setsid() failed. reason:%s", strerror(errno));
680 i = open("/dev/null", O_RDWR);
685 fdlimit = sysconf(_SC_OPEN_MAX);
686 for (i = 3; i < fdlimit; i++)
696 static void stop_me(void)
700 pid = read_pid_file();
703 pool_error("could not read pid file");
708 if (kill(pid, stop_sig) == -1)
710 pool_error("could not stop pid: %d. reason: %s", pid, strerror(errno));
715 fprintf(stderr, "stop request sent to pgpool. waiting for termination...");
717 while (kill(pid, 0) == 0)
719 fprintf(stderr, ".");
722 fprintf(stderr, "done.\n");
728 static int read_pid_file(void)
733 fd = fopen(pool_config->pid_file_name, "r");
738 if (fread(pidbuf, 1, sizeof(pidbuf), fd) <= 0)
740 pool_error("could not read pid file as %s. reason: %s",
741 pool_config->pid_file_name, strerror(errno));
746 return(atoi(pidbuf));
752 static void write_pid_file(void)
757 fd = fopen(pool_config->pid_file_name, "w");
760 pool_error("could not open pid file as %s. reason: %s",
761 pool_config->pid_file_name, strerror(errno));
765 snprintf(pidbuf, sizeof(pidbuf), "%d", (int)getpid());
766 fwrite(pidbuf, strlen(pidbuf)+1, 1, fd);
769 pool_error("could not write pid file as %s. reason: %s",
770 pool_config->pid_file_name, strerror(errno));
777 * fork a child for PCP
779 pid_t pcp_fork_a_child(int unix_fd, int inet_fd, char *pcp_conf_file)
790 myargv = save_ps_display_args(myargc, myargv);
792 /* call PCP child main */
793 POOL_SETMASK(&UnBlockSig);
794 reload_config_request = 0;
795 pcp_do_child(unix_fd, inet_fd, pcp_conf_file);
799 pool_error("fork() failed. reason: %s", strerror(errno));
808 pid_t fork_a_child(int unix_fd, int inet_fd, int id)
816 /* Before we unconditionally closed pipe_fds[0] and pipe_fds[1]
817 * here, which is apparently wrong since in the start up of
818 * pgpool, pipe(2) is not called yet and it mistakenly closes
819 * fd 0. Now we check the fd > 0 before close(), expecting
820 * pipe returns fds greater than 0. Note that we cannot
821 * unconditionally remove close(2) calls since fork_a_child()
822 * may be called *after* pgpool starting up.
830 myargv = save_ps_display_args(myargc, myargv);
832 /* call child main */
833 POOL_SETMASK(&UnBlockSig);
834 reload_config_request = 0;
836 do_child(unix_fd, inet_fd);
840 pool_error("fork() failed. reason: %s", strerror(errno));
847 * create inet domain socket
849 static int create_inet_domain_socket(const char *hostname, const int port)
851 struct sockaddr_in addr;
857 fd = socket(AF_INET, SOCK_STREAM, 0);
860 pool_error("Failed to create INET domain socket. reason: %s", strerror(errno));
863 if ((setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *) &one,
866 pool_error("setsockopt() failed. reason: %s", strerror(errno));
870 memset((char *) &addr, 0, sizeof(addr));
871 ((struct sockaddr *)&addr)->sa_family = AF_INET;
873 if (strcmp(hostname, "*")==0)
875 addr.sin_addr.s_addr = htonl(INADDR_ANY);
879 struct hostent *hostinfo;
881 hostinfo = gethostbyname(hostname);
884 pool_error("could not resolve host name \"%s\": %s", hostname, hstrerror(h_errno));
887 addr.sin_addr = *(struct in_addr *) hostinfo->h_addr;
890 addr.sin_port = htons(port);
891 len = sizeof(struct sockaddr_in);
892 status = bind(fd, (struct sockaddr *)&addr, len);
895 char *host = "", *serv = "";
896 char hostname[NI_MAXHOST], servname[NI_MAXSERV];
897 if (getnameinfo((struct sockaddr *) &addr, len, hostname, sizeof(hostname), servname, sizeof(servname), 0) == 0) {
901 pool_error("bind(%s:%s) failed. reason: %s", host, serv, strerror(errno));
905 status = listen(fd, PGPOOLMAXLITSENQUEUELENGTH);
908 pool_error("listen() failed. reason: %s", strerror(errno));
915 * create UNIX domain socket
917 static int create_unix_domain_socket(struct sockaddr_un un_addr_tmp)
919 struct sockaddr_un addr;
924 fd = socket(AF_UNIX, SOCK_STREAM, 0);
927 pool_error("Failed to create UNIX domain socket. reason: %s", strerror(errno));
930 memset((char *) &addr, 0, sizeof(addr));
931 ((struct sockaddr *)&addr)->sa_family = AF_UNIX;
932 snprintf(addr.sun_path, sizeof(addr.sun_path), un_addr_tmp.sun_path);
933 len = sizeof(struct sockaddr_un);
934 status = bind(fd, (struct sockaddr *)&addr, len);
937 pool_error("bind(%s) failed. reason: %s", addr.sun_path, strerror(errno));
941 if (chmod(un_addr_tmp.sun_path, 0777) == -1)
943 pool_error("chmod() failed. reason: %s", strerror(errno));
947 status = listen(fd, PGPOOLMAXLITSENQUEUELENGTH);
950 pool_error("listen() failed. reason: %s", strerror(errno));
956 static void myunlink(const char* path)
958 if (unlink(path) == 0) return;
959 pool_error("unlink(%s) failed: %s", path, strerror(errno));
962 static void myexit(int code)
966 if (getpid() != mypid)
970 POOL_SETMASK(&AuthBlockSig);
972 for (i = 0; i < pool_config->num_init_children; i++)
974 pid_t pid = pids[i].pid;
980 while (wait(NULL) > 0)
983 pool_error("wait() failed. reason:%s", strerror(errno));
984 POOL_SETMASK(&UnBlockSig);
987 myunlink(un_addr.sun_path);
988 myunlink(pcp_un_addr.sun_path);
989 myunlink(pool_config->pid_file_name);
991 pool_shmem_exit(code);
995 void notice_backend_error(int node_id)
999 degenerate_backend_set(&n, 1);
1002 /* notice backend connection error using SIGUSR1 */
1003 void degenerate_backend_set(int *node_id_set, int count)
1005 pid_t parent = getppid();
1008 if (pool_config->parallel_mode)
1013 pool_semaphore_lock(REQUEST_INFO_SEM);
1014 Req_info->kind = NODE_DOWN_REQUEST;
1015 for (i = 0; i < count; i++)
1017 if (node_id_set[i] < 0 || node_id_set[i] >= MAX_NUM_BACKENDS ||
1018 !VALID_BACKEND(node_id_set[i]))
1020 pool_log("notice_backend_error: node %d is not valid backend.", i);
1024 pool_log("notice_backend_error: %d fail over request from pid %d", node_id_set[i], getpid());
1025 Req_info->node_id[i] = node_id_set[i];
1027 kill(parent, SIGUSR1);
1028 pool_semaphore_unlock(REQUEST_INFO_SEM);
1031 /* send failback request using SIGUSR1 */
1032 void send_failback_request(int node_id)
1034 pid_t parent = getppid();
1036 pool_log("send_failback_request: fail back %d th node request from pid %d", node_id, getpid());
1037 Req_info->kind = NODE_UP_REQUEST;
1038 Req_info->node_id[0] = node_id;
1040 if (node_id < 0 || node_id >= MAX_NUM_BACKENDS || VALID_BACKEND(node_id))
1042 pool_error("send_failback_request: node %d is alive.", node_id);
1046 kill(parent, SIGUSR1);
1049 static RETSIGTYPE exit_handler(int sig)
1053 POOL_SETMASK(&AuthBlockSig);
1056 * this could happen in a child process if a signal has been sent
1057 * before resetting signal handler
1059 if (getpid() != mypid)
1061 pool_debug("exit_handler: I am not parent");
1062 POOL_SETMASK(&UnBlockSig);
1068 pool_log("received smart shutdown request");
1069 else if (sig == SIGINT)
1070 pool_log("received fast shutdown request");
1071 else if (sig == SIGQUIT)
1072 pool_log("received immediate shutdown request");
1075 pool_error("exit_handler: unknown signal received %d", sig);
1076 POOL_SETMASK(&UnBlockSig);
1082 for (i = 0; i < pool_config->num_init_children; i++)
1084 pid_t pid = pids[i].pid;
1093 POOL_SETMASK(&UnBlockSig);
1095 while (wait(NULL) > 0)
1098 if (errno != ECHILD)
1099 pool_error("wait() failed. reason:%s", strerror(errno));
1106 * calculate next master node id
1108 static int get_next_master_node(void)
1111 for (i=0;i<pool_config->backend_desc->num_backends;i++)
1114 * Do not use VALID_BACKEND macro in raw mode.
1115 * VALID_BACKEND return true only if the argument is master
1116 * node id. In other words, standby nodes are false. So need
1117 * to check backend status without VALID_BACKEND.
1121 if (BACKEND_INFO(i).backend_status == CON_CONNECT_WAIT)
1124 else if (VALID_BACKEND(i))
1134 static RETSIGTYPE failover_handler(int sig)
1136 POOL_SETMASK(&BlockSig);
1137 failover_request = 1;
1138 write(pipe_fds[1], "\0", 1);
1139 POOL_SETMASK(&UnBlockSig);
1143 * backend connection error, failover/failback request, if possible
1144 * failover() must be called under protecting signals.
1146 static void failover(void)
1151 int nodes[MAX_NUM_BACKENDS];
1153 pool_debug("failover_handler called");
1155 memset(nodes, 0, sizeof(int) * MAX_NUM_BACKENDS);
1158 * this could happen in a child process if a signal has been sent
1159 * before resetting signal handler
1161 if (getpid() != mypid)
1163 pool_debug("failover_handler: I am not parent");
1164 kill(pcp_pid, SIGUSR2);
1169 * processing SIGTERM, SIGINT or SIGQUIT
1173 pool_debug("failover_handler called while exiting");
1174 kill(pcp_pid, SIGUSR2);
1179 * processing fail over or switch over
1183 pool_debug("failover_handler called while switching");
1184 kill(pcp_pid, SIGUSR2);
1188 pool_semaphore_lock(REQUEST_INFO_SEM);
1190 if (Req_info->kind == CLOSE_IDLE_REQUEST)
1192 pool_semaphore_unlock(REQUEST_INFO_SEM);
1193 kill_all_children(SIGUSR1);
1194 kill(pcp_pid, SIGUSR2);
1199 * if not in replication mode/master slave mode, we treat this a restart request.
1200 * otherwise we need to check if we have already failovered.
1202 pool_debug("failover_handler: starting to select new master node");
1204 node_id = Req_info->node_id[0];
1206 /* failback request? */
1207 if (Req_info->kind == NODE_UP_REQUEST)
1209 if (node_id >= MAX_NUM_BACKENDS ||
1210 (Req_info->kind == NODE_UP_REQUEST && VALID_BACKEND(node_id)) ||
1211 (Req_info->kind == NODE_DOWN_REQUEST && !VALID_BACKEND(node_id)))
1213 pool_semaphore_unlock(REQUEST_INFO_SEM);
1214 pool_error("failover_handler: invalid node_id %d status:%d MAX_NUM_BACKENDS: %d", node_id,
1215 BACKEND_INFO(node_id).backend_status, MAX_NUM_BACKENDS);
1216 kill(pcp_pid, SIGUSR2);
1221 pool_log("starting fail back. reconnect host %s(%d)",
1222 BACKEND_INFO(node_id).backend_hostname,
1223 BACKEND_INFO(node_id).backend_port);
1224 BACKEND_INFO(node_id).backend_status = CON_CONNECT_WAIT; /* unset down status */
1225 trigger_failover_command(node_id, pool_config->failback_command);
1231 for (i = 0; i < MAX_NUM_BACKENDS; i++)
1233 if (Req_info->node_id[i] != -1 &&
1234 VALID_BACKEND(Req_info->node_id[i]))
1236 pool_log("starting degeneration. shutdown host %s(%d)",
1237 BACKEND_INFO(Req_info->node_id[i]).backend_hostname,
1238 BACKEND_INFO(Req_info->node_id[i]).backend_port);
1241 BACKEND_INFO(Req_info->node_id[i]).backend_status = CON_DOWN; /* set down status */
1242 /* save down node */
1243 nodes[Req_info->node_id[i]] = 1;
1250 pool_log("failover: no backends are degenerated");
1251 pool_semaphore_unlock(REQUEST_INFO_SEM);
1252 kill(pcp_pid, SIGUSR2);
1258 new_master = get_next_master_node();
1260 if (new_master == pool_config->backend_desc->num_backends)
1262 pool_error("failover_handler: no valid DB node found");
1266 * Before we tried to minimize restarting pgpool to protect existing
1267 * connections from clients to pgpool children. What we did here was,
1268 * if children other than master went down, we did not fail over.
1269 * This is wrong. Think about following scenario. If someone
1270 * accidentally plugs out the network cable, the TCP/IP stack keeps
1271 * retrying for long time (typically 2 hours). The only way to stop
1272 * the retry is restarting the process. Bottom line is, we need to
1273 * restart all children in any case. See pgpool-general list posting
1274 * "TCP connections are *not* closed when a backend timeout" on Jul 13
1275 * 2008 for more details.
1281 if (Req_info->master_node_id == new_master && *InRecovery == 0)
1283 pool_log("failover_handler: do not restart pgpool. same master node %d was selected", new_master);
1284 if (Req_info->kind == NODE_UP_REQUEST)
1286 pool_log("failback done. reconnect host %s(%d)",
1287 BACKEND_INFO(node_id).backend_hostname,
1288 BACKEND_INFO(node_id).backend_port);
1292 pool_log("failover done. shutdown host %s(%d)",
1293 BACKEND_INFO(node_id).backend_hostname,
1294 BACKEND_INFO(node_id).backend_port);
1297 /* exec failover_command */
1298 for (i = 0; i < pool_config->backend_desc->num_backends; i++)
1301 trigger_failover_command(i, pool_config->failover_command);
1304 pool_semaphore_unlock(REQUEST_INFO_SEM);
1306 kill(pcp_pid, SIGUSR2);
1312 /* kill all children */
1313 for (i = 0; i < pool_config->num_init_children; i++)
1315 pid_t pid = pids[i].pid;
1319 pool_debug("failover_handler: kill %d", pid);
1323 /* exec failover_command */
1324 for (i = 0; i < pool_config->backend_desc->num_backends; i++)
1327 trigger_failover_command(i, pool_config->failover_command);
1330 pool_log("failover_handler: set new master node: %d", new_master);
1331 Req_info->master_node_id = new_master;
1333 /* no need to wait since it will be done in reap_handler */
1335 while (wait(NULL) > 0)
1338 if (errno != ECHILD)
1339 pool_error("failover_handler: wait() failed. reason:%s", strerror(errno));
1342 memset(Req_info->node_id, -1, sizeof(int) * MAX_NUM_BACKENDS);
1343 pool_semaphore_unlock(REQUEST_INFO_SEM);
1345 /* fork the children */
1346 for (i=0;i<pool_config->num_init_children;i++)
1348 pids[i].pid = fork_a_child(unix_fd, inet_fd, i);
1349 pids[i].start_time = time(NULL);
1352 if (Req_info->kind == NODE_UP_REQUEST)
1354 pool_log("failback done. reconnect host %s(%d)",
1355 BACKEND_INFO(node_id).backend_hostname,
1356 BACKEND_INFO(node_id).backend_port);
1360 pool_log("failover done. shutdown host %s(%d)",
1361 BACKEND_INFO(node_id).backend_hostname,
1362 BACKEND_INFO(node_id).backend_port);
1367 /* kick wakeup_handler in pcp_child to notice that
1368 * faiover/failback done
1370 kill(pcp_pid, SIGUSR2);
1374 * health check timer handler
1376 static RETSIGTYPE health_check_timer_handler(int sig)
1378 POOL_SETMASK(&BlockSig);
1379 health_check_timer_expired = 1;
1380 POOL_SETMASK(&UnBlockSig);
1385 * check if we can connect to the backend
1386 * returns 0 for ok. otherwise returns backend id + 1
1388 int health_check(void)
1393 /* V2 startup packet */
1395 int len; /* startup packet length */
1396 StartupPacket_v2 sp;
1405 memset(&mysp, 0, sizeof(mysp));
1406 mysp.len = htonl(296);
1407 mysp.sp.protoVersion = htonl(PROTO_MAJOR_V2 << 16);
1408 strcpy(mysp.sp.database, "template1");
1409 strncpy(mysp.sp.user, pool_config->health_check_user, sizeof(mysp.sp.user) - 1);
1410 *mysp.sp.options = '\0';
1411 *mysp.sp.unused = '\0';
1412 *mysp.sp.tty = '\0';
1414 for (i=0;i<pool_config->backend_desc->num_backends;i++)
1416 pool_debug("health_check: %d th DB node status: %d", i, BACKEND_INFO(i).backend_status);
1418 if (BACKEND_INFO(i).backend_status == CON_UNUSED ||
1419 BACKEND_INFO(i).backend_status == CON_DOWN)
1422 if (*(BACKEND_INFO(i).backend_hostname) == '\0')
1423 fd = connect_unix_domain_socket(i);
1425 fd = connect_inet_domain_socket(i);
1429 pool_error("health check failed. %d th host %s at port %d is down",
1431 BACKEND_INFO(i).backend_hostname,
1432 BACKEND_INFO(i).backend_port);
1437 if (write(fd, &mysp, sizeof(mysp)) < 0)
1439 pool_error("health check failed during write. host %s at port %d is down. reason: %s",
1440 BACKEND_INFO(i).backend_hostname,
1441 BACKEND_INFO(i).backend_port,
1448 * Don't bother to be blocked by read(2). It will be
1449 * interrupted by ALRAM anyway.
1451 sts = read(fd, &kind, 1);
1454 pool_error("health check failed during read. host %s at port %d is down. reason: %s",
1455 BACKEND_INFO(i).backend_hostname,
1456 BACKEND_INFO(i).backend_port,
1463 pool_error("health check failed. EOF encountered. host %s at port %d is down",
1464 BACKEND_INFO(i).backend_hostname,
1465 BACKEND_INFO(i).backend_port);
1471 * If a backend raised a FATAL error(max connections error or
1472 * starting up error?), do not send a Terminate message.
1474 if ((kind != 'E') && (write(fd, "X", 1) < 0))
1476 pool_error("health check failed during write. host %s at port %d is down. reason: %s. Perhaps wrong health check user?",
1477 BACKEND_INFO(i).backend_hostname,
1478 BACKEND_INFO(i).backend_port,
1491 * check if we can connect to the SystemDB
1492 * returns 0 for ok. otherwise returns -1
1495 system_db_health_check(void)
1499 /* V2 startup packet */
1501 int len; /* startup packet length */
1502 StartupPacket_v2 sp;
1507 memset(&mysp, 0, sizeof(mysp));
1508 mysp.len = htonl(296);
1509 mysp.sp.protoVersion = htonl(PROTO_MAJOR_V2 << 16);
1510 strcpy(mysp.sp.database, "template1");
1511 strncpy(mysp.sp.user, SYSDB_INFO->user, sizeof(mysp.sp.user) - 1);
1512 *mysp.sp.options = '\0';
1513 *mysp.sp.unused = '\0';
1514 *mysp.sp.tty = '\0';
1516 pool_debug("health_check: SystemDB status: %d", SYSDB_STATUS);
1518 /* if SystemDB is already down, ignore */
1519 if (SYSDB_STATUS == CON_UNUSED || SYSDB_STATUS == CON_DOWN)
1522 if (*SYSDB_INFO->hostname == '\0')
1523 fd = connect_unix_domain_socket_by_port(SYSDB_INFO->port, pool_config->backend_socket_dir);
1525 fd = connect_inet_domain_socket_by_port(SYSDB_INFO->hostname, SYSDB_INFO->port);
1529 pool_error("health check failed. SystemDB host %s at port %d is down",
1530 SYSDB_INFO->hostname,
1536 if (write(fd, &mysp, sizeof(mysp)) < 0)
1538 pool_error("health check failed during write. SystemDB host %s at port %d is down",
1539 SYSDB_INFO->hostname,
1547 if (write(fd, "X", 1) < 0)
1549 pool_error("health check failed during write. SystemDB host %s at port %d is down",
1550 SYSDB_INFO->hostname,
1563 static RETSIGTYPE reap_handler(int sig)
1565 POOL_SETMASK(&BlockSig);
1566 sigchld_request = 1;
1567 write(pipe_fds[1], "\0", 1);
1568 POOL_SETMASK(&UnBlockSig);
1572 * Attach zombie processes and restart child processes.
1573 * reaper() must be called under protecting signals.
1575 static void reaper(void)
1581 pool_debug("reap_handler called");
1585 pool_debug("reap_handler: exited due to exiting");
1591 pool_debug("reap_handler: exited due to switching");
1595 /* clear SIGCHLD request */
1596 sigchld_request = 0;
1599 pool_debug("reap_handler: call waitpid");
1600 while ((pid = waitpid(-1, &status, WNOHANG)) > 0)
1602 pool_debug("reap_handler: call wait3");
1603 while ((pid = wait3(&status, WNOHANG, NULL)) > 0)
1606 /* if exiting child process was PCP handler */
1609 pool_debug("PCP child %d exits with status %d by signal %d", pid, status, WTERMSIG(status));
1611 pcp_pid = pcp_fork_a_child(pcp_unix_fd, pcp_inet_fd, pcp_conf_file);
1612 pool_debug("fork a new PCP child pid %d", pcp_pid);
1615 pool_debug("child %d exits with status %d by signal %d", pid, status, WTERMSIG(status));
1617 /* look for exiting child's pid */
1618 for (i=0;i<pool_config->num_init_children;i++)
1620 if (pid == pids[i].pid)
1622 /* if found, fork a new child */
1623 if (!switching && !exiting && status)
1625 pids[i].pid = fork_a_child(unix_fd, inet_fd, i);
1626 pids[i].start_time = time(NULL);
1627 pool_debug("fork a new child pid %d", pids[i].pid);
1634 pool_debug("reap_handler: normally exited");
1638 * get node information specified by node_number
1641 pool_get_node_info(int node_number)
1643 if (node_number >= NUM_BACKENDS)
1646 return &BACKEND_INFO(node_number);
1650 * get number of nodes
1653 pool_get_node_count(void)
1655 return NUM_BACKENDS;
1662 pool_get_process_list(int *array_size)
1667 *array_size = pool_config->num_init_children;
1668 array = calloc(*array_size, sizeof(int));
1669 for (i = 0; i < *array_size; i++)
1670 array[i] = pids[i].pid;
1676 * get process information specified by pid
1679 pool_get_process_info(pid_t pid)
1683 for (i = 0; i < pool_config->num_init_children; i++)
1684 if (pids[i].pid == pid)
1691 * get System DB information
1694 pool_get_system_db_info(void)
1696 if (system_db_info == NULL)
1699 return system_db_info->info;
1705 * Wakeup all processes
1707 static void wakeup_children(void)
1709 kill_all_children(SIGUSR2);
1713 static RETSIGTYPE wakeup_handler(int sig)
1715 POOL_SETMASK(&BlockSig);
1717 write(pipe_fds[1], "\0", 1);
1718 POOL_SETMASK(&UnBlockSig);
1725 static RETSIGTYPE reload_config_handler(int sig)
1727 POOL_SETMASK(&BlockSig);
1728 reload_config_request = 1;
1729 write(pipe_fds[1], "\0", 1);
1730 POOL_SETMASK(&UnBlockSig);
1733 static void reload_config(void)
1735 pool_log("reload config files.");
1736 pool_get_config(conf_file, RELOAD_CONFIG);
1737 if (pool_config->enable_pool_hba)
1739 if (pool_config->parallel_mode)
1740 pool_memset_system_db_info(system_db_info->info);
1741 kill_all_children(SIGHUP);
1744 static void kill_all_children(int sig)
1748 /* kill all children */
1749 for (i = 0; i < pool_config->num_init_children; i++)
1751 pid_t pid = pids[i].pid;
1758 /* make PCP process reload as well */
1764 * pause in a period specified by timeout. If any data is coming
1765 * through pipe_fds[0], that means one of: failover request(SIGUSR1),
1766 * SIGCHLD received, children wake up request(SIGUSR2 used in on line
1767 * recovery processing) or config file reload request(SIGHUP) has been
1768 * occurred. In this case this function returns 1.
1769 * otherwise 0: (no signal event occurred), -1: (error)
1770 * XXX: is it ok that select(2) error is ignored here?
1772 static int pool_pause(struct timeval *timeout)
1779 FD_SET(pipe_fds[0], &rfds);
1780 n = select(pipe_fds[0]+1, &rfds, NULL, NULL, timeout);
1782 read(pipe_fds[0], &dummy, 1);
1787 * sleep for seconds specified by "second". Unlike pool_pause(), this
1788 * function guarantees that it will sleep for specified seconds. This
1789 * function uses pool_pause() internally. If it informs that there is
1790 * a pending signal event, they are processed using CHECK_REQUEST
1791 * macro. Note that most of these processes are done while all signals
1794 static void pool_sleep(unsigned int second)
1796 struct timeval current_time, sleep_time;
1798 gettimeofday(¤t_time, NULL);
1799 sleep_time.tv_sec = second + current_time.tv_sec;
1800 sleep_time.tv_usec = current_time.tv_usec;
1802 POOL_SETMASK(&UnBlockSig);
1803 while (sleep_time.tv_sec > current_time.tv_sec)
1805 struct timeval timeout;
1808 timeout.tv_sec = sleep_time.tv_sec - current_time.tv_sec;
1809 timeout.tv_usec = sleep_time.tv_usec - current_time.tv_usec;
1810 if (timeout.tv_usec < 0)
1813 timeout.tv_usec += 1000000;
1816 r = pool_pause(&timeout);
1817 POOL_SETMASK(&BlockSig);
1820 POOL_SETMASK(&UnBlockSig);
1821 gettimeofday(¤t_time, NULL);
1823 POOL_SETMASK(&BlockSig);
1827 * get_config_file_name: return full path of pgpool.conf.
1829 char *get_config_file_name(void)
1835 * get_config_file_name: return full path of pool_hba.conf.
1837 char *get_hba_file_name(void)
1843 * trigger_failover_command: execute specified command at failover.
1844 * command_line is null-terminated string.
1846 static int trigger_failover_command(int node, const char *command_line)
1854 if (command_line == NULL || (strlen(command_line) == 0))
1858 if (node < 0 || node > NUM_BACKENDS)
1861 info = pool_get_node_info(node);
1866 pool_memory = pool_memory_create(PREPARE_BLOCK_SIZE);
1869 pool_error("trigger_failover_command: pool_memory_create() failed");
1872 exec_cmd = init_string("");
1874 while (*command_line)
1876 if (*command_line == '%')
1878 if (*(command_line + 1))
1880 char val = *(command_line + 1);
1883 case 'p': /* port */
1884 snprintf(port_buf, sizeof(port_buf), "%d", info->backend_port);
1885 string_append_char(exec_cmd, port_buf);
1888 case 'D': /* database directory */
1889 string_append_char(exec_cmd, info->backend_data_directory);
1892 case 'd': /* node id */
1893 snprintf(port_buf, sizeof(port_buf), "%d", node);
1894 string_append_char(exec_cmd, port_buf);
1897 case 'h': /* host name */
1898 string_append_char(exec_cmd, info->backend_hostname);
1901 case 'm': /* new master node id */
1902 snprintf(port_buf, sizeof(port_buf), "%d", get_next_master_node());
1903 string_append_char(exec_cmd, port_buf);
1906 case 'M': /* old master node id */
1907 snprintf(port_buf, sizeof(port_buf), "%d", MASTER_NODE_ID);
1908 string_append_char(exec_cmd, port_buf);
1911 case '%': /* escape */
1912 string_append_char(exec_cmd, "%");
1915 default: /* ignore */
1921 buf[0] = *command_line;
1922 string_append_char(exec_cmd, buf);
1927 if (strlen(exec_cmd->data) != 0)
1929 pool_log("execute command: %s", exec_cmd->data);
1930 r = system(exec_cmd->data);
1933 pool_memory_delete(pool_memory, 0);