]> git.8kb.co.uk Git - pgpool-ii/pgpool-ii_2.2.5/blob - main.c
Attempt to send a proper failure message to frontend when authentication
[pgpool-ii/pgpool-ii_2.2.5] / main.c
1 /* -*-pgsql-c-*- */
2 /*
3  * $Header: /cvsroot/pgpool/pgpool-II/main.c,v 1.45.2.6 2009/09/25 07:35:16 t-ishii Exp $
4  *
5  * pgpool: a language independent connection pool server for PostgreSQL
6  * written by Tatsuo Ishii
7  *
8  * Copyright (c) 2003-2009      PgPool Global Development Group
9  *
10  * Permission to use, copy, modify, and distribute this software and
11  * its documentation for any purpose and without fee is hereby
12  * granted, provided that the above copyright notice appear in all
13  * copies and that both that copyright notice and this permission
14  * notice appear in supporting documentation, and that the name of the
15  * author not be used in advertising or publicity pertaining to
16  * distribution of the software without specific, written prior
17  * permission. The author makes no representations about the
18  * suitability of this software for any purpose.  It is provided "as
19  * is" without express or implied warranty.
20  */
21 #include "pool.h"
22
23 #include <ctype.h>
24 #include <sys/types.h>
25 #include <sys/socket.h>
26 #include <netinet/in.h>
27 #include <sys/un.h>
28 #include <netdb.h>
29 #include <arpa/inet.h>
30 #include <sys/time.h>
31 #ifdef HAVE_SYS_SELECT_H
32 #include <sys/select.h>
33 #endif
34
35 #include <sys/stat.h>
36 #include <fcntl.h>
37
38 #include <sys/wait.h>
39
40 #include <stdio.h>
41 #include <errno.h>
42 #include <unistd.h>
43 #include <stdlib.h>
44 #include <string.h>
45
46 #include <signal.h>
47
48 #ifdef HAVE_GETOPT_H
49 #include <getopt.h>
50 #endif
51
52 #include "version.h"
53 #include "parser/pool_memory.h"
54 #include "parser/pool_string.h"
55
56 /*
57  * Process pending signal actions.
58  */
59 #define CHECK_REQUEST \
60         do { \
61                 if (wakeup_request) \
62                 { \
63                         wakeup_children(); \
64                         wakeup_request = 0; \
65                 } \
66                 if (failover_request) \
67                 { \
68                         failover(); \
69                         failover_request = 0; \
70                 } \
71                 if (sigchld_request) \
72                 { \
73                         reaper(); \
74                 } \
75                 if (reload_config_request) \
76                 { \
77                         reload_config(); \
78                         reload_config_request = 0; \
79                 } \
80     } while (0)
81
82
83 #define PGPOOLMAXLITSENQUEUELENGTH 10000
84 static void daemonize(void);
85 static int read_pid_file(void);
86 static void write_pid_file(void);
87 static pid_t pcp_fork_a_child(int unix_fd, int inet_fd, char *pcp_conf_file);
88 static pid_t fork_a_child(int unix_fd, int inet_fd, int id);
89 static int create_unix_domain_socket(struct sockaddr_un un_addr_tmp);
90 static int create_inet_domain_socket(const char *hostname, const int port);
91 static void myexit(int code);
92 static void failover(void);
93 static void reaper(void);
94 static void wakeup_children(void);
95 static void reload_config(void);
96 static int pool_pause(struct timeval *timeout);
97 static void pool_sleep(unsigned int second);
98 static void kill_all_children(int sig);
99 static int get_next_master_node(void);
100
101 static RETSIGTYPE exit_handler(int sig);
102 static RETSIGTYPE reap_handler(int sig);
103 static RETSIGTYPE failover_handler(int sig);
104 static RETSIGTYPE reload_config_handler(int sig);
105 static RETSIGTYPE health_check_timer_handler(int sig);
106 static RETSIGTYPE wakeup_handler(int sig);
107
108 static void usage(void);
109 static void show_version(void);
110 static void stop_me(void);
111
112 static int trigger_failover_command(int node, const char *command_line);
113
114 static struct sockaddr_un un_addr;              /* unix domain socket path */
115 static struct sockaddr_un pcp_un_addr;  /* unix domain socket path for PCP */
116
117 ProcessInfo *pids;      /* shmem child pid table */
118
119 /*
120  * shmem connection info table
121  * this is a two dimension array. i.e.:
122  * con_info[pool_config->num_init_children][pool_config->max_pool]
123  */
124 ConnectionInfo *con_info;
125
126 static int unix_fd;     /* unix domain socket fd */
127 static int inet_fd;     /* inet domain socket fd */
128
129 static int pcp_pid; /* pid for child process handling PCP */
130 static int pcp_unix_fd; /* unix domain socket fd for PCP (not used) */
131 static int pcp_inet_fd; /* inet domain socket fd for PCP */
132 static char pcp_conf_file[POOLMAXPATHLEN+1]; /* path for pcp.conf */
133 static char conf_file[POOLMAXPATHLEN+1];
134 static char hba_file[POOLMAXPATHLEN+1];
135
136 static int exiting = 0;         /* non 0 if I'm exiting */
137 static int switching = 0;               /* non 0 if I'm fail overing or degenerating */
138
139 #ifdef NOT_USED
140 static int degenerated = 0;     /* set non 0 if already degenerated */
141 #endif
142
143 static int clear_cache = 0;             /* non 0 if clear chache option (-c) is given */
144 static int not_detach = 0;              /* non 0 if non detach option (-n) is given */
145 int debug = 0;  /* non 0 if debug option is given (-d) */
146
147 pid_t mypid;    /* pgpool parent process id */
148
149 long int weight_master; /* normalized weight of master (0-RAND_MAX range) */
150
151 static int stop_sig = SIGTERM;  /* stopping signal default value */
152
153 static volatile sig_atomic_t health_check_timer_expired;                /* non 0 if health check timer expired */
154
155 POOL_REQUEST_INFO *Req_info;            /* request info area in shared memory */
156 volatile sig_atomic_t *InRecovery; /* non 0 if recovery is started */
157 volatile sig_atomic_t reload_config_request = 0;
158 static volatile sig_atomic_t failover_request = 0;
159 static volatile sig_atomic_t sigchld_request = 0;
160 static volatile sig_atomic_t wakeup_request = 0;
161
162 static int pipe_fds[2]; /* for delivering signals */
163
164 int my_proc_id;
165
166 int myargc;
167 char **myargv;
168
169 /*
170 * pgpool main program
171 */
172 int main(int argc, char **argv)
173 {
174         int opt;
175         int i;
176         int pid;
177         int size;
178         int retrycnt;
179         int sys_retrycnt;
180
181         myargc = argc;
182         myargv = argv;
183
184         snprintf(conf_file, sizeof(conf_file), "%s/%s", DEFAULT_CONFIGDIR, POOL_CONF_FILE_NAME);
185         snprintf(pcp_conf_file, sizeof(pcp_conf_file), "%s/%s", DEFAULT_CONFIGDIR, PCP_PASSWD_FILE_NAME);
186         snprintf(hba_file, sizeof(hba_file), "%s/%s", DEFAULT_CONFIGDIR, HBA_CONF_FILE_NAME);
187
188         while ((opt = getopt(argc, argv, "a:cdf:F:hm:nv")) != -1)
189         {
190                 switch (opt)
191                 {
192                         case 'a':    /* specify hba configuration file */
193                                 if (!optarg)
194                                 {
195                                         usage();
196                                         exit(1);
197                                 }
198                                 strncpy(hba_file, optarg, sizeof(hba_file));
199                                 break;
200
201                         case 'c':                       /* clear cache option */
202                                 clear_cache = 1;
203                                 break;
204
205                         case 'd':       /* debug option */
206                                 debug = 1;
207                                 break;
208
209                         case 'f':       /* specify configuration file */
210                                 if (!optarg)
211                                 {
212                                         usage();
213                                         exit(1);
214                                 }
215                                 strncpy(conf_file, optarg, sizeof(conf_file));
216                                 break;
217
218                         case 'F':   /* specify PCP password file */
219                                 if (!optarg)
220                                 {
221                                         usage();
222                                         exit(1);
223                                 }
224                                 strncpy(pcp_conf_file, optarg, sizeof(pcp_conf_file));
225                                 break;
226
227                         case 'h':
228                                 usage();
229                                 exit(0);
230                                 break;
231
232                         case 'm':       /* stop mode */
233                                 if (!optarg)
234                                 {
235                                         usage();
236                                         exit(1);
237                                 }
238                                 if (*optarg == 's' || !strcmp("smart", optarg))
239                                         stop_sig = SIGTERM;             /* smart shutdown */
240                                 else if (*optarg == 'f' || !strcmp("fast", optarg))
241                                         stop_sig = SIGINT;              /* fast shutdown */
242                                 else if (*optarg == 'i' || !strcmp("immediate", optarg))
243                                         stop_sig = SIGQUIT;             /* immediate shutdown */
244                                 else
245                                 {
246                                         usage();
247                                         exit(1);
248                                 }
249                                 break;
250
251                         case 'n':       /* no detaching control ttys */
252                                 not_detach = 1;
253                                 break;
254
255                         case 'v':
256                                 show_version();
257                                 exit(0);
258
259                         default:
260                                 usage();
261                                 exit(1);
262                 }
263         }
264
265         mypid = getpid();
266
267         if (pool_init_config())
268                 exit(1);
269
270         if (pool_get_config(conf_file, INIT_CONFIG))
271         {
272                 pool_error("Unable to get configuration. Exiting...");
273                 exit(1);
274         }
275
276         if (pool_config->enable_pool_hba)
277                 load_hba(hba_file);
278
279         /*
280          * if a non-switch argument remains, then it should be either "reload", "stop" or "switch"
281          */
282         if (optind == (argc - 1))
283         {
284                 if (!strcmp(argv[optind], "reload"))
285                 {
286                                 pid_t pid;
287
288                                 pid = read_pid_file();
289                                 if (pid < 0)
290                                 {
291                                         pool_error("could not read pid file");
292                                         pool_shmem_exit(1);
293                                         exit(1);
294                                 }
295
296                                 if (kill(pid, SIGHUP) == -1)
297                                 {
298                                         pool_error("could not reload configuration file pid: %d. reason: %s", pid, strerror(errno));
299                                         pool_shmem_exit(1);
300                                         exit(1);
301                                 }
302                                 pool_shmem_exit(0);
303                                 exit(0);
304                 }
305                 if (!strcmp(argv[optind], "stop"))
306                 {
307                         stop_me();
308                         pool_shmem_exit(0);
309                         exit(0);
310                 }
311                 else
312                 {
313                         usage();
314                         pool_shmem_exit(1);
315                         exit(1);
316                 }
317         }
318         /*
319          * else if no non-switch argument remains, then it should be a start request
320          */
321         else if (optind == argc)
322         {
323                 pid = read_pid_file();
324                 if (pid > 0)
325                 {
326                         if (kill(pid, 0) == 0)
327                         {
328                                 fprintf(stderr, "pid file found. is another pgpool(%d) is running?\n", pid);
329                                 exit(1);
330                         }
331                         else
332                                 fprintf(stderr, "pid file found but it seems bogus. Trying to start pgpool anyway...\n");
333                 }
334         }
335         /*
336          * otherwise an error...
337          */
338         else
339         {
340                 usage();
341                 exit(1);
342         }
343
344         /* set signal masks */
345         poolinitmask();
346
347         if (not_detach)
348                 write_pid_file();
349         else
350                 daemonize();
351
352         if (pool_semaphore_create(MAX_NUM_SEMAPHORES))
353         {
354                 pool_error("Unable to create semaphores. Exiting...");
355                 pool_shmem_exit(1);
356                 exit(1);
357         }
358
359         /* clear cache */
360         if (clear_cache && pool_config->enable_query_cache && SYSDB_STATUS == CON_UP)
361         {
362                 Interval interval[1];
363
364                 interval[0].quantity = 0;
365                 interval[0].unit = second;
366
367                 pool_clear_cache_by_time(interval, 1);
368         }
369
370         /* set unix domain socket path */
371         snprintf(un_addr.sun_path, sizeof(un_addr.sun_path), "%s/.s.PGSQL.%d",
372                          pool_config->socket_dir,
373                          pool_config->port);
374
375         /* set up signal handlers */
376         pool_signal(SIGPIPE, SIG_IGN);
377
378         /* create unix domain socket */
379         unix_fd = create_unix_domain_socket(un_addr);
380
381         /* create inet domain socket if any */
382         if (pool_config->listen_addresses[0])
383         {
384                 inet_fd = create_inet_domain_socket(pool_config->listen_addresses, pool_config->port);
385         }
386
387         size = pool_config->num_init_children * pool_config->max_pool * sizeof(ConnectionInfo);
388         con_info = pool_shared_memory_create(size);
389         if (con_info == NULL)
390         {
391                 pool_error("failed to allocate connection informations");
392                 myexit(1);
393         }
394         memset(con_info, 0, size);
395
396         size = pool_config->num_init_children * (sizeof(ProcessInfo));
397         pids = pool_shared_memory_create(size);
398         if (pids == NULL)
399         {
400                 pool_error("failed to allocate pids");
401                 myexit(1);
402         }
403         memset(pids, 0, size);
404         for (i = 0; i < pool_config->num_init_children; i++)
405         {
406                 pids[i].connection_info = &con_info[i * pool_config->max_pool];
407         }
408
409         /* create fail over/switch over event area */
410         Req_info = pool_shared_memory_create(sizeof(POOL_REQUEST_INFO));
411         if (Req_info == NULL)
412         {
413                 pool_error("failed to allocate Req_info");
414                 myexit(1);
415         }
416
417         /* initialize Req_info */
418         Req_info->kind = NODE_UP_REQUEST;
419         memset(Req_info->node_id, -1, sizeof(int) * MAX_NUM_BACKENDS);
420         Req_info->master_node_id = get_next_master_node();
421         Req_info->conn_counter = 0;
422
423         InRecovery = pool_shared_memory_create(sizeof(int));
424         if (InRecovery == NULL)
425         {
426                 pool_error("failed to allocate InRecovery");
427                 myexit(1);
428         }
429         *InRecovery = 0;
430
431         /*
432          * We need to block signal here. Otherwise child might send some
433          * signals, for example SIGUSR1(fail over).  Children will inherit
434          * signal blocking but they do unblock signals at the very beginning
435          * of process.  So this is harmless.
436          */
437         POOL_SETMASK(&BlockSig);
438
439         /* fork the children */
440         for (i=0;i<pool_config->num_init_children;i++)
441         {
442                 pids[i].pid = fork_a_child(unix_fd, inet_fd, i);
443                 pids[i].start_time = time(NULL);
444         }
445
446         /* set up signal handlers */
447
448         pool_signal(SIGTERM, exit_handler);
449         pool_signal(SIGINT, exit_handler);
450         pool_signal(SIGQUIT, exit_handler);
451         pool_signal(SIGCHLD, reap_handler);
452         pool_signal(SIGUSR1, failover_handler);
453         pool_signal(SIGUSR2, wakeup_handler);
454         pool_signal(SIGHUP, reload_config_handler);
455
456         /* create pipe for delivering event */
457         if (pipe(pipe_fds) < 0)
458         {
459                 pool_error("failed to create pipe");
460                 myexit(1);
461         }
462
463         pool_log("pgpool successfully started");
464
465         /* fork a child for PCP handling */
466         snprintf(pcp_un_addr.sun_path, sizeof(pcp_un_addr.sun_path), "%s/.s.PGSQL.%d",
467                          pool_config->pcp_socket_dir,
468                          pool_config->pcp_port);
469         pcp_unix_fd = create_unix_domain_socket(pcp_un_addr);
470     /* maybe change "*" to pool_config->pcp_listen_addresses */
471         pcp_inet_fd = create_inet_domain_socket("*", pool_config->pcp_port);
472         pcp_pid = pcp_fork_a_child(pcp_unix_fd, pcp_inet_fd, pcp_conf_file);
473
474         retrycnt = 0;           /* reset health check retry counter */
475         sys_retrycnt = 0;       /* reset SystemDB health check retry counter */
476
477         /*
478          * This is the main loop
479          */
480         for (;;)
481         {
482                 CHECK_REQUEST;
483
484                 /* do we need health checking for PostgreSQL? */
485                 if (pool_config->health_check_period > 0)
486                 {
487                         int sts;
488                         int sys_sts = 0;
489                         unsigned int sleep_time;
490
491                         if (retrycnt == 0)
492                         {
493                                 pool_debug("starting health checking");
494                         }
495                         else
496                         {
497                                 pool_debug("retrying %d th health checking", retrycnt);
498                         }
499
500                         if (pool_config->health_check_timeout > 0)
501                         {
502                                 /*
503                                  * set health checker timeout. we want to detect
504                                  * communication path failure much earlier before
505                                  * TCP/IP stack detects it.
506                                  */
507                                 pool_signal(SIGALRM, health_check_timer_handler);
508                                 alarm(pool_config->health_check_timeout);
509                         }
510
511                         /*
512                          * do actual health check. trying to connect to the backend
513                          */
514                         errno = 0;
515                         health_check_timer_expired = 0;
516                         POOL_SETMASK(&UnBlockSig);
517                         sts = health_check();
518                         POOL_SETMASK(&BlockSig);
519                         if (pool_config->parallel_mode || pool_config->enable_query_cache)
520                                 sys_sts = system_db_health_check();
521
522                         if ((sts > 0 || sys_sts < 0) && (errno != EINTR || (errno == EINTR && health_check_timer_expired)))
523                         {
524                                 if (sts > 0)
525                                 {
526                                         sts--;
527
528                                         if (!pool_config->parallel_mode)
529                                         {
530                                                 pool_log("set %d th backend down status", sts);
531                                                 Req_info->kind = NODE_DOWN_REQUEST;
532                                                 Req_info->node_id[0] = sts;
533                                                 failover();
534                                                 /* need to distribute this info to children */
535                                         }
536                                         else
537                                         {
538                                                 retrycnt++;
539                                                 pool_signal(SIGALRM, SIG_IGN);  /* Cancel timer */
540
541                                                 if (retrycnt > NUM_BACKENDS)
542                                                 {
543                                                         /* retry count over */
544                                                         pool_log("set %d th backend down status", sts);
545                                                         Req_info->kind = NODE_DOWN_REQUEST;
546                                                         Req_info->node_id[0] = sts;
547                                                         failover();
548                                                         retrycnt = 0;
549                                                 }
550                                                 else
551                                                 {
552                                                         /* continue to retry */
553                                                         sleep_time = pool_config->health_check_period/NUM_BACKENDS;
554                                                         pool_debug("retry sleep time: %d seconds", sleep_time);
555                                                         pool_sleep(sleep_time);
556                                                         continue;
557                                                 }
558                                         }
559                                 }
560                                 if (sys_sts < 0)
561                                 {
562                                         sys_retrycnt++;
563                                         pool_signal(SIGALRM, SIG_IGN);
564
565                                         if (sys_retrycnt > NUM_BACKENDS)
566                                         {
567                                                 pool_log("set SystemDB down status");
568                                                 SYSDB_STATUS = CON_DOWN;
569                                                 sys_retrycnt = 0;
570                                         }
571                                         else if (sts == 0) /* goes to sleep only when SystemDB alone was down */
572                                         {
573                                                 sleep_time = pool_config->health_check_period/NUM_BACKENDS;
574                                                 pool_debug("retry sleep time: %d seconds", sleep_time);
575                                                 pool_sleep(sleep_time);
576                                                 continue;
577                                         }
578                                 }
579                         }
580
581                         if (pool_config->health_check_timeout > 0)
582                         {
583                                 /* seems ok. cancel health check timer */
584                                 pool_signal(SIGALRM, SIG_IGN);
585                         }
586
587                         sleep_time = pool_config->health_check_period;
588                         pool_sleep(sleep_time);
589                 }
590                 else
591                 {
592                         for (;;)
593                         {
594                                 int r;
595                                 struct timeval t = {3, 0};
596
597                                 POOL_SETMASK(&UnBlockSig);
598                                 r = pool_pause(&t);
599                                 POOL_SETMASK(&BlockSig);
600                                 if (r > 0)
601                                         break;
602                         }
603                 }
604         }
605
606         pool_shmem_exit(0);
607 }
608
609 static void show_version(void)
610 {
611         fprintf(stderr, "%s version %s (%s)\n", PACKAGE, VERSION, PGPOOLVERSION);
612 }
613
614 static void usage(void)
615 {
616         fprintf(stderr, "%s version %s (%s),\n",        PACKAGE, VERSION, PGPOOLVERSION);
617         fprintf(stderr, "  a generic connection pool/replication/load balance server for PostgreSQL\n\n");
618         fprintf(stderr, "Usage:\n");
619         fprintf(stderr, "  pgpool [ -c] [ -f CONFIG_FILE ] [ -F PCP_CONFIG_FILE ] [ -a HBA_CONFIG_FILE ]\n");
620         fprintf(stderr, "         [ -n ] [ -d ]\n");
621         fprintf(stderr, "  pgpool [ -f CONFIG_FILE ] [ -F PCP_CONFIG_FILE ] [ -a HBA_CONFIG_FILE ]\n");
622         fprintf(stderr, "         [ -m SHUTDOWN-MODE ] stop\n");
623         fprintf(stderr, "  pgpool [ -f CONFIG_FILE ] [ -F PCP_CONFIG_FILE ] [ -a HBA_CONFIG_FILE ] reload\n\n");
624         fprintf(stderr, "Common options:\n");
625         fprintf(stderr, "  -a HBA_CONFIG_FILE  Sets the path to the pool_hba.conf configuration file\n");
626         fprintf(stderr, "                      (default: %s/%s)\n",DEFAULT_CONFIGDIR, HBA_CONF_FILE_NAME);
627         fprintf(stderr, "  -f CONFIG_FILE      Sets the path to the pgpool.conf configuration file\n");
628         fprintf(stderr, "                      (default: %s/%s)\n",DEFAULT_CONFIGDIR, POOL_CONF_FILE_NAME);
629         fprintf(stderr, "  -F PCP_CONFIG_FILE  Sets the path to the pcp.conf configuration file\n");
630         fprintf(stderr, "                      (default: %s/%s)\n",DEFAULT_CONFIGDIR, PCP_PASSWD_FILE_NAME);
631         fprintf(stderr, "  -h                  Prints this help\n\n");
632         fprintf(stderr, "Start options:\n");
633         fprintf(stderr, "  -c                  Clears query cache (enable_query_cache must be on)\n");
634         fprintf(stderr, "  -n                  Don't run in daemon mode, does not detach control tty\n");
635         fprintf(stderr, "  -d                  Debug mode\n\n");
636         fprintf(stderr, "Stop options:\n");
637         fprintf(stderr, "  -m SHUTDOWN-MODE    Can be \"smart\", \"fast\", or \"immediate\"\n\n");
638         fprintf(stderr, "Shutdown modes are:\n");
639         fprintf(stderr, "  smart       quit after all clients have disconnected\n");
640         fprintf(stderr, "  fast        quit directly, with proper shutdown\n");
641         fprintf(stderr, "  immediate   quit without complete shutdown; will lead to recovery on restart\n");
642 }
643
644 /*
645 * detach control ttys
646 */
647 static void daemonize(void)
648 {
649         int                     i;
650         pid_t           pid;
651         int                     fdlimit;
652
653         pid = fork();
654         if (pid == (pid_t) -1)
655         {
656                 pool_error("fork() failed. reason: %s", strerror(errno));
657                 pool_shmem_exit(1);
658                 exit(1);
659                 return;                                 /* not reached */
660         }
661         else if (pid > 0)
662         {                       /* parent */
663                 pool_shmem_exit(0);
664                 exit(0);
665         }
666
667 #ifdef HAVE_SETSID
668         if (setsid() < 0)
669         {
670                 pool_error("setsid() failed. reason:%s", strerror(errno));
671                 pool_shmem_exit(1);
672                 exit(1);
673         }
674 #endif
675
676         mypid = getpid();
677
678         chdir("/");
679
680         i = open("/dev/null", O_RDWR);
681         dup2(i, 0);
682         dup2(i, 1);
683         dup2(i, 2);
684
685     fdlimit = sysconf(_SC_OPEN_MAX);
686     for (i = 3; i < fdlimit; i++)
687                 close(i);
688
689         write_pid_file();
690 }
691
692
693 /*
694 * stop myself
695 */
696 static void stop_me(void)
697 {
698         pid_t pid;
699
700         pid = read_pid_file();
701         if (pid < 0)
702         {
703                 pool_error("could not read pid file");
704                 pool_shmem_exit(1);
705                 exit(1);
706         }
707
708         if (kill(pid, stop_sig) == -1)
709         {
710                 pool_error("could not stop pid: %d. reason: %s", pid, strerror(errno));
711                 pool_shmem_exit(1);
712                 exit(1);
713         }
714
715         fprintf(stderr, "stop request sent to pgpool. waiting for termination...");
716
717         while (kill(pid, 0) == 0)
718         {
719                 fprintf(stderr, ".");
720                 sleep(1);
721         }
722         fprintf(stderr, "done.\n");
723 }
724
725 /*
726 * read the pid file
727 */
728 static int read_pid_file(void)
729 {
730         FILE *fd;
731         char pidbuf[128];
732
733         fd = fopen(pool_config->pid_file_name, "r");
734         if (!fd)
735         {
736                 return -1;
737         }
738         if (fread(pidbuf, 1, sizeof(pidbuf), fd) <= 0)
739         {
740                 pool_error("could not read pid file as %s. reason: %s",
741                                    pool_config->pid_file_name, strerror(errno));
742                 fclose(fd);
743                 return -1;
744         }
745         fclose(fd);
746         return(atoi(pidbuf));
747 }
748
749 /*
750 * write the pid file
751 */
752 static void write_pid_file(void)
753 {
754         FILE *fd;
755         char pidbuf[128];
756
757         fd = fopen(pool_config->pid_file_name, "w");
758         if (!fd)
759         {
760                 pool_error("could not open pid file as %s. reason: %s",
761                                    pool_config->pid_file_name, strerror(errno));
762                 pool_shmem_exit(1);
763                 exit(1);
764         }
765         snprintf(pidbuf, sizeof(pidbuf), "%d", (int)getpid());
766         fwrite(pidbuf, strlen(pidbuf)+1, 1, fd);
767         if (fclose(fd))
768         {
769                 pool_error("could not write pid file as %s. reason: %s",
770                                    pool_config->pid_file_name, strerror(errno));
771                 pool_shmem_exit(1);
772                 exit(1);
773         }
774 }
775
776 /*
777  * fork a child for PCP
778  */
779 pid_t pcp_fork_a_child(int unix_fd, int inet_fd, char *pcp_conf_file)
780 {
781         pid_t pid;
782
783         pid = fork();
784
785         if (pid == 0)
786         {
787                 close(pipe_fds[0]);
788                 close(pipe_fds[1]);
789
790                 myargv = save_ps_display_args(myargc, myargv);
791
792                 /* call PCP child main */
793                 POOL_SETMASK(&UnBlockSig);
794                 reload_config_request = 0;
795                 pcp_do_child(unix_fd, inet_fd, pcp_conf_file);
796         }
797         else if (pid == -1)
798         {
799                 pool_error("fork() failed. reason: %s", strerror(errno));
800                 myexit(1);
801         }
802         return pid;
803 }
804
805 /*
806 * fork a child
807 */
808 pid_t fork_a_child(int unix_fd, int inet_fd, int id)
809 {
810         pid_t pid;
811
812         pid = fork();
813
814         if (pid == 0)
815         {
816                 /* Before we unconditionally closed pipe_fds[0] and pipe_fds[1]
817                  * here, which is apparently wrong since in the start up of
818                  * pgpool, pipe(2) is not called yet and it mistakenly closes
819                  * fd 0. Now we check the fd > 0 before close(), expecting
820                  * pipe returns fds greater than 0.  Note that we cannot
821                  * unconditionally remove close(2) calls since fork_a_child()
822                  * may be called *after* pgpool starting up.
823                  */
824                 if (pipe_fds[0] > 0)
825                 {
826                         close(pipe_fds[0]);
827                         close(pipe_fds[1]);
828                 }
829
830                 myargv = save_ps_display_args(myargc, myargv);
831
832                 /* call child main */
833                 POOL_SETMASK(&UnBlockSig);
834                 reload_config_request = 0;
835                 my_proc_id = id;
836                 do_child(unix_fd, inet_fd);
837         }
838         else if (pid == -1)
839         {
840                 pool_error("fork() failed. reason: %s", strerror(errno));
841                 myexit(1);
842         }
843         return pid;
844 }
845
846 /*
847 * create inet domain socket
848 */
849 static int create_inet_domain_socket(const char *hostname, const int port)
850 {
851         struct sockaddr_in addr;
852         int fd;
853         int status;
854         int one = 1;
855         int len;
856
857         fd = socket(AF_INET, SOCK_STREAM, 0);
858         if (fd == -1)
859         {
860                 pool_error("Failed to create INET domain socket. reason: %s", strerror(errno));
861                 myexit(1);
862         }
863         if ((setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *) &one,
864                                         sizeof(one))) == -1)
865         {
866                 pool_error("setsockopt() failed. reason: %s", strerror(errno));
867                 myexit(1);
868         }
869
870         memset((char *) &addr, 0, sizeof(addr));
871         ((struct sockaddr *)&addr)->sa_family = AF_INET;
872
873         if (strcmp(hostname, "*")==0)
874         {
875                 addr.sin_addr.s_addr = htonl(INADDR_ANY);
876         }
877         else
878         {
879                 struct hostent *hostinfo;
880
881                 hostinfo = gethostbyname(hostname);
882                 if (!hostinfo)
883                 {
884                         pool_error("could not resolve host name \"%s\": %s", hostname, hstrerror(h_errno));
885                         myexit(1);
886                 }
887                 addr.sin_addr = *(struct in_addr *) hostinfo->h_addr;
888         }
889
890         addr.sin_port = htons(port);
891         len = sizeof(struct sockaddr_in);
892         status = bind(fd, (struct sockaddr *)&addr, len);
893         if (status == -1)
894         {
895                 char *host = "", *serv = "";
896                 char hostname[NI_MAXHOST], servname[NI_MAXSERV];
897                 if (getnameinfo((struct sockaddr *) &addr, len, hostname, sizeof(hostname), servname, sizeof(servname), 0) == 0) {
898                         host = hostname;
899                         serv = servname;
900                 }
901                 pool_error("bind(%s:%s) failed. reason: %s", host, serv, strerror(errno));
902                 myexit(1);
903         }
904
905         status = listen(fd, PGPOOLMAXLITSENQUEUELENGTH);
906         if (status < 0)
907         {
908                 pool_error("listen() failed. reason: %s", strerror(errno));
909                 myexit(1);
910         }
911         return fd;
912 }
913
914 /*
915 * create UNIX domain socket
916 */
917 static int create_unix_domain_socket(struct sockaddr_un un_addr_tmp)
918 {
919         struct sockaddr_un addr;
920         int fd;
921         int status;
922         int len;
923
924         fd = socket(AF_UNIX, SOCK_STREAM, 0);
925         if (fd == -1)
926         {
927                 pool_error("Failed to create UNIX domain socket. reason: %s", strerror(errno));
928                 myexit(1);
929         }
930         memset((char *) &addr, 0, sizeof(addr));
931         ((struct sockaddr *)&addr)->sa_family = AF_UNIX;
932         snprintf(addr.sun_path, sizeof(addr.sun_path), un_addr_tmp.sun_path);
933         len = sizeof(struct sockaddr_un);
934         status = bind(fd, (struct sockaddr *)&addr, len);
935         if (status == -1)
936         {
937                 pool_error("bind(%s) failed. reason: %s", addr.sun_path, strerror(errno));
938                 myexit(1);
939         }
940
941         if (chmod(un_addr_tmp.sun_path, 0777) == -1)
942         {
943                 pool_error("chmod() failed. reason: %s", strerror(errno));
944                 myexit(1);
945         }
946
947         status = listen(fd, PGPOOLMAXLITSENQUEUELENGTH);
948         if (status < 0)
949         {
950                 pool_error("listen() failed. reason: %s", strerror(errno));
951                 myexit(1);
952         }
953         return fd;
954 }
955
956 static void myunlink(const char* path)
957 {
958         if (unlink(path) == 0) return;
959         pool_error("unlink(%s) failed: %s", path, strerror(errno));
960 }
961
962 static void myexit(int code)
963 {
964         int i;
965
966         if (getpid() != mypid)
967                 return;
968
969         if (pids != NULL) {
970                 POOL_SETMASK(&AuthBlockSig);
971                 exiting = 1;
972                 for (i = 0; i < pool_config->num_init_children; i++)
973                 {
974                         pid_t pid = pids[i].pid;
975                         if (pid)
976                         {
977                                 kill(pid, SIGTERM);
978                         }
979                 }
980                 while (wait(NULL) > 0)
981                         ;
982                 if (errno != ECHILD)
983                         pool_error("wait() failed. reason:%s", strerror(errno));
984                 POOL_SETMASK(&UnBlockSig);
985         }
986
987         myunlink(un_addr.sun_path);
988         myunlink(pcp_un_addr.sun_path);
989         myunlink(pool_config->pid_file_name);
990
991         pool_shmem_exit(code);
992         exit(code);
993 }
994
995 void notice_backend_error(int node_id)
996 {
997         int n = node_id;
998
999         degenerate_backend_set(&n, 1);
1000 }
1001
1002 /* notice backend connection error using SIGUSR1 */
1003 void degenerate_backend_set(int *node_id_set, int count)
1004 {
1005         pid_t parent = getppid();
1006         int i;
1007
1008         if (pool_config->parallel_mode)
1009         {
1010                 return;
1011         }
1012
1013         pool_semaphore_lock(REQUEST_INFO_SEM);
1014         Req_info->kind = NODE_DOWN_REQUEST;
1015         for (i = 0; i < count; i++)
1016         {
1017                 if (node_id_set[i] < 0 || node_id_set[i] >= MAX_NUM_BACKENDS ||
1018                         !VALID_BACKEND(node_id_set[i]))
1019                 {
1020                         pool_log("notice_backend_error: node %d is not valid backend.", i);
1021                         continue;
1022                 }
1023
1024                 pool_log("notice_backend_error: %d fail over request from pid %d", node_id_set[i], getpid());
1025                 Req_info->node_id[i] = node_id_set[i];
1026         }
1027         kill(parent, SIGUSR1);
1028         pool_semaphore_unlock(REQUEST_INFO_SEM);
1029 }
1030
1031 /* send failback request using SIGUSR1 */
1032 void send_failback_request(int node_id)
1033 {
1034         pid_t parent = getppid();
1035
1036         pool_log("send_failback_request: fail back %d th node request from pid %d", node_id, getpid());
1037         Req_info->kind = NODE_UP_REQUEST;
1038         Req_info->node_id[0] = node_id;
1039
1040         if (node_id < 0 || node_id >= MAX_NUM_BACKENDS || VALID_BACKEND(node_id))
1041         {
1042                 pool_error("send_failback_request: node %d is alive.", node_id);
1043                 return;
1044         }
1045
1046         kill(parent, SIGUSR1);
1047 }
1048
1049 static RETSIGTYPE exit_handler(int sig)
1050 {
1051         int i;
1052
1053         POOL_SETMASK(&AuthBlockSig);
1054
1055         /*
1056          * this could happen in a child process if a signal has been sent
1057          * before resetting signal handler
1058          */
1059         if (getpid() != mypid)
1060         {
1061                 pool_debug("exit_handler: I am not parent");
1062                 POOL_SETMASK(&UnBlockSig);
1063                 pool_shmem_exit(0);
1064                 exit(0);
1065         }
1066
1067         if (sig == SIGTERM)
1068                 pool_log("received smart shutdown request");
1069         else if (sig == SIGINT)
1070                 pool_log("received fast shutdown request");
1071         else if (sig == SIGQUIT)
1072                 pool_log("received immediate shutdown request");
1073         else
1074         {
1075                 pool_error("exit_handler: unknown signal received %d", sig);
1076                 POOL_SETMASK(&UnBlockSig);
1077                 return;
1078         }
1079
1080         exiting = 1;
1081
1082         for (i = 0; i < pool_config->num_init_children; i++)
1083         {
1084                 pid_t pid = pids[i].pid;
1085                 if (pid)
1086                 {
1087                         kill(pid, sig);
1088                 }
1089         }
1090
1091         kill(pcp_pid, sig);
1092
1093         POOL_SETMASK(&UnBlockSig);
1094
1095         while (wait(NULL) > 0)
1096                 ;
1097
1098         if (errno != ECHILD)
1099                 pool_error("wait() failed. reason:%s", strerror(errno));
1100
1101         pids = NULL;
1102         myexit(0);
1103 }
1104
1105 /*
1106  * calculate next master node id
1107  */
1108 static int get_next_master_node(void)
1109 {
1110         int i;
1111         for (i=0;i<pool_config->backend_desc->num_backends;i++)
1112         {
1113                 /*
1114                  * Do not use VALID_BACKEND macro in raw mode.
1115                  * VALID_BACKEND return true only if the argument is master
1116                  * node id. In other words, standby nodes are false. So need
1117                  * to check backend status without VALID_BACKEND.
1118                  */
1119                 if (RAW_MODE)
1120                 {
1121                         if (BACKEND_INFO(i).backend_status == CON_CONNECT_WAIT)
1122                                 break;
1123                 }
1124                 else if (VALID_BACKEND(i))
1125                         break;
1126         }
1127         return i;
1128 }
1129
1130 /*
1131  * handle SIGUSR1
1132  *
1133  */
1134 static RETSIGTYPE failover_handler(int sig)
1135 {
1136         POOL_SETMASK(&BlockSig);
1137         failover_request = 1;
1138         write(pipe_fds[1], "\0", 1);
1139         POOL_SETMASK(&UnBlockSig);
1140 }
1141
1142 /*
1143  * backend connection error, failover/failback request, if possible
1144  * failover() must be called under protecting signals.
1145  */
1146 static void failover(void)
1147 {
1148         int i;
1149         int node_id;
1150         int new_master;
1151         int nodes[MAX_NUM_BACKENDS];
1152
1153         pool_debug("failover_handler called");
1154
1155         memset(nodes, 0, sizeof(int) * MAX_NUM_BACKENDS);
1156
1157         /*
1158          * this could happen in a child process if a signal has been sent
1159          * before resetting signal handler
1160          */
1161         if (getpid() != mypid)
1162         {
1163                 pool_debug("failover_handler: I am not parent");
1164                 kill(pcp_pid, SIGUSR2);
1165                 return;
1166         }
1167
1168         /*
1169          * processing SIGTERM, SIGINT or SIGQUIT
1170          */
1171         if (exiting)
1172         {
1173                 pool_debug("failover_handler called while exiting");
1174                 kill(pcp_pid, SIGUSR2);
1175                 return;
1176         }
1177
1178         /*
1179          * processing fail over or switch over
1180          */
1181         if (switching)
1182         {
1183                 pool_debug("failover_handler called while switching");
1184                 kill(pcp_pid, SIGUSR2);
1185                 return;
1186         }
1187
1188         pool_semaphore_lock(REQUEST_INFO_SEM);
1189
1190         if (Req_info->kind == CLOSE_IDLE_REQUEST)
1191         {
1192                 pool_semaphore_unlock(REQUEST_INFO_SEM);
1193                 kill_all_children(SIGUSR1);
1194                 kill(pcp_pid, SIGUSR2);
1195                 return;
1196         }
1197
1198         /*
1199          * if not in replication mode/master slave mode, we treat this a restart request.
1200          * otherwise we need to check if we have already failovered.
1201          */
1202         pool_debug("failover_handler: starting to select new master node");
1203         switching = 1;
1204         node_id = Req_info->node_id[0];
1205
1206         /* failback request? */
1207         if (Req_info->kind == NODE_UP_REQUEST)
1208         {
1209                 if (node_id >= MAX_NUM_BACKENDS ||
1210                         (Req_info->kind == NODE_UP_REQUEST && VALID_BACKEND(node_id)) ||
1211                         (Req_info->kind == NODE_DOWN_REQUEST && !VALID_BACKEND(node_id)))
1212                 {
1213                         pool_semaphore_unlock(REQUEST_INFO_SEM);
1214                         pool_error("failover_handler: invalid node_id %d status:%d MAX_NUM_BACKENDS: %d", node_id,
1215                                            BACKEND_INFO(node_id).backend_status, MAX_NUM_BACKENDS);
1216                         kill(pcp_pid, SIGUSR2);
1217                         switching = 0;
1218                         return;
1219                 }
1220
1221                 pool_log("starting fail back. reconnect host %s(%d)",
1222                                  BACKEND_INFO(node_id).backend_hostname,
1223                                  BACKEND_INFO(node_id).backend_port);
1224                 BACKEND_INFO(node_id).backend_status = CON_CONNECT_WAIT;        /* unset down status */
1225                 trigger_failover_command(node_id, pool_config->failback_command);
1226         }
1227         else
1228         {
1229                 int cnt = 0;
1230
1231                 for (i = 0; i < MAX_NUM_BACKENDS; i++)
1232                 {
1233                         if (Req_info->node_id[i] != -1 &&
1234                                 VALID_BACKEND(Req_info->node_id[i]))
1235                         {
1236                                 pool_log("starting degeneration. shutdown host %s(%d)",
1237                                                  BACKEND_INFO(Req_info->node_id[i]).backend_hostname,
1238                                                  BACKEND_INFO(Req_info->node_id[i]).backend_port);
1239
1240
1241                                 BACKEND_INFO(Req_info->node_id[i]).backend_status = CON_DOWN;   /* set down status */
1242                                 /* save down node */
1243                                 nodes[Req_info->node_id[i]] = 1;
1244                                 cnt++;
1245                         }
1246                 }
1247
1248                 if (cnt == 0)
1249                 {
1250                         pool_log("failover: no backends are degenerated");
1251                         pool_semaphore_unlock(REQUEST_INFO_SEM);
1252                         kill(pcp_pid, SIGUSR2);
1253                         switching = 0;
1254                         return;
1255                 }
1256         }
1257
1258         new_master = get_next_master_node();
1259
1260         if (new_master == pool_config->backend_desc->num_backends)
1261         {
1262                 pool_error("failover_handler: no valid DB node found");
1263         }
1264
1265 /*
1266  * Before we tried to minimize restarting pgpool to protect existing
1267  * connections from clients to pgpool children. What we did here was,
1268  * if children other than master went down, we did not fail over.
1269  * This is wrong. Think about following scenario. If someone
1270  * accidentally plugs out the network cable, the TCP/IP stack keeps
1271  * retrying for long time (typically 2 hours). The only way to stop
1272  * the retry is restarting the process.  Bottom line is, we need to
1273  * restart all children in any case.  See pgpool-general list posting
1274  * "TCP connections are *not* closed when a backend timeout" on Jul 13
1275  * 2008 for more details.
1276  */
1277
1278 #ifdef NOT_USED
1279         else
1280         {
1281                 if (Req_info->master_node_id == new_master && *InRecovery == 0)
1282                 {
1283                         pool_log("failover_handler: do not restart pgpool. same master node %d was selected", new_master);
1284                         if (Req_info->kind == NODE_UP_REQUEST)
1285                         {
1286                                 pool_log("failback done. reconnect host %s(%d)",
1287                                                  BACKEND_INFO(node_id).backend_hostname,
1288                                                  BACKEND_INFO(node_id).backend_port);
1289                         }
1290                         else
1291                         {
1292                                 pool_log("failover done. shutdown host %s(%d)",
1293                                                  BACKEND_INFO(node_id).backend_hostname,
1294                                                  BACKEND_INFO(node_id).backend_port);
1295                         }
1296
1297                         /* exec failover_command */
1298                         for (i = 0; i < pool_config->backend_desc->num_backends; i++)
1299                         {
1300                                 if (nodes[i])
1301                                         trigger_failover_command(i, pool_config->failover_command);
1302                         }
1303
1304                         pool_semaphore_unlock(REQUEST_INFO_SEM);
1305                         switching = 0;
1306                         kill(pcp_pid, SIGUSR2);
1307                         switching = 0;
1308                         return;
1309                 }
1310         }
1311 #endif
1312         /* kill all children */
1313         for (i = 0; i < pool_config->num_init_children; i++)
1314         {
1315                 pid_t pid = pids[i].pid;
1316                 if (pid)
1317                 {
1318                         kill(pid, SIGQUIT);
1319                         pool_debug("failover_handler: kill %d", pid);
1320                 }
1321         }
1322
1323         /* exec failover_command */
1324         for (i = 0; i < pool_config->backend_desc->num_backends; i++)
1325         {
1326                 if (nodes[i])
1327                         trigger_failover_command(i, pool_config->failover_command);
1328         }
1329
1330         pool_log("failover_handler: set new master node: %d", new_master);
1331         Req_info->master_node_id = new_master;
1332
1333 /* no need to wait since it will be done in reap_handler */
1334 #ifdef NOT_USED
1335         while (wait(NULL) > 0)
1336                 ;
1337
1338         if (errno != ECHILD)
1339                 pool_error("failover_handler: wait() failed. reason:%s", strerror(errno));
1340 #endif
1341
1342         memset(Req_info->node_id, -1, sizeof(int) * MAX_NUM_BACKENDS);
1343         pool_semaphore_unlock(REQUEST_INFO_SEM);
1344
1345         /* fork the children */
1346         for (i=0;i<pool_config->num_init_children;i++)
1347         {
1348                 pids[i].pid = fork_a_child(unix_fd, inet_fd, i);
1349                 pids[i].start_time = time(NULL);
1350         }
1351
1352         if (Req_info->kind == NODE_UP_REQUEST)
1353         {
1354                 pool_log("failback done. reconnect host %s(%d)",
1355                                  BACKEND_INFO(node_id).backend_hostname,
1356                                  BACKEND_INFO(node_id).backend_port);
1357         }
1358         else
1359         {
1360                 pool_log("failover done. shutdown host %s(%d)",
1361                                  BACKEND_INFO(node_id).backend_hostname,
1362                                  BACKEND_INFO(node_id).backend_port);
1363         }
1364
1365         switching = 0;
1366
1367         /* kick wakeup_handler in pcp_child to notice that
1368          * faiover/failback done
1369          */
1370         kill(pcp_pid, SIGUSR2);
1371 }
1372
1373 /*
1374  * health check timer handler
1375  */
1376 static RETSIGTYPE health_check_timer_handler(int sig)
1377 {
1378         POOL_SETMASK(&BlockSig);
1379         health_check_timer_expired = 1;
1380         POOL_SETMASK(&UnBlockSig);
1381 }
1382
1383
1384 /*
1385  * check if we can connect to the backend
1386  * returns 0 for ok. otherwise returns backend id + 1
1387  */
1388 int health_check(void)
1389 {
1390         int fd;
1391         int sts;
1392
1393         /* V2 startup packet */
1394         typedef struct {
1395                 int len;                /* startup packet length */
1396                 StartupPacket_v2 sp;
1397         } MySp;
1398         MySp mysp;
1399         char kind;
1400         int i;
1401
1402         if (*InRecovery)
1403                 return 0;
1404
1405         memset(&mysp, 0, sizeof(mysp));
1406         mysp.len = htonl(296);
1407         mysp.sp.protoVersion = htonl(PROTO_MAJOR_V2 << 16);
1408         strcpy(mysp.sp.database, "template1");
1409         strncpy(mysp.sp.user, pool_config->health_check_user, sizeof(mysp.sp.user) - 1);
1410         *mysp.sp.options = '\0';
1411         *mysp.sp.unused = '\0';
1412         *mysp.sp.tty = '\0';
1413
1414         for (i=0;i<pool_config->backend_desc->num_backends;i++)
1415         {
1416                 pool_debug("health_check: %d th DB node status: %d", i, BACKEND_INFO(i).backend_status);
1417
1418                 if (BACKEND_INFO(i).backend_status == CON_UNUSED ||
1419                         BACKEND_INFO(i).backend_status == CON_DOWN)
1420                         continue;
1421
1422                 if (*(BACKEND_INFO(i).backend_hostname) == '\0')
1423                         fd = connect_unix_domain_socket(i);
1424                 else
1425                         fd = connect_inet_domain_socket(i);
1426
1427                 if (fd < 0)
1428                 {
1429                         pool_error("health check failed. %d th host %s at port %d is down",
1430                                            i,
1431                                            BACKEND_INFO(i).backend_hostname,
1432                                            BACKEND_INFO(i).backend_port);
1433
1434                         return i+1;
1435                 }
1436
1437                 if (write(fd, &mysp, sizeof(mysp)) < 0)
1438                 {
1439                         pool_error("health check failed during write. host %s at port %d is down. reason: %s",
1440                                            BACKEND_INFO(i).backend_hostname,
1441                                            BACKEND_INFO(i).backend_port,
1442                                            strerror(errno));
1443                         close(fd);
1444                         return i+1;
1445                 }
1446
1447                 /*
1448                  * Don't bother to be blocked by read(2). It will be
1449                  * interrupted by ALRAM anyway.
1450                  */
1451                 sts = read(fd, &kind, 1);
1452                 if (sts == -1)
1453                 {
1454                         pool_error("health check failed during read. host %s at port %d is down. reason: %s",
1455                                            BACKEND_INFO(i).backend_hostname,
1456                                            BACKEND_INFO(i).backend_port,
1457                                            strerror(errno));
1458                         close(fd);
1459                         return i+1;
1460                 }
1461                 else if (sts == 0)
1462                 {
1463                         pool_error("health check failed. EOF encountered. host %s at port %d is down",
1464                                            BACKEND_INFO(i).backend_hostname,
1465                                            BACKEND_INFO(i).backend_port);
1466                         close(fd);
1467                         return i+1;
1468                 }
1469
1470                 /*
1471                  * If a backend raised a FATAL error(max connections error or
1472                  * starting up error?), do not send a Terminate message.
1473                  */
1474                 if ((kind != 'E') && (write(fd, "X", 1) < 0))
1475                 {
1476                         pool_error("health check failed during write. host %s at port %d is down. reason: %s. Perhaps wrong health check user?",
1477                                            BACKEND_INFO(i).backend_hostname,
1478                                            BACKEND_INFO(i).backend_port,
1479                                            strerror(errno));
1480                         close(fd);
1481                         return i+1;
1482                 }
1483
1484                 close(fd);
1485         }
1486
1487         return 0;
1488 }
1489
1490 /*
1491  * check if we can connect to the SystemDB
1492  * returns 0 for ok. otherwise returns -1
1493  */
1494 int
1495 system_db_health_check(void)
1496 {
1497         int fd;
1498
1499         /* V2 startup packet */
1500         typedef struct {
1501                 int len;                /* startup packet length */
1502                 StartupPacket_v2 sp;
1503         } MySp;
1504         MySp mysp;
1505         char kind;
1506
1507         memset(&mysp, 0, sizeof(mysp));
1508         mysp.len = htonl(296);
1509         mysp.sp.protoVersion = htonl(PROTO_MAJOR_V2 << 16);
1510         strcpy(mysp.sp.database, "template1");
1511         strncpy(mysp.sp.user, SYSDB_INFO->user, sizeof(mysp.sp.user) - 1);
1512         *mysp.sp.options = '\0';
1513         *mysp.sp.unused = '\0';
1514         *mysp.sp.tty = '\0';
1515
1516         pool_debug("health_check: SystemDB status: %d", SYSDB_STATUS);
1517
1518         /* if SystemDB is already down, ignore */
1519         if (SYSDB_STATUS == CON_UNUSED || SYSDB_STATUS == CON_DOWN)
1520                 return 0;
1521
1522         if (*SYSDB_INFO->hostname == '\0')
1523                 fd = connect_unix_domain_socket_by_port(SYSDB_INFO->port, pool_config->backend_socket_dir);
1524         else
1525                 fd = connect_inet_domain_socket_by_port(SYSDB_INFO->hostname, SYSDB_INFO->port);
1526
1527         if (fd < 0)
1528         {
1529                 pool_error("health check failed. SystemDB host %s at port %d is down",
1530                                    SYSDB_INFO->hostname,
1531                                    SYSDB_INFO->port);
1532
1533                 return -1;
1534         }
1535
1536         if (write(fd, &mysp, sizeof(mysp)) < 0)
1537         {
1538                 pool_error("health check failed during write. SystemDB host %s at port %d is down",
1539                                    SYSDB_INFO->hostname,
1540                                    SYSDB_INFO->port);
1541                 close(fd);
1542                 return -1;
1543         }
1544
1545         read(fd, &kind, 1);
1546
1547         if (write(fd, "X", 1) < 0)
1548         {
1549                 pool_error("health check failed during write. SystemDB host %s at port %d is down",
1550                                    SYSDB_INFO->hostname,
1551                                    SYSDB_INFO->port);
1552                 close(fd);
1553                 return -1;
1554         }
1555
1556         close(fd);
1557         return 0;
1558 }
1559
1560 /*
1561  * handle SIGCHLD
1562  */
1563 static RETSIGTYPE reap_handler(int sig)
1564 {
1565         POOL_SETMASK(&BlockSig);
1566         sigchld_request = 1;
1567         write(pipe_fds[1], "\0", 1);
1568         POOL_SETMASK(&UnBlockSig);
1569 }
1570
1571 /*
1572  * Attach zombie processes and restart child processes.
1573  * reaper() must be called under protecting signals.
1574  */
1575 static void reaper(void)
1576 {
1577         pid_t pid;
1578         int status;
1579         int i;
1580
1581         pool_debug("reap_handler called");
1582
1583         if (exiting)
1584         {
1585                 pool_debug("reap_handler: exited due to exiting");
1586                 return;
1587         }
1588
1589         if (switching)
1590         {
1591                 pool_debug("reap_handler: exited due to switching");
1592                 return;
1593         }
1594
1595         /* clear SIGCHLD request */
1596         sigchld_request = 0;
1597
1598 #ifdef HAVE_WAITPID
1599         pool_debug("reap_handler: call waitpid");
1600         while ((pid = waitpid(-1, &status, WNOHANG)) > 0)
1601 #else
1602         pool_debug("reap_handler: call wait3");
1603         while ((pid = wait3(&status, WNOHANG, NULL)) > 0)
1604 #endif
1605         {
1606                 /* if exiting child process was PCP handler */
1607                 if (pid == pcp_pid)
1608                 {
1609                         pool_debug("PCP child %d exits with status %d by signal %d", pid, status, WTERMSIG(status));
1610
1611                         pcp_pid = pcp_fork_a_child(pcp_unix_fd, pcp_inet_fd, pcp_conf_file);
1612                         pool_debug("fork a new PCP child pid %d", pcp_pid);
1613                         break;
1614                 } else {
1615                         pool_debug("child %d exits with status %d by signal %d", pid, status, WTERMSIG(status));
1616
1617                         /* look for exiting child's pid */
1618                         for (i=0;i<pool_config->num_init_children;i++)
1619                         {
1620                                 if (pid == pids[i].pid)
1621                                 {
1622                                         /* if found, fork a new child */
1623                                         if (!switching && !exiting && status)
1624                                         {
1625                                                 pids[i].pid = fork_a_child(unix_fd, inet_fd, i);
1626                                                 pids[i].start_time = time(NULL);
1627                                                 pool_debug("fork a new child pid %d", pids[i].pid);
1628                                                 break;
1629                                         }
1630                                 }
1631                         }
1632                 }
1633         }
1634         pool_debug("reap_handler: normally exited");
1635 }
1636
1637 /*
1638  * get node information specified by node_number
1639  */
1640 BackendInfo *
1641 pool_get_node_info(int node_number)
1642 {
1643         if (node_number >= NUM_BACKENDS)
1644                 return NULL;
1645
1646         return &BACKEND_INFO(node_number);
1647 }
1648
1649 /*
1650  * get number of nodes
1651  */
1652 int
1653 pool_get_node_count(void)
1654 {
1655         return NUM_BACKENDS;
1656 }
1657
1658 /*
1659  * get process ids
1660  */
1661 int *
1662 pool_get_process_list(int *array_size)
1663 {
1664         int        *array;
1665         int             i;
1666
1667         *array_size = pool_config->num_init_children;
1668         array = calloc(*array_size, sizeof(int));
1669         for (i = 0; i < *array_size; i++)
1670                 array[i] = pids[i].pid;
1671
1672         return array;
1673 }
1674
1675 /*
1676  * get process information specified by pid
1677  */
1678 ProcessInfo *
1679 pool_get_process_info(pid_t pid)
1680 {
1681         int             i;
1682
1683         for (i = 0; i < pool_config->num_init_children; i++)
1684                 if (pids[i].pid == pid)
1685                         return &pids[i];
1686
1687         return NULL;
1688 }
1689
1690 /*
1691  * get System DB information
1692  */
1693 SystemDBInfo *
1694 pool_get_system_db_info(void)
1695 {
1696         if (system_db_info == NULL)
1697                 return NULL;
1698
1699         return system_db_info->info;
1700 }
1701
1702
1703 /*
1704  * handle SIGUSR2
1705  * Wakeup all processes
1706  */
1707 static void wakeup_children(void)
1708 {
1709         kill_all_children(SIGUSR2);
1710 }
1711
1712
1713 static RETSIGTYPE wakeup_handler(int sig)
1714 {
1715         POOL_SETMASK(&BlockSig);
1716         wakeup_request = 1;
1717         write(pipe_fds[1], "\0", 1);
1718         POOL_SETMASK(&UnBlockSig);
1719 }
1720
1721 /*
1722  * handle SIGHUP
1723  *
1724  */
1725 static RETSIGTYPE reload_config_handler(int sig)
1726 {
1727         POOL_SETMASK(&BlockSig);
1728         reload_config_request = 1;
1729         write(pipe_fds[1], "\0", 1);
1730         POOL_SETMASK(&UnBlockSig);
1731 }
1732
1733 static void reload_config(void)
1734 {
1735         pool_log("reload config files.");
1736         pool_get_config(conf_file, RELOAD_CONFIG);
1737         if (pool_config->enable_pool_hba)
1738                 load_hba(hba_file);
1739         if (pool_config->parallel_mode)
1740                 pool_memset_system_db_info(system_db_info->info);
1741         kill_all_children(SIGHUP);
1742 }
1743
1744 static void kill_all_children(int sig)
1745 {
1746         int i;
1747
1748         /* kill all children */
1749         for (i = 0; i < pool_config->num_init_children; i++)
1750         {
1751                 pid_t pid = pids[i].pid;
1752                 if (pid)
1753                 {
1754                         kill(pid, sig);
1755                 }
1756         }
1757
1758         /* make PCP process reload as well */
1759         if (sig == SIGHUP)
1760                 kill(pcp_pid, sig);
1761 }
1762
1763 /*
1764  * pause in a period specified by timeout. If any data is coming
1765  * through pipe_fds[0], that means one of: failover request(SIGUSR1),
1766  * SIGCHLD received, children wake up request(SIGUSR2 used in on line
1767  * recovery processing) or config file reload request(SIGHUP) has been
1768  * occurred.  In this case this function returns 1.
1769  * otherwise 0: (no signal event occurred), -1: (error)
1770  * XXX: is it ok that select(2) error is ignored here?
1771  */
1772 static int pool_pause(struct timeval *timeout)
1773 {
1774         fd_set rfds;
1775         int n;
1776         char dummy;
1777
1778         FD_ZERO(&rfds);
1779         FD_SET(pipe_fds[0], &rfds);
1780         n = select(pipe_fds[0]+1, &rfds, NULL, NULL, timeout);
1781         if (n == 1)
1782                 read(pipe_fds[0], &dummy, 1);
1783         return n;
1784 }
1785
1786 /*
1787  * sleep for seconds specified by "second".  Unlike pool_pause(), this
1788  * function guarantees that it will sleep for specified seconds.  This
1789  * function uses pool_pause() internally. If it informs that there is
1790  * a pending signal event, they are processed using CHECK_REQUEST
1791  * macro. Note that most of these processes are done while all signals
1792  * are blocked.
1793  */
1794 static void pool_sleep(unsigned int second)
1795 {
1796         struct timeval current_time, sleep_time;
1797
1798         gettimeofday(&current_time, NULL);
1799         sleep_time.tv_sec = second + current_time.tv_sec;
1800         sleep_time.tv_usec = current_time.tv_usec;
1801
1802         POOL_SETMASK(&UnBlockSig);
1803         while (sleep_time.tv_sec > current_time.tv_sec)
1804         {
1805                 struct timeval timeout;
1806                 int r;
1807
1808                 timeout.tv_sec = sleep_time.tv_sec - current_time.tv_sec;
1809                 timeout.tv_usec = sleep_time.tv_usec - current_time.tv_usec;
1810                 if (timeout.tv_usec < 0)
1811                 {
1812                         timeout.tv_sec--;
1813                         timeout.tv_usec += 1000000;
1814                 }
1815
1816                 r = pool_pause(&timeout);
1817                 POOL_SETMASK(&BlockSig);
1818                 if (r > 0)
1819                         CHECK_REQUEST;
1820                 POOL_SETMASK(&UnBlockSig);
1821                 gettimeofday(&current_time, NULL);
1822         }
1823         POOL_SETMASK(&BlockSig);
1824 }
1825
1826 /*
1827  * get_config_file_name: return full path of pgpool.conf.
1828  */
1829 char *get_config_file_name(void)
1830 {
1831         return conf_file;
1832 }
1833
1834 /*
1835  * get_config_file_name: return full path of pool_hba.conf.
1836  */
1837 char *get_hba_file_name(void)
1838 {
1839         return hba_file;
1840 }
1841
1842 /*
1843  * trigger_failover_command: execute specified command at failover.
1844  *                           command_line is null-terminated string.
1845  */
1846 static int trigger_failover_command(int node, const char *command_line)
1847 {
1848         int r = 0;
1849         String *exec_cmd;
1850         char port_buf[6];
1851         char buf[2];
1852         BackendInfo *info;
1853
1854         if (command_line == NULL || (strlen(command_line) == 0))
1855                 return 0;
1856
1857         /* check nodeID */
1858         if (node < 0 || node > NUM_BACKENDS)
1859                 return -1;
1860
1861         info = pool_get_node_info(node);
1862         if (!info)
1863                 return -1;
1864
1865         buf[1] = '\0';
1866         pool_memory = pool_memory_create(PREPARE_BLOCK_SIZE);
1867         if (!pool_memory)
1868         {
1869                 pool_error("trigger_failover_command: pool_memory_create() failed");
1870                 return -1;
1871         }
1872         exec_cmd = init_string("");
1873
1874         while (*command_line)
1875         {
1876                 if (*command_line == '%')
1877                 {
1878                         if (*(command_line + 1))
1879                         {
1880                                 char val = *(command_line + 1);
1881                                 switch (val)
1882                                 {
1883                                         case 'p': /* port */
1884                                                 snprintf(port_buf, sizeof(port_buf), "%d", info->backend_port);
1885                                                 string_append_char(exec_cmd, port_buf);
1886                                                 break;
1887
1888                                         case 'D': /* database directory */
1889                                                 string_append_char(exec_cmd, info->backend_data_directory);
1890                                                 break;
1891
1892                                         case 'd': /* node id */
1893                                                 snprintf(port_buf, sizeof(port_buf), "%d", node);
1894                                                 string_append_char(exec_cmd, port_buf);
1895                                                 break;
1896
1897                                         case 'h': /* host name */
1898                                                 string_append_char(exec_cmd, info->backend_hostname);
1899                                                 break;
1900
1901                                         case 'm': /* new master node id */
1902                                                 snprintf(port_buf, sizeof(port_buf), "%d", get_next_master_node());
1903                                                 string_append_char(exec_cmd, port_buf);
1904                                                 break;
1905
1906                                         case 'M': /* old master node id */
1907                                                 snprintf(port_buf, sizeof(port_buf), "%d", MASTER_NODE_ID);
1908                                                 string_append_char(exec_cmd, port_buf);
1909                                                 break;
1910
1911                                         case '%': /* escape */
1912                                                 string_append_char(exec_cmd, "%");
1913                                                 break;
1914
1915                                         default: /* ignore */
1916                                                 break;
1917                                 }
1918                                 command_line++;
1919                         }
1920                 } else {
1921                         buf[0] = *command_line;
1922                         string_append_char(exec_cmd, buf);
1923                 }
1924                 command_line++;
1925         }
1926
1927         if (strlen(exec_cmd->data) != 0)
1928         {
1929                 pool_log("execute command: %s", exec_cmd->data);
1930                 r = system(exec_cmd->data);
1931         }
1932
1933         pool_memory_delete(pool_memory, 0);
1934         pool_memory = NULL;
1935
1936         return r;
1937 }