pacemaker  1.1.14-70404b0
Scalable High-Availability cluster resource manager
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
watchdog.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
3  * 2014 Andrew Beekhof <andrew@beekhof.net>
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public
7  * License as published by the Free Software Foundation; either
8  * version 2.1 of the License, or (at your option) any later version.
9  *
10  * This software is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13  * General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public
16  * License along with this library; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  */
19 
20 #include <crm_internal.h>
21 
22 #include <sched.h>
23 #include <sys/ioctl.h>
24 #include <sys/reboot.h>
25 
26 #include <sys/types.h>
27 #include <sys/stat.h>
28 #include <unistd.h>
29 #include <ctype.h>
30 #include <dirent.h>
31 
32 #ifdef _POSIX_MEMLOCK
33 # include <sys/mman.h>
34 #endif
35 
36 static int sbd_pid = 0;
37 
39 {
44 };
45 
46 #define SYSRQ "/proc/sys/kernel/sysrq"
47 
48 void
50 {
51  static bool need_init = true;
52  FILE* procf;
53  int c;
54 
55  if(need_init) {
56  need_init = false;
57  } else {
58  return;
59  }
60 
61  procf = fopen(SYSRQ, "r");
62  if (!procf) {
63  crm_perror(LOG_ERR, "Cannot open "SYSRQ" for read");
64  return;
65  }
66  if (fscanf(procf, "%d", &c) != 1) {
67  crm_perror(LOG_ERR, "Parsing "SYSRQ" failed");
68  c = 0;
69  }
70  fclose(procf);
71  if (c == 1)
72  return;
73 
74  /* 8 for debugging dumps of processes, 128 for reboot/poweroff */
75  c |= 136;
76  procf = fopen(SYSRQ, "w");
77  if (!procf) {
78  crm_perror(LOG_ERR, "Cannot write to "SYSRQ);
79  return;
80  }
81  fprintf(procf, "%d", c);
82  fclose(procf);
83  return;
84 }
85 
86 static void
87 sysrq_trigger(char t)
88 {
89  FILE *procf;
90 
91  sysrq_init();
92 
93  procf = fopen("/proc/sysrq-trigger", "a");
94  if (!procf) {
95  crm_perror(LOG_ERR, "Opening sysrq-trigger failed");
96  return;
97  }
98  crm_info("sysrq-trigger: %c\n", t);
99  fprintf(procf, "%c\n", t);
100  fclose(procf);
101  return;
102 }
103 
104 
105 static void
106 pcmk_panic_local(void)
107 {
108  int rc = pcmk_ok;
109  uid_t uid = geteuid();
110  pid_t ppid = getppid();
111 
112  if(uid != 0 && ppid > 1) {
113  /* We're a non-root pacemaker daemon (cib, crmd, pengine,
114  * attrd, etc) with the original pacemakerd parent
115  *
116  * Of these, only crmd is likely to be initiating resets
117  */
118  do_crm_log_always(LOG_EMERG, "Signaling parent %d to panic", ppid);
120  return;
121 
122  } else if (uid != 0) {
123  /*
124  * No permissions and no pacemakerd parent to escalate to
125  * Track down the new pacakerd process and send a signal instead
126  */
127  union sigval signal_value;
128 
129  memset(&signal_value, 0, sizeof(signal_value));
130  ppid = crm_procfs_pid_of("pacemakerd");
131  do_crm_log_always(LOG_EMERG, "Signaling pacemakerd(%d) to panic", ppid);
132 
133  if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
134  crm_perror(LOG_EMERG, "Cannot signal pacemakerd(%d) to panic", ppid);
135  }
136  /* The best we can do now is die */
138  return;
139  }
140 
141  /* We're either pacemakerd, or a pacemaker daemon running as root */
142 
143  sysrq_trigger('b');
144  /* reboot(RB_HALT_SYSTEM); rc = errno; */
145  reboot(RB_AUTOBOOT);
146  rc = errno;
147 
148  do_crm_log_always(LOG_EMERG, "Reboot failed, escalating to %d: %s (%d)", ppid, pcmk_strerror(rc), rc);
149 
150  if(ppid > 1) {
151  /* child daemon */
152  exit(pcmk_err_panic);
153  } else {
154  /* pacemakerd or orphan child */
155  exit(DAEMON_RESPAWN_STOP);
156  }
157 }
158 
159 static void
160 pcmk_panic_sbd(void)
161 {
162  union sigval signal_value;
163  pid_t ppid = getppid();
164 
165  do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic", sbd_pid);
166 
167  memset(&signal_value, 0, sizeof(signal_value));
168  /* TODO: Arrange for a slightly less brutal option? */
169  if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
170  crm_perror(LOG_EMERG, "Cannot signal SBD(%d) to terminate", sbd_pid);
171  pcmk_panic_local();
172  }
173 
174  if(ppid > 1) {
175  /* child daemon */
176  exit(pcmk_err_panic);
177  } else {
178  /* pacemakerd or orphan child */
179  exit(DAEMON_RESPAWN_STOP);
180  }
181 }
182 
183 void
184 pcmk_panic(const char *origin)
185 {
186  static struct qb_log_callsite *panic_cs = NULL;
187 
188  if (panic_cs == NULL) {
189  panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay", LOG_TRACE, __LINE__, crm_trace_nonlog);
190  }
191 
192  pcmk_locate_sbd();
193 
194  if (panic_cs && panic_cs->targets) {
195  /* getppid() == 1 means our original parent no longer exists */
196  do_crm_log_always(LOG_EMERG,
197  "Shutting down instead of panicing the node: origin=%s, sbd=%d, parent=%d",
198  origin, sbd_pid, getppid());
200  return;
201  }
202 
203  if(sbd_pid > 1) {
204  do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic the system: %s", sbd_pid, origin);
205  pcmk_panic_sbd();
206 
207  } else {
208  do_crm_log_always(LOG_EMERG, "Panicing the system directly: %s", origin);
209  pcmk_panic_local();
210  }
211 }
212 
213 pid_t
215 {
216  char *pidfile = NULL;
217  char *sbd_path = NULL;
218 
219  if(sbd_pid > 1) {
220  return sbd_pid;
221  }
222 
223  /* Look for the pid file */
224  pidfile = crm_strdup_printf("%s/sbd.pid", HA_STATE_DIR);
225  sbd_path = crm_strdup_printf("%s/sbd", SBINDIR);
226 
227  /* Read the pid file */
228  if(pidfile) {
229  int rc = crm_pidfile_inuse(pidfile, 1, sbd_path);
230  if(rc < pcmk_ok && rc != -ENOENT) {
231  sbd_pid = crm_read_pidfile(pidfile);
232  crm_trace("SBD detected at pid=%d (file)");
233  }
234  }
235 
236  if(sbd_pid < 0) {
237  /* Fall back to /proc for systems that support it */
238  sbd_pid = crm_procfs_pid_of("sbd");
239  crm_trace("SBD detected at pid=%d (proc)", sbd_pid);
240  }
241 
242  if(sbd_pid < 0) {
243  sbd_pid = 0;
244  }
245 
246  free(pidfile);
247  free(sbd_path);
248 
249  return sbd_pid;
250 }
#define LOG_TRACE
Definition: logging.h:29
int crm_procfs_pid_of(const char *name)
Definition: procfs.c:117
const char * pcmk_strerror(int rc)
Definition: logging.c:1113
void sysrq_init(void)
Definition: watchdog.c:49
pid_t pcmk_locate_sbd(void)
Definition: watchdog.c:214
#define pcmk_ok
Definition: error.h:42
unsigned int crm_trace_nonlog
Definition: logging.c:48
int crm_read_pidfile(const char *filename)
Definition: utils.c:1247
pcmk_panic_flags
Definition: watchdog.c:38
int crm_pidfile_inuse(const char *filename, long mypid, const char *daemon)
Definition: utils.c:1275
#define crm_trace(fmt, args...)
Definition: logging.h:254
#define HA_STATE_DIR
Definition: config.h:481
#define do_crm_log_always(level, fmt, args...)
Log a message using constant severity.
Definition: logging.h:213
#define pcmk_err_panic
Definition: error.h:57
#define DAEMON_RESPAWN_STOP
Definition: crm.h:67
#define crm_perror(level, fmt, args...)
Log a system error message.
Definition: logging.h:226
int crm_exit(int rc)
Definition: utils.c:87
#define SYSRQ
Definition: watchdog.c:46
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
void pcmk_panic(const char *origin)
Definition: watchdog.c:184
#define crm_info(fmt, args...)
Definition: logging.h:251