daemons/controld/controld_remote

/* */
This source file includes following definitions.
free_cmd
generate_callid
recurring_helper
start_delay_helper
should_purge_attributes
section_to_delete
purge_remote_node_attrs
remote_node_up
remote_node_down
check_remote_node_state
report_remote_ra_result
update_remaining_timeout
retry_start_cmd_cb
connection_takeover_timeout_cb
monitor_timeout_cb
synthesize_lrmd_success
remote_lrm_op_callback
handle_remote_ra_stop
handle_remote_ra_start
handle_remote_ra_exec
remote_ra_data_init
remote_ra_cleanup
is_remote_lrmd_ra
remote_ra_get_rsc_info
is_remote_ra_supported_action
fail_all_monitor_cmds
remove_cmd
remote_ra_cancel
handle_dup_monitor
controld_execute_remote_agent
remote_ra_fail
remote_ra_process_pseudo
remote_ra_maintenance
remote_ra_process_maintenance_nodes
remote_ra_is_in_maintenance
remote_ra_controlling_guest
   1 /*
   2  * Copyright 2013-2023 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/msg_xml.h>
  14 #include <crm/common/xml_internal.h>
  15 #include <crm/lrmd.h>
  16 #include <crm/lrmd_internal.h>
  17 #include <crm/services.h>
  18 
  19 #include <pacemaker-controld.h>
  20 
  21 #define REMOTE_LRMD_RA "remote"
  22 
  23 /* The max start timeout before cmd retry */
  24 #define MAX_START_TIMEOUT_MS 10000
  25 
  26 #define cmd_set_flags(cmd, flags_to_set) do { \
  27     (cmd)->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
  28                                        "Remote command", (cmd)->rsc_id, (cmd)->status, \
  29                                        (flags_to_set), #flags_to_set); \
  30         } while (0)
  31 
  32 #define cmd_clear_flags(cmd, flags_to_clear) do { \
  33     (cmd)->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \
  34                                          "Remote command", (cmd)->rsc_id, (cmd)->status, \
  35                                          (flags_to_clear), #flags_to_clear); \
  36         } while (0)
  37 
  38 enum remote_cmd_status {
  39     cmd_reported_success    = (1 << 0),
  40     cmd_cancel              = (1 << 1),
  41 };
  42 
  43 typedef struct remote_ra_cmd_s {
  44     /*! the local node the cmd is issued from */
  45     char *owner;
  46     /*! the remote node the cmd is executed on */
  47     char *rsc_id;
  48     /*! the action to execute */
  49     char *action;
  50     /*! some string the client wants us to give it back */
  51     char *userdata;
  52     /*! start delay in ms */
  53     int start_delay;
  54     /*! timer id used for start delay. */
  55     int delay_id;
  56     /*! timeout in ms for cmd */
  57     int timeout;
  58     int remaining_timeout;
  59     /*! recurring interval in ms */
  60     guint interval_ms;
  61     /*! interval timer id */
  62     int interval_id;
  63     int monitor_timeout_id;
  64     int takeover_timeout_id;
  65     /*! action parameters */
  66     lrmd_key_value_t *params;
  67     pcmk__action_result_t result;
  68     int call_id;
  69     time_t start_time;
  70     uint32_t status;
  71 } remote_ra_cmd_t;
  72 
  73 #define lrm_remote_set_flags(lrm_state, flags_to_set) do { \
  74     lrm_state_t *lrm = (lrm_state); \
  75     remote_ra_data_t *ra = lrm->remote_ra_data; \
  76     ra->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
  77                                     lrm->node_name, ra->status, \
  78                                     (flags_to_set), #flags_to_set); \
  79         } while (0)
  80 
  81 #define lrm_remote_clear_flags(lrm_state, flags_to_clear) do { \
  82     lrm_state_t *lrm = (lrm_state); \
  83     remote_ra_data_t *ra = lrm->remote_ra_data; \
  84     ra->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
  85                                       lrm->node_name, ra->status, \
  86                                       (flags_to_clear), #flags_to_clear); \
  87         } while (0)
  88 
  89 enum remote_status {
  90     expect_takeover     = (1 << 0),
  91     takeover_complete   = (1 << 1),
  92     remote_active       = (1 << 2),
  93     /* Maintenance mode is difficult to determine from the controller's context,
  94      * so we have it signalled back with the transition from the scheduler.
  95      */
  96     remote_in_maint     = (1 << 3),
  97     /* Similar for whether we are controlling a guest node or remote node.
  98      * Fortunately there is a meta-attribute in the transition already and
  99      * as the situation doesn't change over time we can use the
 100      * resource start for noting down the information for later use when
 101      * the attributes aren't at hand.
 102      */
 103     controlling_guest   = (1 << 4),
 104 };
 105 
 106 typedef struct remote_ra_data_s {
 107     crm_trigger_t *work;
 108     remote_ra_cmd_t *cur_cmd;
 109     GList *cmds;
 110     GList *recurring_cmds;
 111     uint32_t status;
 112 } remote_ra_data_t;
 113 
 114 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
 115 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
 116 static GList *fail_all_monitor_cmds(GList * list);
 117 
 118 static void
 119 free_cmd(gpointer user_data)
     /*  */
 120 {
 121     remote_ra_cmd_t *cmd = user_data;
 122 
 123     if (!cmd) {
 124         return;
 125     }
 126     if (cmd->delay_id) {
 127         g_source_remove(cmd->delay_id);
 128     }
 129     if (cmd->interval_id) {
 130         g_source_remove(cmd->interval_id);
 131     }
 132     if (cmd->monitor_timeout_id) {
 133         g_source_remove(cmd->monitor_timeout_id);
 134     }
 135     if (cmd->takeover_timeout_id) {
 136         g_source_remove(cmd->takeover_timeout_id);
 137     }
 138     free(cmd->owner);
 139     free(cmd->rsc_id);
 140     free(cmd->action);
 141     free(cmd->userdata);
 142     pcmk__reset_result(&(cmd->result));
 143     lrmd_key_value_freeall(cmd->params);
 144     free(cmd);
 145 }
 146 
 147 static int
 148 generate_callid(void)
     /*  */
 149 {
 150     static int remote_ra_callid = 0;
 151 
 152     remote_ra_callid++;
 153     if (remote_ra_callid <= 0) {
 154         remote_ra_callid = 1;
 155     }
 156 
 157     return remote_ra_callid;
 158 }
 159 
 160 static gboolean
 161 recurring_helper(gpointer data)
     /*  */
 162 {
 163     remote_ra_cmd_t *cmd = data;
 164     lrm_state_t *connection_rsc = NULL;
 165 
 166     cmd->interval_id = 0;
 167     connection_rsc = lrm_state_find(cmd->rsc_id);
 168     if (connection_rsc && connection_rsc->remote_ra_data) {
 169         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 170 
 171         ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
 172 
 173         ra_data->cmds = g_list_append(ra_data->cmds, cmd);
 174         mainloop_set_trigger(ra_data->work);
 175     }
 176     return FALSE;
 177 }
 178 
 179 static gboolean
 180 start_delay_helper(gpointer data)
     /*  */
 181 {
 182     remote_ra_cmd_t *cmd = data;
 183     lrm_state_t *connection_rsc = NULL;
 184 
 185     cmd->delay_id = 0;
 186     connection_rsc = lrm_state_find(cmd->rsc_id);
 187     if (connection_rsc && connection_rsc->remote_ra_data) {
 188         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 189 
 190         mainloop_set_trigger(ra_data->work);
 191     }
 192     return FALSE;
 193 }
 194 
 195 static bool
 196 should_purge_attributes(crm_node_t *node)
     /*  */
 197 {
 198     bool purge = true;
 199     crm_node_t *conn_node = NULL;
 200     lrm_state_t *connection_rsc = NULL;
 201 
 202     if (!node->conn_host) {
 203         return purge;
 204     }
 205 
 206     /* Get the node that was hosting the remote connection resource from the
 207      * peer cache.  That's the one we really care about here.
 208      */
 209     conn_node = crm_get_peer(0, node->conn_host);
 210     if (conn_node == NULL) {
 211         return purge;
 212     }
 213 
 214     /* Check the uptime of connection_rsc.  If it hasn't been running long
 215      * enough, set purge=true.  "Long enough" means it started running earlier
 216      * than the timestamp when we noticed it went away in the first place.
 217      */
 218     connection_rsc = lrm_state_find(node->uname);
 219 
 220     if (connection_rsc != NULL) {
 221         lrmd_t *lrm = connection_rsc->conn;
 222         time_t uptime = lrmd__uptime(lrm);
 223         time_t now = time(NULL);
 224 
 225         /* Add 20s of fuzziness to give corosync a while to notice the remote
 226          * host is gone.  On various error conditions (failure to get uptime,
 227          * peer_lost isn't set) we default to purging.
 228          */
 229         if (uptime > 0 &&
 230             conn_node->peer_lost > 0 &&
 231             uptime + 20 >= now - conn_node->peer_lost) {
 232             purge = false;
 233         }
 234     }
 235 
 236     return purge;
 237 }
 238 
 239 static enum controld_section_e
 240 section_to_delete(bool purge)
     /*  */
 241 {
 242     if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
 243         if (purge) {
 244             return controld_section_all_unlocked;
 245         } else {
 246             return controld_section_lrm_unlocked;
 247         }
 248     } else {
 249         if (purge) {
 250             return controld_section_all;
 251         } else {
 252             return controld_section_lrm;
 253         }
 254     }
 255 }
 256 
 257 static void
 258 purge_remote_node_attrs(int call_opt, crm_node_t *node)
     /*  */
 259 {
 260     bool purge = should_purge_attributes(node);
 261     enum controld_section_e section = section_to_delete(purge);
 262 
 263     /* Purge node from attrd's memory */
 264     if (purge) {
 265         update_attrd_remote_node_removed(node->uname, NULL);
 266     }
 267 
 268     controld_delete_node_state(node->uname, section, call_opt);
 269 }
 270 
 271 /*!
 272  * \internal
 273  * \brief Handle cluster communication related to pacemaker_remote node joining
 274  *
 275  * \param[in] node_name  Name of newly integrated pacemaker_remote node
 276  */
 277 static void
 278 remote_node_up(const char *node_name)
     /*  */
 279 {
 280     int call_opt;
 281     xmlNode *update, *state;
 282     crm_node_t *node;
 283 
 284     CRM_CHECK(node_name != NULL, return);
 285     crm_info("Announcing Pacemaker Remote node %s", node_name);
 286 
 287     call_opt = crmd_cib_smart_opt();
 288 
 289     /* Delete node's probe_complete attribute. This serves two purposes:
 290      *
 291      * - @COMPAT DCs < 1.1.14 in a rolling upgrade might use it
 292      * - deleting it (or any attribute for that matter) here ensures the
 293      *   attribute manager learns the node is remote
 294      */
 295     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
 296 
 297     /* Ensure node is in the remote peer cache with member status */
 298     node = crm_remote_peer_get(node_name);
 299     CRM_CHECK(node != NULL, return);
 300 
 301     purge_remote_node_attrs(call_opt, node);
 302     pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 303 
 304     /* pacemaker_remote nodes don't participate in the membership layer,
 305      * so cluster nodes don't automatically get notified when they come and go.
 306      * We send a cluster message to the DC, and update the CIB node state entry,
 307      * so the DC will get it sooner (via message) or later (via CIB refresh),
 308      * and any other interested parties can query the CIB.
 309      */
 310     broadcast_remote_state_message(node_name, true);
 311 
 312     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 313     state = create_node_state_update(node, node_update_cluster, update,
 314                                      __func__);
 315 
 316     /* Clear the XML_NODE_IS_FENCED flag in the node state. If the node ever
 317      * needs to be fenced, this flag will allow various actions to determine
 318      * whether the fencing has happened yet.
 319      */
 320     crm_xml_add(state, XML_NODE_IS_FENCED, "0");
 321 
 322     /* TODO: If the remote connection drops, and this (async) CIB update either
 323      * failed or has not yet completed, later actions could mistakenly think the
 324      * node has already been fenced (if the XML_NODE_IS_FENCED attribute was
 325      * previously set, because it won't have been cleared). This could prevent
 326      * actual fencing or allow recurring monitor failures to be cleared too
 327      * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
 328      */
 329     controld_update_cib(XML_CIB_TAG_STATUS, update, call_opt, NULL);
 330     free_xml(update);
 331 }
 332 
 333 enum down_opts {
 334     DOWN_KEEP_LRM,
 335     DOWN_ERASE_LRM
 336 };
 337 
 338 /*!
 339  * \internal
 340  * \brief Handle cluster communication related to pacemaker_remote node leaving
 341  *
 342  * \param[in] node_name  Name of lost node
 343  * \param[in] opts       Whether to keep or erase LRM history
 344  */
 345 static void
 346 remote_node_down(const char *node_name, const enum down_opts opts)
     /*  */
 347 {
 348     xmlNode *update;
 349     int call_opt = crmd_cib_smart_opt();
 350     crm_node_t *node;
 351 
 352     /* Purge node from attrd's memory */
 353     update_attrd_remote_node_removed(node_name, NULL);
 354 
 355     /* Normally, only node attributes should be erased, and the resource history
 356      * should be kept until the node comes back up. However, after a successful
 357      * fence, we want to clear the history as well, so we don't think resources
 358      * are still running on the node.
 359      */
 360     if (opts == DOWN_ERASE_LRM) {
 361         controld_delete_node_state(node_name, controld_section_all, call_opt);
 362     } else {
 363         controld_delete_node_state(node_name, controld_section_attrs, call_opt);
 364     }
 365 
 366     /* Ensure node is in the remote peer cache with lost state */
 367     node = crm_remote_peer_get(node_name);
 368     CRM_CHECK(node != NULL, return);
 369     pcmk__update_peer_state(__func__, node, CRM_NODE_LOST, 0);
 370 
 371     /* Notify DC */
 372     broadcast_remote_state_message(node_name, false);
 373 
 374     /* Update CIB node state */
 375     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 376     create_node_state_update(node, node_update_cluster, update, __func__);
 377     controld_update_cib(XML_CIB_TAG_STATUS, update, call_opt, NULL);
 378     free_xml(update);
 379 }
 380 
 381 /*!
 382  * \internal
 383  * \brief Handle effects of a remote RA command on node state
 384  *
 385  * \param[in] cmd  Completed remote RA command
 386  */
 387 static void
 388 check_remote_node_state(const remote_ra_cmd_t *cmd)
     /*  */
 389 {
 390     /* Only successful actions can change node state */
 391     if (!pcmk__result_ok(&(cmd->result))) {
 392         return;
 393     }
 394 
 395     if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
 396         remote_node_up(cmd->rsc_id);
 397 
 398     } else if (pcmk__str_eq(cmd->action, "migrate_from", pcmk__str_casei)) {
 399         /* After a successful migration, we don't need to do remote_node_up()
 400          * because the DC already knows the node is up, and we don't want to
 401          * clear LRM history etc. We do need to add the remote node to this
 402          * host's remote peer cache, because (unless it happens to be DC)
 403          * it hasn't been tracking the remote node, and other code relies on
 404          * the cache to distinguish remote nodes from unseen cluster nodes.
 405          */
 406         crm_node_t *node = crm_remote_peer_get(cmd->rsc_id);
 407 
 408         CRM_CHECK(node != NULL, return);
 409         pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 410 
 411     } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
 412         lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id);
 413         remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
 414 
 415         if (ra_data) {
 416             if (!pcmk_is_set(ra_data->status, takeover_complete)) {
 417                 /* Stop means down if we didn't successfully migrate elsewhere */
 418                 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
 419             } else if (AM_I_DC == FALSE) {
 420                 /* Only the connection host and DC track node state,
 421                  * so if the connection migrated elsewhere and we aren't DC,
 422                  * un-cache the node, so we don't have stale info
 423                  */
 424                 crm_remote_peer_cache_remove(cmd->rsc_id);
 425             }
 426         }
 427     }
 428 
 429     /* We don't do anything for successful monitors, which is correct for
 430      * routine recurring monitors, and for monitors on nodes where the
 431      * connection isn't supposed to be (the cluster will stop the connection in
 432      * that case). However, if the initial probe finds the connection already
 433      * active on the node where we want it, we probably should do
 434      * remote_node_up(). Unfortunately, we can't distinguish that case here.
 435      * Given that connections have to be initiated by the cluster, the chance of
 436      * that should be close to zero.
 437      */
 438 }
 439 
 440 static void
 441 report_remote_ra_result(remote_ra_cmd_t * cmd)
     /*  */
 442 {
 443     lrmd_event_data_t op = { 0, };
 444 
 445     check_remote_node_state(cmd);
 446 
 447     op.type = lrmd_event_exec_complete;
 448     op.rsc_id = cmd->rsc_id;
 449     op.op_type = cmd->action;
 450     op.user_data = cmd->userdata;
 451     op.timeout = cmd->timeout;
 452     op.interval_ms = cmd->interval_ms;
 453     op.t_run = (unsigned int) cmd->start_time;
 454     op.t_rcchange = (unsigned int) cmd->start_time;
 455 
 456     lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status,
 457                      cmd->result.exit_reason);
 458 
 459     if (pcmk_is_set(cmd->status, cmd_reported_success) && !pcmk__result_ok(&(cmd->result))) {
 460         op.t_rcchange = (unsigned int) time(NULL);
 461         /* This edge case will likely never ever occur, but if it does the
 462          * result is that a failure will not be processed correctly. This is only
 463          * remotely possible because we are able to detect a connection resource's tcp
 464          * connection has failed at any moment after start has completed. The actual
 465          * recurring operation is just a connectivity ping.
 466          *
 467          * basically, we are not guaranteed that the first successful monitor op and
 468          * a subsequent failed monitor op will not occur in the same timestamp. We have to
 469          * make it look like the operations occurred at separate times though. */
 470         if (op.t_rcchange == op.t_run) {
 471             op.t_rcchange++;
 472         }
 473     }
 474 
 475     if (cmd->params) {
 476         lrmd_key_value_t *tmp;
 477 
 478         op.params = pcmk__strkey_table(free, free);
 479         for (tmp = cmd->params; tmp; tmp = tmp->next) {
 480             g_hash_table_insert(op.params, strdup(tmp->key), strdup(tmp->value));
 481         }
 482 
 483     }
 484     op.call_id = cmd->call_id;
 485     op.remote_nodename = cmd->owner;
 486 
 487     lrm_op_callback(&op);
 488 
 489     if (op.params) {
 490         g_hash_table_destroy(op.params);
 491     }
 492     lrmd__reset_result(&op);
 493 }
 494 
 495 static void
 496 update_remaining_timeout(remote_ra_cmd_t * cmd)
     /*  */
 497 {
 498     cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
 499 }
 500 
 501 static gboolean
 502 retry_start_cmd_cb(gpointer data)
     /*  */
 503 {
 504     lrm_state_t *lrm_state = data;
 505     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 506     remote_ra_cmd_t *cmd = NULL;
 507     int rc = ETIME;
 508 
 509     if (!ra_data || !ra_data->cur_cmd) {
 510         return FALSE;
 511     }
 512     cmd = ra_data->cur_cmd;
 513     if (!pcmk__strcase_any_of(cmd->action, "start", "migrate_from", NULL)) {
 514         return FALSE;
 515     }
 516     update_remaining_timeout(cmd);
 517 
 518     if (cmd->remaining_timeout > 0) {
 519         rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
 520     } else {
 521         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 522                          PCMK_EXEC_TIMEOUT,
 523                          "Not enough time remains to retry remote connection");
 524     }
 525 
 526     if (rc != pcmk_rc_ok) {
 527         report_remote_ra_result(cmd);
 528 
 529         if (ra_data->cmds) {
 530             mainloop_set_trigger(ra_data->work);
 531         }
 532         ra_data->cur_cmd = NULL;
 533         free_cmd(cmd);
 534     } else {
 535         /* wait for connection event */
 536     }
 537 
 538     return FALSE;
 539 }
 540 
 541 
 542 static gboolean
 543 connection_takeover_timeout_cb(gpointer data)
     /*  */
 544 {
 545     lrm_state_t *lrm_state = NULL;
 546     remote_ra_cmd_t *cmd = data;
 547 
 548     crm_info("takeover event timed out for node %s", cmd->rsc_id);
 549     cmd->takeover_timeout_id = 0;
 550 
 551     lrm_state = lrm_state_find(cmd->rsc_id);
 552 
 553     handle_remote_ra_stop(lrm_state, cmd);
 554     free_cmd(cmd);
 555 
 556     return FALSE;
 557 }
 558 
 559 static gboolean
 560 monitor_timeout_cb(gpointer data)
     /*  */
 561 {
 562     lrm_state_t *lrm_state = NULL;
 563     remote_ra_cmd_t *cmd = data;
 564 
 565     lrm_state = lrm_state_find(cmd->rsc_id);
 566 
 567     crm_info("Timed out waiting for remote poke response from %s%s",
 568              cmd->rsc_id, (lrm_state? "" : " (no LRM state)"));
 569     cmd->monitor_timeout_id = 0;
 570     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
 571                      "Remote executor did not respond");
 572 
 573     if (lrm_state && lrm_state->remote_ra_data) {
 574         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 575 
 576         if (ra_data->cur_cmd == cmd) {
 577             ra_data->cur_cmd = NULL;
 578         }
 579         if (ra_data->cmds) {
 580             mainloop_set_trigger(ra_data->work);
 581         }
 582     }
 583 
 584     report_remote_ra_result(cmd);
 585     free_cmd(cmd);
 586 
 587     if(lrm_state) {
 588         lrm_state_disconnect(lrm_state);
 589     }
 590     return FALSE;
 591 }
 592 
 593 static void
 594 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
     /*  */
 595 {
 596     lrmd_event_data_t op = { 0, };
 597 
 598     if (lrm_state == NULL) {
 599         /* if lrm_state not given assume local */
 600         lrm_state = lrm_state_find(controld_globals.our_nodename);
 601     }
 602     CRM_ASSERT(lrm_state != NULL);
 603 
 604     op.type = lrmd_event_exec_complete;
 605     op.rsc_id = rsc_id;
 606     op.op_type = op_type;
 607     op.t_run = (unsigned int) time(NULL);
 608     op.t_rcchange = op.t_run;
 609     op.call_id = generate_callid();
 610     lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 611     process_lrm_event(lrm_state, &op, NULL, NULL);
 612 }
 613 
 614 void
 615 remote_lrm_op_callback(lrmd_event_data_t * op)
     /*  */
 616 {
 617     gboolean cmd_handled = FALSE;
 618     lrm_state_t *lrm_state = NULL;
 619     remote_ra_data_t *ra_data = NULL;
 620     remote_ra_cmd_t *cmd = NULL;
 621 
 622     crm_debug("Processing '%s%s%s' event on remote connection to %s: %s "
 623               "(%d) status=%s (%d)",
 624               (op->op_type? op->op_type : ""), (op->op_type? " " : ""),
 625               lrmd_event_type2str(op->type), op->remote_nodename,
 626               services_ocf_exitcode_str(op->rc), op->rc,
 627               pcmk_exec_status_str(op->op_status), op->op_status);
 628 
 629     lrm_state = lrm_state_find(op->remote_nodename);
 630     if (!lrm_state || !lrm_state->remote_ra_data) {
 631         crm_debug("No state information found for remote connection event");
 632         return;
 633     }
 634     ra_data = lrm_state->remote_ra_data;
 635 
 636     if (op->type == lrmd_event_new_client) {
 637         // Another client has connected to the remote daemon
 638 
 639         if (pcmk_is_set(ra_data->status, expect_takeover)) {
 640             // Great, we knew this was coming
 641             lrm_remote_clear_flags(lrm_state, expect_takeover);
 642             lrm_remote_set_flags(lrm_state, takeover_complete);
 643 
 644         } else {
 645             crm_err("Disconnecting from Pacemaker Remote node %s due to "
 646                     "unexpected client takeover", op->remote_nodename);
 647             /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
 648             /* Do not free lrm_state->conn yet. */
 649             /* It'll be freed in the following stop action. */
 650             lrm_state_disconnect_only(lrm_state);
 651         }
 652         return;
 653     }
 654 
 655     /* filter all EXEC events up */
 656     if (op->type == lrmd_event_exec_complete) {
 657         if (pcmk_is_set(ra_data->status, takeover_complete)) {
 658             crm_debug("ignoring event, this connection is taken over by another node");
 659         } else {
 660             lrm_op_callback(op);
 661         }
 662         return;
 663     }
 664 
 665     if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
 666 
 667         if (!pcmk_is_set(ra_data->status, remote_active)) {
 668             crm_debug("Disconnection from Pacemaker Remote node %s complete",
 669                       lrm_state->node_name);
 670 
 671         } else if (!remote_ra_is_in_maintenance(lrm_state)) {
 672             crm_err("Lost connection to Pacemaker Remote node %s",
 673                     lrm_state->node_name);
 674             ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
 675             ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
 676 
 677         } else {
 678             crm_notice("Unmanaged Pacemaker Remote node %s disconnected",
 679                        lrm_state->node_name);
 680             /* Do roughly what a 'stop' on the remote-resource would do */
 681             handle_remote_ra_stop(lrm_state, NULL);
 682             remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
 683             /* now fake the reply of a successful 'stop' */
 684             synthesize_lrmd_success(NULL, lrm_state->node_name, "stop");
 685         }
 686         return;
 687     }
 688 
 689     if (!ra_data->cur_cmd) {
 690         crm_debug("no event to match");
 691         return;
 692     }
 693 
 694     cmd = ra_data->cur_cmd;
 695 
 696     /* Start actions and migrate from actions complete after connection
 697      * comes back to us. */
 698     if (op->type == lrmd_event_connect && pcmk__strcase_any_of(cmd->action, "start",
 699                                                                "migrate_from", NULL)) {
 700         if (op->connection_rc < 0) {
 701             update_remaining_timeout(cmd);
 702 
 703             if ((op->connection_rc == -ENOKEY)
 704                 || (op->connection_rc == -EKEYREJECTED)) {
 705                 // Hard error, don't retry
 706                 pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM,
 707                                  PCMK_EXEC_ERROR,
 708                                  pcmk_strerror(op->connection_rc));
 709 
 710             } else if (cmd->remaining_timeout > 3000) {
 711                 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
 712                 g_timeout_add(1000, retry_start_cmd_cb, lrm_state);
 713                 return;
 714 
 715             } else {
 716                 crm_trace("can't reschedule start, remaining timeout too small %d",
 717                           cmd->remaining_timeout);
 718                 pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 719                                     PCMK_EXEC_TIMEOUT,
 720                                     "%s without enough time to retry",
 721                                     pcmk_strerror(op->connection_rc));
 722             }
 723 
 724         } else {
 725             lrm_state_reset_tables(lrm_state, TRUE);
 726             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 727             lrm_remote_set_flags(lrm_state, remote_active);
 728         }
 729 
 730         crm_debug("Remote connection event matched %s action", cmd->action);
 731         report_remote_ra_result(cmd);
 732         cmd_handled = TRUE;
 733 
 734     } else if (op->type == lrmd_event_poke && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 735 
 736         if (cmd->monitor_timeout_id) {
 737             g_source_remove(cmd->monitor_timeout_id);
 738             cmd->monitor_timeout_id = 0;
 739         }
 740 
 741         /* Only report success the first time, after that only worry about failures.
 742          * For this function, if we get the poke pack, it is always a success. Pokes
 743          * only fail if the send fails, or the response times out. */
 744         if (!pcmk_is_set(cmd->status, cmd_reported_success)) {
 745             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 746             report_remote_ra_result(cmd);
 747             cmd_set_flags(cmd, cmd_reported_success);
 748         }
 749 
 750         crm_debug("Remote poke event matched %s action", cmd->action);
 751 
 752         /* success, keep rescheduling if interval is present. */
 753         if (cmd->interval_ms && !pcmk_is_set(cmd->status, cmd_cancel)) {
 754             ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
 755             cmd->interval_id = g_timeout_add(cmd->interval_ms,
 756                                              recurring_helper, cmd);
 757             cmd = NULL;         /* prevent free */
 758         }
 759         cmd_handled = TRUE;
 760 
 761     } else if (op->type == lrmd_event_disconnect && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 762         if (pcmk_is_set(ra_data->status, remote_active) &&
 763             !pcmk_is_set(cmd->status, cmd_cancel)) {
 764             pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 765                              PCMK_EXEC_ERROR,
 766                              "Remote connection unexpectedly dropped "
 767                              "during monitor");
 768             report_remote_ra_result(cmd);
 769             crm_err("Remote connection to %s unexpectedly dropped during monitor",
 770                     lrm_state->node_name);
 771         }
 772         cmd_handled = TRUE;
 773 
 774     } else if (op->type == lrmd_event_new_client && pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
 775 
 776         handle_remote_ra_stop(lrm_state, cmd);
 777         cmd_handled = TRUE;
 778 
 779     } else {
 780         crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
 781     }
 782 
 783     if (cmd_handled) {
 784         ra_data->cur_cmd = NULL;
 785         if (ra_data->cmds) {
 786             mainloop_set_trigger(ra_data->work);
 787         }
 788         free_cmd(cmd);
 789     }
 790 }
 791 
 792 static void
 793 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
     /*  */
 794 {
 795     remote_ra_data_t *ra_data = NULL;
 796 
 797     CRM_ASSERT(lrm_state);
 798     ra_data = lrm_state->remote_ra_data;
 799 
 800     if (!pcmk_is_set(ra_data->status, takeover_complete)) {
 801         /* delete pending ops when ever the remote connection is intentionally stopped */
 802         g_hash_table_remove_all(lrm_state->active_ops);
 803     } else {
 804         /* we no longer hold the history if this connection has been migrated,
 805          * however, we keep metadata cache for future use */
 806         lrm_state_reset_tables(lrm_state, FALSE);
 807     }
 808 
 809     lrm_remote_clear_flags(lrm_state, remote_active);
 810     lrm_state_disconnect(lrm_state);
 811 
 812     if (ra_data->cmds) {
 813         g_list_free_full(ra_data->cmds, free_cmd);
 814     }
 815     if (ra_data->recurring_cmds) {
 816         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 817     }
 818     ra_data->cmds = NULL;
 819     ra_data->recurring_cmds = NULL;
 820     ra_data->cur_cmd = NULL;
 821 
 822     if (cmd) {
 823         pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 824         report_remote_ra_result(cmd);
 825     }
 826 }
 827 
 828 // \return Standard Pacemaker return code
 829 static int
 830 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
     /*  */
 831 {
 832     const char *server = NULL;
 833     lrmd_key_value_t *tmp = NULL;
 834     int port = 0;
 835     int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
 836     int rc = pcmk_rc_ok;
 837 
 838     for (tmp = cmd->params; tmp; tmp = tmp->next) {
 839         if (pcmk__strcase_any_of(tmp->key, XML_RSC_ATTR_REMOTE_RA_ADDR,
 840                                  XML_RSC_ATTR_REMOTE_RA_SERVER, NULL)) {
 841             server = tmp->value;
 842         } else if (pcmk__str_eq(tmp->key, XML_RSC_ATTR_REMOTE_RA_PORT, pcmk__str_casei)) {
 843             port = atoi(tmp->value);
 844         } else if (pcmk__str_eq(tmp->key, CRM_META "_" XML_RSC_ATTR_CONTAINER, pcmk__str_casei)) {
 845             lrm_remote_set_flags(lrm_state, controlling_guest);
 846         }
 847     }
 848 
 849     rc = controld_connect_remote_executor(lrm_state, server, port,
 850                                           timeout_used);
 851     if (rc != pcmk_rc_ok) {
 852         pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 853                             PCMK_EXEC_ERROR,
 854                             "Could not connect to Pacemaker Remote node %s: %s",
 855                             lrm_state->node_name, pcmk_rc_str(rc));
 856     }
 857     return rc;
 858 }
 859 
 860 static gboolean
 861 handle_remote_ra_exec(gpointer user_data)
     /*  */
 862 {
 863     int rc = 0;
 864     lrm_state_t *lrm_state = user_data;
 865     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 866     remote_ra_cmd_t *cmd;
 867     GList *first = NULL;
 868 
 869     if (ra_data->cur_cmd) {
 870         /* still waiting on previous cmd */
 871         return TRUE;
 872     }
 873 
 874     while (ra_data->cmds) {
 875         first = ra_data->cmds;
 876         cmd = first->data;
 877         if (cmd->delay_id) {
 878             /* still waiting for start delay timer to trip */
 879             return TRUE;
 880         }
 881 
 882         ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
 883         g_list_free_1(first);
 884 
 885         if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) {
 886             lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete);
 887             if (handle_remote_ra_start(lrm_state, cmd,
 888                                        cmd->timeout) == pcmk_rc_ok) {
 889                 /* take care of this later when we get async connection result */
 890                 crm_debug("Initiated async remote connection, %s action will complete after connect event",
 891                           cmd->action);
 892                 ra_data->cur_cmd = cmd;
 893                 return TRUE;
 894             }
 895             report_remote_ra_result(cmd);
 896 
 897         } else if (!strcmp(cmd->action, "monitor")) {
 898 
 899             if (lrm_state_is_connected(lrm_state) == TRUE) {
 900                 rc = lrm_state_poke_connection(lrm_state);
 901                 if (rc < 0) {
 902                     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 903                                      PCMK_EXEC_ERROR, pcmk_strerror(rc));
 904                 }
 905             } else {
 906                 rc = -1;
 907                 pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING,
 908                                  PCMK_EXEC_DONE, "Remote connection inactive");
 909             }
 910 
 911             if (rc == 0) {
 912                 crm_debug("Poked Pacemaker Remote at node %s, waiting for async response",
 913                           cmd->rsc_id);
 914                 ra_data->cur_cmd = cmd;
 915                 cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd);
 916                 return TRUE;
 917             }
 918             report_remote_ra_result(cmd);
 919 
 920         } else if (!strcmp(cmd->action, "stop")) {
 921 
 922             if (pcmk_is_set(ra_data->status, expect_takeover)) {
 923                 /* briefly wait on stop for the takeover event to occur. If the
 924                  * takeover event does not occur during the wait period, that's fine.
 925                  * It just means that the remote-node's lrm_status section is going to get
 926                  * cleared which will require all the resources running in the remote-node
 927                  * to be explicitly re-detected via probe actions.  If the takeover does occur
 928                  * successfully, then we can leave the status section intact. */
 929                 cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd);
 930                 ra_data->cur_cmd = cmd;
 931                 return TRUE;
 932             }
 933 
 934             handle_remote_ra_stop(lrm_state, cmd);
 935 
 936         } else if (!strcmp(cmd->action, "migrate_to")) {
 937             lrm_remote_clear_flags(lrm_state, takeover_complete);
 938             lrm_remote_set_flags(lrm_state, expect_takeover);
 939             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 940             report_remote_ra_result(cmd);
 941         } else if (pcmk__str_any_of(cmd->action, CRMD_ACTION_RELOAD,
 942                                     CRMD_ACTION_RELOAD_AGENT, NULL))  {
 943             /* Currently the only reloadable parameter is reconnect_interval,
 944              * which is only used by the scheduler via the CIB, so reloads are a
 945              * no-op.
 946              *
 947              * @COMPAT DC <2.1.0: We only need to check for "reload" in case
 948              * we're in a rolling upgrade with a DC scheduling "reload" instead
 949              * of "reload-agent". An OCF 1.1 "reload" would be a no-op anyway,
 950              * so this would work for that purpose as well.
 951              */
 952             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 953             report_remote_ra_result(cmd);
 954         }
 955 
 956         free_cmd(cmd);
 957     }
 958 
 959     return TRUE;
 960 }
 961 
 962 static void
 963 remote_ra_data_init(lrm_state_t * lrm_state)
     /*  */
 964 {
 965     remote_ra_data_t *ra_data = NULL;
 966 
 967     if (lrm_state->remote_ra_data) {
 968         return;
 969     }
 970 
 971     ra_data = calloc(1, sizeof(remote_ra_data_t));
 972     ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
 973     lrm_state->remote_ra_data = ra_data;
 974 }
 975 
 976 void
 977 remote_ra_cleanup(lrm_state_t * lrm_state)
     /*  */
 978 {
 979     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 980 
 981     if (!ra_data) {
 982         return;
 983     }
 984 
 985     if (ra_data->cmds) {
 986         g_list_free_full(ra_data->cmds, free_cmd);
 987     }
 988 
 989     if (ra_data->recurring_cmds) {
 990         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 991     }
 992     mainloop_destroy_trigger(ra_data->work);
 993     free(ra_data);
 994     lrm_state->remote_ra_data = NULL;
 995 }
 996 
 997 gboolean
 998 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
     /*  */
 999 {
1000     if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
1001         return TRUE;
1002     }
1003     if ((id != NULL) && (lrm_state_find(id) != NULL)
1004         && !pcmk__str_eq(id, controld_globals.our_nodename, pcmk__str_casei)) {
1005         return TRUE;
1006     }
1007 
1008     return FALSE;
1009 }
1010 
1011 lrmd_rsc_info_t *
1012 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
     /*  */
1013 {
1014     lrmd_rsc_info_t *info = NULL;
1015 
1016     if ((lrm_state_find(rsc_id))) {
1017         info = calloc(1, sizeof(lrmd_rsc_info_t));
1018 
1019         info->id = strdup(rsc_id);
1020         info->type = strdup(REMOTE_LRMD_RA);
1021         info->standard = strdup(PCMK_RESOURCE_CLASS_OCF);
1022         info->provider = strdup("pacemaker");
1023     }
1024 
1025     return info;
1026 }
1027 
1028 static gboolean
1029 is_remote_ra_supported_action(const char *action)
     /*  */
1030 {
1031     return pcmk__str_any_of(action,
1032                             CRMD_ACTION_START,
1033                             CRMD_ACTION_STOP,
1034                             CRMD_ACTION_STATUS,
1035                             CRMD_ACTION_MIGRATE,
1036                             CRMD_ACTION_MIGRATED,
1037                             CRMD_ACTION_RELOAD_AGENT,
1038                             CRMD_ACTION_RELOAD,
1039                             NULL);
1040 }
1041 
1042 static GList *
1043 fail_all_monitor_cmds(GList * list)
     /*  */
1044 {
1045     GList *rm_list = NULL;
1046     remote_ra_cmd_t *cmd = NULL;
1047     GList *gIter = NULL;
1048 
1049     for (gIter = list; gIter != NULL; gIter = gIter->next) {
1050         cmd = gIter->data;
1051         if ((cmd->interval_ms > 0) && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
1052             rm_list = g_list_append(rm_list, cmd);
1053         }
1054     }
1055 
1056     for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
1057         cmd = gIter->data;
1058 
1059         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
1060                          PCMK_EXEC_ERROR, "Lost connection to remote executor");
1061         crm_trace("Pre-emptively failing %s %s (interval=%u, %s)",
1062                   cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
1063         report_remote_ra_result(cmd);
1064 
1065         list = g_list_remove(list, cmd);
1066         free_cmd(cmd);
1067     }
1068 
1069     /* frees only the list data, not the cmds */
1070     g_list_free(rm_list);
1071     return list;
1072 }
1073 
1074 static GList *
1075 remove_cmd(GList * list, const char *action, guint interval_ms)
     /*  */
1076 {
1077     remote_ra_cmd_t *cmd = NULL;
1078     GList *gIter = NULL;
1079 
1080     for (gIter = list; gIter != NULL; gIter = gIter->next) {
1081         cmd = gIter->data;
1082         if ((cmd->interval_ms == interval_ms)
1083             && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
1084             break;
1085         }
1086         cmd = NULL;
1087     }
1088     if (cmd) {
1089         list = g_list_remove(list, cmd);
1090         free_cmd(cmd);
1091     }
1092     return list;
1093 }
1094 
1095 int
1096 remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
     /*  */
1097                  const char *action, guint interval_ms)
1098 {
1099     lrm_state_t *connection_rsc = NULL;
1100     remote_ra_data_t *ra_data = NULL;
1101 
1102     connection_rsc = lrm_state_find(rsc_id);
1103     if (!connection_rsc || !connection_rsc->remote_ra_data) {
1104         return -EINVAL;
1105     }
1106 
1107     ra_data = connection_rsc->remote_ra_data;
1108     ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1109     ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1110                                          interval_ms);
1111     if (ra_data->cur_cmd &&
1112         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1113         (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1114 
1115         cmd_set_flags(ra_data->cur_cmd, cmd_cancel);
1116     }
1117 
1118     return 0;
1119 }
1120 
1121 static remote_ra_cmd_t *
1122 handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
     /*  */
1123                    const char *userdata)
1124 {
1125     GList *gIter = NULL;
1126     remote_ra_cmd_t *cmd = NULL;
1127 
1128     /* there are 3 places a potential duplicate monitor operation
1129      * could exist.
1130      * 1. recurring_cmds list. where the op is waiting for its next interval
1131      * 2. cmds list, where the op is queued to get executed immediately
1132      * 3. cur_cmd, which means the monitor op is in flight right now.
1133      */
1134     if (interval_ms == 0) {
1135         return NULL;
1136     }
1137 
1138     if (ra_data->cur_cmd &&
1139         !pcmk_is_set(ra_data->cur_cmd->status, cmd_cancel) &&
1140         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1141         pcmk__str_eq(ra_data->cur_cmd->action, "monitor", pcmk__str_casei)) {
1142 
1143         cmd = ra_data->cur_cmd;
1144         goto handle_dup;
1145     }
1146 
1147     for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1148         cmd = gIter->data;
1149         if ((cmd->interval_ms == interval_ms)
1150             && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
1151             goto handle_dup;
1152         }
1153     }
1154 
1155     for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1156         cmd = gIter->data;
1157         if ((cmd->interval_ms == interval_ms)
1158             && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
1159             goto handle_dup;
1160         }
1161     }
1162 
1163     return NULL;
1164 
1165 handle_dup:
1166 
1167     crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
1168               cmd->rsc_id, "monitor", interval_ms);
1169 
1170     /* update the userdata */
1171     if (userdata) {
1172        free(cmd->userdata);
1173        cmd->userdata = strdup(userdata);
1174     }
1175 
1176     /* if we've already reported success, generate a new call id */
1177     if (pcmk_is_set(cmd->status, cmd_reported_success)) {
1178         cmd->start_time = time(NULL);
1179         cmd->call_id = generate_callid();
1180         cmd_clear_flags(cmd, cmd_reported_success);
1181     }
1182 
1183     /* if we have an interval_id set, that means we are in the process of
1184      * waiting for this cmd's next interval. instead of waiting, cancel
1185      * the timer and execute the action immediately */
1186     if (cmd->interval_id) {
1187         g_source_remove(cmd->interval_id);
1188         cmd->interval_id = 0;
1189         recurring_helper(cmd);
1190     }
1191 
1192     return cmd;
1193 }
1194 
1195 /*!
1196  * \internal
1197  * \brief Execute an action using the (internal) ocf:pacemaker:remote agent
1198  *
1199  * \param[in]     lrm_state      Executor state object for remote connection
1200  * \param[in]     rsc_id         Connection resource ID
1201  * \param[in]     action         Action to execute
1202  * \param[in]     userdata       String to copy and pass to execution callback
1203  * \param[in]     interval_ms    Action interval (in milliseconds)
1204  * \param[in]     timeout_ms     Action timeout (in milliseconds)
1205  * \param[in]     start_delay_ms Delay (in milliseconds) before executing action
1206  * \param[in,out] params         Connection resource parameters
1207  * \param[out]    call_id        Where to store call ID on success
1208  *
1209  * \return Standard Pacemaker return code
1210  * \note This takes ownership of \p params, which should not be used or freed
1211  *       after calling this function.
1212  */
1213 int
1214 controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id,
     /*  */
1215                               const char *action, const char *userdata,
1216                               guint interval_ms, int timeout_ms,
1217                               int start_delay_ms, lrmd_key_value_t *params,
1218                               int *call_id)
1219 {
1220     lrm_state_t *connection_rsc = NULL;
1221     remote_ra_cmd_t *cmd = NULL;
1222     remote_ra_data_t *ra_data = NULL;
1223 
1224     *call_id = 0;
1225 
1226     CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL)
1227               && (userdata != NULL) && (call_id != NULL),
1228               lrmd_key_value_freeall(params); return EINVAL);
1229 
1230     if (!is_remote_ra_supported_action(action)) {
1231         lrmd_key_value_freeall(params);
1232         return EOPNOTSUPP;
1233     }
1234 
1235     connection_rsc = lrm_state_find(rsc_id);
1236     if (connection_rsc == NULL) {
1237         lrmd_key_value_freeall(params);
1238         return ENOTCONN;
1239     }
1240 
1241     remote_ra_data_init(connection_rsc);
1242     ra_data = connection_rsc->remote_ra_data;
1243 
1244     cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1245     if (cmd) {
1246         *call_id = cmd->call_id;
1247         lrmd_key_value_freeall(params);
1248         return pcmk_rc_ok;
1249     }
1250 
1251     cmd = calloc(1, sizeof(remote_ra_cmd_t));
1252     if (cmd == NULL) {
1253         lrmd_key_value_freeall(params);
1254         return ENOMEM;
1255     }
1256 
1257     cmd->owner = strdup(lrm_state->node_name);
1258     cmd->rsc_id = strdup(rsc_id);
1259     cmd->action = strdup(action);
1260     cmd->userdata = strdup(userdata);
1261     if ((cmd->owner == NULL) || (cmd->rsc_id == NULL) || (cmd->action == NULL)
1262         || (cmd->userdata == NULL)) {
1263         free_cmd(cmd);
1264         lrmd_key_value_freeall(params);
1265         return ENOMEM;
1266     }
1267 
1268     cmd->interval_ms = interval_ms;
1269     cmd->timeout = timeout_ms;
1270     cmd->start_delay = start_delay_ms;
1271     cmd->params = params;
1272     cmd->start_time = time(NULL);
1273 
1274     cmd->call_id = generate_callid();
1275 
1276     if (cmd->start_delay) {
1277         cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
1278     }
1279 
1280     ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1281     mainloop_set_trigger(ra_data->work);
1282 
1283     *call_id = cmd->call_id;
1284     return pcmk_rc_ok;
1285 }
1286 
1287 /*!
1288  * \internal
1289  * \brief Immediately fail all monitors of a remote node, if proxied here
1290  *
1291  * \param[in] node_name  Name of pacemaker_remote node
1292  */
1293 void
1294 remote_ra_fail(const char *node_name)
     /*  */
1295 {
1296     lrm_state_t *lrm_state = lrm_state_find(node_name);
1297 
1298     if (lrm_state && lrm_state_is_connected(lrm_state)) {
1299         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1300 
1301         crm_info("Failing monitors on Pacemaker Remote node %s", node_name);
1302         ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1303         ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1304     }
1305 }
1306 
1307 /* A guest node fencing implied by host fencing looks like:
1308  *
1309  *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1310  *                on_node="lxc1" on_node_uuid="lxc1">
1311  *     <attributes CRM_meta_on_node="lxc1" CRM_meta_on_node_uuid="lxc1"
1312  *                 CRM_meta_stonith_action="off" crm_feature_set="3.0.12"/>
1313  *     <downed>
1314  *       <node id="lxc1"/>
1315  *     </downed>
1316  *  </pseudo_event>
1317  */
1318 #define XPATH_PSEUDO_FENCE "/" XML_GRAPH_TAG_PSEUDO_EVENT \
1319     "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \
1320     "/" XML_CIB_TAG_NODE
1321 
1322 /*!
1323  * \internal
1324  * \brief Check a pseudo-action for Pacemaker Remote node side effects
1325  *
1326  * \param[in,out] xml  XML of pseudo-action to check
1327  */
1328 void
1329 remote_ra_process_pseudo(xmlNode *xml)
     /*  */
1330 {
1331     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
1332 
1333     if (numXpathResults(search) == 1) {
1334         xmlNode *result = getXpathResult(search, 0);
1335 
1336         /* Normally, we handle the necessary side effects of a guest node stop
1337          * action when reporting the remote agent's result. However, if the stop
1338          * is implied due to fencing, it will be a fencing pseudo-event, and
1339          * there won't be a result to report. Handle that case here.
1340          *
1341          * This will result in a duplicate call to remote_node_down() if the
1342          * guest stop was real instead of implied, but that shouldn't hurt.
1343          *
1344          * There is still one corner case that isn't handled: if a guest node
1345          * isn't running any resources when its host is fenced, it will appear
1346          * to be cleanly stopped, so there will be no pseudo-fence, and our
1347          * peer cache state will be incorrect unless and until the guest is
1348          * recovered.
1349          */
1350         if (result) {
1351             const char *remote = ID(result);
1352 
1353             if (remote) {
1354                 remote_node_down(remote, DOWN_ERASE_LRM);
1355             }
1356         }
1357     }
1358     freeXpathObject(search);
1359 }
1360 
1361 static void
1362 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
     /*  */
1363 {
1364     xmlNode *update, *state;
1365     int call_opt;
1366     crm_node_t *node;
1367 
1368     call_opt = crmd_cib_smart_opt();
1369     node = crm_remote_peer_get(lrm_state->node_name);
1370     CRM_CHECK(node != NULL, return);
1371     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
1372     state = create_node_state_update(node, node_update_none, update,
1373                                      __func__);
1374     crm_xml_add(state, XML_NODE_IS_MAINTENANCE, maintenance?"1":"0");
1375     if (controld_update_cib(XML_CIB_TAG_STATUS, update, call_opt,
1376                             NULL) == pcmk_rc_ok) {
1377         /* TODO: still not 100% sure that async update will succeed ... */
1378         if (maintenance) {
1379             lrm_remote_set_flags(lrm_state, remote_in_maint);
1380         } else {
1381             lrm_remote_clear_flags(lrm_state, remote_in_maint);
1382         }
1383     }
1384     free_xml(update);
1385 }
1386 
1387 #define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1388     "[@" XML_LRM_ATTR_TASK "='" CRM_OP_MAINTENANCE_NODES "']/" \
1389     XML_GRAPH_TAG_MAINTENANCE
1390 
1391 /*!
1392  * \internal
1393  * \brief Check a pseudo-action holding updates for maintenance state
1394  *
1395  * \param[in,out] xml  XML of pseudo-action to check
1396  */
1397 void
1398 remote_ra_process_maintenance_nodes(xmlNode *xml)
     /*  */
1399 {
1400     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);
1401 
1402     if (numXpathResults(search) == 1) {
1403         xmlNode *node;
1404         int cnt = 0, cnt_remote = 0;
1405 
1406         for (node =
1407                 first_named_child(getXpathResult(search, 0), XML_CIB_TAG_NODE);
1408             node != NULL; node = pcmk__xml_next(node)) {
1409             lrm_state_t *lrm_state = lrm_state_find(ID(node));
1410 
1411             cnt++;
1412             if (lrm_state && lrm_state->remote_ra_data &&
1413                 pcmk_is_set(((remote_ra_data_t *) lrm_state->remote_ra_data)->status, remote_active)) {
1414                 int is_maint;
1415 
1416                 cnt_remote++;
1417                 pcmk__scan_min_int(crm_element_value(node, XML_NODE_IS_MAINTENANCE),
1418                                    &is_maint, 0);
1419                 remote_ra_maintenance(lrm_state, is_maint);
1420             }
1421         }
1422         crm_trace("Action holds %d nodes (%d remotes found) "
1423                     "adjusting maintenance-mode", cnt, cnt_remote);
1424     }
1425     freeXpathObject(search);
1426 }
1427 
1428 gboolean
1429 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
     /*  */
1430 {
1431     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1432     return pcmk_is_set(ra_data->status, remote_in_maint);
1433 }
1434 
1435 gboolean
1436 remote_ra_controlling_guest(lrm_state_t * lrm_state)
     /*  */
1437 {
1438     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1439     return pcmk_is_set(ra_data->status, controlling_guest);
1440 }
/* */
root/daemons/controld/controld_remote_ra.c

DEFINITIONS