summaryrefslogtreecommitdiff
path: root/net/bird3/files/patch-09-graceful-recovery
diff options
context:
space:
mode:
Diffstat (limited to 'net/bird3/files/patch-09-graceful-recovery')
-rw-r--r--net/bird3/files/patch-09-graceful-recovery311
1 files changed, 311 insertions, 0 deletions
diff --git a/net/bird3/files/patch-09-graceful-recovery b/net/bird3/files/patch-09-graceful-recovery
new file mode 100644
index 000000000000..d576f80ebc42
--- /dev/null
+++ b/net/bird3/files/patch-09-graceful-recovery
@@ -0,0 +1,311 @@
+From f7639a9fafa7411ebd1f2af56c270b970ac09f3d Mon Sep 17 00:00:00 2001
+From: Maria Matejka <mq@ucw.cz>
+Date: Mon, 23 Dec 2024 21:06:26 +0100
+Subject: [PATCH] Graceful recovery: converted to obstacles
+
+Yet another refcounting mechanism had a locking collision.
+---
+ nest/proto.c | 178 ++++++++++++++++++++++++++----------------------
+ nest/protocol.h | 14 +++-
+ 2 files changed, 110 insertions(+), 82 deletions(-)
+
+diff --git a/nest/proto.c b/nest/proto.c
+index 6fa74e9f1..caf99829b 100644
+--- nest/proto.c
++++ nest/proto.c
+@@ -31,15 +31,8 @@ static list STATIC_LIST_INIT(protocol_list);
+ #define CD(c, msg, args...) ({ if (c->debug & D_STATES) log(L_TRACE "%s.%s: " msg, c->proto->name, c->name ?: "?", ## args); })
+ #define PD(p, msg, args...) ({ if (p->debug & D_STATES) log(L_TRACE "%s: " msg, p->name, ## args); })
+
+-static timer *gr_wait_timer;
+-
+-#define GRS_NONE 0
+-#define GRS_INIT 1
+-#define GRS_ACTIVE 2
+-#define GRS_DONE 3
+-
+-static int graceful_restart_state;
+-static u32 graceful_restart_locks;
++static struct graceful_recovery_context _graceful_recovery_context;
++OBSREF(struct graceful_recovery_context) graceful_recovery_context;
+
+ static char *p_states[] = { "DOWN", "START", "UP", "STOP" };
+ static char *c_states[] = { "DOWN", "START", "UP", "STOP", "RESTART" };
+@@ -912,7 +905,7 @@ channel_do_stop(struct channel *c)
+ ev_postpone(&c->reimport_event);
+
+ c->gr_wait = 0;
+- if (c->gr_lock)
++ if (OBSREF_GET(c->gr_lock))
+ channel_graceful_restart_unlock(c);
+
+ CALL(c->class->shutdown, c);
+@@ -1407,7 +1400,7 @@ proto_start(struct proto *p)
+ DBG("Kicking %s up\n", p->name);
+ PD(p, "Starting");
+
+- if (graceful_restart_state == GRS_INIT)
++ if (OBSREF_GET(graceful_recovery_context))
+ p->gr_recovery = 1;
+
+ if (p->cf->loop_order != DOMAIN_ORDER(the_bird))
+@@ -1921,7 +1914,45 @@ proto_enable(struct proto *p)
+ *
+ */
+
+-static void graceful_restart_done(timer *t);
++/**
++ * graceful_restart_done - finalize graceful restart
++ * @t: unused
++ *
++ * When there are no locks on graceful restart, the functions finalizes the
++ * graceful restart recovery. Protocols postponing route export until the end of
++ * the recovery are awakened and the export to them is enabled.
++ */
++static void
++graceful_recovery_done(struct callback *_ UNUSED)
++{
++ ASSERT_DIE(birdloop_inside(&main_birdloop));
++ ASSERT_DIE(_graceful_recovery_context.grc_state == GRS_ACTIVE);
++
++ tm_stop(&_graceful_recovery_context.wait_timer);
++ log(L_INFO "Graceful recovery done");
++
++ WALK_TLIST(proto, p, &global_proto_list)
++ PROTO_LOCKED_FROM_MAIN(p)
++ {
++ p->gr_recovery = 0;
++
++ struct channel *c;
++ WALK_LIST(c, p->channels)
++ {
++ ASSERT_DIE(!OBSREF_GET(c->gr_lock));
++
++ /* Resume postponed export of routes */
++ if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify)
++ channel_start_export(c);
++
++ /* Cleanup */
++ c->gr_wait = 0;
++ }
++ }
++
++ _graceful_recovery_context.grc_state = GRS_DONE;
++}
++
+
+ /**
+ * graceful_restart_recovery - request initial graceful restart recovery
+@@ -1933,7 +1964,30 @@ static void graceful_restart_done(timer *t);
+ void
+ graceful_restart_recovery(void)
+ {
+- graceful_restart_state = GRS_INIT;
++ obstacle_target_init(
++ &_graceful_recovery_context.obstacles,
++ &_graceful_recovery_context.obstacles_cleared,
++ &root_pool, "Graceful recovery");
++
++ OBSREF_SET(graceful_recovery_context, &_graceful_recovery_context);
++ _graceful_recovery_context.grc_state = GRS_INIT;
++}
++
++static void
++graceful_recovery_timeout(timer *t UNUSED)
++{
++ log(L_INFO "Graceful recovery timeout");
++ WALK_TLIST(proto, p, &global_proto_list)
++ PROTO_LOCKED_FROM_MAIN(p)
++ {
++ struct channel *c;
++ WALK_LIST(c, p->channels)
++ if (OBSREF_GET(c->gr_lock))
++ {
++ log(L_INFO "Graceful recovery: Not waiting for %s.%s", p->name, c->name);
++ OBSREF_CLEAR(c->gr_lock);
++ }
++ }
+ }
+
+ /**
+@@ -1946,73 +2000,35 @@ graceful_restart_recovery(void)
+ void
+ graceful_restart_init(void)
+ {
+- if (!graceful_restart_state)
++ if (!OBSREF_GET(graceful_recovery_context))
+ return;
+
+- log(L_INFO "Graceful restart started");
++ log(L_INFO "Graceful recovery started");
+
+- if (!graceful_restart_locks)
+- {
+- graceful_restart_done(NULL);
+- return;
+- }
++ _graceful_recovery_context.grc_state = GRS_ACTIVE;
+
+- graceful_restart_state = GRS_ACTIVE;
+- gr_wait_timer = tm_new_init(proto_pool, graceful_restart_done, NULL, 0, 0);
++ _graceful_recovery_context.wait_timer = (timer) { .hook = graceful_recovery_timeout };
+ u32 gr_wait = atomic_load_explicit(&global_runtime, memory_order_relaxed)->gr_wait;
+- tm_start(gr_wait_timer, gr_wait S);
+-}
+-
+-/**
+- * graceful_restart_done - finalize graceful restart
+- * @t: unused
+- *
+- * When there are no locks on graceful restart, the functions finalizes the
+- * graceful restart recovery. Protocols postponing route export until the end of
+- * the recovery are awakened and the export to them is enabled. All other
+- * related state is cleared. The function is also called when the graceful
+- * restart wait timer fires (but there are still some locks).
+- */
+-static void
+-graceful_restart_done(timer *t)
+-{
+- log(L_INFO "Graceful restart done");
+- graceful_restart_state = GRS_DONE;
+-
+- WALK_TLIST(proto, p, &global_proto_list)
+- {
+- if (!p->gr_recovery)
+- continue;
+-
+- struct channel *c;
+- WALK_LIST(c, p->channels)
+- {
+- /* Resume postponed export of routes */
+- if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify)
+- channel_start_export(c);
++ tm_start(&_graceful_recovery_context.wait_timer, gr_wait S);
+
+- /* Cleanup */
+- c->gr_wait = 0;
+- c->gr_lock = 0;
+- }
+-
+- p->gr_recovery = 0;
+- }
++ callback_init(&_graceful_recovery_context.obstacles_cleared, graceful_recovery_done, &main_birdloop);
+
+- graceful_restart_locks = 0;
+-
+- rfree(t);
++ /* The last clearing of obstacle reference will cause
++ * the graceful recovery finish immediately. */
++ OBSREF_CLEAR(graceful_recovery_context);
+ }
+
+ void
+ graceful_restart_show_status(void)
+ {
+- if (graceful_restart_state != GRS_ACTIVE)
++ if (_graceful_recovery_context.grc_state != GRS_ACTIVE)
+ return;
+
+ cli_msg(-24, "Graceful restart recovery in progress");
+- cli_msg(-24, " Waiting for %d channels to recover", graceful_restart_locks);
+- cli_msg(-24, " Wait timer is %t/%u", tm_remains(gr_wait_timer),
++ cli_msg(-24, " Waiting for %u channels to recover",
++ obstacle_target_count(&_graceful_recovery_context.obstacles));
++ cli_msg(-24, " Wait timer is %t/%u",
++ tm_remains(&_graceful_recovery_context.wait_timer),
+ atomic_load_explicit(&global_runtime, memory_order_relaxed)->gr_wait);
+ }
+
+@@ -2032,14 +2048,22 @@ graceful_restart_show_status(void)
+ void
+ channel_graceful_restart_lock(struct channel *c)
+ {
+- ASSERT(graceful_restart_state == GRS_INIT);
+- ASSERT(c->proto->gr_recovery);
++ ASSERT_DIE(birdloop_inside(&main_birdloop));
+
+- if (c->gr_lock)
++ if (OBSREF_GET(c->gr_lock))
+ return;
+
+- c->gr_lock = 1;
+- graceful_restart_locks++;
++ switch (_graceful_recovery_context.grc_state)
++ {
++ case GRS_INIT:
++ case GRS_ACTIVE:
++ OBSREF_SET(c->gr_lock, &_graceful_recovery_context);
++ break;
++
++ case GRS_NONE:
++ case GRS_DONE:
++ break;
++ }
+ }
+
+ /**
+@@ -2052,18 +2076,10 @@ channel_graceful_restart_lock(struct channel *c)
+ void
+ channel_graceful_restart_unlock(struct channel *c)
+ {
+- if (!c->gr_lock)
+- return;
+-
+- c->gr_lock = 0;
+- graceful_restart_locks--;
+-
+- if ((graceful_restart_state == GRS_ACTIVE) && !graceful_restart_locks)
+- tm_start(gr_wait_timer, 0);
++ OBSREF_CLEAR(c->gr_lock);
+ }
+
+
+-
+ /**
+ * protos_dump_all - dump status of all protocols
+ *
+@@ -2615,9 +2631,9 @@ channel_show_info(struct channel *c)
+ cli_msg(-1006, " Input filter: %s", filter_name(c->in_filter));
+ cli_msg(-1006, " Output filter: %s", filter_name(c->out_filter));
+
+- if (graceful_restart_state == GRS_ACTIVE)
++ if (_graceful_recovery_context.grc_state == GRS_ACTIVE)
+ cli_msg(-1006, " GR recovery: %s%s",
+- c->gr_lock ? " pending" : "",
++ OBSREF_GET(c->gr_lock) ? " pending" : "",
+ c->gr_wait ? " waiting" : "");
+
+ channel_show_limit(&c->rx_limit, "Receive limit:", c->limit_active & (1 << PLD_RX), c->limit_actions[PLD_RX]);
+diff --git a/nest/protocol.h b/nest/protocol.h
+index 2bfa1628a..ec561b263 100644
+--- nest/protocol.h
++++ nest/protocol.h
+@@ -659,7 +659,7 @@ struct channel {
+
+ u8 channel_state;
+ u8 reloadable; /* Hook reload_routes() is allowed on the channel */
+- u8 gr_lock; /* Graceful restart mechanism should wait for this channel */
++ OBSREF(struct graceful_recovery_context) gr_lock; /* Graceful restart mechanism should wait for this channel */
+ u8 gr_wait; /* Route export to channel is postponed until graceful restart */
+
+ u32 obstacles; /* External obstacles remaining before cleanup */
+@@ -763,4 +763,16 @@ void *channel_config_new(const struct channel_class *cc, const char *name, uint
+ void *channel_config_get(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto);
+ int channel_reconfigure(struct channel *c, struct channel_config *cf);
+
++struct graceful_recovery_context {
++ struct obstacle_target obstacles;
++ struct callback obstacles_cleared;
++ enum {
++ GRS_NONE,
++ GRS_INIT,
++ GRS_ACTIVE,
++ GRS_DONE,
++ } grc_state;
++ timer wait_timer;
++};
++
+ #endif
+--
+GitLab
+