diff options
Diffstat (limited to 'net/bird3/files/patch-09-graceful-recovery')
-rw-r--r-- | net/bird3/files/patch-09-graceful-recovery | 311 |
1 files changed, 311 insertions, 0 deletions
diff --git a/net/bird3/files/patch-09-graceful-recovery b/net/bird3/files/patch-09-graceful-recovery new file mode 100644 index 000000000000..d576f80ebc42 --- /dev/null +++ b/net/bird3/files/patch-09-graceful-recovery @@ -0,0 +1,311 @@ +From f7639a9fafa7411ebd1f2af56c270b970ac09f3d Mon Sep 17 00:00:00 2001 +From: Maria Matejka <mq@ucw.cz> +Date: Mon, 23 Dec 2024 21:06:26 +0100 +Subject: [PATCH] Graceful recovery: converted to obstacles + +Yet another refcounting mechanism had a locking collision. +--- + nest/proto.c | 178 ++++++++++++++++++++++++++---------------------- + nest/protocol.h | 14 +++- + 2 files changed, 110 insertions(+), 82 deletions(-) + +diff --git a/nest/proto.c b/nest/proto.c +index 6fa74e9f1..caf99829b 100644 +--- nest/proto.c ++++ nest/proto.c +@@ -31,15 +31,8 @@ static list STATIC_LIST_INIT(protocol_list); + #define CD(c, msg, args...) ({ if (c->debug & D_STATES) log(L_TRACE "%s.%s: " msg, c->proto->name, c->name ?: "?", ## args); }) + #define PD(p, msg, args...) ({ if (p->debug & D_STATES) log(L_TRACE "%s: " msg, p->name, ## args); }) + +-static timer *gr_wait_timer; +- +-#define GRS_NONE 0 +-#define GRS_INIT 1 +-#define GRS_ACTIVE 2 +-#define GRS_DONE 3 +- +-static int graceful_restart_state; +-static u32 graceful_restart_locks; ++static struct graceful_recovery_context _graceful_recovery_context; ++OBSREF(struct graceful_recovery_context) graceful_recovery_context; + + static char *p_states[] = { "DOWN", "START", "UP", "STOP" }; + static char *c_states[] = { "DOWN", "START", "UP", "STOP", "RESTART" }; +@@ -912,7 +905,7 @@ channel_do_stop(struct channel *c) + ev_postpone(&c->reimport_event); + + c->gr_wait = 0; +- if (c->gr_lock) ++ if (OBSREF_GET(c->gr_lock)) + channel_graceful_restart_unlock(c); + + CALL(c->class->shutdown, c); +@@ -1407,7 +1400,7 @@ proto_start(struct proto *p) + DBG("Kicking %s up\n", p->name); + PD(p, "Starting"); + +- if (graceful_restart_state == GRS_INIT) ++ if (OBSREF_GET(graceful_recovery_context)) + p->gr_recovery = 1; + + if (p->cf->loop_order != DOMAIN_ORDER(the_bird)) +@@ -1921,7 +1914,45 @@ proto_enable(struct proto *p) + * + */ + +-static void graceful_restart_done(timer *t); ++/** ++ * graceful_restart_done - finalize graceful restart ++ * @t: unused ++ * ++ * When there are no locks on graceful restart, the functions finalizes the ++ * graceful restart recovery. Protocols postponing route export until the end of ++ * the recovery are awakened and the export to them is enabled. ++ */ ++static void ++graceful_recovery_done(struct callback *_ UNUSED) ++{ ++ ASSERT_DIE(birdloop_inside(&main_birdloop)); ++ ASSERT_DIE(_graceful_recovery_context.grc_state == GRS_ACTIVE); ++ ++ tm_stop(&_graceful_recovery_context.wait_timer); ++ log(L_INFO "Graceful recovery done"); ++ ++ WALK_TLIST(proto, p, &global_proto_list) ++ PROTO_LOCKED_FROM_MAIN(p) ++ { ++ p->gr_recovery = 0; ++ ++ struct channel *c; ++ WALK_LIST(c, p->channels) ++ { ++ ASSERT_DIE(!OBSREF_GET(c->gr_lock)); ++ ++ /* Resume postponed export of routes */ ++ if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify) ++ channel_start_export(c); ++ ++ /* Cleanup */ ++ c->gr_wait = 0; ++ } ++ } ++ ++ _graceful_recovery_context.grc_state = GRS_DONE; ++} ++ + + /** + * graceful_restart_recovery - request initial graceful restart recovery +@@ -1933,7 +1964,30 @@ static void graceful_restart_done(timer *t); + void + graceful_restart_recovery(void) + { +- graceful_restart_state = GRS_INIT; ++ obstacle_target_init( ++ &_graceful_recovery_context.obstacles, ++ &_graceful_recovery_context.obstacles_cleared, ++ &root_pool, "Graceful recovery"); ++ ++ OBSREF_SET(graceful_recovery_context, &_graceful_recovery_context); ++ _graceful_recovery_context.grc_state = GRS_INIT; ++} ++ ++static void ++graceful_recovery_timeout(timer *t UNUSED) ++{ ++ log(L_INFO "Graceful recovery timeout"); ++ WALK_TLIST(proto, p, &global_proto_list) ++ PROTO_LOCKED_FROM_MAIN(p) ++ { ++ struct channel *c; ++ WALK_LIST(c, p->channels) ++ if (OBSREF_GET(c->gr_lock)) ++ { ++ log(L_INFO "Graceful recovery: Not waiting for %s.%s", p->name, c->name); ++ OBSREF_CLEAR(c->gr_lock); ++ } ++ } + } + + /** +@@ -1946,73 +2000,35 @@ graceful_restart_recovery(void) + void + graceful_restart_init(void) + { +- if (!graceful_restart_state) ++ if (!OBSREF_GET(graceful_recovery_context)) + return; + +- log(L_INFO "Graceful restart started"); ++ log(L_INFO "Graceful recovery started"); + +- if (!graceful_restart_locks) +- { +- graceful_restart_done(NULL); +- return; +- } ++ _graceful_recovery_context.grc_state = GRS_ACTIVE; + +- graceful_restart_state = GRS_ACTIVE; +- gr_wait_timer = tm_new_init(proto_pool, graceful_restart_done, NULL, 0, 0); ++ _graceful_recovery_context.wait_timer = (timer) { .hook = graceful_recovery_timeout }; + u32 gr_wait = atomic_load_explicit(&global_runtime, memory_order_relaxed)->gr_wait; +- tm_start(gr_wait_timer, gr_wait S); +-} +- +-/** +- * graceful_restart_done - finalize graceful restart +- * @t: unused +- * +- * When there are no locks on graceful restart, the functions finalizes the +- * graceful restart recovery. Protocols postponing route export until the end of +- * the recovery are awakened and the export to them is enabled. All other +- * related state is cleared. The function is also called when the graceful +- * restart wait timer fires (but there are still some locks). +- */ +-static void +-graceful_restart_done(timer *t) +-{ +- log(L_INFO "Graceful restart done"); +- graceful_restart_state = GRS_DONE; +- +- WALK_TLIST(proto, p, &global_proto_list) +- { +- if (!p->gr_recovery) +- continue; +- +- struct channel *c; +- WALK_LIST(c, p->channels) +- { +- /* Resume postponed export of routes */ +- if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify) +- channel_start_export(c); ++ tm_start(&_graceful_recovery_context.wait_timer, gr_wait S); + +- /* Cleanup */ +- c->gr_wait = 0; +- c->gr_lock = 0; +- } +- +- p->gr_recovery = 0; +- } ++ callback_init(&_graceful_recovery_context.obstacles_cleared, graceful_recovery_done, &main_birdloop); + +- graceful_restart_locks = 0; +- +- rfree(t); ++ /* The last clearing of obstacle reference will cause ++ * the graceful recovery finish immediately. */ ++ OBSREF_CLEAR(graceful_recovery_context); + } + + void + graceful_restart_show_status(void) + { +- if (graceful_restart_state != GRS_ACTIVE) ++ if (_graceful_recovery_context.grc_state != GRS_ACTIVE) + return; + + cli_msg(-24, "Graceful restart recovery in progress"); +- cli_msg(-24, " Waiting for %d channels to recover", graceful_restart_locks); +- cli_msg(-24, " Wait timer is %t/%u", tm_remains(gr_wait_timer), ++ cli_msg(-24, " Waiting for %u channels to recover", ++ obstacle_target_count(&_graceful_recovery_context.obstacles)); ++ cli_msg(-24, " Wait timer is %t/%u", ++ tm_remains(&_graceful_recovery_context.wait_timer), + atomic_load_explicit(&global_runtime, memory_order_relaxed)->gr_wait); + } + +@@ -2032,14 +2048,22 @@ graceful_restart_show_status(void) + void + channel_graceful_restart_lock(struct channel *c) + { +- ASSERT(graceful_restart_state == GRS_INIT); +- ASSERT(c->proto->gr_recovery); ++ ASSERT_DIE(birdloop_inside(&main_birdloop)); + +- if (c->gr_lock) ++ if (OBSREF_GET(c->gr_lock)) + return; + +- c->gr_lock = 1; +- graceful_restart_locks++; ++ switch (_graceful_recovery_context.grc_state) ++ { ++ case GRS_INIT: ++ case GRS_ACTIVE: ++ OBSREF_SET(c->gr_lock, &_graceful_recovery_context); ++ break; ++ ++ case GRS_NONE: ++ case GRS_DONE: ++ break; ++ } + } + + /** +@@ -2052,18 +2076,10 @@ channel_graceful_restart_lock(struct channel *c) + void + channel_graceful_restart_unlock(struct channel *c) + { +- if (!c->gr_lock) +- return; +- +- c->gr_lock = 0; +- graceful_restart_locks--; +- +- if ((graceful_restart_state == GRS_ACTIVE) && !graceful_restart_locks) +- tm_start(gr_wait_timer, 0); ++ OBSREF_CLEAR(c->gr_lock); + } + + +- + /** + * protos_dump_all - dump status of all protocols + * +@@ -2615,9 +2631,9 @@ channel_show_info(struct channel *c) + cli_msg(-1006, " Input filter: %s", filter_name(c->in_filter)); + cli_msg(-1006, " Output filter: %s", filter_name(c->out_filter)); + +- if (graceful_restart_state == GRS_ACTIVE) ++ if (_graceful_recovery_context.grc_state == GRS_ACTIVE) + cli_msg(-1006, " GR recovery: %s%s", +- c->gr_lock ? " pending" : "", ++ OBSREF_GET(c->gr_lock) ? " pending" : "", + c->gr_wait ? " waiting" : ""); + + channel_show_limit(&c->rx_limit, "Receive limit:", c->limit_active & (1 << PLD_RX), c->limit_actions[PLD_RX]); +diff --git a/nest/protocol.h b/nest/protocol.h +index 2bfa1628a..ec561b263 100644 +--- nest/protocol.h ++++ nest/protocol.h +@@ -659,7 +659,7 @@ struct channel { + + u8 channel_state; + u8 reloadable; /* Hook reload_routes() is allowed on the channel */ +- u8 gr_lock; /* Graceful restart mechanism should wait for this channel */ ++ OBSREF(struct graceful_recovery_context) gr_lock; /* Graceful restart mechanism should wait for this channel */ + u8 gr_wait; /* Route export to channel is postponed until graceful restart */ + + u32 obstacles; /* External obstacles remaining before cleanup */ +@@ -763,4 +763,16 @@ void *channel_config_new(const struct channel_class *cc, const char *name, uint + void *channel_config_get(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto); + int channel_reconfigure(struct channel *c, struct channel_config *cf); + ++struct graceful_recovery_context { ++ struct obstacle_target obstacles; ++ struct callback obstacles_cleared; ++ enum { ++ GRS_NONE, ++ GRS_INIT, ++ GRS_ACTIVE, ++ GRS_DONE, ++ } grc_state; ++ timer wait_timer; ++}; ++ + #endif +-- +GitLab + |