X Tutup
Skip to content

Commit 81d66fa

Browse files
committed
oomd: split swap and mem pressure event timers
One thing that came out of the test week is that systoomd needs to poll more frequently so as not to race with the kernel oom killer in situations where memory is eaten quickly. Memory pressure counters are lagging so it isn't worthwhile to change the current read rate; however swap is not lagging and can be checked more frequently. So let's split these into 2 different timer events. As a result, swap now also doesn't have to be subject to the post-action (post-kill) delay that we need for memory pressure events. Addresses some of slowness to kill discussed in https://bugzilla.redhat.com/show_bug.cgi?id=1941340
1 parent 8ab34a4 commit 81d66fa

File tree

2 files changed

+127
-56
lines changed

2 files changed

+127
-56
lines changed

src/oom/oomd-manager.c

Lines changed: 121 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -299,8 +299,7 @@ static int acquire_managed_oom_connect(Manager *m) {
299299
return 0;
300300
}
301301

302-
static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
303-
_cleanup_set_free_ Set *targets = NULL;
302+
static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
304303
Manager *m = userdata;
305304
usec_t usec_now;
306305
int r;
@@ -313,7 +312,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
313312
if (r < 0)
314313
return log_error_errno(r, "Failed to reset event timer: %m");
315314

316-
r = sd_event_source_set_time_relative(s, INTERVAL_USEC);
315+
r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC);
317316
if (r < 0)
318317
return log_error_errno(r, "Failed to set relative time for timer: %m");
319318

@@ -324,13 +323,89 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
324323
return log_error_errno(r, "Failed to acquire varlink connection: %m");
325324
}
326325

327-
/* Update the cgroups used for detection/action */
328-
r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts);
329-
if (r == -ENOMEM)
330-
return log_oom();
326+
/* We still try to acquire swap information for oomctl even if no units want swap monitoring */
327+
r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
328+
/* If there are no units depending on swap actions, the only error we exit on is ENOMEM.
329+
* Allow ENOENT in the event that swap is disabled on the system. */
330+
if (r == -ENOENT) {
331+
zero(m->system_context);
332+
return 0;
333+
} else if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
334+
return log_error_errno(r, "Failed to acquire system context: %m");
335+
336+
/* Return early if nothing is requesting swap monitoring */
337+
if (hashmap_isempty(m->monitored_swap_cgroup_contexts))
338+
return 0;
339+
340+
/* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the
341+
* system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts
342+
* is only used to decide which cgroups to kill (and even then only the resource usages of its descendent
343+
* nodes are the ones that matter). */
344+
345+
if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
346+
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
347+
_cleanup_free_ char *selected = NULL;
348+
349+
log_debug("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
350+
m->system_context.swap_used, m->system_context.swap_total,
351+
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
352+
353+
r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
354+
if (r == -ENOMEM)
355+
return log_oom();
356+
if (r < 0)
357+
log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
358+
359+
r = oomd_kill_by_swap_usage(candidates, m->dry_run, &selected);
360+
if (r == -ENOMEM)
361+
return log_oom();
362+
if (r < 0)
363+
log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
364+
else {
365+
if (selected)
366+
log_notice("Killed %s due to swap used (%"PRIu64") / total (%"PRIu64") being more than "
367+
PERMYRIAD_AS_PERCENT_FORMAT_STR,
368+
selected, m->system_context.swap_used, m->system_context.swap_total,
369+
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
370+
return 0;
371+
}
372+
}
373+
374+
return 0;
375+
}
376+
377+
static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
378+
_cleanup_set_free_ Set *targets = NULL;
379+
Manager *m = userdata;
380+
usec_t usec_now;
381+
int r;
382+
383+
assert(s);
384+
assert(userdata);
385+
386+
/* Reset timer */
387+
r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
331388
if (r < 0)
332-
log_debug_errno(r, "Failed to update monitored swap cgroup contexts, ignoring: %m");
389+
return log_error_errno(r, "Failed to reset event timer: %m");
390+
391+
r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC);
392+
if (r < 0)
393+
return log_error_errno(r, "Failed to set relative time for timer: %m");
394+
395+
/* Reconnect if our connection dropped */
396+
if (!m->varlink) {
397+
r = acquire_managed_oom_connect(m);
398+
if (r < 0)
399+
return log_error_errno(r, "Failed to acquire varlink connection: %m");
400+
}
333401

402+
/* Return early if nothing is requesting memory pressure monitoring */
403+
if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts)) {
404+
hashmap_clear(m->monitored_mem_pressure_cgroup_contexts_candidates);
405+
return 0;
406+
}
407+
408+
/* Update the cgroups used for detection/action */
334409
r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
335410
if (r == -ENOMEM)
336411
return log_oom();
@@ -344,23 +419,16 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
344419
if (r < 0)
345420
log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
346421

347-
r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
348-
/* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
349-
* Allow ENOENT in the event that swap is disabled on the system. */
350-
if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
351-
return log_error_errno(r, "Failed to acquire system context: %m");
352-
else if (r == -ENOENT)
353-
zero(m->system_context);
354-
355422
if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
356423
m->last_reclaim_at = usec_now;
357424

358-
/* If we're still recovering from a kill, don't try to kill again yet */
359-
if (m->post_action_delay_start > 0) {
360-
if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
425+
/* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
426+
* values and go on a kill storm. */
427+
if (m->mem_pressure_post_action_delay_start > 0) {
428+
if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
361429
return 0;
362430
else
363-
m->post_action_delay_start = 0;
431+
m->mem_pressure_post_action_delay_start = 0;
364432
}
365433

366434
r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
@@ -396,7 +464,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
396464
log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
397465
else {
398466
/* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
399-
m->post_action_delay_start = usec_now;
467+
m->mem_pressure_post_action_delay_start = usec_now;
400468
if (selected)
401469
log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
402470
" for > %s with reclaim activity",
@@ -412,47 +480,42 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
412480
}
413481
}
414482

415-
if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
416-
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
417-
_cleanup_free_ char *selected = NULL;
483+
return 0;
484+
}
418485

419-
log_debug("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
420-
m->system_context.swap_used, m->system_context.swap_total,
421-
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
486+
static int monitor_swap_contexts(Manager *m) {
487+
_cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
488+
int r;
422489

423-
r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
424-
if (r == -ENOMEM)
425-
return log_oom();
426-
if (r < 0)
427-
log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
490+
assert(m);
491+
assert(m->event);
428492

429-
r = oomd_kill_by_swap_usage(candidates, m->dry_run, &selected);
430-
if (r == -ENOMEM)
431-
return log_oom();
432-
if (r < 0)
433-
log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
434-
else {
435-
m->post_action_delay_start = usec_now;
436-
if (selected)
437-
log_notice("Killed %s due to swap used (%"PRIu64") / total (%"PRIu64") being more than "
438-
PERMYRIAD_AS_PERCENT_FORMAT_STR,
439-
selected, m->system_context.swap_used, m->system_context.swap_total,
440-
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
441-
return 0;
442-
}
443-
}
493+
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m);
494+
if (r < 0)
495+
return r;
496+
497+
r = sd_event_source_set_exit_on_failure(s, true);
498+
if (r < 0)
499+
return r;
444500

501+
r = sd_event_source_set_enabled(s, SD_EVENT_ON);
502+
if (r < 0)
503+
return r;
504+
505+
(void) sd_event_source_set_description(s, "oomd-swap-timer");
506+
507+
m->swap_context_event_source = TAKE_PTR(s);
445508
return 0;
446509
}
447510

448-
static int monitor_cgroup_contexts(Manager *m) {
511+
static int monitor_memory_pressure_contexts(Manager *m) {
449512
_cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
450513
int r;
451514

452515
assert(m);
453516
assert(m->event);
454517

455-
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m);
518+
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m);
456519
if (r < 0)
457520
return r;
458521

@@ -464,17 +527,18 @@ static int monitor_cgroup_contexts(Manager *m) {
464527
if (r < 0)
465528
return r;
466529

467-
(void) sd_event_source_set_description(s, "oomd-timer");
530+
(void) sd_event_source_set_description(s, "oomd-memory-pressure-timer");
468531

469-
m->cgroup_context_event_source = TAKE_PTR(s);
532+
m->mem_pressure_context_event_source = TAKE_PTR(s);
470533
return 0;
471534
}
472535

473536
Manager* manager_free(Manager *m) {
474537
assert(m);
475538

476539
varlink_close_unref(m->varlink);
477-
sd_event_source_unref(m->cgroup_context_event_source);
540+
sd_event_source_unref(m->swap_context_event_source);
541+
sd_event_source_unref(m->mem_pressure_context_event_source);
478542
sd_event_unref(m->event);
479543

480544
bus_verify_polkit_async_registry_free(m->polkit_registry);
@@ -596,7 +660,11 @@ int manager_start(
596660
if (r < 0)
597661
return r;
598662

599-
r = monitor_cgroup_contexts(m);
663+
r = monitor_memory_pressure_contexts(m);
664+
if (r < 0)
665+
return r;
666+
667+
r = monitor_swap_contexts(m);
600668
if (r < 0)
601669
return r;
602670

src/oom/oomd-manager.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
#include "varlink.h"
88

99
/* Polling interval for monitoring stats */
10-
#define INTERVAL_USEC (1 * USEC_PER_SEC)
10+
#define SWAP_INTERVAL_USEC 150000 /* 0.15 seconds */
11+
/* Pressure counters are lagging (~2 seconds) compared to swap so polling too frequently just wastes CPU */
12+
#define MEM_PRESSURE_INTERVAL_USEC (1 * USEC_PER_SEC)
1113

1214
/* Used to weight the averages */
1315
#define AVERAGE_SIZE_DECAY 4
@@ -45,9 +47,10 @@ struct Manager {
4547
OomdSystemContext system_context;
4648

4749
usec_t last_reclaim_at;
48-
usec_t post_action_delay_start;
50+
usec_t mem_pressure_post_action_delay_start;
4951

50-
sd_event_source *cgroup_context_event_source;
52+
sd_event_source *swap_context_event_source;
53+
sd_event_source *mem_pressure_context_event_source;
5154

5255
Varlink *varlink;
5356
};

0 commit comments

Comments
 (0)
X Tutup