bcachefs: Improve journal pin flushing

Running the preempt tiering tests with a lower than normal journal
reclaim delay turned up a shutdown hang - a lost wakeup, caused because
flushing a journal pin (e.g. key cache/write buffer) can generate a new
journal pin.

The "simple" fix of adding the correct wakeup didn't work because of
ordering issues; if we flush btree node pins too aggressively before
other pins have completed, we end up spinning where each flush iteration
generates new work.

So to fix this correctly:
- The list of flushed journal pins is now broken out by type, so that
  we can wait for key cache/write buffer pin flushing to complete
  before flushing dirty btree nodes

- A new closure_waitlist is added for bch2_journal_flush_pins; this one
  is only used under or when we're taking the journal lock, so it's
  pretty cheap to add rigorously correct wakeups to journal_pin_set()
  and journal_pin_drop().

Additionally, bch2_journal_seq_pins_to_text() is moved to
journal_reclaim.c, where it belongs, along with a bit of other small
renaming and refactoring.

Besides fixing the hang, the better ordering between key cache/write
buffer flushing and btree node flushing should help or fix the "unmount
taking excessively long" a few users have been noticing.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-01-25 19:22:50 -05:00
parent 0c74c85bbe
commit 35f5197009
6 changed files with 136 additions and 85 deletions

View File

@ -20,6 +20,7 @@
#include "extents.h"
#include "fsck.h"
#include "inode.h"
#include "journal_reclaim.h"
#include "super.h"
#include <linux/console.h>

View File

@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq)
static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
{
unsigned i;
for (i = 0; i < ARRAY_SIZE(p->list); i++)
INIT_LIST_HEAD(&p->list[i]);
INIT_LIST_HEAD(&p->flushed);
for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++)
INIT_LIST_HEAD(&p->unflushed[i]);
for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
INIT_LIST_HEAD(&p->flushed[i]);
atomic_set(&p->count, count);
p->devs.nr = 0;
}
@ -1626,54 +1625,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
__bch2_journal_debug_to_text(out, j);
spin_unlock(&j->lock);
}
bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
spin_lock(&j->lock);
if (!test_bit(JOURNAL_running, &j->flags)) {
spin_unlock(&j->lock);
return true;
}
*seq = max(*seq, j->pin.front);
if (*seq >= j->pin.back) {
spin_unlock(&j->lock);
return true;
}
out->atomic++;
pin_list = journal_seq_pin(j, *seq);
prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
printbuf_indent_add(out, 2);
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
list_for_each_entry(pin, &pin_list->list[i], list)
prt_printf(out, "\t%px %ps\n", pin, pin->flush);
if (!list_empty(&pin_list->flushed))
prt_printf(out, "flushed:\n");
list_for_each_entry(pin, &pin_list->flushed, list)
prt_printf(out, "\t%px %ps\n", pin, pin->flush);
printbuf_indent_sub(out, 2);
--out->atomic;
spin_unlock(&j->lock);
return false;
}
void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
{
u64 seq = 0;
while (!bch2_journal_seq_pins_to_text(out, j, &seq))
seq++;
}

View File

@ -430,8 +430,6 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);

View File

@ -327,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j)
popped = true;
}
if (popped)
if (popped) {
bch2_journal_space_available(j);
__closure_wake_up(&j->reclaim_flush_wait);
}
}
bool __bch2_journal_pin_put(struct journal *j, u64 seq)
@ -362,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j,
pin->seq = 0;
list_del_init(&pin->list);
if (j->reclaim_flush_wait.list.first)
__closure_wake_up(&j->reclaim_flush_wait);
/*
* Unpinning a journal entry may make journal_next_bucket() succeed, if
* writing a new last_seq will now make another bucket available:
@ -383,11 +388,11 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
{
if (fn == bch2_btree_node_flush0 ||
fn == bch2_btree_node_flush1)
return JOURNAL_PIN_btree;
return JOURNAL_PIN_TYPE_btree;
else if (fn == bch2_btree_key_cache_journal_flush)
return JOURNAL_PIN_key_cache;
return JOURNAL_PIN_TYPE_key_cache;
else
return JOURNAL_PIN_other;
return JOURNAL_PIN_TYPE_other;
}
static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
@ -406,7 +411,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
atomic_inc(&pin_list->count);
pin->seq = seq;
pin->flush = flush_fn;
list_add(&pin->list, &pin_list->list[type]);
if (list_empty(&pin_list->unflushed[type]) &&
j->reclaim_flush_wait.list.first)
__closure_wake_up(&j->reclaim_flush_wait);
list_add(&pin->list, &pin_list->unflushed[type]);
}
void bch2_journal_pin_copy(struct journal *j,
@ -499,16 +509,15 @@ journal_get_next_pin(struct journal *j,
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
unsigned i;
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
if (*seq > seq_to_flush && !allowed_above_seq)
break;
for (i = 0; i < JOURNAL_PIN_NR; i++)
if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
((1U << i) & allowed_above_seq)) {
ret = list_first_entry_or_null(&pin_list->list[i],
for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) ||
(BIT(i) & allowed_above_seq)) {
ret = list_first_entry_or_null(&pin_list->unflushed[i],
struct journal_entry_pin, list);
if (ret)
return ret;
@ -544,8 +553,8 @@ static size_t journal_flush_pins(struct journal *j,
}
if (min_key_cache) {
allowed_above |= 1U << JOURNAL_PIN_key_cache;
allowed_below |= 1U << JOURNAL_PIN_key_cache;
allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache);
allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache);
}
cond_resched();
@ -553,7 +562,9 @@ static size_t journal_flush_pins(struct journal *j,
j->last_flushed = jiffies;
spin_lock(&j->lock);
pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
pin = journal_get_next_pin(j, seq_to_flush,
allowed_below,
allowed_above, &seq);
if (pin) {
BUG_ON(j->flush_in_progress);
j->flush_in_progress = pin;
@ -576,7 +587,7 @@ static size_t journal_flush_pins(struct journal *j,
spin_lock(&j->lock);
/* Pin might have been dropped or rearmed: */
if (likely(!err && !j->flush_in_progress_dropped))
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
j->flush_in_progress = NULL;
j->flush_in_progress_dropped = false;
spin_unlock(&j->lock);
@ -816,10 +827,41 @@ int bch2_journal_reclaim_start(struct journal *j)
return 0;
}
static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush,
unsigned types)
{
struct journal_entry_pin_list *pin_list;
u64 seq;
spin_lock(&j->lock);
fifo_for_each_entry_ptr(pin_list, &j->pin, seq) {
if (seq > seq_to_flush)
break;
for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
if ((BIT(i) & types) &&
(!list_empty(&pin_list->unflushed[i]) ||
!list_empty(&pin_list->flushed[i]))) {
spin_unlock(&j->lock);
return true;
}
}
spin_unlock(&j->lock);
return false;
}
static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush,
unsigned types)
{
return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) ||
journal_pins_still_flushing(j, seq_to_flush, types);
}
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
bool *did_work)
{
int ret;
int ret = 0;
ret = bch2_journal_error(j);
if (ret)
@ -827,12 +869,18 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
if (journal_flush_pins(j, seq_to_flush,
(1U << JOURNAL_PIN_key_cache)|
(1U << JOURNAL_PIN_other), 0, 0, 0) ||
journal_flush_pins(j, seq_to_flush,
(1U << JOURNAL_PIN_btree), 0, 0, 0))
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
BIT(JOURNAL_PIN_TYPE_key_cache)|
BIT(JOURNAL_PIN_TYPE_other))) {
*did_work = true;
goto unlock;
}
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
BIT(JOURNAL_PIN_TYPE_btree))) {
*did_work = true;
goto unlock;
}
if (seq_to_flush > journal_cur_seq(j))
bch2_journal_entry_close(j);
@ -847,6 +895,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
!fifo_used(&j->pin);
spin_unlock(&j->lock);
unlock:
mutex_unlock(&j->reclaim_lock);
return ret;
@ -860,7 +909,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
if (!test_bit(JOURNAL_running, &j->flags))
return false;
closure_wait_event(&j->async_wait,
closure_wait_event(&j->reclaim_flush_wait,
journal_flush_done(j, seq_to_flush, &did_work));
return did_work;
@ -926,3 +975,54 @@ err:
return ret;
}
bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
spin_lock(&j->lock);
if (!test_bit(JOURNAL_running, &j->flags)) {
spin_unlock(&j->lock);
return true;
}
*seq = max(*seq, j->pin.front);
if (*seq >= j->pin.back) {
spin_unlock(&j->lock);
return true;
}
out->atomic++;
pin_list = journal_seq_pin(j, *seq);
prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
printbuf_indent_add(out, 2);
prt_printf(out, "unflushed:\n");
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
list_for_each_entry(pin, &pin_list->unflushed[i], list)
prt_printf(out, "\t%px %ps\n", pin, pin->flush);
prt_printf(out, "flushed:\n");
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++)
list_for_each_entry(pin, &pin_list->flushed[i], list)
prt_printf(out, "\t%px %ps\n", pin, pin->flush);
printbuf_indent_sub(out, 2);
--out->atomic;
spin_unlock(&j->lock);
return false;
}
void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
{
u64 seq = 0;
while (!bch2_journal_seq_pins_to_text(out, j, &seq))
seq++;
}

View File

@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j)
int bch2_journal_flush_device_pins(struct journal *, int);
void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */

View File

@ -53,15 +53,15 @@ struct journal_buf {
*/
enum journal_pin_type {
JOURNAL_PIN_btree,
JOURNAL_PIN_key_cache,
JOURNAL_PIN_other,
JOURNAL_PIN_NR,
JOURNAL_PIN_TYPE_btree,
JOURNAL_PIN_TYPE_key_cache,
JOURNAL_PIN_TYPE_other,
JOURNAL_PIN_TYPE_NR,
};
struct journal_entry_pin_list {
struct list_head list[JOURNAL_PIN_NR];
struct list_head flushed;
struct list_head unflushed[JOURNAL_PIN_TYPE_NR];
struct list_head flushed[JOURNAL_PIN_TYPE_NR];
atomic_t count;
struct bch_devs_list devs;
};
@ -226,6 +226,7 @@ struct journal {
/* Used when waiting because the journal was full */
wait_queue_head_t wait;
struct closure_waitlist async_wait;
struct closure_waitlist reclaim_flush_wait;
struct delayed_work write_work;
struct workqueue_struct *wq;