Optimize the computation of cur_win_nsec = win_nsec / sqrt(scale_step + 1) in blk-wbt by introducing a fast inverse square root algorithm. This approach replaces the original use of int_sqrt and division with a more efficient and accurate approximation method. Signed-off-by: Meng Shao Liu <sau525@xxxxxxxxx> --- Since this fast inverse square root algorithm now appears in three locations (blk-wbt, sch_cake, codel), it might be worth considering refactoring the implementation into a shared helper to reduce duplication and ensure consistency. However, this patch focuses solely on introducing the optimization in blk-wbt. block/blk-wbt.c | 60 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index a50d4cd55..1fd5af3ba 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -80,6 +80,8 @@ struct rq_wb { u64 win_nsec; /* default window size */ u64 cur_win_nsec; /* current window size */ + u32 rec_inv_sqrt; /* reciprocal value of sqrt(scaling step + 1) */ struct blk_stat_callback *cb; u64 sync_issue; @@ -130,6 +132,11 @@ enum { */ RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, + /* + * Initial reciprocal value of sqrt(scaling step + 1) + */ + RWB_REC_INV_SQRT = 0, + /* * Disregard stats, if we don't meet this minimum */ @@ -395,20 +402,55 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle) rwb_trace_step(rwb, tracepoint_string("scale down")); } +#define REC_INV_SQRT_CACHE (16) +static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { + ~0, ~0, 3037000500, 2479700525, + 2147483647, 1920767767, 1753413056, 1623345051, + 1518500250, 1431655765, 1358187914, 1294981364, + 1239850263, 1191209601, 1147878294, 1108955788 +}; + +/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots + * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) + * + * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 + */ + +static void rwb_newton_step(struct rq_wb *rwb) +{ + struct rq_depth *rqd = &rwb->rq_depth; + u32 invsqrt, invsqrt2; + u64 val; + + invsqrt = rwb->rec_inv_sqrt; + invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; + val = (3LL << 32) - ((u64)(rqd->scale_step + 1) * invsqrt2); + + val >>= 2; /* avoid overflow in following multiply */ + val = (val * invsqrt) >> (32 - 2 + 1); + + rwb->rec_inv_sqrt = val; +} + +static void rwb_invsqrt(struct rq_wb *rwb) +{ + struct rq_depth *rqd = &rwb->rq_depth; + + if (rqd->scale_step + 1 < REC_INV_SQRT_CACHE) + rwb->rec_inv_sqrt = inv_sqrt_cache[rqd->scale_step + 1]; + else + rwb_newton_step(rwb); +} + static void rwb_arm_timer(struct rq_wb *rwb) { struct rq_depth *rqd = &rwb->rq_depth; if (rqd->scale_step > 0) { - /* - * We should speed this up, using some variant of a fast - * integer inverse square root calculation. Since we only do - * this for every window expiration, it's not a huge deal, - * though. - */ - rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, - int_sqrt((rqd->scale_step + 1) << 8)); - } else { + rwb_invsqrt(rwb); + rwb->cur_win_nsec = reciprocal_scale(rwb->win_nsec, + rwb->rec_inv_sqrt); + } else { /* * For step < 0, we don't want to increase/decrease the * window size. @@ -911,6 +953,7 @@ int wbt_init(struct gendisk *disk) rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; + rwb->rec_inv_sqrt = RWB_REC_INV_SQRT; rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->min_lat_nsec = wbt_default_latency_nsec(q); -- 2.50.1