diff --git a/src/cpu/riscv_fpu.c b/src/cpu/riscv_fpu.c index a1d60859c..a541305d2 100644 --- a/src/cpu/riscv_fpu.c +++ b/src/cpu/riscv_fpu.c @@ -54,46 +54,182 @@ static const uint32_t riscv_fli_table[32] = { 0x7FC00000UL, // Canonical NaN }; -static void riscv_prepare_rmm(rvvm_hart_t* vm, const uint32_t insn, const size_t rs1, const size_t rs2) +/* + * RMM (round to nearest, ties to max magnitude) == IEEE 754 roundTiesToAway. + * + * The host FPU has no such mode, so when frm == RMM the host is left in + * round-to-nearest-even (see fpu_set_rounding_mode()). RNE and roundTiesToAway + * produce identical results EXCEPT on an exact halfway tie, where RNE rounds to + * even and roundTiesToAway rounds to the larger-magnitude neighbour. + * + * So we compute the op in RNE, recover the EXACT rounding error via the library's + * error-free transforms (TwoSum / TwoProduct), and only when that error is + * exactly half a ULP away from zero do we step the result outward by one ULP. + * (The previous implementation rounded toward +/-inf unconditionally, which is + * correct on ties but wrong for every inexact non-tie. See issue #204.) + * + * fadd/fsub/fmul use riscv_rmm_apply (in riscv_fpu.h, shared with the FMA family). + * fsqrt never needs a fixup: a square + * root is irrational unless exact, and its result is always normal (the square + * root of even the smallest subnormal is ~2^-75), so RNE == roundTiesToAway. A + * *normally-rounded* quotient is likewise never an exact halfway case — but a + * *subnormal* quotient has reduced precision and CAN land exactly on a tie, so + * fdiv gets a dedicated subnormal fixup (riscv_rmm_div_apply). + * + * The error-free transforms run only for a finite result, and the per-op wrappers + * (riscv_rmm_add/mul/div) snapshot and restore the exception flags around them: + * TwoSum/TwoProduct do raw host arithmetic whose intermediate steps can raise + * spurious exceptions (inf-inf -> NV, near-FLT_MAX -> OF) that must not leak into + * fflags. The genuine flags are already set by the base op. + */ +// roundTiesToAway fixup for fadd/fsub (fsub passes b negated). Flag-isolated. +static forceinline fpu_f32_t riscv_rmm_add_f32(fpu_f32_t n, fpu_f32_t a, fpu_f32_t b) { - bool neg = false; + if (unlikely(!fpu_is_finite32(n))) { + return n; + } + const uint32_t exc = fpu_get_exceptions(); + const fpu_f32_t r = riscv_rmm_apply_f32(n, fpu_add_error32(n, a, b)); + fpu_set_exceptions(exc); + return r; +} - // Decide the sign of the output - switch (insn & 0xFE000000UL) { - case 0x00000000UL: // fadd.s - neg = fpu_signbit32(fpu_add32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); - break;; - case 0x02000000UL: // fadd.d - neg = fpu_signbit64(fpu_add64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2))); - break; - case 0x08000000UL: // fsub.s - neg = fpu_signbit32(fpu_sub32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); - break; - case 0x0A000000UL: // fsub.d - neg = fpu_signbit64(fpu_sub64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2))); - break; - case 0x10000000UL: // fmul.s - case 0x18000000UL: // fdiv.s - neg = fpu_signbit32(riscv_view_s(vm, rs1)) != fpu_signbit32(riscv_view_s(vm, rs2)); - break; - case 0x12000000UL: // fmul.d - case 0x1A000000UL: // fdiv.d - neg = fpu_signbit64(riscv_view_d(vm, rs1)) != fpu_signbit64(riscv_view_d(vm, rs2)); - break; - default: - neg = fpu_signbit64(riscv_view_d(vm, rs1)); - break; +static forceinline fpu_f64_t riscv_rmm_add_f64(fpu_f64_t n, fpu_f64_t a, fpu_f64_t b) +{ + if (unlikely(!fpu_is_finite64(n))) { + return n; } + const uint32_t exc = fpu_get_exceptions(); + const fpu_f64_t r = riscv_rmm_apply_f64(n, fpu_add_error64(n, a, b)); + fpu_set_exceptions(exc); + return r; +} - // Round to positive/negative infinity based on the result sign - if (neg) { - fpu_set_rounding_mode(FPU_LIB_ROUND_DN); - } else { - fpu_set_rounding_mode(FPU_LIB_ROUND_UP); +/* + * Dekker's product error (fpu_mul_error) loses bits once the product error drops + * below the smallest subnormal -- not only for a subnormal result, but throughout + * the smallest normal binades (the half-ULP ~2^(e-53) underflows for e <= -969). + * Those results need an exact tie test. + * + * For f32 the exact product widens losslessly into f64 (48 <= 53 bits) and the + * away-side midpoint is exact there (<= 25 bits, down to 2^-150), so the op is a + * tie iff a*b == m. This is exact for every f32 result, so fmul.s always uses it. + * + * For f64 there is no wider type, so scale both operands by 2^512 to lift the + * product into a binade where the fma-based TwoProduct is exact (a small product + * bounds |a|,|b|, so a*2^512 cannot overflow). The midpoint of two normals needs + * one bit more than the format holds, so rather than form it we test the residual: + * the op is a tie iff 2*(a*b - n) == ULP, evaluated in the scaled domain with an + * exact TwoSum cancellation. Mirrors the scaling in riscv_rmm_div_apply_f64. + */ +static forceinline fpu_f32_t riscv_rmm_mul_exact_f32(fpu_f32_t n, fpu_f32_t a, fpu_f32_t b) +{ + const fpu_f32_t away = fpu_bit_u32_to_f32(fpu_bit_f32_to_u32(n) + 1); + const fpu_f64_t half = fpu_bit_u64_to_f64(0x3FE0000000000000ULL); // 0.5 + const fpu_f64_t dn = fpu_fcvt_f32_to_f64(n); + const fpu_f64_t dab = fpu_mul64(fpu_fcvt_f32_to_f64(a), fpu_fcvt_f32_to_f64(b)); // exact + const fpu_f64_t m = fpu_add64(dn, fpu_mul64(fpu_sub64(fpu_fcvt_f32_to_f64(away), dn), half)); + return ((fpu_bit_f64_to_u64(fpu_sub64(dab, m)) << 1) == 0) ? away : n; // tie iff a*b == m +} + +static forceinline fpu_f64_t riscv_rmm_mul_small_f64(fpu_f64_t n, fpu_f64_t a, fpu_f64_t b) +{ + const fpu_f64_t S = fpu_bit_u64_to_f64(0x5FF0000000000000ULL); // 2^512 + const fpu_f64_t as = fpu_mul64(a, S), bs = fpu_mul64(b, S); // exact: |a|,|b| bounded + const fpu_f64_t ps = fpu_mul64(as, bs); // a*b * 2^1024, normal + const fpu_f64_t pe = fpu_fma64(as, bs, fpu_neg64(ps)); // exact: a*b*2^1024 = ps + pe + const fpu_f64_t away = fpu_bit_u64_to_f64(fpu_bit_f64_to_u64(n) + 1); + const fpu_f64_t ns = fpu_mul64(fpu_mul64(n, S), S); // n * 2^1024, exact + const fpu_f64_t gaps = fpu_mul64(fpu_mul64(fpu_sub64(away, n), S), S); // ULP * 2^1024 + // residual (a*b - n)*2^1024 = dr + pe (dr exact, Sterbenz). Tie iff 2*(a*b-n) + // == ULP, i.e. 2*dr + 2*pe == gaps, via an exact TwoSum(2*dr, -gaps). + const fpu_f64_t dr = fpu_sub64(ps, ns); + const fpu_f64_t td = fpu_add64(dr, dr), tp = fpu_add64(pe, pe), ng = fpu_neg64(gaps); + const fpu_f64_t vh = fpu_add64(td, ng); + const fpu_f64_t vl = fpu_add_error64(vh, td, ng); // exact: (td - gaps) = vh + vl + return (((fpu_bit_f64_to_u64(vl) << 1) == 0) && ((fpu_bit_f64_to_u64(fpu_add64(vh, tp)) << 1) == 0)) + ? away : n; // tie iff td - gaps == -2*pe +} + +// roundTiesToAway fixup for fmul. Flag-isolated. fmul.s always uses the exact f64 +// widening; fmul.d uses the exact scaled test for small results (where the product +// error underflows) and the cheaper Dekker product error otherwise. +static forceinline fpu_f32_t riscv_rmm_mul_f32(fpu_f32_t n, fpu_f32_t a, fpu_f32_t b) +{ + if (unlikely(!fpu_is_finite32(n))) { + return n; + } + const uint32_t exc = fpu_get_exceptions(); + const fpu_f32_t r = riscv_rmm_mul_exact_f32(n, a, b); + fpu_set_exceptions(exc); + return r; +} + +static forceinline fpu_f64_t riscv_rmm_mul_f64(fpu_f64_t n, fpu_f64_t a, fpu_f64_t b) +{ + if (unlikely(!fpu_is_finite64(n))) { + return n; + } + const uint32_t exc = fpu_get_exceptions(); + // |n| >= 2^-959 (exp field >= 64): the product error stays representable, so + // the cheap Dekker path is exact. Below that, use the scaled residual test. + const fpu_f64_t r = (((fpu_bit_f64_to_u64(n) >> 52) & 0x7FFU) >= 64) + ? riscv_rmm_apply_f64(n, fpu_mul_error64(n, a, b)) + : riscv_rmm_mul_small_f64(n, a, b); + fpu_set_exceptions(exc); + return r; +} + +/* + * roundTiesToAway fixup for fdiv. Only a subnormal (or zero) quotient can be an + * exact tie; a normal quotient never is, so the common path returns immediately + * (non-finite results also have a non-zero exponent field and return unchanged). + * + * For a subnormal result the exact residual rho = a - n*b is representable, so + * fma(-n, b, a) recovers it exactly. The true quotient is n + rho/b, so it is the + * exact midpoint (a tie, rounded away) iff 2*rho == gap*b, where gap = away - n + * (= +/- the subnormal step). |rho| <= |b|*2^-1075, so every value below stays in + * range and exact. Flag-isolated, as the residual machinery can raise spurious + * exceptions. + */ +static forceinline fpu_f32_t riscv_rmm_div_apply_f32(fpu_f32_t n, fpu_f32_t a, fpu_f32_t b) +{ + if (likely(fpu_bit_f32_to_u32(n) & FPU_LIB_FP32_EXPONENT_MASK)) { + return n; // normal / inf / nan: RNE already == roundTiesToAway + } + const uint32_t exc = fpu_get_exceptions(); + const fpu_f32_t rho = fpu_fma32(fpu_neg32(n), b, a); // exact residual a - n*b + const fpu_f32_t away = fpu_bit_u32_to_f32(fpu_bit_f32_to_u32(n) + 1); + const fpu_f32_t gap = fpu_sub32(away, n); // +/- 2^-149, exact + // Tie iff 2*rho == gap*b, evaluated in fp64 where both sides are exact + // (operands widen losslessly and gap*b ~ 2^-22 stays well inside the range). + const fpu_f64_t two_rho = fpu_add64(fpu_fcvt_f32_to_f64(rho), fpu_fcvt_f32_to_f64(rho)); + const fpu_f64_t gap_b = fpu_mul64(fpu_fcvt_f32_to_f64(gap), fpu_fcvt_f32_to_f64(b)); + const fpu_f32_t r = fpu_is_bit_equal64(two_rho, gap_b) ? away : n; + fpu_set_exceptions(exc); + return r; +} + +static forceinline fpu_f64_t riscv_rmm_div_apply_f64(fpu_f64_t n, fpu_f64_t a, fpu_f64_t b) +{ + if (likely(fpu_bit_f64_to_u64(n) & FPU_LIB_FP64_EXPONENT_MASK)) { + return n; } + const uint32_t exc = fpu_get_exceptions(); + const fpu_f64_t rho = fpu_fma64(fpu_neg64(n), b, a); + const fpu_f64_t away = fpu_bit_u64_to_f64(fpu_bit_f64_to_u64(n) + 1); + const fpu_f64_t gap = fpu_sub64(away, n); // +/- 2^-1074, exact + // fp64 has no wider type; gap*b underflows, so rescale 2*rho == gap*b by the + // gap magnitude (2^1074) as two exact power-of-two steps: rho*2^1075 == +/-b. + const fpu_f64_t scaled = fpu_mul64(fpu_mul64(rho, fpu_bit_u64_to_f64(0x7FE0000000000000ULL)), // 2^1023 + fpu_bit_u64_to_f64(0x4330000000000000ULL)); // 2^52 + const fpu_f64_t target = (fpu_bit_f64_to_u64(gap) >> 63) ? fpu_neg64(b) : b; + const fpu_f64_t r = fpu_is_bit_equal64(scaled, target) ? away : n; + fpu_set_exceptions(exc); + return r; } -slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint32_t insn) +static slow_path void riscv_emulate_f_opc_op_impl(rvvm_hart_t* vm, const uint32_t insn, const bool rmm) { const size_t rds = bit_ext_u32(insn, 7, 5); const uint32_t rm = bit_ext_u32(insn, 12, 3); @@ -102,48 +238,67 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3 if (likely(riscv_fpu_is_enabled(vm))) { - if (unlikely(vm->csr.fcsr >> 5 == 0x04)) { - // Handle RMM rounding - riscv_prepare_rmm(vm, insn, rs1, rs2); - } - switch (insn & 0xFE007000UL) { /* * FPU computations */ - case RISCV_FPU_GEN_RM_CASES(0x00000000UL): // fadd.s - riscv_emit_s(vm, rds, fpu_add32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); - return; - case RISCV_FPU_GEN_RM_CASES(0x02000000UL): // fadd.d - riscv_emit_d(vm, rds, fpu_add64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2))); - return; - case RISCV_FPU_GEN_RM_CASES(0x08000000UL): // fsub.s - riscv_write_s(vm, rds, fpu_sub32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); - return; - case RISCV_FPU_GEN_RM_CASES(0x0A000000UL): // fsub.d - riscv_write_d(vm, rds, fpu_sub64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2))); - return; - case RISCV_FPU_GEN_RM_CASES(0x10000000UL): // fmul.s - riscv_emit_s(vm, rds, fpu_mul32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); - return; - case RISCV_FPU_GEN_RM_CASES(0x12000000UL): // fmul.d - riscv_emit_d(vm, rds, fpu_mul64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2))); - return; - case RISCV_FPU_GEN_RM_CASES(0x18000000UL): // fdiv.s - riscv_emit_s(vm, rds, fpu_div32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); - return; - case RISCV_FPU_GEN_RM_CASES(0x1A000000UL): // fdiv.d - riscv_emit_d(vm, rds, fpu_div64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2))); - return; + case RISCV_FPU_GEN_RM_CASES(0x00000000UL): { // fadd.s + const fpu_f32_t a = riscv_read_s(vm, rs1), b = riscv_read_s(vm, rs2); + const fpu_f32_t n = fpu_add32(a, b); + riscv_write_s(vm, rds, rmm ? riscv_rmm_add_f32(n, a, b) : n); + return; + } + case RISCV_FPU_GEN_RM_CASES(0x02000000UL): { // fadd.d + const fpu_f64_t a = riscv_view_d(vm, rs1), b = riscv_view_d(vm, rs2); + const fpu_f64_t n = fpu_add64(a, b); + riscv_write_d(vm, rds, rmm ? riscv_rmm_add_f64(n, a, b) : n); + return; + } + case RISCV_FPU_GEN_RM_CASES(0x08000000UL): { // fsub.s + const fpu_f32_t a = riscv_read_s(vm, rs1), b = riscv_read_s(vm, rs2); + const fpu_f32_t n = fpu_sub32(a, b); + riscv_write_s(vm, rds, rmm ? riscv_rmm_add_f32(n, a, fpu_neg32(b)) : n); + return; + } + case RISCV_FPU_GEN_RM_CASES(0x0A000000UL): { // fsub.d + const fpu_f64_t a = riscv_view_d(vm, rs1), b = riscv_view_d(vm, rs2); + const fpu_f64_t n = fpu_sub64(a, b); + riscv_write_d(vm, rds, rmm ? riscv_rmm_add_f64(n, a, fpu_neg64(b)) : n); + return; + } + case RISCV_FPU_GEN_RM_CASES(0x10000000UL): { // fmul.s + const fpu_f32_t a = riscv_read_s(vm, rs1), b = riscv_read_s(vm, rs2); + const fpu_f32_t n = fpu_mul32(a, b); + riscv_write_s(vm, rds, rmm ? riscv_rmm_mul_f32(n, a, b) : n); + return; + } + case RISCV_FPU_GEN_RM_CASES(0x12000000UL): { // fmul.d + const fpu_f64_t a = riscv_view_d(vm, rs1), b = riscv_view_d(vm, rs2); + const fpu_f64_t n = fpu_mul64(a, b); + riscv_write_d(vm, rds, rmm ? riscv_rmm_mul_f64(n, a, b) : n); + return; + } + case RISCV_FPU_GEN_RM_CASES(0x18000000UL): { // fdiv.s + const fpu_f32_t a = riscv_read_s(vm, rs1), b = riscv_read_s(vm, rs2); + const fpu_f32_t n = fpu_div32(a, b); + riscv_write_s(vm, rds, rmm ? riscv_rmm_div_apply_f32(n, a, b) : n); + return; + } + case RISCV_FPU_GEN_RM_CASES(0x1A000000UL): { // fdiv.d + const fpu_f64_t a = riscv_view_d(vm, rs1), b = riscv_view_d(vm, rs2); + const fpu_f64_t n = fpu_div64(a, b); + riscv_write_d(vm, rds, rmm ? riscv_rmm_div_apply_f64(n, a, b) : n); + return; + } case RISCV_FPU_GEN_RM_CASES(0x58000000UL): // fsqrt.s if (likely(!rs2)) { - riscv_emit_s(vm, rds, fpu_sqrt32(riscv_view_s(vm, rs1))); + riscv_write_s(vm, rds, fpu_sqrt32(riscv_read_s(vm, rs1))); return; } break; case RISCV_FPU_GEN_RM_CASES(0x5A000000UL): // fsqrt.s if (likely(!rs2)) { - riscv_emit_d(vm, rds, fpu_sqrt64(riscv_view_d(vm, rs1))); + riscv_write_d(vm, rds, fpu_sqrt64(riscv_view_d(vm, rs1))); return; } break; @@ -157,14 +312,14 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3 return; case 0x04: // fround.s (Zfa) case 0x05: // TODO: froundx.s (Zfa) - riscv_emit_s(vm, rds, fpu_fcvt_i64_to_f32(fpu_round_f32_to_i64(riscv_view_s(vm, rs1), rm))); + riscv_emit_s(vm, rds, fpu_fcvt_i64_to_f32(fpu_round_f32_to_i64(riscv_read_s(vm, rs1), rm))); return; } break; case RISCV_FPU_GEN_RM_CASES(0x42000000UL): switch (rs2) { case 0x00: // fcvt.d.s - riscv_write_d(vm, rds, fpu_fcvt_f32_to_f64(riscv_view_s(vm, rs1))); + riscv_write_d(vm, rds, fpu_fcvt_f32_to_f64(riscv_read_s(vm, rs1))); return; case 0x04: // fround.s (Zfa) case 0x05: // TODO: froundx.s (Zfa) @@ -175,20 +330,20 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3 case RISCV_FPU_GEN_RM_CASES(0xC0000000UL): switch (rs2) { case 0x00: // fcvt.w.s - riscv_write_reg(vm, rds, (int32_t)fpu_round_f32_to_i32(riscv_view_s(vm, rs1), rm)); + riscv_write_reg(vm, rds, (int32_t)fpu_round_f32_to_i32(riscv_read_s(vm, rs1), rm)); return; case 0x01: // fcvt.wu.s - riscv_write_reg(vm, rds, (int32_t)fpu_round_f32_to_u32(riscv_view_s(vm, rs1), rm)); + riscv_write_reg(vm, rds, (int32_t)fpu_round_f32_to_u32(riscv_read_s(vm, rs1), rm)); return; case 0x02: // fcvt.l.s if (likely(vm->rv64)) { - riscv_write_reg(vm, rds, (int64_t)fpu_round_f32_to_i64(riscv_view_s(vm, rs1), rm)); + riscv_write_reg(vm, rds, (int64_t)fpu_round_f32_to_i64(riscv_read_s(vm, rs1), rm)); return; } break; case 0x03: // fcvt.lu.s if (likely(vm->rv64)) { - riscv_write_reg(vm, rds, (int64_t)fpu_round_f32_to_u64(riscv_view_s(vm, rs1), rm)); + riscv_write_reg(vm, rds, (int64_t)fpu_round_f32_to_u64(riscv_read_s(vm, rs1), rm)); return; } break; @@ -272,10 +427,10 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3 riscv_emit_s(vm, rds, fpu_fsgnj32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2))); return; case 0x20001000UL: // fsgnjn.s - riscv_emit_s(vm, rds, fpu_fsgnjn32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); + riscv_emit_s(vm, rds, fpu_fsgnjn32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2))); return; case 0x20002000UL: // fsgnjx.s - riscv_emit_s(vm, rds, fpu_fsgnjx32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); + riscv_emit_s(vm, rds, fpu_fsgnjx32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2))); return; case 0x22000000UL: // fsgnj.d riscv_emit_d(vm, rds, fpu_fsgnj64(riscv_read_d(vm, rs1), riscv_read_d(vm, rs2))); @@ -290,16 +445,16 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3 * FPU comparisons */ case 0x28000000UL: // fmin.s - riscv_emit_s(vm, rds, fpu_min32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); + riscv_emit_s(vm, rds, fpu_min32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2))); return; case 0x28001000UL: // fmax.s - riscv_emit_s(vm, rds, fpu_max32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); + riscv_emit_s(vm, rds, fpu_max32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2))); return; case 0x28002000UL: // fminm.s (Zfa) - riscv_write_s(vm, rds, fpu_min32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); + riscv_write_s(vm, rds, fpu_min32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2))); return; case 0x28003000UL: // fmaxm.s (Zfa) - riscv_write_s(vm, rds, fpu_max32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); + riscv_write_s(vm, rds, fpu_max32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2))); return; case 0x2A000000UL: // fmin.d riscv_emit_d(vm, rds, fpu_min64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2))); @@ -314,19 +469,19 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3 riscv_write_d(vm, rds, fpu_max64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2))); return; case 0xA0000000UL: // fle.s - riscv_write_reg(vm, rds, fpu_is_fle32_sig(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); + riscv_write_reg(vm, rds, fpu_is_fle32_sig(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2))); return; case 0xA0001000UL: // flt.s - riscv_write_reg(vm, rds, fpu_is_flt32_sig(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); + riscv_write_reg(vm, rds, fpu_is_flt32_sig(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2))); return; case 0xA0002000UL: // feq.s - riscv_write_reg(vm, rds, fpu_is_equal32_quiet(riscv_read_s(vm, rs1), riscv_view_s(vm, rs2))); + riscv_write_reg(vm, rds, fpu_is_equal32_quiet(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2))); return; case 0xA0004000UL: // fleq.s (Zfa) - riscv_write_reg(vm, rds, fpu_is_fle32_quiet(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); + riscv_write_reg(vm, rds, fpu_is_fle32_quiet(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2))); return; case 0xA0005000UL: // fltq.s (Zfa) - riscv_write_reg(vm, rds, fpu_is_flt32_quiet(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2))); + riscv_write_reg(vm, rds, fpu_is_flt32_quiet(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2))); return; case 0xA2000000UL: // fle.d riscv_write_reg(vm, rds, fpu_is_fle64_sig(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2))); @@ -354,7 +509,7 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3 break; case 0xE0001000UL: // fclass.s if (likely(!rs2)) { - riscv_write_reg(vm, rds, 1U << fpu_fclass32(riscv_view_s(vm, rs1))); + riscv_write_reg(vm, rds, 1U << fpu_fclass32(riscv_read_s(vm, rs1))); return; } break; @@ -406,4 +561,32 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3 riscv_illegal_insn(vm, insn); } +/* + * RISC-V selects the rounding mode either dynamically (the frm CSR, when the + * instruction's rm field is DYN) or statically (the rm field itself). The host + * FPU tracks frm, so a static rm field that differs from frm must be applied + * around the op. RMM has no host mode at all and is synthesized in RNE by the + * fixups above (riscv_rmm_apply / riscv_rmm_div_apply), so its sub-ops must run + * in RNE. funct3 == rm only carries a rounding mode on rounding-capable ops, so + * this never misfires on fsgnj/fcmp/fclass/fmv. + */ +slow_path void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint32_t insn) +{ + const uint32_t rm = bit_ext_u32(insn, 12, 3); + const uint32_t frm = vm->csr.fcsr >> 5; + // Effective rounding mode: a static rm field overrides the dynamic frm CSR. + const uint32_t eff = (rm == 0x07) ? frm : rm; + const bool rmm = (eff == 0x04); + // Override the host mode when synthesizing RMM (run sub-ops in RNE) or when a + // static rm field selects a host-native mode other than the one frm left set. + if (unlikely(rmm || (rm != 0x07 && eff != frm))) { + const uint32_t host = fpu_get_rounding_mode(); + fpu_set_rounding_mode(rmm ? FPU_LIB_ROUND_NE : eff); + riscv_emulate_f_opc_op_impl(vm, insn, rmm); + fpu_set_rounding_mode(host); + } else { + riscv_emulate_f_opc_op_impl(vm, insn, false); + } +} + #endif diff --git a/src/cpu/riscv_fpu.h b/src/cpu/riscv_fpu.h index a5fb25d9d..6395bd320 100644 --- a/src/cpu/riscv_fpu.h +++ b/src/cpu/riscv_fpu.h @@ -26,13 +26,13 @@ static forceinline bool riscv_fpu_rm_is_valid(uint32_t rm) return rm > 1; } -// Bit-precise register reads +// Bit-precise register read (raw low 32 bits, no NaN-box check) — for fmv.x.w static forceinline fpu_f32_t riscv_view_s(rvvm_hart_t* vm, size_t reg) { return fpu_unpack_f32_from_f64(vm->fpu_registers[reg]); } -// Normalized register reads +// Value register read: a mal-boxed narrow operand reads as the canonical NaN static forceinline fpu_f32_t riscv_read_s(rvvm_hart_t* vm, size_t reg) { return fpu_nan_unbox_f32(vm->fpu_registers[reg]); @@ -82,6 +82,48 @@ static forceinline void riscv_write_d(rvvm_hart_t* vm, size_t reg, fpu_f64_t val slow_path void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint32_t insn); +/* + * roundTiesToAway core fixup, shared by the OP-FP dispatch (riscv_fpu.c) and the + * FMA family (riscv_rmm_fma_apply, below). Given a round-to-nearest-even result n + * and the EXACT rounding error err = (true result) - n, step n one ULP outward + * only on an exact halfway tie (err exactly half a ULP, pointing away from zero); + * otherwise RNE already equals roundTiesToAway. See the strategy comment in + * riscv_fpu.c. + */ +static forceinline fpu_f32_t riscv_rmm_apply_f32(fpu_f32_t n, fpu_f32_t err) +{ + const uint32_t un = fpu_bit_f32_to_u32(n); + const uint32_t ue = fpu_bit_f32_to_u32(err); + // Skip exact results (err == +/-0) and errors pointing toward zero: in both + // cases RNE already delivers the roundTiesToAway value. + if ((ue << 1) == 0 || (un >> 31) != (ue >> 31)) { + return n; + } + // n + 1 ULP toward larger magnitude, and the exact gap to it. + const fpu_f32_t away = fpu_bit_u32_to_f32(un + 1); + const fpu_f32_t spacing = fpu_sub32(away, n); // exact: adjacent floats + // Tie iff the exact result is the midpoint, i.e. 2*err == spacing. + if (fpu_is_bit_equal32(fpu_add32(err, err), spacing)) { + return away; + } + return n; +} + +static forceinline fpu_f64_t riscv_rmm_apply_f64(fpu_f64_t n, fpu_f64_t err) +{ + const uint64_t un = fpu_bit_f64_to_u64(n); + const uint64_t ue = fpu_bit_f64_to_u64(err); + if ((ue << 1) == 0 || (un >> 63) != (ue >> 63)) { + return n; + } + const fpu_f64_t away = fpu_bit_u64_to_f64(un + 1); + const fpu_f64_t spacing = fpu_sub64(away, n); + if (fpu_is_bit_equal64(fpu_add64(err, err), spacing)) { + return away; + } + return n; +} + #if defined(RISCV32) || defined(RISCV64) static forceinline void riscv_emulate_f_opc_load(rvvm_hart_t* vm, const uint32_t insn) @@ -126,6 +168,175 @@ static forceinline void riscv_emulate_f_opc_store(rvvm_hart_t* vm, const uint32_ riscv_illegal_insn(vm, insn); } +/* + * FMA honors the rounding mode like any other rounding op: dynamically via frm + * (the host already tracks it) or statically via the instruction's rm field. + * fpu_fma rounds in the host mode, so a static rm that differs from frm must be + * applied around the op; RMM has no host mode and is run in RNE. (RMM differs + * from RNE only on an exact halfway tie, and an exact FMA tie is vanishingly rare + * — the remaining ties-away gap is shared with the rest of the FP path.) + */ +// Underflow flag, RISC-V semantics: UF iff the result is tiny *after rounding* and +// inexact. A subnormal result is unambiguously tiny; a normal result above the +// smallest normal is not. The only hard case is a result of exactly the smallest +// normal: some hosts (e.g. aarch64) detect tininess *before* rounding and raise a +// spurious UF, but RISC-V follows the IEEE "after rounding" rule -- tiny iff the +// exact result, rounded to full precision with an *unbounded* exponent (the finer +// subnormal grid would not have crossed into the normal range), is still below the +// smallest normal. So at the boundary we recompute that: widen the exact a*b+c into +// f64 (lossless for a smallest-normal f32 result), scale into [0.5, 1), and round +// back to f32 in the effective mode -- a magnitude >= 1.0 means not tiny. The gate +// keeps the common FMA path at a single compare. +static forceinline void riscv_fma_fixup_uf32(fpu_f32_t r, fpu_f32_t a, fpu_f32_t b, fpu_f32_t c, uint32_t eff) +{ + if (likely((fpu_bit_f32_to_u32(r) & 0x7FFFFFFFU) != 0x00800000U)) { + return; // subnormal UF is genuine; a larger normal result has none + } + const uint32_t exc = fpu_get_exceptions(); + if (!(exc & FPU_LIB_FLAG_NX)) { + return; // an exact smallest-normal result never underflows + } + const fpu_f64_t e = fpu_add64(fpu_mul64(fpu_fcvt_f32_to_f64(a), fpu_fcvt_f32_to_f64(b)), + fpu_fcvt_f32_to_f64(c)); // exact + const fpu_f64_t scaled = fpu_mul64(e, fpu_bit_u64_to_f64(0x47D0000000000000ULL)); // * 2^126 -> ~[0.5,1) + const uint32_t host = fpu_get_rounding_mode(); + fpu_set_rounding_mode(eff == 0x04 ? FPU_LIB_ROUND_NE : eff); + const fpu_f32_t rn = fpu_fcvt_f64_to_f32(scaled); + fpu_set_rounding_mode(host); + const bool tiny = (fpu_bit_f32_to_u32(rn) & 0x7FFFFFFFU) < 0x3F800000U; // |rn| < 1.0 + fpu_set_exceptions(tiny ? (exc | FPU_LIB_FLAG_UF) : (exc & ~FPU_LIB_FLAG_UF)); +} + +// f64 has no wider type, so recompute the after-rounding tininess by scaling: a +// smallest-normal f64 result bounds |a|,|b| (a tiny product forces small operands), +// so a*2^200 cannot overflow, and fma(a*2^200, b, c*2^200) = (a*b+c)*2^200 lands in +// the normal range where the fused rounding is unbounded-exponent. The result is +// tiny iff that scaled magnitude is below the scaled smallest normal (2^-822). +static forceinline void riscv_fma_fixup_uf64(fpu_f64_t r, fpu_f64_t a, fpu_f64_t b, fpu_f64_t c, uint32_t eff) +{ + if (likely((fpu_bit_f64_to_u64(r) & 0x7FFFFFFFFFFFFFFFULL) != 0x0010000000000000ULL)) { + return; // subnormal UF is genuine; a larger normal result has none + } + const uint32_t exc = fpu_get_exceptions(); + if (!(exc & FPU_LIB_FLAG_NX)) { + return; // an exact smallest-normal result never underflows + } + const fpu_f64_t S = fpu_bit_u64_to_f64(0x4C70000000000000ULL); // 2^200 + const uint32_t host = fpu_get_rounding_mode(); + fpu_set_rounding_mode(eff == 0x04 ? FPU_LIB_ROUND_NE : eff); + const fpu_f64_t rs = fpu_fma64(fpu_mul64(a, S), b, fpu_mul64(c, S)); // (a*b+c) * 2^200 + fpu_set_rounding_mode(host); + const bool tiny = (fpu_bit_f64_to_u64(rs) & 0x7FFFFFFFFFFFFFFFULL) < 0x0C90000000000000ULL; // |rs| < 2^-822 + fpu_set_exceptions(tiny ? (exc | FPU_LIB_FLAG_UF) : (exc & ~FPU_LIB_FLAG_UF)); +} + +/* + * roundTiesToAway fixup for the FMA family. As with the OP-FP ops, the host runs + * in RNE under frm == RMM, so only an exact halfway tie needs correcting. Both are + * flag-isolated: the error-free arithmetic does raw host ops whose intermediate + * steps can raise spurious exceptions, while the genuine flags are already set by + * the base fma. + * + * f32: widen to f64, where a*b is exact (48 <= 53 bits) and the away-side midpoint + * m is exact (<= 25 significant bits, including the subnormal range). The op is an + * exact tie iff a*b + c == m, tested as ((a*b + c) - m) == 0 with every term + * exact. A subnormal f32 result is covered too: it widens to a normal f64. + */ +static forceinline fpu_f32_t riscv_rmm_fma_apply_f32(fpu_f32_t n, fpu_f32_t a, fpu_f32_t b, fpu_f32_t c) +{ + if (unlikely(!fpu_is_finite32(n))) { + return n; + } + const uint32_t exc = fpu_get_exceptions(); + const fpu_f32_t away = fpu_bit_u32_to_f32(fpu_bit_f32_to_u32(n) + 1); // toward larger magnitude + const fpu_f64_t dab = fpu_mul64(fpu_fcvt_f32_to_f64(a), fpu_fcvt_f32_to_f64(b)); // exact + const fpu_f64_t dc = fpu_fcvt_f32_to_f64(c); + const fpu_f64_t s1 = fpu_add64(dab, dc); + const fpu_f64_t s1e = fpu_add_error64(s1, dab, dc); // exact: dab + dc == s1 + s1e + const fpu_f64_t dn = fpu_fcvt_f32_to_f64(n); + const fpu_f64_t half = fpu_bit_u64_to_f64(0x3FE0000000000000ULL); // 0.5 + const fpu_f64_t m = fpu_add64(dn, fpu_mul64(fpu_sub64(fpu_fcvt_f32_to_f64(away), dn), half)); + const fpu_f64_t resid = fpu_add64(fpu_sub64(s1, m), s1e); // (a*b + c) - m, exactly 0 only on a tie + const fpu_f32_t r = ((fpu_bit_f64_to_u64(resid) << 1) == 0) ? away : n; + fpu_set_exceptions(exc); + return r; +} + +/* + * f64: there is no wider host type, so recover the exact residual a*b + c - n via + * the Boldo-Muller error-free FMA (TwoProduct + two TwoSums + a final FastTwoSum), + * which splits it into a non-overlapping pair (r2, r3). An exact tie has the + * residual equal to +/- half a ULP -- a single float -- so r3 == 0 and r2 is that + * half-ULP, exactly the condition riscv_rmm_apply_f64 already tests. + */ +static forceinline fpu_f64_t riscv_rmm_fma_apply_f64(fpu_f64_t n, fpu_f64_t a, fpu_f64_t b, fpu_f64_t c) +{ + if (unlikely(!fpu_is_finite64(n))) { + return n; + } + const uint32_t exc = fpu_get_exceptions(); + const fpu_f64_t p = fpu_mul64(a, b); + const fpu_f64_t pe = fpu_fma64(a, b, fpu_neg64(p)); // TwoProduct tail: a*b == p + pe + const fpu_f64_t a1 = fpu_add64(c, pe); + const fpu_f64_t a2 = fpu_add_error64(a1, c, pe); // TwoSum(c, pe) + const fpu_f64_t b1 = fpu_add64(p, a1); + const fpu_f64_t b2 = fpu_add_error64(b1, p, a1); // TwoSum(p, a1) + const fpu_f64_t g = fpu_add64(fpu_sub64(b1, n), b2); + const fpu_f64_t r2 = fpu_add64(g, a2); // FastTwoSum(g, a2): residual == r2 + r3 + const fpu_f64_t r3 = fpu_sub64(a2, fpu_sub64(r2, g)); + const fpu_f64_t r = ((fpu_bit_f64_to_u64(r3) << 1) == 0) ? riscv_rmm_apply_f64(n, r2) : n; + fpu_set_exceptions(exc); + return r; +} + +static forceinline fpu_f32_t riscv_fma_round_f32(rvvm_hart_t* vm, uint32_t rm, fpu_f32_t a, fpu_f32_t b, fpu_f32_t c) +{ + const uint32_t frm = vm->csr.fcsr >> 5; + const uint32_t eff = (rm == 0x07) ? frm : rm; + fpu_f32_t r; + if (unlikely(eff == 0x04)) { + // RMM: compute in RNE, then apply the roundTiesToAway fixup. + const uint32_t host = fpu_get_rounding_mode(); + fpu_set_rounding_mode(FPU_LIB_ROUND_NE); + r = riscv_rmm_fma_apply_f32(fpu_fma32(a, b, c), a, b, c); + fpu_set_rounding_mode(host); + } else if (rm != 0x07 && eff != frm) { + // Static host-native mode that differs from frm. + const uint32_t host = fpu_get_rounding_mode(); + fpu_set_rounding_mode(eff); + r = fpu_fma32(a, b, c); + fpu_set_rounding_mode(host); + } else { + r = fpu_fma32(a, b, c); + } + riscv_fma_fixup_uf32(r, a, b, c, eff); + return r; +} + +static forceinline fpu_f64_t riscv_fma_round_f64(rvvm_hart_t* vm, uint32_t rm, fpu_f64_t a, fpu_f64_t b, fpu_f64_t c) +{ + const uint32_t frm = vm->csr.fcsr >> 5; + const uint32_t eff = (rm == 0x07) ? frm : rm; + fpu_f64_t r; + if (unlikely(eff == 0x04)) { + // RMM: compute in RNE, then apply the roundTiesToAway fixup. + const uint32_t host = fpu_get_rounding_mode(); + fpu_set_rounding_mode(FPU_LIB_ROUND_NE); + r = riscv_rmm_fma_apply_f64(fpu_fma64(a, b, c), a, b, c); + fpu_set_rounding_mode(host); + } else if (rm != 0x07 && eff != frm) { + // Static host-native mode that differs from frm. + const uint32_t host = fpu_get_rounding_mode(); + fpu_set_rounding_mode(eff); + r = fpu_fma64(a, b, c); + fpu_set_rounding_mode(host); + } else { + r = fpu_fma64(a, b, c); + } + riscv_fma_fixup_uf64(r, a, b, c, eff); + return r; +} + static forceinline void riscv_emulate_f_fmadd(rvvm_hart_t* vm, const uint32_t insn) { const size_t rds = bit_ext_u32(insn, 7, 5); @@ -137,14 +348,14 @@ static forceinline void riscv_emulate_f_fmadd(rvvm_hart_t* vm, const uint32_t in if (likely(riscv_fpu_is_enabled(vm) && riscv_fpu_rm_is_valid(rm))) { switch (bit_ext_u32(insn, 25, 2)) { case 0x0: // fmadd.s - riscv_emit_s(vm, rds, - fpu_fma32(riscv_view_s(vm, rs1), // - riscv_view_s(vm, rs2), // - riscv_view_s(vm, rs3))); + riscv_write_s(vm, rds, + riscv_fma_round_f32(vm, rm, riscv_read_s(vm, rs1), // + riscv_read_s(vm, rs2), // + riscv_read_s(vm, rs3))); return; case 0x1: // fmadd.d - riscv_emit_d(vm, rds, - fpu_fma64(riscv_view_d(vm, rs1), // + riscv_write_d(vm, rds, + riscv_fma_round_f64(vm, rm, riscv_view_d(vm, rs1), // riscv_view_d(vm, rs2), // riscv_view_d(vm, rs3))); return; @@ -165,14 +376,14 @@ static forceinline void riscv_emulate_f_fmsub(rvvm_hart_t* vm, const uint32_t in if (likely(riscv_fpu_is_enabled(vm) && riscv_fpu_rm_is_valid(rm))) { switch (bit_ext_u32(insn, 25, 2)) { case 0x0: // fmsub.s - riscv_emit_s(vm, rds, - fpu_fma32(riscv_view_s(vm, rs1), // - riscv_view_s(vm, rs2), // - fpu_neg32(riscv_view_s(vm, rs3)))); + riscv_write_s(vm, rds, + riscv_fma_round_f32(vm, rm, riscv_read_s(vm, rs1), // + riscv_read_s(vm, rs2), // + fpu_neg32(riscv_read_s(vm, rs3)))); return; case 0x1: // fmsub.d - riscv_emit_d(vm, rds, - fpu_fma64(riscv_view_d(vm, rs1), // + riscv_write_d(vm, rds, + riscv_fma_round_f64(vm, rm, riscv_view_d(vm, rs1), // riscv_view_d(vm, rs2), // fpu_neg64(riscv_view_d(vm, rs3)))); return; @@ -193,14 +404,14 @@ static forceinline void riscv_emulate_f_fnmsub(rvvm_hart_t* vm, const uint32_t i if (likely(riscv_fpu_is_enabled(vm) && riscv_fpu_rm_is_valid(rm))) { switch (bit_ext_u32(insn, 25, 2)) { case 0x0: // fnmsub.s - riscv_emit_s(vm, rds, - fpu_fma32(fpu_neg32(riscv_view_s(vm, rs1)), // - riscv_view_s(vm, rs2), // - riscv_view_s(vm, rs3))); + riscv_write_s(vm, rds, + riscv_fma_round_f32(vm, rm, fpu_neg32(riscv_read_s(vm, rs1)), // + riscv_read_s(vm, rs2), // + riscv_read_s(vm, rs3))); return; case 0x1: // fnmsub.d - riscv_emit_d(vm, rds, - fpu_fma64(fpu_neg64(riscv_view_d(vm, rs1)), // + riscv_write_d(vm, rds, + riscv_fma_round_f64(vm, rm, fpu_neg64(riscv_view_d(vm, rs1)), // riscv_view_d(vm, rs2), // riscv_view_d(vm, rs3))); return; @@ -220,17 +431,18 @@ static forceinline void riscv_emulate_f_fnmadd(rvvm_hart_t* vm, const uint32_t i if (likely(riscv_fpu_is_enabled(vm) && riscv_fpu_rm_is_valid(rm))) { switch (bit_ext_u32(insn, 25, 2)) { - case 0x0: // fnmadd.s - riscv_emit_s(vm, rds, - fpu_neg32(fpu_fma32(riscv_view_s(vm, rs1), // - riscv_view_s(vm, rs2), // - riscv_view_s(vm, rs3)))); + case 0x0: // fnmadd.s = -(rs1*rs2) - rs3; negate operands so the single + // rounding sees the correctly-signed result (directed modes) + riscv_write_s(vm, rds, + riscv_fma_round_f32(vm, rm, fpu_neg32(riscv_read_s(vm, rs1)), // + riscv_read_s(vm, rs2), // + fpu_neg32(riscv_read_s(vm, rs3)))); return; case 0x1: // fnmadd.d - riscv_emit_d(vm, rds, - fpu_neg64(fpu_fma64(riscv_view_d(vm, rs1), // + riscv_write_d(vm, rds, + riscv_fma_round_f64(vm, rm, fpu_neg64(riscv_view_d(vm, rs1)), // riscv_view_d(vm, rs2), // - riscv_view_d(vm, rs3)))); + fpu_neg64(riscv_view_d(vm, rs3)))); return; } } diff --git a/src/util/fpu_lib.c b/src/util/fpu_lib.c index c82f0451e..80a74bc42 100644 --- a/src/util/fpu_lib.c +++ b/src/util/fpu_lib.c @@ -407,52 +407,76 @@ slow_path uint32_t fpu_fclass64(fpu_f64_t d) slow_path fpu_f32_t fpu_round_f32_internal(fpu_f32_t f, uint32_t mode) { - uint32_t u = fpu_bit_f32_to_u32(f); - uint32_t s = u & FPU_LIB_FP32_SIGNEDFP_MASK; + // A non-fractional value (integer, +/-0, inf, NaN) is already rounded. Return it + // unchanged: the round-to-nearest path below adds +/-0.5, which is inexact once + // 0.5 underflows a large value's ULP and would raise a spurious INEXACT -- e.g. + // on an out-of-range fcvt-to-int, leaving NX wrongly set alongside NV. + if (likely(!fpu_is_fractional32(f))) { + return f; + } + // f is finite with a fractional part. The magic-number rounding (add/sub of 0.5 + // or 1.0) is itself inexact for the discarded fraction; snapshot and restore the + // exception flags so it leaks none. Each fcvt-to-int caller determines INEXACT + // itself from the round-trip (and only when the result is in range), so the + // rounding step must not pre-set a spurious NX -- e.g. alongside an NV from an + // out-of-range or negative-to-unsigned conversion. + const uint32_t exc = fpu_get_exceptions(); + const uint32_t s = fpu_bit_f32_to_u32(f) & FPU_LIB_FP32_SIGNEDFP_MASK; + fpu_f32_t r = f; if (unlikely(mode > FPU_LIB_ROUND_UP)) { mode = fpu_get_rounding_mode(); } switch (mode) { case FPU_LIB_ROUND_NE: case FPU_LIB_ROUND_MM: - return fpu_add32(f, fpu_bit_u32_to_f32(0x3F000000U | s)); + r = fpu_add32(f, fpu_bit_u32_to_f32(0x3F000000U | s)); + break; case FPU_LIB_ROUND_DN: - if (s && fpu_is_fractional32(f)) { - return fpu_sub32(f, fpu_bit_u32_to_f32(0x3F800000U)); + if (s) { + r = fpu_sub32(f, fpu_bit_u32_to_f32(0x3F800000U)); } break; case FPU_LIB_ROUND_UP: - if (!s && fpu_is_fractional32(f)) { - return fpu_add32(f, fpu_bit_u32_to_f32(0x3F800000U)); + if (!s) { + r = fpu_add32(f, fpu_bit_u32_to_f32(0x3F800000U)); } break; } - return f; + fpu_set_exceptions(exc); + return r; } slow_path fpu_f64_t fpu_round_f64_internal(fpu_f64_t d, uint32_t mode) { - uint64_t u = fpu_bit_f64_to_u64(d); - uint64_t s = u & FPU_LIB_FP64_SIGNEDFP_MASK; + // See fpu_round_f32_internal: a non-fractional value needs no rounding, and + // skipping it avoids a spurious INEXACT from the +/-0.5 round-to-nearest add. + if (likely(!fpu_is_fractional64(d))) { + return d; + } + const uint32_t exc = fpu_get_exceptions(); + const uint64_t s = fpu_bit_f64_to_u64(d) & FPU_LIB_FP64_SIGNEDFP_MASK; + fpu_f64_t r = d; if (unlikely(mode > FPU_LIB_ROUND_UP)) { mode = fpu_get_rounding_mode(); } switch (mode) { case FPU_LIB_ROUND_NE: case FPU_LIB_ROUND_MM: - return fpu_add64(d, fpu_bit_u64_to_f64(0x3FE0000000000000ULL | s)); + r = fpu_add64(d, fpu_bit_u64_to_f64(0x3FE0000000000000ULL | s)); + break; case FPU_LIB_ROUND_DN: - if (s && fpu_is_fractional64(d)) { - return fpu_sub64(d, fpu_bit_u64_to_f64(0x3FF0000000000000ULL)); + if (s) { + r = fpu_sub64(d, fpu_bit_u64_to_f64(0x3FF0000000000000ULL)); } break; case FPU_LIB_ROUND_UP: - if (!s && fpu_is_fractional64(d)) { - return fpu_add64(d, fpu_bit_u64_to_f64(0x3FF0000000000000ULL)); + if (!s) { + r = fpu_add64(d, fpu_bit_u64_to_f64(0x3FF0000000000000ULL)); } break; } - return d; + fpu_set_exceptions(exc); + return r; } #if defined(USE_SOFT_FPU_SQRT) diff --git a/src/util/fpu_lib.h b/src/util/fpu_lib.h index e9681e9ae..ecc8969e6 100644 --- a/src/util/fpu_lib.h +++ b/src/util/fpu_lib.h @@ -1071,12 +1071,12 @@ static forceinline fpu_f32_t fpu_sqrt32(fpu_f32_t f) #endif return ret; } - if (fpu_is_negative32(f)) { - // Raise invalid flag, return canonical NaN + if (fpu_is_negative32(f) && (fpu_bit_f32_to_u32(f) << 1) != 0) { + // Negative non-zero: raise invalid, return canonical NaN fpu_raise_invalid(); return fpu_bit_u32_to_f32(FPU_LIB_FP32_CANONICAL_NAN); } - // Perform NaN propagation + // NaN propagation, and sqrt(-0.0) == -0.0 (no exception) return f; } @@ -1093,12 +1093,12 @@ static forceinline fpu_f64_t fpu_sqrt64(fpu_f64_t d) #endif return ret; } - if (fpu_is_negative64(d)) { - // Raise invalid flag, return canonical NaN + if (fpu_is_negative64(d) && (fpu_bit_f64_to_u64(d) << 1) != 0) { + // Negative non-zero: raise invalid, return canonical NaN fpu_raise_invalid(); return fpu_bit_u64_to_f64(FPU_LIB_FP64_CANONICAL_NAN); } - // Perform NaN propagation + // NaN propagation, and sqrt(-0.0) == -0.0 (no exception) return d; }