diff --git a/src/cpu/riscv_fpu.c b/src/cpu/riscv_fpu.c
index a1d60859c..a541305d2 100644
--- a/src/cpu/riscv_fpu.c
+++ b/src/cpu/riscv_fpu.c
@@ -54,46 +54,182 @@ static const uint32_t riscv_fli_table[32] = {
     0x7FC00000UL, // Canonical NaN
 };
 
-static void riscv_prepare_rmm(rvvm_hart_t* vm, const uint32_t insn, const size_t rs1, const size_t rs2)
+/*
+ * RMM (round to nearest, ties to max magnitude) == IEEE 754 roundTiesToAway.
+ *
+ * The host FPU has no such mode, so when frm == RMM the host is left in
+ * round-to-nearest-even (see fpu_set_rounding_mode()). RNE and roundTiesToAway
+ * produce identical results EXCEPT on an exact halfway tie, where RNE rounds to
+ * even and roundTiesToAway rounds to the larger-magnitude neighbour.
+ *
+ * So we compute the op in RNE, recover the EXACT rounding error via the library's
+ * error-free transforms (TwoSum / TwoProduct), and only when that error is
+ * exactly half a ULP away from zero do we step the result outward by one ULP.
+ * (The previous implementation rounded toward +/-inf unconditionally, which is
+ * correct on ties but wrong for every inexact non-tie. See issue #204.)
+ *
+ * fadd/fsub/fmul use riscv_rmm_apply (in riscv_fpu.h, shared with the FMA family).
+ * fsqrt never needs a fixup: a square
+ * root is irrational unless exact, and its result is always normal (the square
+ * root of even the smallest subnormal is ~2^-75), so RNE == roundTiesToAway. A
+ * *normally-rounded* quotient is likewise never an exact halfway case — but a
+ * *subnormal* quotient has reduced precision and CAN land exactly on a tie, so
+ * fdiv gets a dedicated subnormal fixup (riscv_rmm_div_apply).
+ *
+ * The error-free transforms run only for a finite result, and the per-op wrappers
+ * (riscv_rmm_add/mul/div) snapshot and restore the exception flags around them:
+ * TwoSum/TwoProduct do raw host arithmetic whose intermediate steps can raise
+ * spurious exceptions (inf-inf -> NV, near-FLT_MAX -> OF) that must not leak into
+ * fflags. The genuine flags are already set by the base op.
+ */
+// roundTiesToAway fixup for fadd/fsub (fsub passes b negated). Flag-isolated.
+static forceinline fpu_f32_t riscv_rmm_add_f32(fpu_f32_t n, fpu_f32_t a, fpu_f32_t b)
 {
-    bool neg = false;
+    if (unlikely(!fpu_is_finite32(n))) {
+        return n;
+    }
+    const uint32_t exc = fpu_get_exceptions();
+    const fpu_f32_t r  = riscv_rmm_apply_f32(n, fpu_add_error32(n, a, b));
+    fpu_set_exceptions(exc);
+    return r;
+}
 
-    // Decide the sign of the output
-    switch (insn & 0xFE000000UL) {
-        case 0x00000000UL: // fadd.s
-            neg = fpu_signbit32(fpu_add32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
-            break;;
-        case 0x02000000UL: // fadd.d
-            neg = fpu_signbit64(fpu_add64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2)));
-            break;
-        case 0x08000000UL: // fsub.s
-            neg = fpu_signbit32(fpu_sub32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
-            break;
-        case 0x0A000000UL: // fsub.d
-            neg = fpu_signbit64(fpu_sub64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2)));
-            break;
-        case 0x10000000UL: // fmul.s
-        case 0x18000000UL: // fdiv.s
-            neg = fpu_signbit32(riscv_view_s(vm, rs1)) != fpu_signbit32(riscv_view_s(vm, rs2));
-            break;
-        case 0x12000000UL: // fmul.d
-        case 0x1A000000UL: // fdiv.d
-            neg = fpu_signbit64(riscv_view_d(vm, rs1)) != fpu_signbit64(riscv_view_d(vm, rs2));
-            break;
-        default:
-            neg = fpu_signbit64(riscv_view_d(vm, rs1));
-            break;
+static forceinline fpu_f64_t riscv_rmm_add_f64(fpu_f64_t n, fpu_f64_t a, fpu_f64_t b)
+{
+    if (unlikely(!fpu_is_finite64(n))) {
+        return n;
     }
+    const uint32_t exc = fpu_get_exceptions();
+    const fpu_f64_t r  = riscv_rmm_apply_f64(n, fpu_add_error64(n, a, b));
+    fpu_set_exceptions(exc);
+    return r;
+}
 
-    // Round to positive/negative infinity based on the result sign
-    if (neg) {
-        fpu_set_rounding_mode(FPU_LIB_ROUND_DN);
-    } else {
-        fpu_set_rounding_mode(FPU_LIB_ROUND_UP);
+/*
+ * Dekker's product error (fpu_mul_error) loses bits once the product error drops
+ * below the smallest subnormal -- not only for a subnormal result, but throughout
+ * the smallest normal binades (the half-ULP ~2^(e-53) underflows for e <= -969).
+ * Those results need an exact tie test.
+ *
+ * For f32 the exact product widens losslessly into f64 (48 <= 53 bits) and the
+ * away-side midpoint is exact there (<= 25 bits, down to 2^-150), so the op is a
+ * tie iff a*b == m. This is exact for every f32 result, so fmul.s always uses it.
+ *
+ * For f64 there is no wider type, so scale both operands by 2^512 to lift the
+ * product into a binade where the fma-based TwoProduct is exact (a small product
+ * bounds |a|,|b|, so a*2^512 cannot overflow). The midpoint of two normals needs
+ * one bit more than the format holds, so rather than form it we test the residual:
+ * the op is a tie iff 2*(a*b - n) == ULP, evaluated in the scaled domain with an
+ * exact TwoSum cancellation. Mirrors the scaling in riscv_rmm_div_apply_f64.
+ */
+static forceinline fpu_f32_t riscv_rmm_mul_exact_f32(fpu_f32_t n, fpu_f32_t a, fpu_f32_t b)
+{
+    const fpu_f32_t away = fpu_bit_u32_to_f32(fpu_bit_f32_to_u32(n) + 1);
+    const fpu_f64_t half = fpu_bit_u64_to_f64(0x3FE0000000000000ULL);  // 0.5
+    const fpu_f64_t dn   = fpu_fcvt_f32_to_f64(n);
+    const fpu_f64_t dab  = fpu_mul64(fpu_fcvt_f32_to_f64(a), fpu_fcvt_f32_to_f64(b));  // exact
+    const fpu_f64_t m    = fpu_add64(dn, fpu_mul64(fpu_sub64(fpu_fcvt_f32_to_f64(away), dn), half));
+    return ((fpu_bit_f64_to_u64(fpu_sub64(dab, m)) << 1) == 0) ? away : n;  // tie iff a*b == m
+}
+
+static forceinline fpu_f64_t riscv_rmm_mul_small_f64(fpu_f64_t n, fpu_f64_t a, fpu_f64_t b)
+{
+    const fpu_f64_t S    = fpu_bit_u64_to_f64(0x5FF0000000000000ULL);  // 2^512
+    const fpu_f64_t as   = fpu_mul64(a, S), bs = fpu_mul64(b, S);      // exact: |a|,|b| bounded
+    const fpu_f64_t ps   = fpu_mul64(as, bs);                          // a*b * 2^1024, normal
+    const fpu_f64_t pe   = fpu_fma64(as, bs, fpu_neg64(ps));           // exact: a*b*2^1024 = ps + pe
+    const fpu_f64_t away = fpu_bit_u64_to_f64(fpu_bit_f64_to_u64(n) + 1);
+    const fpu_f64_t ns   = fpu_mul64(fpu_mul64(n, S), S);              // n * 2^1024, exact
+    const fpu_f64_t gaps = fpu_mul64(fpu_mul64(fpu_sub64(away, n), S), S);  // ULP * 2^1024
+    // residual (a*b - n)*2^1024 = dr + pe (dr exact, Sterbenz). Tie iff 2*(a*b-n)
+    // == ULP, i.e. 2*dr + 2*pe == gaps, via an exact TwoSum(2*dr, -gaps).
+    const fpu_f64_t dr   = fpu_sub64(ps, ns);
+    const fpu_f64_t td   = fpu_add64(dr, dr), tp = fpu_add64(pe, pe), ng = fpu_neg64(gaps);
+    const fpu_f64_t vh   = fpu_add64(td, ng);
+    const fpu_f64_t vl   = fpu_add_error64(vh, td, ng);               // exact: (td - gaps) = vh + vl
+    return (((fpu_bit_f64_to_u64(vl) << 1) == 0) && ((fpu_bit_f64_to_u64(fpu_add64(vh, tp)) << 1) == 0))
+               ? away : n;                                            // tie iff td - gaps == -2*pe
+}
+
+// roundTiesToAway fixup for fmul. Flag-isolated. fmul.s always uses the exact f64
+// widening; fmul.d uses the exact scaled test for small results (where the product
+// error underflows) and the cheaper Dekker product error otherwise.
+static forceinline fpu_f32_t riscv_rmm_mul_f32(fpu_f32_t n, fpu_f32_t a, fpu_f32_t b)
+{
+    if (unlikely(!fpu_is_finite32(n))) {
+        return n;
+    }
+    const uint32_t exc = fpu_get_exceptions();
+    const fpu_f32_t r  = riscv_rmm_mul_exact_f32(n, a, b);
+    fpu_set_exceptions(exc);
+    return r;
+}
+
+static forceinline fpu_f64_t riscv_rmm_mul_f64(fpu_f64_t n, fpu_f64_t a, fpu_f64_t b)
+{
+    if (unlikely(!fpu_is_finite64(n))) {
+        return n;
+    }
+    const uint32_t exc = fpu_get_exceptions();
+    // |n| >= 2^-959 (exp field >= 64): the product error stays representable, so
+    // the cheap Dekker path is exact. Below that, use the scaled residual test.
+    const fpu_f64_t r  = (((fpu_bit_f64_to_u64(n) >> 52) & 0x7FFU) >= 64)
+                             ? riscv_rmm_apply_f64(n, fpu_mul_error64(n, a, b))
+                             : riscv_rmm_mul_small_f64(n, a, b);
+    fpu_set_exceptions(exc);
+    return r;
+}
+
+/*
+ * roundTiesToAway fixup for fdiv. Only a subnormal (or zero) quotient can be an
+ * exact tie; a normal quotient never is, so the common path returns immediately
+ * (non-finite results also have a non-zero exponent field and return unchanged).
+ *
+ * For a subnormal result the exact residual rho = a - n*b is representable, so
+ * fma(-n, b, a) recovers it exactly. The true quotient is n + rho/b, so it is the
+ * exact midpoint (a tie, rounded away) iff 2*rho == gap*b, where gap = away - n
+ * (= +/- the subnormal step). |rho| <= |b|*2^-1075, so every value below stays in
+ * range and exact. Flag-isolated, as the residual machinery can raise spurious
+ * exceptions.
+ */
+static forceinline fpu_f32_t riscv_rmm_div_apply_f32(fpu_f32_t n, fpu_f32_t a, fpu_f32_t b)
+{
+    if (likely(fpu_bit_f32_to_u32(n) & FPU_LIB_FP32_EXPONENT_MASK)) {
+        return n;  // normal / inf / nan: RNE already == roundTiesToAway
+    }
+    const uint32_t  exc  = fpu_get_exceptions();
+    const fpu_f32_t rho  = fpu_fma32(fpu_neg32(n), b, a);  // exact residual a - n*b
+    const fpu_f32_t away = fpu_bit_u32_to_f32(fpu_bit_f32_to_u32(n) + 1);
+    const fpu_f32_t gap  = fpu_sub32(away, n);             // +/- 2^-149, exact
+    // Tie iff 2*rho == gap*b, evaluated in fp64 where both sides are exact
+    // (operands widen losslessly and gap*b ~ 2^-22 stays well inside the range).
+    const fpu_f64_t two_rho = fpu_add64(fpu_fcvt_f32_to_f64(rho), fpu_fcvt_f32_to_f64(rho));
+    const fpu_f64_t gap_b   = fpu_mul64(fpu_fcvt_f32_to_f64(gap), fpu_fcvt_f32_to_f64(b));
+    const fpu_f32_t r       = fpu_is_bit_equal64(two_rho, gap_b) ? away : n;
+    fpu_set_exceptions(exc);
+    return r;
+}
+
+static forceinline fpu_f64_t riscv_rmm_div_apply_f64(fpu_f64_t n, fpu_f64_t a, fpu_f64_t b)
+{
+    if (likely(fpu_bit_f64_to_u64(n) & FPU_LIB_FP64_EXPONENT_MASK)) {
+        return n;
     }
+    const uint32_t  exc  = fpu_get_exceptions();
+    const fpu_f64_t rho  = fpu_fma64(fpu_neg64(n), b, a);
+    const fpu_f64_t away = fpu_bit_u64_to_f64(fpu_bit_f64_to_u64(n) + 1);
+    const fpu_f64_t gap  = fpu_sub64(away, n);             // +/- 2^-1074, exact
+    // fp64 has no wider type; gap*b underflows, so rescale 2*rho == gap*b by the
+    // gap magnitude (2^1074) as two exact power-of-two steps: rho*2^1075 == +/-b.
+    const fpu_f64_t scaled = fpu_mul64(fpu_mul64(rho, fpu_bit_u64_to_f64(0x7FE0000000000000ULL)),  // 2^1023
+                                       fpu_bit_u64_to_f64(0x4330000000000000ULL));                 // 2^52
+    const fpu_f64_t target = (fpu_bit_f64_to_u64(gap) >> 63) ? fpu_neg64(b) : b;
+    const fpu_f64_t r      = fpu_is_bit_equal64(scaled, target) ? away : n;
+    fpu_set_exceptions(exc);
+    return r;
 }
 
-slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint32_t insn)
+static slow_path void riscv_emulate_f_opc_op_impl(rvvm_hart_t* vm, const uint32_t insn, const bool rmm)
 {
     const size_t   rds = bit_ext_u32(insn, 7, 5);
     const uint32_t rm  = bit_ext_u32(insn, 12, 3);
@@ -102,48 +238,67 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3
 
     if (likely(riscv_fpu_is_enabled(vm))) {
 
-        if (unlikely(vm->csr.fcsr >> 5 == 0x04)) {
-            // Handle RMM rounding
-            riscv_prepare_rmm(vm, insn, rs1, rs2);
-        }
-
         switch (insn & 0xFE007000UL) {
             /*
              * FPU computations
              */
-            case RISCV_FPU_GEN_RM_CASES(0x00000000UL): // fadd.s
-                riscv_emit_s(vm, rds, fpu_add32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
-                return;
-            case RISCV_FPU_GEN_RM_CASES(0x02000000UL): // fadd.d
-                riscv_emit_d(vm, rds, fpu_add64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2)));
-                return;
-            case RISCV_FPU_GEN_RM_CASES(0x08000000UL): // fsub.s
-                riscv_write_s(vm, rds, fpu_sub32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
-                return;
-            case RISCV_FPU_GEN_RM_CASES(0x0A000000UL): // fsub.d
-                riscv_write_d(vm, rds, fpu_sub64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2)));
-                return;
-            case RISCV_FPU_GEN_RM_CASES(0x10000000UL): // fmul.s
-                riscv_emit_s(vm, rds, fpu_mul32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
-                return;
-            case RISCV_FPU_GEN_RM_CASES(0x12000000UL): // fmul.d
-                riscv_emit_d(vm, rds, fpu_mul64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2)));
-                return;
-            case RISCV_FPU_GEN_RM_CASES(0x18000000UL): // fdiv.s
-                riscv_emit_s(vm, rds, fpu_div32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
-                return;
-            case RISCV_FPU_GEN_RM_CASES(0x1A000000UL): // fdiv.d
-                riscv_emit_d(vm, rds, fpu_div64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2)));
-                return;
+            case RISCV_FPU_GEN_RM_CASES(0x00000000UL): { // fadd.s
+                const fpu_f32_t a = riscv_read_s(vm, rs1), b = riscv_read_s(vm, rs2);
+                const fpu_f32_t n = fpu_add32(a, b);
+                riscv_write_s(vm, rds, rmm ? riscv_rmm_add_f32(n, a, b) : n);
+                return;
+            }
+            case RISCV_FPU_GEN_RM_CASES(0x02000000UL): { // fadd.d
+                const fpu_f64_t a = riscv_view_d(vm, rs1), b = riscv_view_d(vm, rs2);
+                const fpu_f64_t n = fpu_add64(a, b);
+                riscv_write_d(vm, rds, rmm ? riscv_rmm_add_f64(n, a, b) : n);
+                return;
+            }
+            case RISCV_FPU_GEN_RM_CASES(0x08000000UL): { // fsub.s
+                const fpu_f32_t a = riscv_read_s(vm, rs1), b = riscv_read_s(vm, rs2);
+                const fpu_f32_t n = fpu_sub32(a, b);
+                riscv_write_s(vm, rds, rmm ? riscv_rmm_add_f32(n, a, fpu_neg32(b)) : n);
+                return;
+            }
+            case RISCV_FPU_GEN_RM_CASES(0x0A000000UL): { // fsub.d
+                const fpu_f64_t a = riscv_view_d(vm, rs1), b = riscv_view_d(vm, rs2);
+                const fpu_f64_t n = fpu_sub64(a, b);
+                riscv_write_d(vm, rds, rmm ? riscv_rmm_add_f64(n, a, fpu_neg64(b)) : n);
+                return;
+            }
+            case RISCV_FPU_GEN_RM_CASES(0x10000000UL): { // fmul.s
+                const fpu_f32_t a = riscv_read_s(vm, rs1), b = riscv_read_s(vm, rs2);
+                const fpu_f32_t n = fpu_mul32(a, b);
+                riscv_write_s(vm, rds, rmm ? riscv_rmm_mul_f32(n, a, b) : n);
+                return;
+            }
+            case RISCV_FPU_GEN_RM_CASES(0x12000000UL): { // fmul.d
+                const fpu_f64_t a = riscv_view_d(vm, rs1), b = riscv_view_d(vm, rs2);
+                const fpu_f64_t n = fpu_mul64(a, b);
+                riscv_write_d(vm, rds, rmm ? riscv_rmm_mul_f64(n, a, b) : n);
+                return;
+            }
+            case RISCV_FPU_GEN_RM_CASES(0x18000000UL): { // fdiv.s
+                const fpu_f32_t a = riscv_read_s(vm, rs1), b = riscv_read_s(vm, rs2);
+                const fpu_f32_t n = fpu_div32(a, b);
+                riscv_write_s(vm, rds, rmm ? riscv_rmm_div_apply_f32(n, a, b) : n);
+                return;
+            }
+            case RISCV_FPU_GEN_RM_CASES(0x1A000000UL): { // fdiv.d
+                const fpu_f64_t a = riscv_view_d(vm, rs1), b = riscv_view_d(vm, rs2);
+                const fpu_f64_t n = fpu_div64(a, b);
+                riscv_write_d(vm, rds, rmm ? riscv_rmm_div_apply_f64(n, a, b) : n);
+                return;
+            }
             case RISCV_FPU_GEN_RM_CASES(0x58000000UL): // fsqrt.s
                 if (likely(!rs2)) {
-                    riscv_emit_s(vm, rds, fpu_sqrt32(riscv_view_s(vm, rs1)));
+                    riscv_write_s(vm, rds, fpu_sqrt32(riscv_read_s(vm, rs1)));
                     return;
                 }
                 break;
             case RISCV_FPU_GEN_RM_CASES(0x5A000000UL): // fsqrt.s
                 if (likely(!rs2)) {
-                    riscv_emit_d(vm, rds, fpu_sqrt64(riscv_view_d(vm, rs1)));
+                    riscv_write_d(vm, rds, fpu_sqrt64(riscv_view_d(vm, rs1)));
                     return;
                 }
                 break;
@@ -157,14 +312,14 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3
                         return;
                     case 0x04: // fround.s (Zfa)
                     case 0x05: // TODO: froundx.s (Zfa)
-                        riscv_emit_s(vm, rds, fpu_fcvt_i64_to_f32(fpu_round_f32_to_i64(riscv_view_s(vm, rs1), rm)));
+                        riscv_emit_s(vm, rds, fpu_fcvt_i64_to_f32(fpu_round_f32_to_i64(riscv_read_s(vm, rs1), rm)));
                         return;
                 }
                 break;
             case RISCV_FPU_GEN_RM_CASES(0x42000000UL):
                 switch (rs2) {
                     case 0x00: // fcvt.d.s
-                        riscv_write_d(vm, rds, fpu_fcvt_f32_to_f64(riscv_view_s(vm, rs1)));
+                        riscv_write_d(vm, rds, fpu_fcvt_f32_to_f64(riscv_read_s(vm, rs1)));
                         return;
                     case 0x04: // fround.s (Zfa)
                     case 0x05: // TODO: froundx.s (Zfa)
@@ -175,20 +330,20 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3
             case RISCV_FPU_GEN_RM_CASES(0xC0000000UL):
                 switch (rs2) {
                     case 0x00: // fcvt.w.s
-                        riscv_write_reg(vm, rds, (int32_t)fpu_round_f32_to_i32(riscv_view_s(vm, rs1), rm));
+                        riscv_write_reg(vm, rds, (int32_t)fpu_round_f32_to_i32(riscv_read_s(vm, rs1), rm));
                         return;
                     case 0x01: // fcvt.wu.s
-                        riscv_write_reg(vm, rds, (int32_t)fpu_round_f32_to_u32(riscv_view_s(vm, rs1), rm));
+                        riscv_write_reg(vm, rds, (int32_t)fpu_round_f32_to_u32(riscv_read_s(vm, rs1), rm));
                         return;
                     case 0x02: // fcvt.l.s
                         if (likely(vm->rv64)) {
-                            riscv_write_reg(vm, rds, (int64_t)fpu_round_f32_to_i64(riscv_view_s(vm, rs1), rm));
+                            riscv_write_reg(vm, rds, (int64_t)fpu_round_f32_to_i64(riscv_read_s(vm, rs1), rm));
                             return;
                         }
                         break;
                     case 0x03: // fcvt.lu.s
                         if (likely(vm->rv64)) {
-                            riscv_write_reg(vm, rds, (int64_t)fpu_round_f32_to_u64(riscv_view_s(vm, rs1), rm));
+                            riscv_write_reg(vm, rds, (int64_t)fpu_round_f32_to_u64(riscv_read_s(vm, rs1), rm));
                             return;
                         }
                         break;
@@ -272,10 +427,10 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3
                 riscv_emit_s(vm, rds, fpu_fsgnj32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2)));
                 return;
             case 0x20001000UL: // fsgnjn.s
-                riscv_emit_s(vm, rds, fpu_fsgnjn32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
+                riscv_emit_s(vm, rds, fpu_fsgnjn32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2)));
                 return;
             case 0x20002000UL: // fsgnjx.s
-                riscv_emit_s(vm, rds, fpu_fsgnjx32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
+                riscv_emit_s(vm, rds, fpu_fsgnjx32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2)));
                 return;
             case 0x22000000UL: // fsgnj.d
                 riscv_emit_d(vm, rds, fpu_fsgnj64(riscv_read_d(vm, rs1), riscv_read_d(vm, rs2)));
@@ -290,16 +445,16 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3
              * FPU comparisons
              */
             case 0x28000000UL: // fmin.s
-                riscv_emit_s(vm, rds, fpu_min32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
+                riscv_emit_s(vm, rds, fpu_min32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2)));
                 return;
             case 0x28001000UL: // fmax.s
-                riscv_emit_s(vm, rds, fpu_max32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
+                riscv_emit_s(vm, rds, fpu_max32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2)));
                 return;
             case 0x28002000UL: // fminm.s (Zfa)
-                riscv_write_s(vm, rds, fpu_min32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
+                riscv_write_s(vm, rds, fpu_min32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2)));
                 return;
             case 0x28003000UL: // fmaxm.s (Zfa)
-                riscv_write_s(vm, rds, fpu_max32(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
+                riscv_write_s(vm, rds, fpu_max32(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2)));
                 return;
             case 0x2A000000UL: // fmin.d
                 riscv_emit_d(vm, rds, fpu_min64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2)));
@@ -314,19 +469,19 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3
                 riscv_write_d(vm, rds, fpu_max64(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2)));
                 return;
             case 0xA0000000UL: // fle.s
-                riscv_write_reg(vm, rds, fpu_is_fle32_sig(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
+                riscv_write_reg(vm, rds, fpu_is_fle32_sig(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2)));
                 return;
             case 0xA0001000UL: // flt.s
-                riscv_write_reg(vm, rds, fpu_is_flt32_sig(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
+                riscv_write_reg(vm, rds, fpu_is_flt32_sig(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2)));
                 return;
             case 0xA0002000UL: // feq.s
-                riscv_write_reg(vm, rds, fpu_is_equal32_quiet(riscv_read_s(vm, rs1), riscv_view_s(vm, rs2)));
+                riscv_write_reg(vm, rds, fpu_is_equal32_quiet(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2)));
                 return;
             case 0xA0004000UL: // fleq.s (Zfa)
-                riscv_write_reg(vm, rds, fpu_is_fle32_quiet(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
+                riscv_write_reg(vm, rds, fpu_is_fle32_quiet(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2)));
                 return;
             case 0xA0005000UL: // fltq.s (Zfa)
-                riscv_write_reg(vm, rds, fpu_is_flt32_quiet(riscv_view_s(vm, rs1), riscv_view_s(vm, rs2)));
+                riscv_write_reg(vm, rds, fpu_is_flt32_quiet(riscv_read_s(vm, rs1), riscv_read_s(vm, rs2)));
                 return;
             case 0xA2000000UL: // fle.d
                 riscv_write_reg(vm, rds, fpu_is_fle64_sig(riscv_view_d(vm, rs1), riscv_view_d(vm, rs2)));
@@ -354,7 +509,7 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3
                 break;
             case 0xE0001000UL: // fclass.s
                 if (likely(!rs2)) {
-                    riscv_write_reg(vm, rds, 1U << fpu_fclass32(riscv_view_s(vm, rs1)));
+                    riscv_write_reg(vm, rds, 1U << fpu_fclass32(riscv_read_s(vm, rs1)));
                     return;
                 }
                 break;
@@ -406,4 +561,32 @@ slow_path func_opt_size void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint3
     riscv_illegal_insn(vm, insn);
 }
 
+/*
+ * RISC-V selects the rounding mode either dynamically (the frm CSR, when the
+ * instruction's rm field is DYN) or statically (the rm field itself). The host
+ * FPU tracks frm, so a static rm field that differs from frm must be applied
+ * around the op. RMM has no host mode at all and is synthesized in RNE by the
+ * fixups above (riscv_rmm_apply / riscv_rmm_div_apply), so its sub-ops must run
+ * in RNE. funct3 == rm only carries a rounding mode on rounding-capable ops, so
+ * this never misfires on fsgnj/fcmp/fclass/fmv.
+ */
+slow_path void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint32_t insn)
+{
+    const uint32_t rm  = bit_ext_u32(insn, 12, 3);
+    const uint32_t frm = vm->csr.fcsr >> 5;
+    // Effective rounding mode: a static rm field overrides the dynamic frm CSR.
+    const uint32_t eff = (rm == 0x07) ? frm : rm;
+    const bool     rmm = (eff == 0x04);
+    // Override the host mode when synthesizing RMM (run sub-ops in RNE) or when a
+    // static rm field selects a host-native mode other than the one frm left set.
+    if (unlikely(rmm || (rm != 0x07 && eff != frm))) {
+        const uint32_t host = fpu_get_rounding_mode();
+        fpu_set_rounding_mode(rmm ? FPU_LIB_ROUND_NE : eff);
+        riscv_emulate_f_opc_op_impl(vm, insn, rmm);
+        fpu_set_rounding_mode(host);
+    } else {
+        riscv_emulate_f_opc_op_impl(vm, insn, false);
+    }
+}
+
 #endif
diff --git a/src/cpu/riscv_fpu.h b/src/cpu/riscv_fpu.h
index a5fb25d9d..6395bd320 100644
--- a/src/cpu/riscv_fpu.h
+++ b/src/cpu/riscv_fpu.h
@@ -26,13 +26,13 @@ static forceinline bool riscv_fpu_rm_is_valid(uint32_t rm)
     return rm > 1;
 }
 
-// Bit-precise register reads
+// Bit-precise register read (raw low 32 bits, no NaN-box check) — for fmv.x.w
 static forceinline fpu_f32_t riscv_view_s(rvvm_hart_t* vm, size_t reg)
 {
     return fpu_unpack_f32_from_f64(vm->fpu_registers[reg]);
 }
 
-// Normalized register reads
+// Value register read: a mal-boxed narrow operand reads as the canonical NaN
 static forceinline fpu_f32_t riscv_read_s(rvvm_hart_t* vm, size_t reg)
 {
     return fpu_nan_unbox_f32(vm->fpu_registers[reg]);
@@ -82,6 +82,48 @@ static forceinline void riscv_write_d(rvvm_hart_t* vm, size_t reg, fpu_f64_t val
 
 slow_path void riscv_emulate_f_opc_op(rvvm_hart_t* vm, const uint32_t insn);
 
+/*
+ * roundTiesToAway core fixup, shared by the OP-FP dispatch (riscv_fpu.c) and the
+ * FMA family (riscv_rmm_fma_apply, below). Given a round-to-nearest-even result n
+ * and the EXACT rounding error err = (true result) - n, step n one ULP outward
+ * only on an exact halfway tie (err exactly half a ULP, pointing away from zero);
+ * otherwise RNE already equals roundTiesToAway. See the strategy comment in
+ * riscv_fpu.c.
+ */
+static forceinline fpu_f32_t riscv_rmm_apply_f32(fpu_f32_t n, fpu_f32_t err)
+{
+    const uint32_t un = fpu_bit_f32_to_u32(n);
+    const uint32_t ue = fpu_bit_f32_to_u32(err);
+    // Skip exact results (err == +/-0) and errors pointing toward zero: in both
+    // cases RNE already delivers the roundTiesToAway value.
+    if ((ue << 1) == 0 || (un >> 31) != (ue >> 31)) {
+        return n;
+    }
+    // n + 1 ULP toward larger magnitude, and the exact gap to it.
+    const fpu_f32_t away    = fpu_bit_u32_to_f32(un + 1);
+    const fpu_f32_t spacing = fpu_sub32(away, n);  // exact: adjacent floats
+    // Tie iff the exact result is the midpoint, i.e. 2*err == spacing.
+    if (fpu_is_bit_equal32(fpu_add32(err, err), spacing)) {
+        return away;
+    }
+    return n;
+}
+
+static forceinline fpu_f64_t riscv_rmm_apply_f64(fpu_f64_t n, fpu_f64_t err)
+{
+    const uint64_t un = fpu_bit_f64_to_u64(n);
+    const uint64_t ue = fpu_bit_f64_to_u64(err);
+    if ((ue << 1) == 0 || (un >> 63) != (ue >> 63)) {
+        return n;
+    }
+    const fpu_f64_t away    = fpu_bit_u64_to_f64(un + 1);
+    const fpu_f64_t spacing = fpu_sub64(away, n);
+    if (fpu_is_bit_equal64(fpu_add64(err, err), spacing)) {
+        return away;
+    }
+    return n;
+}
+
 #if defined(RISCV32) || defined(RISCV64)
 
 static forceinline void riscv_emulate_f_opc_load(rvvm_hart_t* vm, const uint32_t insn)
@@ -126,6 +168,175 @@ static forceinline void riscv_emulate_f_opc_store(rvvm_hart_t* vm, const uint32_
     riscv_illegal_insn(vm, insn);
 }
 
+/*
+ * FMA honors the rounding mode like any other rounding op: dynamically via frm
+ * (the host already tracks it) or statically via the instruction's rm field.
+ * fpu_fma rounds in the host mode, so a static rm that differs from frm must be
+ * applied around the op; RMM has no host mode and is run in RNE. (RMM differs
+ * from RNE only on an exact halfway tie, and an exact FMA tie is vanishingly rare
+ * — the remaining ties-away gap is shared with the rest of the FP path.)
+ */
+// Underflow flag, RISC-V semantics: UF iff the result is tiny *after rounding* and
+// inexact. A subnormal result is unambiguously tiny; a normal result above the
+// smallest normal is not. The only hard case is a result of exactly the smallest
+// normal: some hosts (e.g. aarch64) detect tininess *before* rounding and raise a
+// spurious UF, but RISC-V follows the IEEE "after rounding" rule -- tiny iff the
+// exact result, rounded to full precision with an *unbounded* exponent (the finer
+// subnormal grid would not have crossed into the normal range), is still below the
+// smallest normal. So at the boundary we recompute that: widen the exact a*b+c into
+// f64 (lossless for a smallest-normal f32 result), scale into [0.5, 1), and round
+// back to f32 in the effective mode -- a magnitude >= 1.0 means not tiny. The gate
+// keeps the common FMA path at a single compare.
+static forceinline void riscv_fma_fixup_uf32(fpu_f32_t r, fpu_f32_t a, fpu_f32_t b, fpu_f32_t c, uint32_t eff)
+{
+    if (likely((fpu_bit_f32_to_u32(r) & 0x7FFFFFFFU) != 0x00800000U)) {
+        return;  // subnormal UF is genuine; a larger normal result has none
+    }
+    const uint32_t exc = fpu_get_exceptions();
+    if (!(exc & FPU_LIB_FLAG_NX)) {
+        return;  // an exact smallest-normal result never underflows
+    }
+    const fpu_f64_t e = fpu_add64(fpu_mul64(fpu_fcvt_f32_to_f64(a), fpu_fcvt_f32_to_f64(b)),
+                                  fpu_fcvt_f32_to_f64(c));                          // exact
+    const fpu_f64_t scaled = fpu_mul64(e, fpu_bit_u64_to_f64(0x47D0000000000000ULL));  // * 2^126 -> ~[0.5,1)
+    const uint32_t host = fpu_get_rounding_mode();
+    fpu_set_rounding_mode(eff == 0x04 ? FPU_LIB_ROUND_NE : eff);
+    const fpu_f32_t rn = fpu_fcvt_f64_to_f32(scaled);
+    fpu_set_rounding_mode(host);
+    const bool tiny = (fpu_bit_f32_to_u32(rn) & 0x7FFFFFFFU) < 0x3F800000U;  // |rn| < 1.0
+    fpu_set_exceptions(tiny ? (exc | FPU_LIB_FLAG_UF) : (exc & ~FPU_LIB_FLAG_UF));
+}
+
+// f64 has no wider type, so recompute the after-rounding tininess by scaling: a
+// smallest-normal f64 result bounds |a|,|b| (a tiny product forces small operands),
+// so a*2^200 cannot overflow, and fma(a*2^200, b, c*2^200) = (a*b+c)*2^200 lands in
+// the normal range where the fused rounding is unbounded-exponent. The result is
+// tiny iff that scaled magnitude is below the scaled smallest normal (2^-822).
+static forceinline void riscv_fma_fixup_uf64(fpu_f64_t r, fpu_f64_t a, fpu_f64_t b, fpu_f64_t c, uint32_t eff)
+{
+    if (likely((fpu_bit_f64_to_u64(r) & 0x7FFFFFFFFFFFFFFFULL) != 0x0010000000000000ULL)) {
+        return;  // subnormal UF is genuine; a larger normal result has none
+    }
+    const uint32_t exc = fpu_get_exceptions();
+    if (!(exc & FPU_LIB_FLAG_NX)) {
+        return;  // an exact smallest-normal result never underflows
+    }
+    const fpu_f64_t S = fpu_bit_u64_to_f64(0x4C70000000000000ULL);  // 2^200
+    const uint32_t host = fpu_get_rounding_mode();
+    fpu_set_rounding_mode(eff == 0x04 ? FPU_LIB_ROUND_NE : eff);
+    const fpu_f64_t rs = fpu_fma64(fpu_mul64(a, S), b, fpu_mul64(c, S));  // (a*b+c) * 2^200
+    fpu_set_rounding_mode(host);
+    const bool tiny = (fpu_bit_f64_to_u64(rs) & 0x7FFFFFFFFFFFFFFFULL) < 0x0C90000000000000ULL;  // |rs| < 2^-822
+    fpu_set_exceptions(tiny ? (exc | FPU_LIB_FLAG_UF) : (exc & ~FPU_LIB_FLAG_UF));
+}
+
+/*
+ * roundTiesToAway fixup for the FMA family. As with the OP-FP ops, the host runs
+ * in RNE under frm == RMM, so only an exact halfway tie needs correcting. Both are
+ * flag-isolated: the error-free arithmetic does raw host ops whose intermediate
+ * steps can raise spurious exceptions, while the genuine flags are already set by
+ * the base fma.
+ *
+ * f32: widen to f64, where a*b is exact (48 <= 53 bits) and the away-side midpoint
+ * m is exact (<= 25 significant bits, including the subnormal range). The op is an
+ * exact tie iff a*b + c == m, tested as ((a*b + c) - m) == 0 with every term
+ * exact. A subnormal f32 result is covered too: it widens to a normal f64.
+ */
+static forceinline fpu_f32_t riscv_rmm_fma_apply_f32(fpu_f32_t n, fpu_f32_t a, fpu_f32_t b, fpu_f32_t c)
+{
+    if (unlikely(!fpu_is_finite32(n))) {
+        return n;
+    }
+    const uint32_t  exc   = fpu_get_exceptions();
+    const fpu_f32_t away  = fpu_bit_u32_to_f32(fpu_bit_f32_to_u32(n) + 1);  // toward larger magnitude
+    const fpu_f64_t dab   = fpu_mul64(fpu_fcvt_f32_to_f64(a), fpu_fcvt_f32_to_f64(b));  // exact
+    const fpu_f64_t dc    = fpu_fcvt_f32_to_f64(c);
+    const fpu_f64_t s1    = fpu_add64(dab, dc);
+    const fpu_f64_t s1e   = fpu_add_error64(s1, dab, dc);  // exact: dab + dc == s1 + s1e
+    const fpu_f64_t dn    = fpu_fcvt_f32_to_f64(n);
+    const fpu_f64_t half  = fpu_bit_u64_to_f64(0x3FE0000000000000ULL);  // 0.5
+    const fpu_f64_t m     = fpu_add64(dn, fpu_mul64(fpu_sub64(fpu_fcvt_f32_to_f64(away), dn), half));
+    const fpu_f64_t resid = fpu_add64(fpu_sub64(s1, m), s1e);  // (a*b + c) - m, exactly 0 only on a tie
+    const fpu_f32_t r     = ((fpu_bit_f64_to_u64(resid) << 1) == 0) ? away : n;
+    fpu_set_exceptions(exc);
+    return r;
+}
+
+/*
+ * f64: there is no wider host type, so recover the exact residual a*b + c - n via
+ * the Boldo-Muller error-free FMA (TwoProduct + two TwoSums + a final FastTwoSum),
+ * which splits it into a non-overlapping pair (r2, r3). An exact tie has the
+ * residual equal to +/- half a ULP -- a single float -- so r3 == 0 and r2 is that
+ * half-ULP, exactly the condition riscv_rmm_apply_f64 already tests.
+ */
+static forceinline fpu_f64_t riscv_rmm_fma_apply_f64(fpu_f64_t n, fpu_f64_t a, fpu_f64_t b, fpu_f64_t c)
+{
+    if (unlikely(!fpu_is_finite64(n))) {
+        return n;
+    }
+    const uint32_t  exc = fpu_get_exceptions();
+    const fpu_f64_t p   = fpu_mul64(a, b);
+    const fpu_f64_t pe  = fpu_fma64(a, b, fpu_neg64(p));  // TwoProduct tail: a*b == p + pe
+    const fpu_f64_t a1  = fpu_add64(c, pe);
+    const fpu_f64_t a2  = fpu_add_error64(a1, c, pe);     // TwoSum(c, pe)
+    const fpu_f64_t b1  = fpu_add64(p, a1);
+    const fpu_f64_t b2  = fpu_add_error64(b1, p, a1);     // TwoSum(p, a1)
+    const fpu_f64_t g   = fpu_add64(fpu_sub64(b1, n), b2);
+    const fpu_f64_t r2  = fpu_add64(g, a2);               // FastTwoSum(g, a2): residual == r2 + r3
+    const fpu_f64_t r3  = fpu_sub64(a2, fpu_sub64(r2, g));
+    const fpu_f64_t r   = ((fpu_bit_f64_to_u64(r3) << 1) == 0) ? riscv_rmm_apply_f64(n, r2) : n;
+    fpu_set_exceptions(exc);
+    return r;
+}
+
+static forceinline fpu_f32_t riscv_fma_round_f32(rvvm_hart_t* vm, uint32_t rm, fpu_f32_t a, fpu_f32_t b, fpu_f32_t c)
+{
+    const uint32_t frm = vm->csr.fcsr >> 5;
+    const uint32_t eff = (rm == 0x07) ? frm : rm;
+    fpu_f32_t r;
+    if (unlikely(eff == 0x04)) {
+        // RMM: compute in RNE, then apply the roundTiesToAway fixup.
+        const uint32_t host = fpu_get_rounding_mode();
+        fpu_set_rounding_mode(FPU_LIB_ROUND_NE);
+        r = riscv_rmm_fma_apply_f32(fpu_fma32(a, b, c), a, b, c);
+        fpu_set_rounding_mode(host);
+    } else if (rm != 0x07 && eff != frm) {
+        // Static host-native mode that differs from frm.
+        const uint32_t host = fpu_get_rounding_mode();
+        fpu_set_rounding_mode(eff);
+        r = fpu_fma32(a, b, c);
+        fpu_set_rounding_mode(host);
+    } else {
+        r = fpu_fma32(a, b, c);
+    }
+    riscv_fma_fixup_uf32(r, a, b, c, eff);
+    return r;
+}
+
+static forceinline fpu_f64_t riscv_fma_round_f64(rvvm_hart_t* vm, uint32_t rm, fpu_f64_t a, fpu_f64_t b, fpu_f64_t c)
+{
+    const uint32_t frm = vm->csr.fcsr >> 5;
+    const uint32_t eff = (rm == 0x07) ? frm : rm;
+    fpu_f64_t r;
+    if (unlikely(eff == 0x04)) {
+        // RMM: compute in RNE, then apply the roundTiesToAway fixup.
+        const uint32_t host = fpu_get_rounding_mode();
+        fpu_set_rounding_mode(FPU_LIB_ROUND_NE);
+        r = riscv_rmm_fma_apply_f64(fpu_fma64(a, b, c), a, b, c);
+        fpu_set_rounding_mode(host);
+    } else if (rm != 0x07 && eff != frm) {
+        // Static host-native mode that differs from frm.
+        const uint32_t host = fpu_get_rounding_mode();
+        fpu_set_rounding_mode(eff);
+        r = fpu_fma64(a, b, c);
+        fpu_set_rounding_mode(host);
+    } else {
+        r = fpu_fma64(a, b, c);
+    }
+    riscv_fma_fixup_uf64(r, a, b, c, eff);
+    return r;
+}
+
 static forceinline void riscv_emulate_f_fmadd(rvvm_hart_t* vm, const uint32_t insn)
 {
     const size_t   rds = bit_ext_u32(insn, 7, 5);
@@ -137,14 +348,14 @@ static forceinline void riscv_emulate_f_fmadd(rvvm_hart_t* vm, const uint32_t in
     if (likely(riscv_fpu_is_enabled(vm) && riscv_fpu_rm_is_valid(rm))) {
         switch (bit_ext_u32(insn, 25, 2)) {
             case 0x0: // fmadd.s
-                riscv_emit_s(vm, rds,
-                             fpu_fma32(riscv_view_s(vm, rs1), //
-                                       riscv_view_s(vm, rs2), //
-                                       riscv_view_s(vm, rs3)));
+                riscv_write_s(vm, rds,
+                             riscv_fma_round_f32(vm, rm, riscv_read_s(vm, rs1), //
+                                       riscv_read_s(vm, rs2), //
+                                       riscv_read_s(vm, rs3)));
                 return;
             case 0x1: // fmadd.d
-                riscv_emit_d(vm, rds,
-                             fpu_fma64(riscv_view_d(vm, rs1), //
+                riscv_write_d(vm, rds,
+                             riscv_fma_round_f64(vm, rm, riscv_view_d(vm, rs1), //
                                        riscv_view_d(vm, rs2), //
                                        riscv_view_d(vm, rs3)));
                 return;
@@ -165,14 +376,14 @@ static forceinline void riscv_emulate_f_fmsub(rvvm_hart_t* vm, const uint32_t in
     if (likely(riscv_fpu_is_enabled(vm) && riscv_fpu_rm_is_valid(rm))) {
         switch (bit_ext_u32(insn, 25, 2)) {
             case 0x0: // fmsub.s
-                riscv_emit_s(vm, rds,
-                             fpu_fma32(riscv_view_s(vm, rs1), //
-                                       riscv_view_s(vm, rs2), //
-                                       fpu_neg32(riscv_view_s(vm, rs3))));
+                riscv_write_s(vm, rds,
+                             riscv_fma_round_f32(vm, rm, riscv_read_s(vm, rs1), //
+                                       riscv_read_s(vm, rs2), //
+                                       fpu_neg32(riscv_read_s(vm, rs3))));
                 return;
             case 0x1: // fmsub.d
-                riscv_emit_d(vm, rds,
-                             fpu_fma64(riscv_view_d(vm, rs1), //
+                riscv_write_d(vm, rds,
+                             riscv_fma_round_f64(vm, rm, riscv_view_d(vm, rs1), //
                                        riscv_view_d(vm, rs2), //
                                        fpu_neg64(riscv_view_d(vm, rs3))));
                 return;
@@ -193,14 +404,14 @@ static forceinline void riscv_emulate_f_fnmsub(rvvm_hart_t* vm, const uint32_t i
     if (likely(riscv_fpu_is_enabled(vm) && riscv_fpu_rm_is_valid(rm))) {
         switch (bit_ext_u32(insn, 25, 2)) {
             case 0x0: // fnmsub.s
-                riscv_emit_s(vm, rds,
-                             fpu_fma32(fpu_neg32(riscv_view_s(vm, rs1)), //
-                                       riscv_view_s(vm, rs2),            //
-                                       riscv_view_s(vm, rs3)));
+                riscv_write_s(vm, rds,
+                             riscv_fma_round_f32(vm, rm, fpu_neg32(riscv_read_s(vm, rs1)), //
+                                       riscv_read_s(vm, rs2),            //
+                                       riscv_read_s(vm, rs3)));
                 return;
             case 0x1: // fnmsub.d
-                riscv_emit_d(vm, rds,
-                             fpu_fma64(fpu_neg64(riscv_view_d(vm, rs1)), //
+                riscv_write_d(vm, rds,
+                             riscv_fma_round_f64(vm, rm, fpu_neg64(riscv_view_d(vm, rs1)), //
                                        riscv_view_d(vm, rs2),            //
                                        riscv_view_d(vm, rs3)));
                 return;
@@ -220,17 +431,18 @@ static forceinline void riscv_emulate_f_fnmadd(rvvm_hart_t* vm, const uint32_t i
 
     if (likely(riscv_fpu_is_enabled(vm) && riscv_fpu_rm_is_valid(rm))) {
         switch (bit_ext_u32(insn, 25, 2)) {
-            case 0x0: // fnmadd.s
-                riscv_emit_s(vm, rds,
-                             fpu_neg32(fpu_fma32(riscv_view_s(vm, rs1), //
-                                                 riscv_view_s(vm, rs2), //
-                                                 riscv_view_s(vm, rs3))));
+            case 0x0: // fnmadd.s = -(rs1*rs2) - rs3; negate operands so the single
+                       // rounding sees the correctly-signed result (directed modes)
+                riscv_write_s(vm, rds,
+                             riscv_fma_round_f32(vm, rm, fpu_neg32(riscv_read_s(vm, rs1)), //
+                                                 riscv_read_s(vm, rs2), //
+                                                 fpu_neg32(riscv_read_s(vm, rs3))));
                 return;
             case 0x1: // fnmadd.d
-                riscv_emit_d(vm, rds,
-                             fpu_neg64(fpu_fma64(riscv_view_d(vm, rs1), //
+                riscv_write_d(vm, rds,
+                             riscv_fma_round_f64(vm, rm, fpu_neg64(riscv_view_d(vm, rs1)), //
                                                  riscv_view_d(vm, rs2), //
-                                                 riscv_view_d(vm, rs3))));
+                                                 fpu_neg64(riscv_view_d(vm, rs3))));
                 return;
         }
     }
diff --git a/src/util/fpu_lib.c b/src/util/fpu_lib.c
index c82f0451e..80a74bc42 100644
--- a/src/util/fpu_lib.c
+++ b/src/util/fpu_lib.c
@@ -407,52 +407,76 @@ slow_path uint32_t fpu_fclass64(fpu_f64_t d)
 
 slow_path fpu_f32_t fpu_round_f32_internal(fpu_f32_t f, uint32_t mode)
 {
-    uint32_t u = fpu_bit_f32_to_u32(f);
-    uint32_t s = u & FPU_LIB_FP32_SIGNEDFP_MASK;
+    // A non-fractional value (integer, +/-0, inf, NaN) is already rounded. Return it
+    // unchanged: the round-to-nearest path below adds +/-0.5, which is inexact once
+    // 0.5 underflows a large value's ULP and would raise a spurious INEXACT -- e.g.
+    // on an out-of-range fcvt-to-int, leaving NX wrongly set alongside NV.
+    if (likely(!fpu_is_fractional32(f))) {
+        return f;
+    }
+    // f is finite with a fractional part. The magic-number rounding (add/sub of 0.5
+    // or 1.0) is itself inexact for the discarded fraction; snapshot and restore the
+    // exception flags so it leaks none. Each fcvt-to-int caller determines INEXACT
+    // itself from the round-trip (and only when the result is in range), so the
+    // rounding step must not pre-set a spurious NX -- e.g. alongside an NV from an
+    // out-of-range or negative-to-unsigned conversion.
+    const uint32_t exc = fpu_get_exceptions();
+    const uint32_t s   = fpu_bit_f32_to_u32(f) & FPU_LIB_FP32_SIGNEDFP_MASK;
+    fpu_f32_t r = f;
     if (unlikely(mode > FPU_LIB_ROUND_UP)) {
         mode = fpu_get_rounding_mode();
     }
     switch (mode) {
         case FPU_LIB_ROUND_NE:
         case FPU_LIB_ROUND_MM:
-            return fpu_add32(f, fpu_bit_u32_to_f32(0x3F000000U | s));
+            r = fpu_add32(f, fpu_bit_u32_to_f32(0x3F000000U | s));
+            break;
         case FPU_LIB_ROUND_DN:
-            if (s && fpu_is_fractional32(f)) {
-                return fpu_sub32(f, fpu_bit_u32_to_f32(0x3F800000U));
+            if (s) {
+                r = fpu_sub32(f, fpu_bit_u32_to_f32(0x3F800000U));
             }
             break;
         case FPU_LIB_ROUND_UP:
-            if (!s && fpu_is_fractional32(f)) {
-                return fpu_add32(f, fpu_bit_u32_to_f32(0x3F800000U));
+            if (!s) {
+                r = fpu_add32(f, fpu_bit_u32_to_f32(0x3F800000U));
             }
             break;
     }
-    return f;
+    fpu_set_exceptions(exc);
+    return r;
 }
 
 slow_path fpu_f64_t fpu_round_f64_internal(fpu_f64_t d, uint32_t mode)
 {
-    uint64_t u = fpu_bit_f64_to_u64(d);
-    uint64_t s = u & FPU_LIB_FP64_SIGNEDFP_MASK;
+    // See fpu_round_f32_internal: a non-fractional value needs no rounding, and
+    // skipping it avoids a spurious INEXACT from the +/-0.5 round-to-nearest add.
+    if (likely(!fpu_is_fractional64(d))) {
+        return d;
+    }
+    const uint32_t exc = fpu_get_exceptions();
+    const uint64_t s   = fpu_bit_f64_to_u64(d) & FPU_LIB_FP64_SIGNEDFP_MASK;
+    fpu_f64_t r = d;
     if (unlikely(mode > FPU_LIB_ROUND_UP)) {
         mode = fpu_get_rounding_mode();
     }
     switch (mode) {
         case FPU_LIB_ROUND_NE:
         case FPU_LIB_ROUND_MM:
-            return fpu_add64(d, fpu_bit_u64_to_f64(0x3FE0000000000000ULL | s));
+            r = fpu_add64(d, fpu_bit_u64_to_f64(0x3FE0000000000000ULL | s));
+            break;
         case FPU_LIB_ROUND_DN:
-            if (s && fpu_is_fractional64(d)) {
-                return fpu_sub64(d, fpu_bit_u64_to_f64(0x3FF0000000000000ULL));
+            if (s) {
+                r = fpu_sub64(d, fpu_bit_u64_to_f64(0x3FF0000000000000ULL));
             }
             break;
         case FPU_LIB_ROUND_UP:
-            if (!s && fpu_is_fractional64(d)) {
-                return fpu_add64(d, fpu_bit_u64_to_f64(0x3FF0000000000000ULL));
+            if (!s) {
+                r = fpu_add64(d, fpu_bit_u64_to_f64(0x3FF0000000000000ULL));
             }
             break;
     }
-    return d;
+    fpu_set_exceptions(exc);
+    return r;
 }
 
 #if defined(USE_SOFT_FPU_SQRT)
diff --git a/src/util/fpu_lib.h b/src/util/fpu_lib.h
index e9681e9ae..ecc8969e6 100644
--- a/src/util/fpu_lib.h
+++ b/src/util/fpu_lib.h
@@ -1071,12 +1071,12 @@ static forceinline fpu_f32_t fpu_sqrt32(fpu_f32_t f)
 #endif
         return ret;
     }
-    if (fpu_is_negative32(f)) {
-        // Raise invalid flag, return canonical NaN
+    if (fpu_is_negative32(f) && (fpu_bit_f32_to_u32(f) << 1) != 0) {
+        // Negative non-zero: raise invalid, return canonical NaN
         fpu_raise_invalid();
         return fpu_bit_u32_to_f32(FPU_LIB_FP32_CANONICAL_NAN);
     }
-    // Perform NaN propagation
+    // NaN propagation, and sqrt(-0.0) == -0.0 (no exception)
     return f;
 }
 
@@ -1093,12 +1093,12 @@ static forceinline fpu_f64_t fpu_sqrt64(fpu_f64_t d)
 #endif
         return ret;
     }
-    if (fpu_is_negative64(d)) {
-        // Raise invalid flag, return canonical NaN
+    if (fpu_is_negative64(d) && (fpu_bit_f64_to_u64(d) << 1) != 0) {
+        // Negative non-zero: raise invalid, return canonical NaN
         fpu_raise_invalid();
         return fpu_bit_u64_to_f64(FPU_LIB_FP64_CANONICAL_NAN);
     }
-    // Perform NaN propagation
+    // NaN propagation, and sqrt(-0.0) == -0.0 (no exception)
     return d;
 }