From cea38096c9921b9b419c9a04eaef257a8869e504 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sat, 23 May 2026 01:17:48 +0200 Subject: [PATCH 1/4] JIT: Use faster mod for uint16 values --- src/coreclr/jit/assertionprop.cpp | 50 +++++- src/coreclr/jit/gentree.cpp | 4 +- src/coreclr/jit/gentree.h | 2 + src/coreclr/jit/lower.cpp | 75 +++++++- src/coreclr/jit/morph.cpp | 20 +++ .../Divide/Regressions/Regression4_Divide.cs | 169 ++++++++++++++++++ .../Regressions/Regression4_Divide.csproj | 9 + 7 files changed, 322 insertions(+), 7 deletions(-) create mode 100644 src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.cs create mode 100644 src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.csproj diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index 2375158e7d11b1..23c9dc700b1f0a 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -228,6 +228,24 @@ bool IntegralRange::Contains(int64_t value) const rangeType = compiler->lvaGetDesc(node->AsLclVar())->TypeGet(); } + // If the local's conservative VN identifies it as the result of a CAST + // to a small type, the value stored is bounded by that type's range. + if (compiler->vnStore != nullptr) + { + ValueNum vn = compiler->vnStore->VNConservativeNormalValue(node->gtVNPair); + VNFuncApp funcApp; + if (compiler->vnStore->GetVNFunc(vn, &funcApp) && (funcApp.m_func == VNF_Cast)) + { + var_types castToType; + bool srcIsUnsigned; + compiler->vnStore->GetCastOperFromVN(funcApp.m_args[1], &castToType, &srcIsUnsigned); + if (varTypeIsSmall(castToType)) + { + return ForType(castToType); + } + } + } + if (varDsc->IsNeverNegative()) { return {SymbolicIntegerValue::Zero, UpperBoundForType(rangeType)}; @@ -532,12 +550,21 @@ bool IntegralRange::Contains(int64_t value) const // CAST(ulong/long <- int) - [INT_MIN..INT_MAX] if (!cast->gtOverflow()) { - if ((fromType == TYP_INT) && fromUnsigned) + IntegralRange typeRange = (fromType == TYP_INT) && fromUnsigned + ? IntegralRange{SymbolicIntegerValue::Zero, SymbolicIntegerValue::UIntMax} + : IntegralRange{SymbolicIntegerValue::IntMin, SymbolicIntegerValue::IntMax}; + + // Refine using the operand's known range: when the operand fits entirely + // inside what the cast can represent, the cast preserves the value, so we + // can tighten the output range to the operand's range. This handles cases + // like CAST(int <- ulong) of a value already known to be in uint16 range. + IntegralRange operandRange = ForNode(cast->CastOp(), compiler); + if (typeRange.Contains(operandRange)) { - return {SymbolicIntegerValue::Zero, SymbolicIntegerValue::UIntMax}; + return operandRange; } - return {SymbolicIntegerValue::IntMin, SymbolicIntegerValue::IntMax}; + return typeRange; } SymbolicIntegerValue lowerBound; @@ -4044,6 +4071,7 @@ GenTree* Compiler::optAssertionProp_AddMulSub(ASSERT_VALARG_TP assertions, GenTr // 1) Convert DIV/MOD to UDIV/UMOD if both operands are proven to be never negative // 2) Marks DIV/UDIV/MOD/UMOD with GTF_DIV_MOD_NO_BY_ZERO if divisor is proven to be never zero // 3) Marks DIV/UDIV/MOD/UMOD with GTF_DIV_MOD_NO_OVERFLOW if both operands are proven to be never negative +// 4) Marks UMOD with GTF_UMOD_UINT16_OPERANDS if both operands are proven to be in uint16 range // // Arguments: // assertions - set of live assertions @@ -4091,6 +4119,22 @@ GenTree* Compiler::optAssertionProp_ModDiv(ASSERT_VALARG_TP assertions, changed = true; } +#ifdef TARGET_64BIT + // Detect "uint16 % const-uint16" patterns so lowering can emit a cheaper FastMod sequence. + if (((tree->gtFlags & GTF_UMOD_UINT16_OPERANDS) == 0) && tree->OperIs(GT_UMOD) && !opts.MinOpts() && + op2->IsCnsIntOrI()) + { + ssize_t divisorValue = op2->AsIntCon()->IconValue(); + if ((divisorValue > 0) && FitsIn(divisorValue) && + IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(op1, this))) + { + JITDUMP("Both operands for UMOD are in uint16 range...\n") + tree->gtFlags |= GTF_UMOD_UINT16_OPERANDS; + changed = true; + } + } +#endif // TARGET_64BIT + return changed ? optAssertionProp_Update(tree, tree, stmt) : nullptr; } diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 1946ace685bc22..162c5f9f828582 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -2672,8 +2672,8 @@ bool GenTree::Compare(GenTree* op1, GenTree* op2, bool swapOK) } if (op1->OperIs(GT_MOD, GT_UMOD, GT_DIV, GT_UDIV)) { - if ((op1->gtFlags & (GTF_DIV_MOD_NO_BY_ZERO | GTF_DIV_MOD_NO_OVERFLOW)) != - (op2->gtFlags & (GTF_DIV_MOD_NO_BY_ZERO | GTF_DIV_MOD_NO_OVERFLOW))) + if ((op1->gtFlags & (GTF_DIV_MOD_NO_BY_ZERO | GTF_DIV_MOD_NO_OVERFLOW | GTF_UMOD_UINT16_OPERANDS)) != + (op2->gtFlags & (GTF_DIV_MOD_NO_BY_ZERO | GTF_DIV_MOD_NO_OVERFLOW | GTF_UMOD_UINT16_OPERANDS))) { return false; } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index a71ed68351c9b4..bec522a8efe539 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -528,6 +528,8 @@ enum GenTreeFlags : unsigned GTF_DIV_MOD_NO_OVERFLOW = 0x40000000, // GT_DIV, GT_MOD -- Div or mod definitely does not overflow. + GTF_UMOD_UINT16_OPERANDS = 0x80000000, // UMOD -- Both operands to a mod are in uint16 range. The divisor is non-zero constant. + GTF_CHK_INDEX_INBND = 0x80000000, // GT_BOUNDS_CHECK -- have proven this check is always in-bounds GTF_ARRLEN_NONFAULTING = 0x20000000, // GT_ARR_LENGTH -- An array length operation that cannot fault. Same as GT_IND_NONFAULTING. diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 6e7f6824ecea3a..6ab398c96ee9c1 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -8088,7 +8088,10 @@ bool Lowering::TryLowerConstIntUDivOrUMod(GenTreeOp* divMod) assert(varTypeIsFloating(divMod->TypeGet())); #endif // USE_HELPERS_FOR_INT_DIV #if defined(TARGET_ARM64) - assert(!divMod->OperIs(GT_UMOD)); + // ARM64 has no remainder instruction. Morph rewrites every non-pow2 MOD/UMOD + // into a SUB-MUL-DIV form except when the FastMod path here can apply, which + // requires a constant uint16 divisor (and a uint16-range dividend). + assert(!divMod->OperIs(GT_UMOD) || divMod->gtGetOp2()->IsCnsIntOrI()); #endif // TARGET_ARM64 GenTree* dividend = divMod->gtGetOp1(); @@ -8170,10 +8173,78 @@ bool Lowering::TryLowerConstIntUDivOrUMod(GenTreeOp* divMod) } } + assert(divisorValue >= 3); + // TODO-ARM-CQ: Currently there's no GT_MULHI for ARM32 #if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) - if (!m_compiler->opts.MinOpts() && (divisorValue >= 3)) + if (!m_compiler->opts.MinOpts()) { +#ifdef TARGET_64BIT + // Replace (uint16 % uint16) with a cheaper variant of FastMod, specialized for 16-bit operands. + // + // uint multiplier = uint.MaxValue / divisor + 1; + // ulong result = ((ulong)(dividend * multiplier) * divisor) >> 32; + // return (int)result; + // + // The dividend is first truncated to TYP_INT (safe because we know it fits in + // uint16), and the final value (always in [0, divisor)) is widened back to the + // mod's result type. The dividend's range may have been recovered either + // statically by IntegralRange::ForNode at this point, or earlier by assertion + // propagation (which leaves GTF_UMOD_UINT16_OPERANDS for us). + if (!isDiv && FitsIn(divisorValue) && + (((divMod->gtFlags & GTF_UMOD_UINT16_OPERANDS) != 0) || + IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(dividend, m_compiler)))) + { + GenTree* dividendInt = dividend; + if (genActualType(dividend) != TYP_INT) + { + assert(genActualType(dividend) == TYP_LONG); + dividendInt = m_compiler->gtNewCastNode(TYP_INT, dividend, /* unsigned */ true, TYP_INT); + BlockRange().InsertBefore(divMod, dividendInt); + } + + GenTree* multiplier = + m_compiler->gtNewIconNode(static_cast((UINT32_MAX / divisorValue) + 1), TYP_INT); + GenTree* mul1 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, dividendInt, multiplier); + mul1->SetUnsigned(); + GenTree* castUp = m_compiler->gtNewCastNode(TYP_LONG, mul1, /* unsigned */ true, TYP_LONG); + BlockRange().InsertBefore(divMod, multiplier, mul1, castUp); + + // Reuse the existing constant divisor as a TYP_LONG operand for the second multiply. + divisor->BashToConst(static_cast(divisorValue)); + + GenTree* mul2 = m_compiler->gtNewOperNode(GT_MUL, TYP_LONG, castUp, divisor); + mul2->SetUnsigned(); + GenTree* shiftAmount = m_compiler->gtNewIconNode(32); + BlockRange().InsertBefore(divMod, mul2, shiftAmount); + + GenTree* firstNode = (dividendInt == dividend) ? multiplier : dividendInt; + + if (type == TYP_LONG) + { + // The shift result is already TYP_LONG; turn divMod itself into the shift. + divMod->ChangeOper(GT_RSZ); + divMod->gtOp1 = mul2; + divMod->gtOp2 = shiftAmount; + } + else + { + assert(type == TYP_INT); + GenTree* shift = m_compiler->gtNewOperNode(GT_RSZ, TYP_LONG, mul2, shiftAmount); + BlockRange().InsertBefore(divMod, shift); + + divMod->ChangeOper(GT_CAST); + divMod->AsCast()->gtCastType = TYP_INT; + divMod->gtOp1 = shift; + divMod->gtOp2 = nullptr; + divMod->SetUnsigned(); + } + + ContainCheckRange(firstNode, divMod); + return true; + } +#endif // TARGET_64BIT + size_t magic; bool increment; int preShift; diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index f5dd9fe9bef06a..b24ce40938ac4e 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -7424,6 +7424,26 @@ GenTree* Compiler::fgMorphSmpOp(GenTree* tree, bool* optAssertionPropDone) else if (tree->OperIs(GT_MOD) && op2->IsIntegralConst() && !op2->IsIntegralConstAbsPow2()) #endif { +#if defined(TARGET_64BIT) + // Skip this transform when lowering will be able to apply the + // cheaper uint16 FastMod sequence. This requires the divisor to + // be a non-zero constant in uint16 range and the dividend's + // static IntegralRange to already fit in uint16. + if (op2->IsCnsIntOrI()) + { + ssize_t modDivisor = op2->AsIntCon()->IconValue(); + if ((modDivisor > 0) && FitsIn(modDivisor) && + IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(op1, this))) + { + if (tree->OperIs(GT_MOD)) + { + tree->SetOper(GT_UMOD); + } + break; + } + } +#endif // TARGET_64BIT + // Transformation: a % b = a - (a / b) * b; tree = fgMorphModToSubMulDiv(tree->AsOp()); op1 = tree->AsOp()->gtOp1; diff --git a/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.cs b/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.cs new file mode 100644 index 00000000000000..eec5af4a8dd5bc --- /dev/null +++ b/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.cs @@ -0,0 +1,169 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.X86; +using Xunit; + +public class Program +{ + private static ushort s_field1; + private static ulong s_field2; + + [MethodImpl(MethodImplOptions.NoInlining)] + public static uint Umod_U4_CharByZero(char c) + { + return (uint)c % 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ushort Umod_U2_CharByConst(char c) + { + return (ushort)(c % 42); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static int Umod_I4_CharByConst(char c) + { + return c % 42; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static uint Umod_U4_CharByConst(char c) + { + return (uint)c % 42; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static long Umod_I8_CharByConst(char c) + { + return (long)c % 42; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ulong Umod_U8_CharByConst(char c) + { + return (ulong)c % 42; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static bool TestIsWhiteSpace(char c) + { + ReadOnlySpan HashEntries = [' ', ' ', '\u00A0', ' ', ' ', ' ', ' ', ' ', ' ', '\t', '\n', '\v', '\f', '\r', ' ', ' ', '\u2028', '\u2029', ' ', ' ', ' ', ' ', ' ', '\u202F', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '\u3000', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '\u0085', '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A', ' ', ' ', ' ', ' ', ' ', '\u205F', '\u1680', ' ', ' ', ' ', ' ', ' ', ' ']; + return HashEntries[c % HashEntries.Length] == c; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ushort Umod_TZC(ulong value) + { + return (ushort)(BitOperations.TrailingZeroCount(value) % 42); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ushort Umod_TZC_Intrinsic(ulong value) + { + return (ushort)(Bmi1.X64.TrailingZeroCount(value) % 42); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static int Umod_UInt16Range_ByConst(int value) + { + if (value is > 0 and < 1234) + { + return value % 123; + } + + return -1; + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + private static void Test1() + { + for (int i = 0; i < 2; i++) + { + Core(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void Core() + { + s_field1 = (ushort)(Bmi1.X64.TrailingZeroCount(s_field2) % 42); + } + } + + [Fact] + public static int TestEntryPoint() + { + try + { + Umod_U4_CharByZero('a'); + return 0; + } + catch (DivideByZeroException) { } + + if (Umod_U2_CharByConst('a') != 13) + return 0; + + if (Umod_I4_CharByConst('a') != 13) + return 0; + + if (Umod_U4_CharByConst('a') != 13) + return 0; + + if (Umod_I8_CharByConst('a') != 13) + return 0; + + if (Umod_U8_CharByConst('a') != 13) + return 0; + + if (!TestIsWhiteSpace(' ')) + return 0; + + if (!TestIsWhiteSpace('\u2029')) + return 0; + + if (TestIsWhiteSpace('\0')) + return 0; + + if (TestIsWhiteSpace('a')) + return 0; + + if (Umod_TZC(1L << 40) != 40) + return 0; + + if (Umod_TZC(1L << 50) != 8) + return 0; + + if (Bmi1.X64.IsSupported) + { + if (Umod_TZC_Intrinsic(1L << 40) != 40) + return 0; + + if (Umod_TZC_Intrinsic(1L << 50) != 8) + return 0; + } + + if (Umod_UInt16Range_ByConst(0) != -1) + return 0; + + if (Umod_UInt16Range_ByConst(42) != 42) + return 0; + + if (Umod_UInt16Range_ByConst(123) != 0) + return 0; + + if (Bmi1.X64.IsSupported) + { + s_field1 = 1; + s_field2 = 1L << 50; + Test1(); + + if (s_field1 != 8) + return 0; + } + + return 100; + } +} diff --git a/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.csproj b/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.csproj new file mode 100644 index 00000000000000..501217e4d86892 --- /dev/null +++ b/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.csproj @@ -0,0 +1,9 @@ + + + None + True + + + + + From 56f1398f83e523059f7d1bdb51d13c76430179ea Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sat, 23 May 2026 04:14:51 +0200 Subject: [PATCH 2/4] Add a MinOpts check --- src/coreclr/jit/morph.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index b24ce40938ac4e..b7aceaf6119f0d 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -7429,7 +7429,7 @@ GenTree* Compiler::fgMorphSmpOp(GenTree* tree, bool* optAssertionPropDone) // cheaper uint16 FastMod sequence. This requires the divisor to // be a non-zero constant in uint16 range and the dividend's // static IntegralRange to already fit in uint16. - if (op2->IsCnsIntOrI()) + if (!opts.MinOpts() && op2->IsCnsIntOrI()) { ssize_t modDivisor = op2->AsIntCon()->IconValue(); if ((modDivisor > 0) && FitsIn(modDivisor) && From 897296a0fa975079dfc5da6d0e8772dc7afaf0ce Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sun, 24 May 2026 00:02:50 +0200 Subject: [PATCH 3/4] Fix unsound IntegralRange refinement for NOL small-type locals The VN-based small-type refinement in IntegralRange::ForNode for GT_LCL_VAR claimed a tight range whenever the local's conservative VN was a CAST to a small type. That is unsound for normalize-on-load locals: their storage only contains the small-type bits in the low byte (upper bytes are stale), but the tight range let fgOptimizeCast drop a required sign/zero extending load downstream, causing it to read those stale bits. Restrict the refinement to locals whose storage is fully normalized: either non-small locals, or small locals with lvNormalizeOnStore set. Repros fuzzlyn seed 12902382323863156506 (and others) where '(short)(arg0 ^ arg2)' with sbyte arg0 began returning the unsigned byte interpretation in release. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/assertionprop.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index 23c9dc700b1f0a..25f6d163f20e32 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -230,7 +230,14 @@ bool IntegralRange::Contains(int64_t value) const // If the local's conservative VN identifies it as the result of a CAST // to a small type, the value stored is bounded by that type's range. - if (compiler->vnStore != nullptr) + // + // This refinement is only sound when the local's storage is fully + // normalized to the cast's destination type. For "normalize on load" + // small-type locals the store writes only the low bits, leaving the + // upper bits stale; tightening the range here would let downstream + // optimizations (e.g. fgOptimizeCast) drop a required sign/zero + // extending load and read those stale bits. + if ((compiler->vnStore != nullptr) && (!varTypeIsSmall(varDsc->TypeGet()) || varDsc->lvNormalizeOnStore())) { ValueNum vn = compiler->vnStore->VNConservativeNormalValue(node->gtVNPair); VNFuncApp funcApp; From a222b5cc47f6705f6b169769c707575b5317e661 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Mon, 25 May 2026 15:48:22 +0200 Subject: [PATCH 4/4] Move detection to range checks --- src/coreclr/jit/assertionprop.cpp | 42 ------------- src/coreclr/jit/compiler.h | 11 ++++ src/coreclr/jit/gentree.h | 2 +- src/coreclr/jit/lower.cpp | 6 +- src/coreclr/jit/morph.cpp | 38 ++++++++--- src/coreclr/jit/rangecheck.cpp | 101 +++++++++++++++++++++++++++++- src/coreclr/jit/rangecheck.h | 6 ++ 7 files changed, 150 insertions(+), 56 deletions(-) diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index 25f6d163f20e32..df611d8aa5b3ef 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -228,31 +228,6 @@ bool IntegralRange::Contains(int64_t value) const rangeType = compiler->lvaGetDesc(node->AsLclVar())->TypeGet(); } - // If the local's conservative VN identifies it as the result of a CAST - // to a small type, the value stored is bounded by that type's range. - // - // This refinement is only sound when the local's storage is fully - // normalized to the cast's destination type. For "normalize on load" - // small-type locals the store writes only the low bits, leaving the - // upper bits stale; tightening the range here would let downstream - // optimizations (e.g. fgOptimizeCast) drop a required sign/zero - // extending load and read those stale bits. - if ((compiler->vnStore != nullptr) && (!varTypeIsSmall(varDsc->TypeGet()) || varDsc->lvNormalizeOnStore())) - { - ValueNum vn = compiler->vnStore->VNConservativeNormalValue(node->gtVNPair); - VNFuncApp funcApp; - if (compiler->vnStore->GetVNFunc(vn, &funcApp) && (funcApp.m_func == VNF_Cast)) - { - var_types castToType; - bool srcIsUnsigned; - compiler->vnStore->GetCastOperFromVN(funcApp.m_args[1], &castToType, &srcIsUnsigned); - if (varTypeIsSmall(castToType)) - { - return ForType(castToType); - } - } - } - if (varDsc->IsNeverNegative()) { return {SymbolicIntegerValue::Zero, UpperBoundForType(rangeType)}; @@ -4078,7 +4053,6 @@ GenTree* Compiler::optAssertionProp_AddMulSub(ASSERT_VALARG_TP assertions, GenTr // 1) Convert DIV/MOD to UDIV/UMOD if both operands are proven to be never negative // 2) Marks DIV/UDIV/MOD/UMOD with GTF_DIV_MOD_NO_BY_ZERO if divisor is proven to be never zero // 3) Marks DIV/UDIV/MOD/UMOD with GTF_DIV_MOD_NO_OVERFLOW if both operands are proven to be never negative -// 4) Marks UMOD with GTF_UMOD_UINT16_OPERANDS if both operands are proven to be in uint16 range // // Arguments: // assertions - set of live assertions @@ -4126,22 +4100,6 @@ GenTree* Compiler::optAssertionProp_ModDiv(ASSERT_VALARG_TP assertions, changed = true; } -#ifdef TARGET_64BIT - // Detect "uint16 % const-uint16" patterns so lowering can emit a cheaper FastMod sequence. - if (((tree->gtFlags & GTF_UMOD_UINT16_OPERANDS) == 0) && tree->OperIs(GT_UMOD) && !opts.MinOpts() && - op2->IsCnsIntOrI()) - { - ssize_t divisorValue = op2->AsIntCon()->IconValue(); - if ((divisorValue > 0) && FitsIn(divisorValue) && - IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(op1, this))) - { - JITDUMP("Both operands for UMOD are in uint16 range...\n") - tree->gtFlags |= GTF_UMOD_UINT16_OPERANDS; - changed = true; - } - } -#endif // TARGET_64BIT - return changed ? optAssertionProp_Update(tree, tree, stmt) : nullptr; } diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 583892ca426796..f8c7c82922fe1c 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7884,6 +7884,7 @@ class Compiler #define OMF_HAS_STACK_ARRAY 0x00100000 // Method contains stack allocated arrays #define OMF_HAS_BOUNDS_CHECKS 0x00200000 // Method contains bounds checks #define OMF_HAS_EARLY_QMARKS 0x00400000 // Method contains early expandable QMARKs +#define OMF_HAS_UMOD_BY_CONST_UINT16_CANDIDATE 0x00800000 // Method contains a MOD/UMOD that may turn into a uint16 FastMod candidate during the range check phase // clang-format on @@ -7924,6 +7925,16 @@ class Compiler optMethodFlags |= OMF_HAS_BOUNDS_CHECKS; } + bool doesMethodHaveUModByConstUInt16Candidate() + { + return (optMethodFlags & OMF_HAS_UMOD_BY_CONST_UINT16_CANDIDATE) != 0; + } + + void setMethodHasUModByConstUInt16Candidate() + { + optMethodFlags |= OMF_HAS_UMOD_BY_CONST_UINT16_CANDIDATE; + } + bool doesMethodHaveExpandableCasts() { return (optMethodFlags & OMF_HAS_EXPANDABLE_CAST) != 0; diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index bec522a8efe539..c04e3fb6f44bb4 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -528,7 +528,7 @@ enum GenTreeFlags : unsigned GTF_DIV_MOD_NO_OVERFLOW = 0x40000000, // GT_DIV, GT_MOD -- Div or mod definitely does not overflow. - GTF_UMOD_UINT16_OPERANDS = 0x80000000, // UMOD -- Both operands to a mod are in uint16 range. The divisor is non-zero constant. + GTF_UMOD_UINT16_OPERANDS = 0x80000000, // GT_UMOD -- Both operands to a mod are in uint16 range. The divisor is a non-zero constant. GTF_CHK_INDEX_INBND = 0x80000000, // GT_BOUNDS_CHECK -- have proven this check is always in-bounds diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 6ab398c96ee9c1..44053912b2c3c3 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -8189,8 +8189,8 @@ bool Lowering::TryLowerConstIntUDivOrUMod(GenTreeOp* divMod) // The dividend is first truncated to TYP_INT (safe because we know it fits in // uint16), and the final value (always in [0, divisor)) is widened back to the // mod's result type. The dividend's range may have been recovered either - // statically by IntegralRange::ForNode at this point, or earlier by assertion - // propagation (which leaves GTF_UMOD_UINT16_OPERANDS for us). + // statically by IntegralRange::ForNode at this point, or earlier by the range + // check phase (which leaves GTF_UMOD_UINT16_OPERANDS for us). if (!isDiv && FitsIn(divisorValue) && (((divMod->gtFlags & GTF_UMOD_UINT16_OPERANDS) != 0) || IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(dividend, m_compiler)))) @@ -8211,7 +8211,7 @@ bool Lowering::TryLowerConstIntUDivOrUMod(GenTreeOp* divMod) BlockRange().InsertBefore(divMod, multiplier, mul1, castUp); // Reuse the existing constant divisor as a TYP_LONG operand for the second multiply. - divisor->BashToConst(static_cast(divisorValue)); + divisor->BashToConst(static_cast(divisorValue), TYP_LONG); GenTree* mul2 = m_compiler->gtNewOperNode(GT_MUL, TYP_LONG, castUp, divisor); mul2->SetUnsigned(); diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index b7aceaf6119f0d..1afce54b5da81e 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -7425,20 +7425,32 @@ GenTree* Compiler::fgMorphSmpOp(GenTree* tree, bool* optAssertionPropDone) #endif { #if defined(TARGET_64BIT) - // Skip this transform when lowering will be able to apply the - // cheaper uint16 FastMod sequence. This requires the divisor to - // be a non-zero constant in uint16 range and the dividend's - // static IntegralRange to already fit in uint16. + // Skip the SUB-MUL-DIV rewrite below when lowering may apply + // the cheaper uint16 FastMod sequence. That requires a non-zero + // constant uint16 divisor AND a dividend provable as uint16. + // The dividend's range may be provable statically here, or only + // later via assertion/VN information from the range check phase; + // defer to that phase by leaving the node as MOD and setting + // OMF_HAS_UMOD_BY_CONST_UINT16_CANDIDATE when we can't decide yet. if (!opts.MinOpts() && op2->IsCnsIntOrI()) { ssize_t modDivisor = op2->AsIntCon()->IconValue(); - if ((modDivisor > 0) && FitsIn(modDivisor) && - IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(op1, this))) + if ((modDivisor > 0) && FitsIn(modDivisor)) { - if (tree->OperIs(GT_MOD)) + if (IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(op1, this))) { - tree->SetOper(GT_UMOD); + if (tree->OperIs(GT_MOD)) + { + tree->SetOper(GT_UMOD); + } + break; } + + // Defer to the range check phase: leave the MOD/UMOD + // node intact so that VN- and assertion-based range + // analysis can decide whether the dividend fits in + // uint16. + setMethodHasUModByConstUInt16Candidate(); break; } } @@ -7449,6 +7461,16 @@ GenTree* Compiler::fgMorphSmpOp(GenTree* tree, bool* optAssertionPropDone) op1 = tree->AsOp()->gtOp1; op2 = tree->AsOp()->gtOp2; } +#if defined(TARGET_64BIT) && !defined(TARGET_ARM64) + else if (!opts.MinOpts() && tree->OperIs(GT_MOD, GT_UMOD) && !op2->IsIntegralConst() && + ((typ == TYP_INT) || (typ == TYP_LONG))) + { + // The divisor may fold to a uint16 constant later (e.g. a + // span length that is computed from an RVA static). Let the + // range check phase re-examine such nodes after VN/CSE. + setMethodHasUModByConstUInt16Candidate(); + } +#endif // TARGET_64BIT && !TARGET_ARM64 break; USE_HELPER_FOR_ARITH: diff --git a/src/coreclr/jit/rangecheck.cpp b/src/coreclr/jit/rangecheck.cpp index 5d3326548ec260..bf11707a036824 100644 --- a/src/coreclr/jit/rangecheck.cpp +++ b/src/coreclr/jit/rangecheck.cpp @@ -14,12 +14,21 @@ // PhaseStatus Compiler::rangeCheckPhase() { - if (!doesMethodHaveBoundsChecks() || (fgSsaPassesCompleted == 0)) + if ((!doesMethodHaveBoundsChecks() && !doesMethodHaveUModByConstUInt16Candidate()) || (fgSsaPassesCompleted == 0)) { return PhaseStatus::MODIFIED_NOTHING; } - const bool madeChanges = GetRangeCheck()->OptimizeRangeChecks(); + RangeCheck* rc = GetRangeCheck(); + bool madeChanges = false; + if (doesMethodHaveBoundsChecks()) + { + madeChanges = rc->OptimizeRangeChecks(); + } + if (doesMethodHaveUModByConstUInt16Candidate()) + { + madeChanges |= rc->TryMarkUModUInt16Operands(); + } return madeChanges ? PhaseStatus::MODIFIED_EVERYTHING : PhaseStatus::MODIFIED_NOTHING; } @@ -2401,3 +2410,91 @@ bool RangeCheck::OptimizeRangeChecks() return madeChanges; } + +//------------------------------------------------------------------------ +// TryMarkUModUInt16Operands: detect GT_MOD/GT_UMOD nodes where the divisor is +// a positive uint16 constant and the dividend is provably in uint16 range. +// Such nodes are converted to GT_UMOD (if signed) and tagged with +// GTF_UMOD_UINT16_OPERANDS so that lowering can emit a cheaper FastMod +// sequence specialized for 16-bit operands. +// +// Return Value: +// True if any node was modified. +// +bool RangeCheck::TryMarkUModUInt16Operands() +{ +#ifdef TARGET_64BIT + bool madeChanges = false; + + for (BasicBlock* const block : m_compiler->Blocks()) + { + for (Statement* const stmt : block->Statements()) + { + for (GenTree* const tree : stmt->TreeList()) + { + if (!tree->OperIs(GT_MOD, GT_UMOD)) + { + continue; + } + + if ((tree->gtFlags & GTF_UMOD_UINT16_OPERANDS) != 0) + { + continue; + } + + if (!tree->TypeIs(TYP_INT, TYP_LONG)) + { + continue; + } + + GenTree* divisor = tree->gtGetOp2(); + if (!divisor->IsCnsIntOrI()) + { + continue; + } + + ssize_t divisorValue = divisor->AsIntCon()->IconValue(); + if ((divisorValue <= 0) || !FitsIn(divisorValue)) + { + continue; + } + + GenTree* dividend = tree->gtGetOp1(); + + bool dividendFitsUint16 = + IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(dividend, m_compiler)); + + if (!dividendFitsUint16 && (m_compiler->vnStore != nullptr)) + { + // Fall back to a richer VN-/assertion-based range analysis. + Range range = Range(Limit(Limit::keUndef)); + if (TryGetRange(block, dividend, &range) && range.IsConstantRange()) + { + const int lower = range.LowerLimit().GetConstant(); + const int upper = range.UpperLimit().GetConstant(); + dividendFitsUint16 = (lower >= 0) && (upper <= UINT16_MAX); + } + } + + if (!dividendFitsUint16) + { + continue; + } + + JITDUMP("Marking [%06u] as a uint16 UMOD candidate\n", Compiler::dspTreeID(tree)); + + if (tree->OperIs(GT_MOD)) + { + tree->SetOper(GT_UMOD); + } + tree->gtFlags |= GTF_UMOD_UINT16_OPERANDS; + madeChanges = true; + } + } + } + + return madeChanges; +#else + return false; +#endif // TARGET_64BIT +} diff --git a/src/coreclr/jit/rangecheck.h b/src/coreclr/jit/rangecheck.h index c9f189deeb73de..89abee415406f8 100644 --- a/src/coreclr/jit/rangecheck.h +++ b/src/coreclr/jit/rangecheck.h @@ -773,6 +773,12 @@ class RangeCheck // and assertion prop phases are completed. bool OptimizeRangeChecks(); + // Walks the IR looking for "uint16 % const-uint16" patterns and tags eligible + // GT_UMOD nodes with GTF_UMOD_UINT16_OPERANDS so lowering can emit a cheaper + // FastMod sequence. GT_MOD nodes whose dividend is non-negative and whose + // divisor is a positive uint16 constant are converted to GT_UMOD. + bool TryMarkUModUInt16Operands(); + bool TryGetRange(BasicBlock* block, GenTree* expr, Range* pRange); // Cheaper version of TryGetRange that is based only on incoming assertions.