[prev in list] [next in list] [prev in thread] [next in thread]
List: llvm-commits
Subject: [PATCH] D111846: [LV] Drop integer poison-generating flags from instructions that need predication
From: Diego Caballero via Phabricator via llvm-commits <llvm-commits () lists ! llvm ! org>
Date: 2021-10-31 22:01:56
Message-ID: faMzDUToRU-IcpfVWR8_Zg () ismtpd0132p1iad2 ! sendgrid ! net
[Download RAW message or body]
dcaballe updated this revision to Diff 383687.
dcaballe added a comment.
- Added check to make sure we only drop poison-generating flags from instru=
ctions contributing to the address computation of masked loads/stores.
- Removed logic to drop flags from widen GEPs (for gathers/scatters)
- Removed logic to drop flags from all the widen instructions.
- Reverted changes in impacted tests.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D111846/new/
https://reviews.llvm.org/D111846
Files:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-gr=
oup.ll
llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-wit=
h-gaps.ll
llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll
llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
["D111846.383687.patch" (D111846.383687.patch)]
Index: llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
+++ llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
@@ -112,8 +112,8 @@
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDEX]] to i16
; CHECK-NEXT: [[TMP3:%.*]] = add i16 [[TMP2]], 0
; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, \
i16 0, i16 [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [32 x i16], [32 x i16]* @src, i16 0, \
i16 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, i16* [[TMP5]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP7]], align 1
; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
Index: llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll
+++ llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 \
-force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s +; \
RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 \
-force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
@@ -23,7 +23,7 @@
;CHECK-LABEL: @masked_strided(
;CHECK: vector.body:
-;CHECK-NEXT: %index = phi i32
+;CHECK-NEXT: %index = phi i32
;CHECK-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, \
i32 5, i32 6, i32 7> ;CHECK-NEXT: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], \
%{{broadcast.splat*}} ;CHECK-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], \
<i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
Index: llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
+++ llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
@@ -382,8 +382,8 @@
; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>*
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* \
[[TMP1]], align 2 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp sgt <4 x \
i16> [[WIDE_LOAD]], zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nuw nsw i64 [[INDEX]], 3
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* \
[[POINTS:%.*]], i64 [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul \
i64 [[INDEX]], 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr i16, \
i16* [[POINTS:%.*]], i64 [[TMP3]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = \
bitcast i16* [[TMP4]] to <12 x i16>* ; ENABLED_MASKED_STRIDED-NEXT: \
[[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> poison, \
<12 x i32> <i32 0, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 2, i32 \
undef, i32 undef, i32 3, i32 undef, i32 undef> ; ENABLED_MASKED_STRIDED-NEXT: \
[[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <12 x \
i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, \
i32 3>
Index: llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -121,7 +121,7 @@
; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]]
; DISABLED_MASKED_STRIDED: pred.load.continue14:
; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], \
[[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ]
-; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, i8* \
[[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = \
getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: \
[[TMP51:%.*]] = bitcast i8* [[TMP50]] to <8 x i8>* ; DISABLED_MASKED_STRIDED-NEXT: \
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP49]], <8 x i8>* [[TMP51]], i32 \
1, <8 x i1> [[TMP0]]) ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw \
i32 [[INDEX]], 8 @@ -141,13 +141,13 @@
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ \
[[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: \
[[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, \
i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; \
ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], \
[[BROADCAST_SPLAT]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* \
[[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl i32 \
[[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* \
[[P:%.*]], i32 [[TMP1]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* \
[[TMP2]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = \
shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, \
i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, \
i32 7> ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> \
@llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> \
[[INTERLEAVED_MASK]], <16 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: \
[[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, \
<8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* \
[[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = \
getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: \
[[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: \
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* \
[[TMP5]], i32 1, <8 x i1> [[TMP0]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] \
= add nuw i32 [[INDEX]], 8 @@ -297,7 +297,7 @@
; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]]
; DISABLED_MASKED_STRIDED: pred.load.continue14:
; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], \
[[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ]
-; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, i8* \
[[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = \
getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: \
[[TMP51:%.*]] = bitcast i8* [[TMP50]] to <8 x i8>* ; DISABLED_MASKED_STRIDED-NEXT: \
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP49]], <8 x i8>* [[TMP51]], i32 \
1, <8 x i1> [[TMP0]]) ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw \
i32 [[INDEX]], 8 @@ -317,14 +317,14 @@
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ \
[[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: \
[[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, \
i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; \
ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], \
[[BROADCAST_SPLAT]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* \
[[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl i32 \
[[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* \
[[P:%.*]], i32 [[TMP1]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* \
[[TMP2]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = \
shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, \
i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, \
i32 7> ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <16 x i1> \
[[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 \
true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 \
false> ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> \
@llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x \
i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector \
<16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 \
6, i32 8, i32 10, i32 12, i32 14>
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* \
[[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = \
getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: \
[[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: \
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* \
[[TMP6]], i32 1, <8 x i1> [[TMP0]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] \
= add nuw i32 [[INDEX]], 8 @@ -489,7 +489,7 @@
; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]]
; DISABLED_MASKED_STRIDED: pred.load.continue16:
; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], \
[[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ]
-; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, i8* \
[[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = \
getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: \
[[TMP53:%.*]] = bitcast i8* [[TMP52]] to <8 x i8>* ; DISABLED_MASKED_STRIDED-NEXT: \
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP51]], <8 x i8>* [[TMP53]], i32 \
1, <8 x i1> [[TMP3]]) ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 \
[[INDEX]], 8 @@ -518,15 +518,15 @@
; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, \
i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], \
[[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x \
i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: \
[[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 1
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* \
[[P:%.*]], i32 [[TMP2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl i32 \
[[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* \
[[P:%.*]], i32 [[TMP2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x \
i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer ; \
ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* ; \
ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> \
[[TMP4]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 \
3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7> ; \
ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], \
<i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, \
i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false> ; \
ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> \
@llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[TMP6]], <16 x \
i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector \
<16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 \
6, i32 8, i32 10, i32 12, i32 14>
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* \
[[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = \
getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: \
[[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: \
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* \
[[TMP8]], i32 1, <8 x i1> [[TMP4]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] \
= add i32 [[INDEX]], 8 @@ -696,7 +696,7 @@
; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]]
; DISABLED_MASKED_STRIDED: pred.load.continue16:
; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], \
[[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ]
-; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, i8* \
[[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = \
getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: \
[[TMP53:%.*]] = bitcast i8* [[TMP52]] to <8 x i8>* ; DISABLED_MASKED_STRIDED-NEXT: \
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP51]], <8 x i8>* [[TMP53]], i32 \
1, <8 x i1> [[TMP3]]) ; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 \
[[INDEX]], 8 @@ -725,15 +725,15 @@
; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, \
i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], \
[[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x \
i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: \
[[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul nsw i32 [[INDEX]], 3
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* \
[[P:%.*]], i32 [[TMP2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = mul i32 \
[[INDEX]], 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* \
[[P:%.*]], i32 [[TMP2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x \
i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer ; \
ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <24 x i8>* ; \
ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> \
[[TMP4]], <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 \
2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 \
6, i32 6, i32 6, i32 7, i32 7, i32 7> ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] \
= and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 \
false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, \
i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 \
true, i1 false, i1 false> ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] \
= call <24 x i8> @llvm.masked.load.v24i8.p0v24i8(<24 x i8>* [[TMP5]], i32 1, <24 x \
i1> [[TMP6]], <24 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: \
[[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, \
<8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 \
21>
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* \
[[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = \
getelementptr i8, i8* [[Q:%.*]], i32 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: \
[[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: \
call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* \
[[TMP8]], i32 1, <8 x i1> [[TMP4]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] \
= add i32 [[INDEX]], 8 @@ -1432,8 +1432,8 @@
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ \
[[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: \
[[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, \
i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; \
ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], \
[[BROADCAST_SPLAT]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* \
[[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl i32 \
[[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* \
[[P:%.*]], i32 [[TMP1]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* \
[[TMP2]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = \
shufflevector <8 x i1> [[TMP0]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, \
i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, \
i32 7> ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> \
@llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> \
[[INTERLEAVED_MASK]], <16 x i8> poison) @@ -1443,8 +1443,8 @@
; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], \
[[STRIDED_VEC1]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = select <8 x i1> \
[[TMP5]], <8 x i8> [[STRIDED_VEC1]], <8 x i8> [[STRIDED_VEC]] ; \
ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = sub <8 x i8> \
zeroinitializer, [[TMP6]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* \
[[Q:%.*]], i32 -1
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* \
[[TMP8]], i32 [[TMP4]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = \
getelementptr i8, i8* [[Q:%.*]], i32 -1 +; ENABLED_MASKED_STRIDED-NEXT: \
[[TMP9:%.*]] = getelementptr i8, i8* [[TMP8]], i32 [[TMP4]] ; \
ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* ; \
ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> \
[[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, \
i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> ; \
ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> \
[[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) @@ \
-2619,8 +2619,8 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> \
[ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ \
[[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: \
[[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; \
ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], \
[[BROADCAST_SPLAT]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[INDEX]], 1
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* \
[[P:%.*]], i32 [[TMP2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl i32 \
[[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* \
[[P:%.*]], i32 [[TMP2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = select <8 x \
i1> [[TMP1]], <8 x i1> [[TMP0]], <8 x i1> zeroinitializer ; \
ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* ; \
ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> \
[[TMP4]], <8 x i1> poison, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 \
3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7> @@ -2631,8 +2631,8 \
@@ ; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp slt <8 x i8> \
[[STRIDED_VEC]], [[STRIDED_VEC3]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = \
select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]] ; \
ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = sub <8 x i8> \
zeroinitializer, [[TMP8]]
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* \
[[Q:%.*]], i32 -1
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* \
[[TMP10]], i32 [[TMP6]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = \
getelementptr i8, i8* [[Q:%.*]], i32 -1 +; ENABLED_MASKED_STRIDED-NEXT: \
[[TMP11:%.*]] = getelementptr i8, i8* [[TMP10]], i32 [[TMP6]] ; \
ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>* \
; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> \
[[TMP8]], <8 x i8> [[TMP9]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, \
i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> ; \
ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> \
[[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> \
[[INTERLEAVED_MASK]])
Index: llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -49,13 +49,13 @@
; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, \
!alias.scope !0 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], \
<i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 \
100>
-; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]]
-; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]]
+; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0
; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> \
@llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x \
i32> poison), !alias.scope !3 ; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> \
[[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]]
-; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]]
+; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x \
i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !5, !noalias !7 ; \
AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 @@ -136,40 +136,40 @@
; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100> ; AVX2-NEXT: \
[[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100> ; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 \
x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 \
100, i32 100>
-; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]]
-; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
-; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
-; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP3]]
-; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; AVX2-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]]
+; AVX2-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]]
+; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]]
+; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]]
+; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0
; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> \
@llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x \
i32> poison), !alias.scope !3
-; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
+; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 8
; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> \
@llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x \
i32> poison), !alias.scope !3
-; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 16
+; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 16
; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> \
@llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x \
i32> poison), !alias.scope !3
-; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 24
+; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 24
; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> \
@llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x \
i32> poison), !alias.scope !3 ; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> \
[[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x \
i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX2-NEXT: [[TMP34:%.*]] = add nsw \
<8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] ; AVX2-NEXT: [[TMP35:%.*]] = \
add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]]
-; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]]
-; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
-; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
-; AVX2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
-; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0
+; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]]
+; AVX2-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]]
+; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]]
+; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]]
+; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[TMP36]], i32 0
; AVX2-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP32]], <8 x \
i32>* [[TMP41]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !5, \
!noalias !7
-; AVX2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 8
+; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP36]], i32 8
; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP33]], <8 x \
i32>* [[TMP43]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !5, \
!noalias !7
-; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 16
+; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP36]], i32 16
; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x \
i32>* [[TMP45]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !5, \
!noalias !7
-; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 24
+; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP36]], i32 24
; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>*
; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x \
i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 ; \
AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 @@ -252,40 +252,40 @@
; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], <i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, \
i32 100, i32 100, i32 100, i32 100, i32 100> ; AVX512-NEXT: [[TMP18:%.*]] = icmp \
slt <16 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, \
i32 100> ; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], <i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, \
i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 \
[[TMP0]]
-; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 \
[[TMP1]]
-; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 \
[[TMP2]]
-; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 \
[[TMP3]]
-; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; AVX512-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]]
+; AVX512-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]]
+; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]]
+; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]]
+; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0
; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> \
@llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], \
<16 x i32> poison), !alias.scope !3
-; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 16
+; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 16
; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> \
@llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], \
<16 x i32> poison), !alias.scope !3
-; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 32
+; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 32
; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> \
@llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], \
<16 x i32> poison), !alias.scope !3
-; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 48
+; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 48
; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> \
@llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], \
<16 x i32> poison), !alias.scope !3 ; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x \
i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX512-NEXT: [[TMP33:%.*]] = add nsw \
<16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP34:%.*]] = \
add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] ; AVX512-NEXT: \
[[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], \
[[WIDE_LOAD14]]
-; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 \
[[TMP0]]
-; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 \
[[TMP1]]
-; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 \
[[TMP2]]
-; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 \
[[TMP3]]
-; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0
+; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]]
+; AVX512-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]]
+; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]]
+; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]]
+; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[TMP36]], i32 0
; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP32]], \
<16 x i32>* [[TMP41]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !5, \
!noalias !7
-; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 16
+; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP36]], i32 16
; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP33]], \
<16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !5, \
!noalias !7
-; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 32
+; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP36]], i32 32
; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], \
<16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !5, \
!noalias !7
-; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 48
+; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP36]], i32 48
; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], \
<16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 ; \
AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 @@ -307,13 +307,13 @@
; AVX512-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[TMP52]], align \
4 ; AVX512-NEXT: [[TMP53:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD21]], <i32 100, \
i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 \
[[TMP49]]
-; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TMP54]], i32 0
+; AVX512-NEXT: [[TMP54:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP49]]
+; AVX512-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[TMP54]], i32 0
; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <8 x i32> \
@llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP56]], i32 4, <8 x i1> [[TMP53]], <8 x \
i32> poison) ; AVX512-NEXT: [[TMP57:%.*]] = add nsw <8 x i32> \
[[WIDE_MASKED_LOAD22]], [[WIDE_LOAD21]]
-; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 \
[[TMP49]]
-; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, i32* [[TMP58]], i32 0
+; AVX512-NEXT: [[TMP58:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP49]]
+; AVX512-NEXT: [[TMP59:%.*]] = getelementptr i32, i32* [[TMP58]], i32 0
; AVX512-NEXT: [[TMP60:%.*]] = bitcast i32* [[TMP59]] to <8 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP57]], <8 \
x i32>* [[TMP60]], i32 4, <8 x i1> [[TMP53]]) ; AVX512-NEXT: [[INDEX_NEXT19]] = \
add nuw i64 [[INDEX18]], 8 @@ -408,13 +408,13 @@
; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> \
addrspace(1)* ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> \
addrspace(1)* [[TMP3]], align 4, !alias.scope !11 ; AVX1-NEXT: [[TMP4:%.*]] = \
icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 \
100, i32 100, i32 100>
-; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], \
i64 [[TMP0]]
-; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP5]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32 addrspace(1)* \
[[B]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32 \
addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)* ; AVX1-NEXT: \
[[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> \
addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !14 \
; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> \
[[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], \
i64 [[TMP0]]
-; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP9]], i32 0 +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32 addrspace(1)* \
[[A]], i64 [[TMP0]] +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP9]], i32 0 ; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32 \
addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* ; AVX1-NEXT: call void \
@llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* \
[[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !16, !noalias !18 ; AVX1-NEXT: \
[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 @@ -495,40 +495,40 @@
; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100> ; AVX2-NEXT: \
[[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100> ; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 \
x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 \
100, i32 100>
-; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], \
i64 [[TMP0]]
-; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], \
i64 [[TMP1]]
-; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], \
i64 [[TMP2]]
-; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], \
i64 [[TMP3]]
-; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP20]], i32 0 +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[B]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr i32, \
i32 addrspace(1)* [[B]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr \
i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP23:%.*]] = \
getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] +; AVX2-NEXT: \
[[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0 ; AVX2-NEXT: \
[[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)* ; \
AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> \
@llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> \
[[TMP16]], <8 x i32> poison), !alias.scope !14
-; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP20]], i32 8 +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 \
addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: \
[[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> \
addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison), !alias.scope \
!14
-; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP20]], i32 16 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 \
addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: \
[[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> \
addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison), !alias.scope \
!14
-; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP20]], i32 24 +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 \
addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: \
[[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> \
addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope \
!14 ; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], \
[[WIDE_LOAD]] ; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> \
[[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x \
i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] ; AVX2-NEXT: [[TMP35:%.*]] = add nsw \
<8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]]
-; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], \
i64 [[TMP0]]
-; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], \
i64 [[TMP1]]
-; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], \
i64 [[TMP2]]
-; AVX2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], \
i64 [[TMP3]]
-; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP36]], i32 0 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[A]], i64 [[TMP0]] +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr i32, \
i32 addrspace(1)* [[A]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr \
i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] +; AVX2-NEXT: [[TMP39:%.*]] = \
getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] +; AVX2-NEXT: \
[[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0 ; AVX2-NEXT: \
[[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <8 x i32> addrspace(1)* ; \
AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP32]], <8 x \
i32> addrspace(1)* [[TMP41]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !16, !noalias \
!18
-; AVX2-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP36]], i32 8 +; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP36]], i32 8 ; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32 \
addrspace(1)* [[TMP42]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void \
@llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP33]], <8 x i32> addrspace(1)* \
[[TMP43]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !16, !noalias !18
-; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP36]], i32 16 +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP36]], i32 16 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32 \
addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void \
@llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* \
[[TMP45]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !16, !noalias !18
-; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP36]], i32 24 +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP36]], i32 24 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32 \
addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)* ; AVX2-NEXT: call void \
@llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* \
[[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 ; AVX2-NEXT: \
[[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 @@ -611,40 +611,40 @@
; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], <i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, \
i32 100, i32 100, i32 100, i32 100, i32 100> ; AVX512-NEXT: [[TMP18:%.*]] = icmp \
slt <16 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, \
i32 100> ; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], <i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, \
i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[B]], i64 [[TMP0]]
-; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[B]], i64 [[TMP1]]
-; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[B]], i64 [[TMP2]]
-; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[B]], i64 [[TMP3]]
-; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP20]], i32 0 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[B]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr \
i32, i32 addrspace(1)* [[B]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP22:%.*]] = \
getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] +; AVX512-NEXT: \
[[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] +; \
AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0 \
; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> \
addrspace(1)* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> \
@llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x \
i1> [[TMP16]], <16 x i32> poison), !alias.scope !16
-; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP20]], i32 16 +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 \
addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: \
[[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x \
i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison), \
!alias.scope !16
-; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP20]], i32 32 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 \
addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: \
[[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x \
i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison), \
!alias.scope !16
-; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP20]], i32 48 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 \
addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: \
[[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x \
i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison), \
!alias.scope !16 ; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> \
[[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x \
i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP34:%.*]] = add \
nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] ; AVX512-NEXT: \
[[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], \
[[WIDE_LOAD14]]
-; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[A]], i64 [[TMP0]]
-; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[A]], i64 [[TMP1]]
-; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[A]], i64 [[TMP2]]
-; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[A]], i64 [[TMP3]]
-; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP36]], i32 0 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[A]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr \
i32, i32 addrspace(1)* [[A]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP38:%.*]] = \
getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] +; AVX512-NEXT: \
[[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] +; \
AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0 \
; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <16 x i32> \
addrspace(1)* ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x \
i32> [[TMP32]], <16 x i32> addrspace(1)* [[TMP41]], i32 4, <16 x i1> [[TMP16]]), \
!alias.scope !18, !noalias !20
-; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP36]], i32 16 +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP36]], i32 16 ; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32 \
addrspace(1)* [[TMP42]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void \
@llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP33]], <16 x i32> addrspace(1)* \
[[TMP43]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !18, !noalias \
!20
-; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP36]], i32 32 +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP36]], i32 32 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32 \
addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void \
@llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* \
[[TMP45]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !18, !noalias \
!20
-; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP36]], i32 48 +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[TMP36]], i32 48 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32 \
addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)* ; AVX512-NEXT: call void \
@llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* \
[[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !18, !noalias !20 ; \
AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 @@ -666,13 +666,13 @@
; AVX512-NEXT: [[TMP52:%.*]] = bitcast i32 addrspace(1)* [[TMP51]] to <8 x i32> \
addrspace(1)* ; AVX512-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32> \
addrspace(1)* [[TMP52]], align 4 ; AVX512-NEXT: [[TMP53:%.*]] = icmp slt <8 x \
i32> [[WIDE_LOAD21]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, \
i32 100>
-; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[B]], i64 [[TMP49]]
-; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP54]], i32 0 +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[B]], i64 [[TMP49]] +; AVX512-NEXT: [[TMP55:%.*]] = getelementptr \
i32, i32 addrspace(1)* [[TMP54]], i32 0 ; AVX512-NEXT: [[TMP56:%.*]] = bitcast \
i32 addrspace(1)* [[TMP55]] to <8 x i32> addrspace(1)* ; AVX512-NEXT: \
[[WIDE_MASKED_LOAD22:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> \
addrspace(1)* [[TMP56]], i32 4, <8 x i1> [[TMP53]], <8 x i32> poison) ; AVX512-NEXT: \
[[TMP57:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD22]], \
[[WIDE_LOAD21]]
-; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[A]], i64 [[TMP49]]
-; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* \
[[TMP58]], i32 0 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr i32, i32 \
addrspace(1)* [[A]], i64 [[TMP49]] +; AVX512-NEXT: [[TMP59:%.*]] = getelementptr \
i32, i32 addrspace(1)* [[TMP58]], i32 0 ; AVX512-NEXT: [[TMP60:%.*]] = bitcast \
i32 addrspace(1)* [[TMP59]] to <8 x i32> addrspace(1)* ; AVX512-NEXT: call void \
@llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP57]], <8 x i32> addrspace(1)* \
[[TMP60]], i32 4, <8 x i1> [[TMP53]]) ; AVX512-NEXT: [[INDEX_NEXT19]] = add nuw \
i64 [[INDEX18]], 8 @@ -776,14 +776,14 @@
; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, \
!alias.scope !21 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], \
<i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 \
100>
-; AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B]], i64 \
[[TMP0]]
-; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; AVX1-NEXT: [[TMP5:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]]
+; AVX1-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[TMP5]], i32 0
; AVX1-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> \
@llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x \
float> poison), !alias.scope !24 ; AVX1-NEXT: [[TMP8:%.*]] = sitofp <8 x i32> \
[[WIDE_LOAD]] to <8 x float> ; AVX1-NEXT: [[TMP9:%.*]] = fadd <8 x float> \
[[WIDE_MASKED_LOAD]], [[TMP8]]
-; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 \
[[TMP0]]
-; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 \
0 +; AVX1-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]]
+; AVX1-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0
; AVX1-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>*
; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 \
x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !26, !noalias !28 ; \
AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 @@ -865,20 +865,20 @@
; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100> ; AVX2-NEXT: \
[[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100> ; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 \
x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 \
100, i32 100>
-; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 \
[[TMP0]]
-; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 \
[[TMP1]]
-; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 \
[[TMP2]]
-; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 \
[[TMP3]]
-; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 \
0 +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]]
+; AVX2-NEXT: [[TMP21:%.*]] = getelementptr float, float* [[B]], i64 [[TMP1]]
+; AVX2-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]]
+; AVX2-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]]
+; AVX2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], i32 0
; AVX2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> \
@llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 \
x float> poison), !alias.scope !24
-; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 \
8 +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 8
; AVX2-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x float> \
@llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 \
x float> poison), !alias.scope !24
-; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 \
16 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 16
; AVX2-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x float> \
@llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 \
x float> poison), !alias.scope !24
-; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 \
24 +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 24
; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x float> \
@llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 \
x float> poison), !alias.scope !24 ; AVX2-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> \
[[WIDE_LOAD]] to <8 x float> @@ -889,20 +889,20 @@
; AVX2-NEXT: [[TMP37:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD15]], [[TMP33]]
; AVX2-NEXT: [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD16]], [[TMP34]]
; AVX2-NEXT: [[TMP39:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD17]], [[TMP35]]
-; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[A]], i64 \
[[TMP0]]
-; AVX2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[A]], i64 \
[[TMP1]]
-; AVX2-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[A]], i64 \
[[TMP2]]
-; AVX2-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[A]], i64 \
[[TMP3]]
-; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 \
0 +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]]
+; AVX2-NEXT: [[TMP41:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]]
+; AVX2-NEXT: [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]]
+; AVX2-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]]
+; AVX2-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], i32 0
; AVX2-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <8 x float>*
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP36]], <8 \
x float>* [[TMP45]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !26, \
!noalias !28
-; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 \
8 +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 8
; AVX2-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <8 x float>*
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP37]], <8 \
x float>* [[TMP47]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !26, \
!noalias !28
-; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 \
16 +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 16
; AVX2-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <8 x float>*
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 \
x float>* [[TMP49]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !26, \
!noalias !28
-; AVX2-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 \
24 +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 24
; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <8 x float>*
; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 \
x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !26, !noalias !28 ; \
AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 @@ -986,20 +986,20 @@
; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], <i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, \
i32 100, i32 100, i32 100, i32 100, i32 100> ; AVX512-NEXT: [[TMP18:%.*]] = icmp \
slt <16 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, \
i32 100> ; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], <i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, \
i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 \
[[TMP0]]
-; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 \
[[TMP1]]
-; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 \
[[TMP2]]
-; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 \
[[TMP3]]
-; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], \
i32 0 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[B]], i64 \
[[TMP0]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr float, float* [[B]], i64 \
[[TMP1]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 \
[[TMP2]] +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 \
[[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], \
i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <16 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> \
@llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP25]], i32 4, <16 x i1> \
[[TMP16]], <16 x float> poison), !alias.scope !27
-; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], \
i32 16 +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 \
16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x float> \
@llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> \
[[TMP17]], <16 x float> poison), !alias.scope !27
-; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], \
i32 32 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 \
32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x float> \
@llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> \
[[TMP18]], <16 x float> poison), !alias.scope !27
-; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP20]], \
i32 48 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 \
48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x float> \
@llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> \
[[TMP19]], <16 x float> poison), !alias.scope !27 ; AVX512-NEXT: [[TMP32:%.*]] = \
sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> @@ -1010,20 +1010,20 @@
; AVX512-NEXT: [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD15]], \
[[TMP33]] ; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> \
[[WIDE_MASKED_LOAD16]], [[TMP34]] ; AVX512-NEXT: [[TMP39:%.*]] = fadd <16 x \
float> [[WIDE_MASKED_LOAD17]], [[TMP35]]
-; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[A]], i64 \
[[TMP0]]
-; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[A]], i64 \
[[TMP1]]
-; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[A]], i64 \
[[TMP2]]
-; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[A]], i64 \
[[TMP3]]
-; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP40]], \
i32 0 +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr float, float* [[A]], i64 \
[[TMP0]] +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr float, float* [[A]], i64 \
[[TMP1]] +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 \
[[TMP2]] +; AVX512-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 \
[[TMP3]] +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], \
i32 0 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <16 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> \
[[TMP36]], <16 x float>* [[TMP45]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !29, \
!noalias !31
-; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP40]], \
i32 16 +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 \
16 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <16 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> \
[[TMP37]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !29, \
!noalias !31
-; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[TMP40]], \
i32 32 +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 \
32 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> \
[[TMP38]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !29, \
!noalias !31
-; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP40]], \
i32 48 +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 \
48 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <16 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> \
[[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !29, \
!noalias !31 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
@@ -1045,14 +1045,14 @@
; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>*
; AVX512-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[TMP56]], align \
4 ; AVX512-NEXT: [[TMP57:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD21]], <i32 100, \
i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 \
[[TMP53]]
-; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP58]], \
i32 0 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr float, float* [[B]], i64 \
[[TMP53]] +; AVX512-NEXT: [[TMP59:%.*]] = getelementptr float, float* [[TMP58]], \
i32 0 ; AVX512-NEXT: [[TMP60:%.*]] = bitcast float* [[TMP59]] to <8 x float>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD22:%.*]] = call <8 x float> \
@llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP60]], i32 4, <8 x i1> [[TMP57]], <8 \
x float> poison) ; AVX512-NEXT: [[TMP61:%.*]] = sitofp <8 x i32> [[WIDE_LOAD21]] \
to <8 x float> ; AVX512-NEXT: [[TMP62:%.*]] = fadd <8 x float> \
[[WIDE_MASKED_LOAD22]], [[TMP61]]
-; AVX512-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[A]], i64 \
[[TMP53]]
-; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP63]], \
i32 0 +; AVX512-NEXT: [[TMP63:%.*]] = getelementptr float, float* [[A]], i64 \
[[TMP53]] +; AVX512-NEXT: [[TMP64:%.*]] = getelementptr float, float* [[TMP63]], \
i32 0 ; AVX512-NEXT: [[TMP65:%.*]] = bitcast float* [[TMP64]] to <8 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP62]], \
<8 x float>* [[TMP65]], i32 4, <8 x i1> [[TMP57]]) ; AVX512-NEXT: \
[[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 8 @@ -1176,20 +1176,20 @@
; AVX-NEXT: [[TMP17:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD12]], <i32 100, i32 \
100, i32 100, i32 100> ; AVX-NEXT: [[TMP18:%.*]] = icmp slt <4 x i32> \
[[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100> ; AVX-NEXT: [[TMP19:%.*]] = \
icmp slt <4 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 \
100>
-; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 \
[[TMP0]]
-; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[B]], i64 \
[[TMP1]]
-; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[B]], i64 \
[[TMP2]]
-; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 \
[[TMP3]]
-; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 \
0 +; AVX-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[B]], i64 [[TMP0]]
+; AVX-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]]
+; AVX-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[B]], i64 [[TMP2]]
+; AVX-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]]
+; AVX-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0
; AVX-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> \
@llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], \
<4 x double> poison), !alias.scope !34
-; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 \
4 +; AVX-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 4
; AVX-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>*
; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> \
@llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], \
<4 x double> poison), !alias.scope !34
-; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 \
8 +; AVX-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 8
; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
; AVX-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> \
@llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], \
<4 x double> poison), !alias.scope !34
-; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 \
12 +; AVX-NEXT: [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 12
; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
; AVX-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> \
@llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], \
<4 x double> poison), !alias.scope !34 ; AVX-NEXT: [[TMP32:%.*]] = sitofp <4 x \
i32> [[WIDE_LOAD]] to <4 x double> @@ -1200,20 +1200,20 @@
; AVX-NEXT: [[TMP37:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]]
; AVX-NEXT: [[TMP38:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]]
; AVX-NEXT: [[TMP39:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]]
-; AVX-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[A]], i64 \
[[TMP0]]
-; AVX-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[A]], i64 \
[[TMP1]]
-; AVX-NEXT: [[TMP42:%.*]] = getelementptr inbounds double, double* [[A]], i64 \
[[TMP2]]
-; AVX-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[A]], i64 \
[[TMP3]]
-; AVX-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 \
0 +; AVX-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[A]], i64 [[TMP0]]
+; AVX-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[A]], i64 [[TMP1]]
+; AVX-NEXT: [[TMP42:%.*]] = getelementptr double, double* [[A]], i64 [[TMP2]]
+; AVX-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]]
+; AVX-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0
; AVX-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP36]], <4 \
x double>* [[TMP45]], i32 8, <4 x i1> [[TMP16]]), !alias.scope !36, \
!noalias !38
-; AVX-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 \
4 +; AVX-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 4
; AVX-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP37]], <4 \
x double>* [[TMP47]], i32 8, <4 x i1> [[TMP17]]), !alias.scope !36, \
!noalias !38
-; AVX-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 \
8 +; AVX-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 8
; AVX-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP38]], <4 \
x double>* [[TMP49]], i32 8, <4 x i1> [[TMP18]]), !alias.scope !36, \
!noalias !38
-; AVX-NEXT: [[TMP50:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 \
12 +; AVX-NEXT: [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 12
; AVX-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <4 x double>*
; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 \
x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !36, !noalias !38 ; \
AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 @@ -1295,20 +1295,20 @@
; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100> ; AVX512-NEXT: \
[[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 \
100, i32 100, i32 100, i32 100, i32 100> ; AVX512-NEXT: [[TMP19:%.*]] = icmp slt \
<8 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 \
100, i32 100>
-; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 \
[[TMP0]]
-; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[B]], i64 \
[[TMP1]]
-; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[B]], i64 \
[[TMP2]]
-; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 \
[[TMP3]]
-; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP20]], \
i32 0 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[B]], i64 \
[[TMP0]] +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[B]], i64 \
[[TMP1]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[B]], i64 \
[[TMP2]] +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 \
[[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], \
i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> \
@llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], \
<8 x double> poison), !alias.scope !38
-; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP20]], \
i32 8 +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 \
8 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> \
@llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], \
<8 x double> poison), !alias.scope !38
-; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP20]], \
i32 16 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], \
i32 16 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> \
@llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], \
<8 x double> poison), !alias.scope !38
-; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP20]], \
i32 24 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], \
i32 24 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> \
@llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], \
<8 x double> poison), !alias.scope !38 ; AVX512-NEXT: [[TMP32:%.*]] = sitofp <8 x \
i32> [[WIDE_LOAD]] to <8 x double> @@ -1319,20 +1319,20 @@
; AVX512-NEXT: [[TMP37:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD15]], \
[[TMP33]] ; AVX512-NEXT: [[TMP38:%.*]] = fadd <8 x double> \
[[WIDE_MASKED_LOAD16]], [[TMP34]] ; AVX512-NEXT: [[TMP39:%.*]] = fadd <8 x \
double> [[WIDE_MASKED_LOAD17]], [[TMP35]]
-; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[A]], i64 \
[[TMP0]]
-; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[A]], i64 \
[[TMP1]]
-; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds double, double* [[A]], i64 \
[[TMP2]]
-; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[A]], i64 \
[[TMP3]]
-; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP40]], \
i32 0 +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[A]], i64 \
[[TMP0]] +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[A]], i64 \
[[TMP1]] +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr double, double* [[A]], i64 \
[[TMP2]] +; AVX512-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 \
[[TMP3]] +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], \
i32 0 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP36]], \
<8 x double>* [[TMP45]], i32 8, <8 x i1> [[TMP16]]), !alias.scope \
!40, !noalias !42
-; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[TMP40]], \
i32 8 +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 \
8 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP37]], \
<8 x double>* [[TMP47]], i32 8, <8 x i1> [[TMP17]]), !alias.scope \
!40, !noalias !42
-; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP40]], \
i32 16 +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], \
i32 16 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP38]], \
<8 x double>* [[TMP49]], i32 8, <8 x i1> [[TMP18]]), !alias.scope \
!40, !noalias !42
-; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds double, double* [[TMP40]], \
i32 24 +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], \
i32 24 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], \
<8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !40, !noalias !42 \
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 @@ -1690,30 +1690,30 @@
; AVX2-NEXT: [[TMP21:%.*]] = icmp sgt <4 x i32> [[REVERSE13]], zeroinitializer
; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt <4 x i32> [[REVERSE15]], zeroinitializer
; AVX2-NEXT: [[TMP23:%.*]] = icmp sgt <4 x i32> [[REVERSE17]], zeroinitializer
-; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[IN]], i64 \
[[TMP0]]
-; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[IN]], i64 \
[[TMP1]]
-; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[IN]], i64 \
[[TMP2]]
-; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[IN]], i64 \
[[TMP3]]
-; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP24]], \
i32 0
-; AVX2-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP28]], \
i32 -3 +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[IN]], i64 \
[[TMP0]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[IN]], i64 \
[[TMP1]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[IN]], i64 \
[[TMP2]] +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[IN]], i64 \
[[TMP3]] +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP24]], \
i32 0 +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 \
-3 ; AVX2-NEXT: [[REVERSE18:%.*]] = shufflevector <4 x i1> [[TMP20]], <4 x i1> \
poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> ; AVX2-NEXT: [[TMP30:%.*]] = \
bitcast double* [[TMP29]] to <4 x double>* ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] \
= call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, \
<4 x i1> [[REVERSE18]], <4 x double> poison), !alias.scope !44 ; AVX2-NEXT: \
[[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> \
poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, double* [[TMP24]], \
i32 -4
-; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP31]], \
i32 -3 +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 \
-4 +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -3
; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP21]], <4 x i1> \
poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> ; AVX2-NEXT: [[TMP33:%.*]] = \
bitcast double* [[TMP32]] to <4 x double>* ; AVX2-NEXT: \
[[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x \
double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope \
!44 ; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> \
[[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> <i32 3, i32 2, \
i32 1, i32 0>
-; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], \
i32 -8
-; AVX2-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], \
i32 -3 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 \
-8 +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -3
; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i1> [[TMP22]], <4 x i1> \
poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> ; AVX2-NEXT: [[TMP36:%.*]] = \
bitcast double* [[TMP35]] to <4 x double>* ; AVX2-NEXT: \
[[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x \
double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> poison), !alias.scope \
!44 ; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> \
[[WIDE_MASKED_LOAD24]], <4 x double> poison, <4 x i32> <i32 3, i32 2, \
i32 1, i32 0>
-; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP24]], \
i32 -12
-; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], \
i32 -3 +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 \
-12 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -3
; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> \
poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> ; AVX2-NEXT: [[TMP39:%.*]] = \
bitcast double* [[TMP38]] to <4 x double>* ; AVX2-NEXT: \
[[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x \
double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> poison), !alias.scope \
!44 @@ -1722,28 +1722,28 @@
; AVX2-NEXT: [[TMP41:%.*]] = fadd <4 x double> [[REVERSE22]], <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01> ; \
AVX2-NEXT: [[TMP42:%.*]] = fadd <4 x double> [[REVERSE25]], <double 5.000000e-01, \
double 5.000000e-01, double 5.000000e-01, double 5.000000e-01> ; AVX2-NEXT: \
[[TMP43:%.*]] = fadd <4 x double> [[REVERSE28]], <double 5.000000e-01, double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01>
-; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP0]]
-; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP1]]
-; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP2]]
-; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP3]] +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT]], i64 \
[[TMP0]] +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], i64 \
[[TMP1]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], i64 \
[[TMP2]] +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], i64 \
[[TMP3]] ; AVX2-NEXT: [[REVERSE29:%.*]] = shufflevector <4 x double> [[TMP40]], \
<4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 0
-; AVX2-NEXT: [[TMP49:%.*]] = getelementptr inbounds double, double* [[TMP48]], \
i32 -3 +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 \
0 +; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -3
; AVX2-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> \
[[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope \
!46, !noalias !48 ; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x double> \
[[TMP41]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 \
0>
-; AVX2-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 -4
-; AVX2-NEXT: [[TMP52:%.*]] = getelementptr inbounds double, double* [[TMP51]], \
i32 -3 +; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 \
-4 +; AVX2-NEXT: [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -3
; AVX2-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> \
[[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope \
!46, !noalias !48 ; AVX2-NEXT: [[REVERSE33:%.*]] = shufflevector <4 x double> \
[[TMP42]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 \
0>
-; AVX2-NEXT: [[TMP54:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 -8
-; AVX2-NEXT: [[TMP55:%.*]] = getelementptr inbounds double, double* [[TMP54]], \
i32 -3 +; AVX2-NEXT: [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 \
-8 +; AVX2-NEXT: [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -3
; AVX2-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> \
[[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope \
!46, !noalias !48 ; AVX2-NEXT: [[REVERSE35:%.*]] = shufflevector <4 x double> \
[[TMP43]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 \
0>
-; AVX2-NEXT: [[TMP57:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 -12
-; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP57]], \
i32 -3 +; AVX2-NEXT: [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 \
-12 +; AVX2-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -3
; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> \
[[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope \
!46, !noalias !48 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -1833,30 +1833,30 @@
; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i32> [[REVERSE13]], zeroinitializer
; AVX512-NEXT: [[TMP22:%.*]] = icmp sgt <8 x i32> [[REVERSE15]], zeroinitializer
; AVX512-NEXT: [[TMP23:%.*]] = icmp sgt <8 x i32> [[REVERSE17]], zeroinitializer
-; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[IN]], i64 \
[[TMP0]]
-; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[IN]], i64 \
[[TMP1]]
-; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[IN]], i64 \
[[TMP2]]
-; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[IN]], i64 \
[[TMP3]]
-; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP24]], \
i32 0
-; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP28]], \
i32 -7 +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[IN]], i64 \
[[TMP0]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[IN]], i64 \
[[TMP1]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[IN]], i64 \
[[TMP2]] +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[IN]], i64 \
[[TMP3]] +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP24]], \
i32 0 +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 \
-7 ; AVX512-NEXT: [[REVERSE18:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> \
poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> ; \
AVX512-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>* ; \
AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> \
@llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> \
[[REVERSE18]], <8 x double> poison), !alias.scope !58 ; AVX512-NEXT: \
[[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> \
poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, \
i32 0>
-; AVX512-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, double* [[TMP24]], \
i32 -8
-; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP31]], \
i32 -7 +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], \
i32 -8 +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], \
i32 -7 ; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x \
i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> ; \
AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* ; \
AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> \
@llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> \
[[REVERSE20]], <8 x double> poison), !alias.scope !58 ; AVX512-NEXT: \
[[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> \
poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, \
i32 0>
-; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], \
i32 -16
-; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], \
i32 -7 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], \
i32 -16 +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], \
i32 -7 ; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i1> [[TMP22]], <8 x \
i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> ; \
AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>* ; \
AVX512-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> \
@llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> \
[[REVERSE23]], <8 x double> poison), !alias.scope !58 ; AVX512-NEXT: \
[[REVERSE25:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD24]], <8 x double> \
poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, \
i32 0>
-; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP24]], \
i32 -24
-; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], \
i32 -7 +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], \
i32 -24 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], \
i32 -7 ; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP23]], <8 x \
i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> ; \
AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* ; \
AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> \
@llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> \
[[REVERSE26]], <8 x double> poison), !alias.scope !58 @@ -1865,28 +1865,28 @@
; AVX512-NEXT: [[TMP41:%.*]] = fadd <8 x double> [[REVERSE22]], <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01> ; \
AVX512-NEXT: [[TMP42:%.*]] = fadd <8 x double> [[REVERSE25]], <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01> ; \
AVX512-NEXT: [[TMP43:%.*]] = fadd <8 x double> [[REVERSE28]], <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double \
5.000000e-01>
-; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT]], \
i64 [[TMP0]]
-; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], \
i64 [[TMP1]]
-; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], \
i64 [[TMP2]]
-; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], \
i64 [[TMP3]] +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* \
[[OUT]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, \
double* [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr \
double, double* [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP47:%.*]] = \
getelementptr double, double* [[OUT]], i64 [[TMP3]] ; AVX512-NEXT: \
[[REVERSE29:%.*]] = shufflevector <8 x double> [[TMP40]], <8 x double> poison, <8 x \
i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 0
-; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds double, double* [[TMP48]], \
i32 -7 +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], \
i32 0 +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 \
-7 ; AVX512-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> \
[[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope \
!60, !noalias !62 ; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x double> \
[[TMP41]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, \
i32 1, i32 0>
-; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 -8
-; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds double, double* [[TMP51]], \
i32 -7 +; AVX512-NEXT: [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], \
i32 -8 +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], \
i32 -7 ; AVX512-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> \
[[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope \
!60, !noalias !62 ; AVX512-NEXT: [[REVERSE33:%.*]] = shufflevector <8 x double> \
[[TMP42]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, \
i32 1, i32 0>
-; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 -16
-; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds double, double* [[TMP54]], \
i32 -7 +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], \
i32 -16 +; AVX512-NEXT: [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], \
i32 -7 ; AVX512-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> \
[[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope \
!60, !noalias !62 ; AVX512-NEXT: [[REVERSE35:%.*]] = shufflevector <8 x double> \
[[TMP43]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, \
i32 1, i32 0>
-; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 -24
-; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP57]], \
i32 -7 +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], \
i32 -24 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], \
i32 -7 ; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> \
[[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope \
!60, !noalias !62 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
@@ -1996,34 +1996,34 @@
; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], \
i64 [[TMP0]]
-; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 \
[[TMP1]]
-; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 \
[[TMP2]]
-; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 \
[[TMP3]] +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], \
i64 [[TMP0]] +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr double*, double** [[IN]], \
i64 [[TMP1]] +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr double*, double** [[IN]], \
i64 [[TMP2]] +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr double*, double** [[IN]], \
i64 [[TMP3]] ; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 \
true, i1 true, i1 true> ; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 \
true, i1 true, i1 true, i1 true> ; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> \
[[TMP22]], <i1 true, i1 true, i1 true, i1 true> ; AVX1-NEXT: [[TMP31:%.*]] = xor \
<4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds double*, double** [[TMP24]], \
i32 0 +; AVX1-NEXT: [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 \
0 ; AVX1-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> \
@llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> \
[[TMP28]], <4 x double*> poison)
-; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], \
i32 4 +; AVX1-NEXT: [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 \
4 ; AVX1-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> \
@llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> \
[[TMP29]], <4 x double*> poison)
-; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], \
i32 8 +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 \
8 ; AVX1-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> \
@llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> \
[[TMP30]], <4 x double*> poison)
-; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], \
i32 12 +; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], \
i32 12 ; AVX1-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> \
@llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> \
[[TMP31]], <4 x double*> poison) ; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x \
double*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX1-NEXT: [[TMP41:%.*]] = icmp \
eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX1-NEXT: \
[[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer ; \
AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], \
zeroinitializer
-; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], \
i64 [[TMP0]]
-; AVX1-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP1]]
-; AVX1-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP2]]
-; AVX1-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP3]] +; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], \
i64 [[TMP0]] +; AVX1-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], \
i64 [[TMP1]] +; AVX1-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], \
i64 [[TMP2]] +; AVX1-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], \
i64 [[TMP3]] ; AVX1-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 \
true, i1 true, i1 true> ; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 \
true, i1 true, i1 true, i1 true> ; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> \
[[TMP42]], <i1 true, i1 true, i1 true, i1 true> @@ -2032,16 +2032,16 @@
; AVX1-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x \
i1> zeroinitializer ; AVX1-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x \
i1> [[TMP50]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP55:%.*]] = select <4 x \
i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
-; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 0 +; AVX1-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
-; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 4 +; AVX1-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4
; AVX1-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
-; AVX1-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 8 +; AVX1-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8
; AVX1-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
-; AVX1-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 12 +; AVX1-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 \
12 ; AVX1-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add \
nuw i64 [[INDEX]], 16 @@ -2120,34 +2120,34 @@
; AVX2-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], \
i64 [[TMP0]]
-; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 \
[[TMP1]]
-; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 \
[[TMP2]]
-; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 \
[[TMP3]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double*, double** [[IN:%.*]], \
i64 [[TMP0]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double*, double** [[IN]], \
i64 [[TMP1]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr double*, double** [[IN]], \
i64 [[TMP2]] +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double*, double** [[IN]], \
i64 [[TMP3]] ; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 \
true, i1 true, i1 true> ; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 \
true, i1 true, i1 true, i1 true> ; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> \
[[TMP22]], <i1 true, i1 true, i1 true, i1 true> ; AVX2-NEXT: [[TMP31:%.*]] = xor \
<4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds double*, double** [[TMP24]], \
i32 0 +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double*, double** [[TMP24]], i32 \
0 ; AVX2-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <4 x double*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> \
@llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> \
[[TMP28]], <4 x double*> poison)
-; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], \
i32 4 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double*, double** [[TMP24]], i32 \
4 ; AVX2-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> \
@llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> \
[[TMP29]], <4 x double*> poison)
-; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], \
i32 8 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double*, double** [[TMP24]], i32 \
8 ; AVX2-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> \
@llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> \
[[TMP30]], <4 x double*> poison)
-; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], \
i32 12 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double*, double** [[TMP24]], \
i32 12 ; AVX2-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> \
@llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> \
[[TMP31]], <4 x double*> poison) ; AVX2-NEXT: [[TMP40:%.*]] = icmp eq <4 x \
double*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX2-NEXT: [[TMP41:%.*]] = icmp \
eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX2-NEXT: \
[[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer ; \
AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], \
zeroinitializer
-; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], \
i64 [[TMP0]]
-; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP1]]
-; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP2]]
-; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP3]] +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], \
i64 [[TMP0]] +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], \
i64 [[TMP1]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], \
i64 [[TMP2]] +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], \
i64 [[TMP3]] ; AVX2-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 \
true, i1 true, i1 true> ; AVX2-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 \
true, i1 true, i1 true, i1 true> ; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> \
[[TMP42]], <i1 true, i1 true, i1 true, i1 true> @@ -2156,16 +2156,16 @@
; AVX2-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x \
i1> zeroinitializer ; AVX2-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x \
i1> [[TMP50]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP55:%.*]] = select <4 x \
i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
-; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 0 +; AVX2-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
; AVX2-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
-; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 4 +; AVX2-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4
; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
-; AVX2-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 8 +; AVX2-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8
; AVX2-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
-; AVX2-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 12 +; AVX2-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 \
12 ; AVX2-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add \
nuw i64 [[INDEX]], 16 @@ -2244,34 +2244,34 @@
; AVX512-NEXT: [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer
; AVX512-NEXT: [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer
; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP19]], zeroinitializer
-; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double*, double** \
[[IN:%.*]], i64 [[TMP0]]
-; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds double*, double** [[IN]], \
i64 [[TMP1]]
-; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds double*, double** [[IN]], \
i64 [[TMP2]]
-; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds double*, double** [[IN]], \
i64 [[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double*, double** \
[[IN:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double*, \
double** [[IN]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr \
double*, double** [[IN]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP27:%.*]] = \
getelementptr double*, double** [[IN]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP28:%.*]] \
= xor <8 x i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 \
true, i1 true> ; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP21]], <i1 true, \
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true> ; AVX512-NEXT: \
[[TMP30:%.*]] = xor <8 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true, i1 true, \
i1 true, i1 true, i1 true> ; AVX512-NEXT: [[TMP31:%.*]] = xor <8 x i1> [[TMP23]], \
<i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 \
true>
-; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double*, double** \
[[TMP24]], i32 0 +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double*, double** \
[[TMP24]], i32 0 ; AVX512-NEXT: [[TMP33:%.*]] = bitcast double** [[TMP32]] to <8 \
x double*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> \
@llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP33]], i32 8, <8 x i1> \
[[TMP28]], <8 x double*> poison)
-; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** \
[[TMP24]], i32 8 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double*, double** \
[[TMP24]], i32 8 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <8 \
x double*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x double*> \
@llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP35]], i32 8, <8 x i1> \
[[TMP29]], <8 x double*> poison)
-; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** \
[[TMP24]], i32 16 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double*, double** \
[[TMP24]], i32 16 ; AVX512-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <8 \
x double*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x double*> \
@llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP37]], i32 8, <8 x i1> \
[[TMP30]], <8 x double*> poison)
-; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** \
[[TMP24]], i32 24 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double*, double** \
[[TMP24]], i32 24 ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <8 \
x double*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x double*> \
@llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP39]], i32 8, <8 x i1> \
[[TMP31]], <8 x double*> poison) ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq <8 x \
double*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX512-NEXT: [[TMP41:%.*]] = \
icmp eq <8 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX512-NEXT: \
[[TMP42:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer ; \
AVX512-NEXT: [[TMP43:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD6]], \
zeroinitializer
-; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* \
[[OUT:%.*]], i64 [[TMP0]]
-; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], \
i64 [[TMP1]]
-; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], \
i64 [[TMP2]]
-; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], \
i64 [[TMP3]] +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* \
[[OUT:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, \
double* [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr \
double, double* [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP47:%.*]] = \
getelementptr double, double* [[OUT]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP48:%.*]] \
= xor <8 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 \
true, i1 true> ; AVX512-NEXT: [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], <i1 true, \
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true> ; AVX512-NEXT: \
[[TMP50:%.*]] = xor <8 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true, i1 true, \
i1 true, i1 true, i1 true> @@ -2280,16 +2280,16 @@
; AVX512-NEXT: [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 \
x i1> zeroinitializer ; AVX512-NEXT: [[TMP54:%.*]] = select <8 x i1> [[TMP30]], \
<8 x i1> [[TMP50]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP55:%.*]] = \
select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> \
zeroinitializer
-; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 0 +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 \
0 ; AVX512-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x \
double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]])
-; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 8 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 \
8 ; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x \
double>* [[TMP59]], i32 8, <8 x i1> [[TMP53]])
-; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 16 +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], \
i32 16 ; AVX512-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x \
double>* [[TMP61]], i32 8, <8 x i1> [[TMP54]])
-; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 24 +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], \
i32 24 ; AVX512-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x \
double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) ; AVX512-NEXT: [[INDEX_NEXT]] = \
add nuw i64 [[INDEX]], 32 @@ -2413,34 +2413,34 @@
; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], \
i64 [[TMP0]]
-; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 \
[[TMP1]]
-; AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 \
[[TMP2]]
-; AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 \
[[TMP3]] +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], \
i64 [[TMP0]] +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], \
i64 [[TMP1]] +; AVX1-NEXT: [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], \
i64 [[TMP2]] +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], \
i64 [[TMP3]] ; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 \
true, i1 true, i1 true> ; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 \
true, i1 true, i1 true, i1 true> ; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> \
[[TMP22]], <i1 true, i1 true, i1 true, i1 true> ; AVX1-NEXT: [[TMP31:%.*]] = xor \
<4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
-; AVX1-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], \
i32 0 +; AVX1-NEXT: [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 \
0 ; AVX1-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> \
@llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> \
[[TMP28]], <4 x i32 ()*> poison)
-; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], \
i32 4 +; AVX1-NEXT: [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 \
4 ; AVX1-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> \
@llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> \
[[TMP29]], <4 x i32 ()*> poison)
-; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], \
i32 8 +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 \
8 ; AVX1-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> \
@llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> \
[[TMP30]], <4 x i32 ()*> poison)
-; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], \
i32 12 +; AVX1-NEXT: [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], \
i32 12 ; AVX1-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>*
; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> \
@llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> \
[[TMP31]], <4 x i32 ()*> poison) ; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 \
()*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX1-NEXT: [[TMP41:%.*]] = icmp eq \
<4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX1-NEXT: [[TMP42:%.*]] = \
icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX1-NEXT: \
[[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], \
zeroinitializer
-; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], \
i64 [[TMP0]]
-; AVX1-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP1]]
-; AVX1-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP2]]
-; AVX1-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP3]] +; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], \
i64 [[TMP0]] +; AVX1-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], \
i64 [[TMP1]] +; AVX1-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], \
i64 [[TMP2]] +; AVX1-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], \
i64 [[TMP3]] ; AVX1-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 \
true, i1 true, i1 true> ; AVX1-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 \
true, i1 true, i1 true, i1 true> ; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> \
[[TMP42]], <i1 true, i1 true, i1 true, i1 true> @@ -2449,16 +2449,16 @@
; AVX1-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x \
i1> zeroinitializer ; AVX1-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x \
i1> [[TMP50]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP55:%.*]] = select <4 x \
i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
-; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 0 +; AVX1-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
; AVX1-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
-; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 4 +; AVX1-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4
; AVX1-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
-; AVX1-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 8 +; AVX1-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8
; AVX1-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
-; AVX1-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 12 +; AVX1-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 \
12 ; AVX1-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add \
nuw i64 [[INDEX]], 16 @@ -2537,34 +2537,34 @@
; AVX2-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer
; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer
; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x i8> [[TMP19]], zeroinitializer
-; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], \
i64 [[TMP0]]
-; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 \
[[TMP1]]
-; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 \
[[TMP2]]
-; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 \
[[TMP3]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** [[IN:%.*]], \
i64 [[TMP0]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], \
i64 [[TMP1]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], \
i64 [[TMP2]] +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr i32 ()*, i32 ()** [[IN]], \
i64 [[TMP3]] ; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP20]], <i1 true, i1 \
true, i1 true, i1 true> ; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP21]], <i1 \
true, i1 true, i1 true, i1 true> ; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> \
[[TMP22]], <i1 true, i1 true, i1 true, i1 true> ; AVX2-NEXT: [[TMP31:%.*]] = xor \
<4 x i1> [[TMP23]], <i1 true, i1 true, i1 true, i1 true>
-; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], \
i32 0 +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 \
0 ; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <4 x i32 ()*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> \
@llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> \
[[TMP28]], <4 x i32 ()*> poison)
-; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], \
i32 4 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 \
4 ; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> \
@llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> \
[[TMP29]], <4 x i32 ()*> poison)
-; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], \
i32 8 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], i32 \
8 ; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> \
@llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> \
[[TMP30]], <4 x i32 ()*> poison)
-; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], \
i32 12 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP24]], \
i32 12 ; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>*
; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> \
@llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> \
[[TMP31]], <4 x i32 ()*> poison) ; AVX2-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 \
()*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX2-NEXT: [[TMP41:%.*]] = icmp eq \
<4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX2-NEXT: [[TMP42:%.*]] = \
icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX2-NEXT: \
[[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], \
zeroinitializer
-; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], \
i64 [[TMP0]]
-; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP1]]
-; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP2]]
-; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 \
[[TMP3]] +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[OUT:%.*]], \
i64 [[TMP0]] +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr double, double* [[OUT]], \
i64 [[TMP1]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[OUT]], \
i64 [[TMP2]] +; AVX2-NEXT: [[TMP47:%.*]] = getelementptr double, double* [[OUT]], \
i64 [[TMP3]] ; AVX2-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 \
true, i1 true, i1 true> ; AVX2-NEXT: [[TMP49:%.*]] = xor <4 x i1> [[TMP41]], <i1 \
true, i1 true, i1 true, i1 true> ; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> \
[[TMP42]], <i1 true, i1 true, i1 true, i1 true> @@ -2573,16 +2573,16 @@
; AVX2-NEXT: [[TMP53:%.*]] = select <4 x i1> [[TMP29]], <4 x i1> [[TMP49]], <4 x \
i1> zeroinitializer ; AVX2-NEXT: [[TMP54:%.*]] = select <4 x i1> [[TMP30]], <4 x \
i1> [[TMP50]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP55:%.*]] = select <4 x \
i1> [[TMP31]], <4 x i1> [[TMP51]], <4 x i1> zeroinitializer
-; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 0 +; AVX2-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 0
; AVX2-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP57]], i32 8, <4 x i1> [[TMP52]])
-; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 4 +; AVX2-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 4
; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP59]], i32 8, <4 x i1> [[TMP53]])
-; AVX2-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 8 +; AVX2-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], i32 8
; AVX2-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP61]], i32 8, <4 x i1> [[TMP54]])
-; AVX2-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 12 +; AVX2-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], i32 \
12 ; AVX2-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <4 x double>*
; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x \
double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add \
nuw i64 [[INDEX]], 16 @@ -2661,34 +2661,34 @@
; AVX512-NEXT: [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer
; AVX512-NEXT: [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer
; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x i8> [[TMP19]], zeroinitializer
-; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32 ()*, i32 ()** \
[[IN:%.*]], i64 [[TMP0]]
-; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], \
i64 [[TMP1]]
-; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], \
i64 [[TMP2]]
-; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], \
i64 [[TMP3]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32 ()*, i32 ()** \
[[IN:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32 ()*, \
i32 ()** [[IN]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32 \
()*, i32 ()** [[IN]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr \
i32 ()*, i32 ()** [[IN]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x \
i1> [[TMP20]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 \
true> ; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP21]], <i1 true, i1 true, \
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true> ; AVX512-NEXT: \
[[TMP30:%.*]] = xor <8 x i1> [[TMP22]], <i1 true, i1 true, i1 true, i1 true, i1 true, \
i1 true, i1 true, i1 true> ; AVX512-NEXT: [[TMP31:%.*]] = xor <8 x i1> [[TMP23]], \
<i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 \
true>
-; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32 ()*, i32 ()** \
[[TMP24]], i32 0 +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32 ()*, i32 ()** \
[[TMP24]], i32 0 ; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32 ()** [[TMP32]] to <8 \
x i32 ()*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> \
@llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP33]], i32 8, <8 x i1> \
[[TMP28]], <8 x i32 ()*> poison)
-; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** \
[[TMP24]], i32 8 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32 ()*, i32 ()** \
[[TMP24]], i32 8 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <8 \
x i32 ()*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x i32 ()*> \
@llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP35]], i32 8, <8 x i1> \
[[TMP29]], <8 x i32 ()*> poison)
-; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** \
[[TMP24]], i32 16 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32 ()*, i32 ()** \
[[TMP24]], i32 16 ; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <8 \
x i32 ()*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i32 ()*> \
@llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP37]], i32 8, <8 x i1> \
[[TMP30]], <8 x i32 ()*> poison)
-; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** \
[[TMP24]], i32 24 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32 ()*, i32 ()** \
[[TMP24]], i32 24 ; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <8 \
x i32 ()*>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x i32 ()*> \
@llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP39]], i32 8, <8 x i1> \
[[TMP31]], <8 x i32 ()*> poison) ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq <8 x i32 \
()*> [[WIDE_MASKED_LOAD]], zeroinitializer ; AVX512-NEXT: [[TMP41:%.*]] = icmp eq \
<8 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] \
= icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer ; AVX512-NEXT: \
[[TMP43:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD6]], \
zeroinitializer
-; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* \
[[OUT:%.*]], i64 [[TMP0]]
-; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], \
i64 [[TMP1]]
-; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], \
i64 [[TMP2]]
-; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], \
i64 [[TMP3]] +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* \
[[OUT:%.*]], i64 [[TMP0]] +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr double, \
double* [[OUT]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr \
double, double* [[OUT]], i64 [[TMP2]] +; AVX512-NEXT: [[TMP47:%.*]] = \
getelementptr double, double* [[OUT]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP48:%.*]] \
= xor <8 x i1> [[TMP40]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 \
true, i1 true> ; AVX512-NEXT: [[TMP49:%.*]] = xor <8 x i1> [[TMP41]], <i1 true, \
i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true> ; AVX512-NEXT: \
[[TMP50:%.*]] = xor <8 x i1> [[TMP42]], <i1 true, i1 true, i1 true, i1 true, i1 true, \
i1 true, i1 true, i1 true> @@ -2697,16 +2697,16 @@
; AVX512-NEXT: [[TMP53:%.*]] = select <8 x i1> [[TMP29]], <8 x i1> [[TMP49]], <8 \
x i1> zeroinitializer ; AVX512-NEXT: [[TMP54:%.*]] = select <8 x i1> [[TMP30]], \
<8 x i1> [[TMP50]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP55:%.*]] = \
select <8 x i1> [[TMP31]], <8 x i1> [[TMP51]], <8 x i1> \
zeroinitializer
-; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 0 +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr double, double* [[TMP44]], i32 \
0 ; AVX512-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x \
double>* [[TMP57]], i32 8, <8 x i1> [[TMP52]])
-; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 8 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP44]], i32 \
8 ; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x \
double>* [[TMP59]], i32 8, <8 x i1> [[TMP53]])
-; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 16 +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr double, double* [[TMP44]], \
i32 16 ; AVX512-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x \
double>* [[TMP61]], i32 8, <8 x i1> [[TMP54]])
-; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP44]], \
i32 24 +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr double, double* [[TMP44]], \
i32 24 ; AVX512-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <8 x double>*
; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double \
5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x \
double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) ; AVX512-NEXT: [[INDEX_NEXT]] = \
add nuw i64 [[INDEX]], 32
Index: llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -61,20 +61,20 @@
; CHECK-NEXT: [[TMP17:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], \
[[BROADCAST_SPLAT8]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp slt <4 x i64> \
[[STEP_ADD1]], [[BROADCAST_SPLAT10]] ; CHECK-NEXT: [[TMP19:%.*]] = icmp slt <4 x \
i64> [[STEP_ADD2]], [[BROADCAST_SPLAT12]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP0]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP4]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP8]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP12]]
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0
; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 4
; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
+; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 8
; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 12
; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP31]], align 4
; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i1> [[TMP16]], <i1 true, i1 true, i1 true, \
i1 true> @@ -228,20 +228,20 @@
; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 \
1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], \
i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 \
[[TMP59]], i32 3
-; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP0]]
-; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP4]]
-; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP8]]
-; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP12]]
-; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
+; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
-; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
+; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP71]], align 4
-; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
+; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP73]], align 4
-; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP75]], align 4
; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 true, i1 true, \
i1 true> @@ -918,20 +918,20 @@
; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 \
1 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], \
i32 2 ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 \
[[TMP60]], i32 3
-; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP1]]
-; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP5]]
-; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP9]]
-; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP13]]
-; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 0
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, i32* [[TMP65]], i32 0
; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP70]], i32 4, <4 x i1> [[TMP40]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 4
+; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, i32* [[TMP65]], i32 4
; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP72]], i32 4, <4 x i1> [[TMP48]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 8
+; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, i32* [[TMP65]], i32 8
; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP74]], i32 4, <4 x i1> [[TMP56]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 12
+; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, i32* [[TMP65]], i32 12
; CHECK-NEXT: [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP76]], i32 4, <4 x i1> [[TMP64]], <4 x \
i32> poison) ; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 \
true, i1 true, i1 true> @@ -1091,20 +1091,20 @@
; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 \
1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], \
i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 \
[[TMP59]], i32 3
-; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP0]]
-; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP4]]
-; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP8]]
-; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP12]]
-; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
+; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
+; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
+; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x \
i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 \
true, i1 true, i1 true> @@ -1609,20 +1609,20 @@
; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 \
1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], \
i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 \
[[TMP59]], i32 3
-; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP0]]
-; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP4]]
-; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP8]]
-; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP12]]
-; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
+; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
+; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
+; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x \
i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 \
true, i1 true, i1 true> @@ -1776,20 +1776,20 @@
; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 \
1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], \
i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 \
[[TMP59]], i32 3
-; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP0]]
-; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP4]]
-; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP8]]
-; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP12]]
-; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
+; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
+; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
+; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x \
i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 \
true, i1 true, i1 true> @@ -1943,20 +1943,20 @@
; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 \
1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], \
i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 \
[[TMP59]], i32 3
-; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP0]]
-; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP4]]
-; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP8]]
-; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP12]]
-; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
+; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
+; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
+; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x \
i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 \
true, i1 true, i1 true> @@ -2119,20 +2119,20 @@
; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 \
1 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], \
i32 2 ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i1> [[TMP63]], i1 \
[[TMP60]], i32 3
-; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP1]]
-; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP5]]
-; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP9]]
-; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP13]]
-; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 0
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, i32* [[TMP65]], i32 0
; CHECK-NEXT: [[TMP70:%.*]] = bitcast i32* [[TMP69]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP70]], align 4
-; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 4
+; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, i32* [[TMP65]], i32 4
; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP72]], align 4
-; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 8
+; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, i32* [[TMP65]], i32 8
; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
-; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 12
+; CHECK-NEXT: [[TMP75:%.*]] = getelementptr i32, i32* [[TMP65]], i32 12
; CHECK-NEXT: [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP76]], align 4
; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], <i1 true, i1 true, i1 true, \
i1 true> @@ -2293,20 +2293,20 @@
; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 \
1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], \
i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 \
[[TMP59]], i32 3
-; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP0]]
-; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP4]]
-; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP8]]
-; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP12]]
-; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
+; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
+; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
+; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x \
i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 \
true, i1 true, i1 true> @@ -2461,20 +2461,20 @@
; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 \
1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], \
i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 \
[[TMP59]], i32 3
-; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP0]]
-; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP4]]
-; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP8]]
-; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP12]]
-; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
+; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
+; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
+; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x \
i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 \
true, i1 true, i1 true> @@ -2639,20 +2639,20 @@
; CHECK-NEXT: [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 \
1 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], \
i32 2 ; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 \
[[TMP59]], i32 3
-; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP0]]
-; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP4]]
-; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP8]]
-; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 \
[[TMP12]]
-; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0
+; CHECK-NEXT: [[TMP64:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, i32* [[BASE]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, i32* [[TMP64]], i32 0
; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4
+; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, i32* [[TMP64]], i32 4
; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8
+; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, i32* [[TMP64]], i32 8
; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x \
i32> poison)
-; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12
+; CHECK-NEXT: [[TMP74:%.*]] = getelementptr i32, i32* [[TMP64]], i32 12
; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> \
@llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x \
i32> poison) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], <i1 true, i1 \
true, i1 true, i1 true>
Index: llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -261,7 +261,7 @@
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], \
[[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP0]] to <16 x \
i32>* ; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT19]], <16 x i32>* [[TMP3]], \
align 4, !alias.scope !17, !noalias !20
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 \
[[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[C]], i64 \
[[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> \
@llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP5]], i32 8, <16 x i1> [[TMP2]], \
<16 x i32> poison), !alias.scope !23 ; CHECK-NEXT: call void \
@llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> \
[[BROADCAST_SPLAT21]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !24, !noalias !23 @@ \
-294,7 +294,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD28]], \
[[BROADCAST_SPLAT30]] ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP7]] to <8 x \
i32>* ; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT32]], <8 x i32>* [[TMP10]], \
align 4
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 \
[[INDEX25]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[C]], i64 \
[[INDEX25]] ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD33:%.*]] = call <8 x i32> \
@llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP12]], i32 8, <8 x i1> [[TMP9]], <8 x \
i32> poison) ; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> \
[[WIDE_MASKED_LOAD33]], <8 x i32*> [[BROADCAST_SPLAT35]], i32 4, <8 x \
i1> [[TMP9]])
Index: llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -30,14 +30,14 @@
; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
; AVX512-NEXT: [[TMP2:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer
-; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 \
[[INDEX6]] +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], \
i64 [[INDEX6]] ; AVX512-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> \
@llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP4]], i32 4, <16 x i1> [[TMP2]], \
<16 x i32> poison) ; AVX512-NEXT: [[TMP5:%.*]] = sext <16 x i32> \
[[WIDE_MASKED_LOAD]] to <16 x i64> ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr \
inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP5]] ; AVX512-NEXT: \
[[WIDE_MASKED_GATHER:%.*]] = call <16 x float> \
@llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP6]], i32 4, <16 x i1> \
[[TMP2]], <16 x float> undef) ; AVX512-NEXT: [[TMP7:%.*]] = fadd <16 x float> \
[[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, \
float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01>
-; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], \
i64 [[INDEX6]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr float, float* \
[[OUT:%.*]], i64 [[INDEX6]] ; AVX512-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] \
to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 \
x float> [[TMP7]], <16 x float>* [[TMP9]], i32 4, <16 x i1> [[TMP2]]) ; AVX512-NEXT: \
[[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 16 @@ -45,14 +45,14 @@
; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], \
align 4 ; AVX512-NEXT: [[TMP12:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_1]], \
zeroinitializer
-; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 \
[[INDEX_NEXT]] +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[INDEX]], \
i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <16 x \
i32>*
-; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> \
@llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP14]], i32 4, <16 x i1> \
[[TMP12]], <16 x i32> poison) +; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call \
<16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP14]], i32 4, <16 x i1> \
[[TMP12]], <16 x i32> poison) ; AVX512-NEXT: [[TMP15:%.*]] = sext <16 x i32> \
[[WIDE_MASKED_LOAD_1]] to <16 x i64> ; AVX512-NEXT: [[TMP16:%.*]] = getelementptr \
inbounds float, float* [[IN]], <16 x i64> [[TMP15]] ; AVX512-NEXT: \
[[WIDE_MASKED_GATHER_1:%.*]] = call <16 x float> \
@llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP16]], i32 4, <16 x i1> \
[[TMP12]], <16 x float> undef) ; AVX512-NEXT: [[TMP17:%.*]] = fadd <16 x float> \
[[WIDE_MASKED_GATHER_1]], <float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 \
[[INDEX_NEXT]] +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr float, float* \
[[OUT]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* \
[[TMP18]] to <16 x float>* ; AVX512-NEXT: call void \
@llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP17]], <16 x float>* [[TMP19]], \
i32 4, <16 x i1> [[TMP12]]) ; AVX512-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 \
[[INDEX6]], 32 @@ -60,14 +60,14 @@
; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP21]], \
align 4 ; AVX512-NEXT: [[TMP22:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_2]], \
zeroinitializer
-; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 \
[[INDEX_NEXT_1]] +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* \
[[INDEX]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP24:%.*]] = bitcast i32* \
[[TMP23]] to <16 x i32>*
-; AVX512-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call <16 x i32> \
@llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP24]], i32 4, <16 x i1> \
[[TMP22]], <16 x i32> poison) +; AVX512-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call \
<16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP24]], i32 4, <16 x i1> \
[[TMP22]], <16 x i32> poison) ; AVX512-NEXT: [[TMP25:%.*]] = sext <16 x i32> \
[[WIDE_MASKED_LOAD_2]] to <16 x i64> ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr \
inbounds float, float* [[IN]], <16 x i64> [[TMP25]] ; AVX512-NEXT: \
[[WIDE_MASKED_GATHER_2:%.*]] = call <16 x float> \
@llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP26]], i32 4, <16 x i1> \
[[TMP22]], <16 x float> undef) ; AVX512-NEXT: [[TMP27:%.*]] = fadd <16 x float> \
[[WIDE_MASKED_GATHER_2]], <float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 \
[[INDEX_NEXT_1]] +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr float, float* \
[[OUT]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* \
[[TMP28]] to <16 x float>* ; AVX512-NEXT: call void \
@llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP29]], \
i32 4, <16 x i1> [[TMP22]]) ; AVX512-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 \
[[INDEX6]], 48 @@ -75,14 +75,14 @@
; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
; AVX512-NEXT: [[WIDE_LOAD_3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP31]], \
align 4 ; AVX512-NEXT: [[TMP32:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_3]], \
zeroinitializer
-; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 \
[[INDEX_NEXT_2]] +; AVX512-NEXT: [[TMP33:%.*]] = getelementptr i32, i32* \
[[INDEX]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP34:%.*]] = bitcast i32* \
[[TMP33]] to <16 x i32>*
-; AVX512-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call <16 x i32> \
@llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP34]], i32 4, <16 x i1> \
[[TMP32]], <16 x i32> poison) +; AVX512-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call \
<16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP34]], i32 4, <16 x i1> \
[[TMP32]], <16 x i32> poison) ; AVX512-NEXT: [[TMP35:%.*]] = sext <16 x i32> \
[[WIDE_MASKED_LOAD_3]] to <16 x i64> ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr \
inbounds float, float* [[IN]], <16 x i64> [[TMP35]] ; AVX512-NEXT: \
[[WIDE_MASKED_GATHER_3:%.*]] = call <16 x float> \
@llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP36]], i32 4, <16 x i1> \
[[TMP32]], <16 x float> undef) ; AVX512-NEXT: [[TMP37:%.*]] = fadd <16 x float> \
[[WIDE_MASKED_GATHER_3]], <float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float \
5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 \
[[INDEX_NEXT_2]] +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr float, float* \
[[OUT]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP39:%.*]] = bitcast float* \
[[TMP38]] to <16 x float>* ; AVX512-NEXT: call void \
@llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP39]], \
i32 4, <16 x i1> [[TMP32]]) ; AVX512-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 \
[[INDEX6]], 64 @@ -112,18 +112,18 @@
; FVW2-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD7]], zeroinitializer
; FVW2-NEXT: [[TMP10:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD8]], zeroinitializer
; FVW2-NEXT: [[TMP11:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD9]], zeroinitializer
-; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 \
[[INDEX6]] +; FVW2-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[INDEX:%.*]], \
i64 [[INDEX6]] ; FVW2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> \
@llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP13]], i32 4, <2 x i1> [[TMP8]], <2 x \
i32> poison)
-; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 2
+; FVW2-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i64 2
; FVW2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; FVW2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <2 x i32> \
@llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP15]], i32 4, <2 x i1> \
[[TMP9]], <2 x i32> poison)
-; FVW2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4
+; FVW2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <2 x i32> \
@llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP15]], i32 4, <2 x i1> [[TMP9]], <2 x \
i32> poison) +; FVW2-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP12]], i64 \
4 ; FVW2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
-; FVW2-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <2 x i32> \
@llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP17]], i32 4, <2 x i1> \
[[TMP10]], <2 x i32> poison)
-; FVW2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 6
+; FVW2-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <2 x i32> \
@llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP17]], i32 4, <2 x i1> [[TMP10]], <2 x \
i32> poison) +; FVW2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 \
6 ; FVW2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>*
-; FVW2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <2 x i32> \
@llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP19]], i32 4, <2 x i1> \
[[TMP11]], <2 x i32> poison) +; FVW2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <2 x \
i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP19]], i32 4, <2 x i1> [[TMP11]], \
<2 x i32> poison) ; FVW2-NEXT: [[TMP20:%.*]] = sext <2 x i32> \
[[WIDE_MASKED_LOAD]] to <2 x i64> ; FVW2-NEXT: [[TMP21:%.*]] = sext <2 x i32> \
[[WIDE_MASKED_LOAD10]] to <2 x i64> ; FVW2-NEXT: [[TMP22:%.*]] = sext <2 x i32> \
[[WIDE_MASKED_LOAD11]] to <2 x i64> @@ -140,16 +140,16 @@
; FVW2-NEXT: [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float \
5.000000e-01, float 5.000000e-01> ; FVW2-NEXT: [[TMP30:%.*]] = fadd <2 x float> \
[[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01> ; FVW2-NEXT: \
[[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float \
5.000000e-01>
-; FVW2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], \
i64 [[INDEX6]] +; FVW2-NEXT: [[TMP32:%.*]] = getelementptr float, float* \
[[OUT:%.*]], i64 [[INDEX6]] ; FVW2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] \
to <2 x float>* ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x \
float> [[TMP28]], <2 x float>* [[TMP33]], i32 4, <2 x i1> [[TMP8]])
-; FVW2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 \
2 +; FVW2-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP32]], i64 2
; FVW2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <2 x float>*
; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP29]], <2 \
x float>* [[TMP35]], i32 4, <2 x i1> [[TMP9]])
-; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 \
4 +; FVW2-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP32]], i64 4
; FVW2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <2 x float>*
; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP30]], <2 \
x float>* [[TMP37]], i32 4, <2 x i1> [[TMP10]])
-; FVW2-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 \
6 +; FVW2-NEXT: [[TMP38:%.*]] = getelementptr float, float* [[TMP32]], i64 6
; FVW2-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>*
; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP31]], <2 \
x float>* [[TMP39]], i32 4, <2 x i1> [[TMP11]]) ; FVW2-NEXT: [[INDEX_NEXT]] = add \
nuw i64 [[INDEX6]], 8
Index: llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -26,10 +26,10 @@
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, \
{{.*}} ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = sub nuw nsw i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], \
i64 [[TMP5]] +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 \
[[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, \
i1 true, i1 true>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> \
@llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP9]], i32 4, <4 x i1> [[TMP7]], <4 x \
float> poison), !invariant.load !0 entry:
@@ -151,10 +151,10 @@
; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 \
1> ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP4]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = sdiv exact i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], \
i64 [[TMP8]] +; CHECK-NEXT: [[TMP8:%.*]] = sdiv i64 [[TMP0]], 1
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 \
[[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, \
i1 true, i1 true>
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 \
0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> \
@llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP12]], i32 4, <4 x i1> [[TMP10]], <4 \
x float> poison), !invariant.load !0 entry:
Index: llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
@@ -51,16 +51,16 @@
; CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x double> [[WIDE_LOAD6]], <4 x \
double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> ; CHECK-NEXT: [[TMP8:%.*]] \
= fcmp une <4 x double> [[REVERSE]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = \
fcmp une <4 x double> [[REVERSE7]], zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[A]], i64 \
[[TMP1]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[TMP10]], \
i64 -3 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr double, double* [[A]], i64 \
[[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, double* [[TMP10]], \
i64 -3 ; CHECK-NEXT: [[REVERSE8:%.*]] = shufflevector <4 x i1> [[TMP8]], <4 x i1> \
poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> ; CHECK-NEXT: [[TMP12:%.*]] = \
bitcast double* [[TMP11]] to <4 x double>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> \
@llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP12]], i32 8, <4 x i1> \
[[REVERSE8]], <4 x double> poison), !alias.scope !3, !noalias !0
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, double* [[TMP10]], \
i64 -4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP13]], \
i64 -3 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> \
@llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP12]], i32 8, <4 x i1> \
[[REVERSE8]], <4 x double> poison), !alias.scope !3, !noalias !0 +; CHECK-NEXT: \
[[TMP13:%.*]] = getelementptr double, double* [[TMP10]], i64 -4 +; CHECK-NEXT: \
[[TMP14:%.*]] = getelementptr double, double* [[TMP13]], i64 -3 ; CHECK-NEXT: \
[[REVERSE10:%.*]] = shufflevector <4 x i1> [[TMP9]], <4 x i1> poison, <4 x i32> <i32 \
3, i32 2, i32 1, i32 0> ; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] \
to <4 x double>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> \
@llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP15]], i32 8, <4 x i1> \
[[REVERSE10]], <4 x double> poison), !alias.scope !3, !noalias !0 +; CHECK-NEXT: \
[[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x \
double>* [[TMP15]], i32 8, <4 x i1> [[REVERSE10]], <4 x double> poison), !alias.scope \
!3, !noalias !0 ; CHECK-NEXT: [[TMP16:%.*]] = fadd <4 x double> \
[[WIDE_MASKED_LOAD]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, \
double 1.000000e+00> ; CHECK-NEXT: [[TMP17:%.*]] = fadd <4 x double> \
[[WIDE_MASKED_LOAD11]], <double 1.000000e+00, double 1.000000e+00, double \
1.000000e+00, double 1.000000e+00> ; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* \
[[TMP11]] to <4 x double>*
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll
@@ -1,4 +1,4 @@
-; This is the loop in c++ being vectorize in this file with
+; This is the loop in c++ being vectorize in this file with
; experimental.vector.reverse
;#pragma clang loop vectorize_width(4, scalable)
@@ -18,7 +18,7 @@
define void @vector_reverse_mask_nxv4i1(double* %a, double* %cond, i64 %N) #0 {
; CHECK-LABEL: vector.body:
; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> \
@llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
-; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> \
@llvm.masked.load.nxv4f64.p0nxv4f64(<vscale x 4 x double>* nonnull %{{.*}}, i32 8, \
<vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison) +; CHECK: \
%[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> \
@llvm.masked.load.nxv4f64.p0nxv4f64(<vscale x 4 x double>* %{{.*}}, i32 8, <vscale x \
4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison) ; CHECK-NEXT: %[[FADD:.*]] = \
fadd <vscale x 4 x double> %[[WIDEMSKLOAD]] ; CHECK: %[[REVERSE9:.*]] = call \
<vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}}) \
; CHECK: call void @llvm.masked.store.nxv4f64.p0nxv4f64(<vscale x 4 x double> \
%[[FADD]], <vscale x 4 x double>* %{{.*}}, i32 8, <vscale x 4 x i1> \
%[[REVERSE9]]
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
@@ -5,7 +5,7 @@
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
; CHECK-NEXT: %[[MASK:.*]] = fcmp ogt <vscale x 4 x float> %[[LOAD1]],
-; CHECK-NEXT: %[[GEPA:.*]] = getelementptr inbounds float, float* %a,
+; CHECK-NEXT: %[[GEPA:.*]] = getelementptr float, float* %a,
; CHECK-NEXT: %[[MLOAD_PTRS:.*]] = bitcast float* %[[GEPA]] to <vscale x 4 x \
float>* ; CHECK-NEXT: %[[LOAD2:.*]] = call <vscale x 4 x float> \
@llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* %[[MLOAD_PTRS]], i32 4, \
<vscale x 4 x i1> %[[MASK]] ; CHECK-NEXT: %[[FADD:.*]] = fadd <vscale x 4 x float> \
%[[LOAD1]], %[[LOAD2]] @@ -42,7 +42,7 @@
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>*
; CHECK-NEXT: %[[MASK:.*]] = icmp ne <vscale x 4 x i32> %[[LOAD1]],
-; CHECK-NEXT: %[[GEPA:.*]] = getelementptr inbounds i32, i32* %a,
+; CHECK-NEXT: %[[GEPA:.*]] = getelementptr i32, i32* %a,
; CHECK-NEXT: %[[MLOAD_PTRS:.*]] = bitcast i32* %[[GEPA]] to <vscale x 4 x i32>*
; CHECK-NEXT: %[[LOAD2:.*]] = call <vscale x 4 x i32> \
@llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* %[[MLOAD_PTRS]], i32 4, \
<vscale x 4 x i1> %[[MASK]] ; CHECK-NEXT: %[[FADD:.*]] = add <vscale x 4 x i32> \
%[[LOAD1]], %[[LOAD2]]
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -511,9 +511,9 @@
/// A helper function to scalarize a single Instruction in the innermost loop.
/// Generates a sequence of scalar instances for each lane between \p MinLane
/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
- /// inclusive. Uses the VPValue operands from \p Operands instead of \p
+ /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
/// Instr's operands.
- void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
+ void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
const VPIteration &Instance, bool IfPredicateInstr,
VPTransformState &State);
@@ -3022,8 +3022,49 @@
}
}
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
- VPUser &User,
+// Returns true if `Def` is part of the address computation of a recipe
+// generating a widen memory instruction. Returns false otherwise.
+static bool isAddressComputationOfWidenMemInstr(VPValue *Def) {
+ SmallVector<VPValue *, 16> Worklist;
+ SmallPtrSet<VPValue *, 16> Visited;
+ Worklist.push_back(Def);
+
+ // Traverse def-use chain from Def searching for a
+ // VPWidenMemoryInstructionRecipe or VPInterleaveRecipe.
+ while (!Worklist.empty()) {
+ VPValue *CurDef = Worklist.back();
+ Worklist.pop_back();
+
+ if (!Visited.insert(CurDef).second)
+ continue;
+
+ // Address computation should only happen on integer types.
+ // TODO: Retrieve the type from the VPRecipes/VPInstructions when available.
+ Value *UnderlyingVal = CurDef->getUnderlyingValue();
+ if (!UnderlyingVal || (!UnderlyingVal->getType()->isIntOrIntVectorTy() &&
+ !UnderlyingVal->getType()->isPtrOrPtrVectorTy()))
+ return false;
+
+ for (VPUser *User : CurDef->users()) {
+ if (auto *RecBase = dyn_cast<VPRecipeBase>(User)) {
+ auto *WidenMemRec = dyn_cast<VPWidenMemoryInstructionRecipe>(RecBase);
+ auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(RecBase);
+ // Make sure the definition is not the value to store.
+ if ((WidenMemRec && WidenMemRec->getAddr() == CurDef) ||
+ (InterleaveRec && InterleaveRec->getAddr() == CurDef))
+ return true;
+
+ for (auto *SingleDef : RecBase->definedValues())
+ Worklist.push_back(SingleDef);
+ }
+ }
+ }
+
+ return false;
+}
+
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+ VPReplicateRecipe *RepRecipe,
const VPIteration &Instance,
bool IfPredicateInstr,
VPTransformState &State) {
@@ -3044,17 +3085,29 @@
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
+ // If the scalarized instruction contributes to the address computation of a
+ // widen masked load/store, was in a basic block that needed predication
+ // and is not predicated after vectorization, we can't propagate
+ // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The control flow
+ // has been linearized and the instruction is no longer guarded by the
+ // predicate, which could lead to the generation of a poison value feeding the
+ // base address of the widen masked load/store.
+ if (!RepRecipe->isPredicated() && !State.Instance && State.VF.isVector() &&
+ Legal->blockNeedsPredication(Instr->getParent()) &&
+ isAddressComputationOfWidenMemInstr(RepRecipe))
+ Cloned->dropPoisonGeneratingFlags();
+
State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
Builder.GetInsertPoint());
// Replace the operands of the cloned instructions with their scalar
// equivalents in the new loop.
- for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
+ for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) {
auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
auto InputInstance = Instance;
if (!Operand || !OrigLoop->contains(Operand) ||
(Cost->isUniformAfterVectorization(Operand, State.VF)))
InputInstance.Lane = VPLane::getFirstLane();
- auto *NewOp = State.get(User.getOperand(op), InputInstance);
+ auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance);
Cloned->setOperand(op, NewOp);
}
addNewMetadata(Cloned, Instr);
@@ -3062,7 +3115,7 @@
// Place the cloned scalar in the new loop.
Builder.Insert(Cloned);
- State.set(Def, Cloned, Instance);
+ State.set(RepRecipe, Cloned, Instance);
// If we just cloned a new assumption, add it the assumption cache.
if (auto *II = dyn_cast<AssumeInst>(Cloned))
@@ -9798,8 +9851,8 @@
void VPReplicateRecipe::execute(VPTransformState &State) {
if (State.Instance) { // Generate a single instance.
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
- *State.Instance, IsPredicated, State);
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
+ IsPredicated, State);
// Insert scalar instance packing it into a vector.
if (AlsoPack && State.VF.isVector()) {
// If we're constructing lane 0, initialize to start from poison.
@@ -9822,7 +9875,7 @@
"Can't scalarize a scalable vector");
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
- State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
+ State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
VPIteration(Part, Lane), IsPredicated,
State);
}
[Attachment #4 (text/plain)]
_______________________________________________
llvm-commits mailing list
llvm-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic