[prev in list] [next in list] [prev in thread] [next in thread]
List: llvm-commits
Subject: [PATCH] D140677: [AArch64][DAG] `canCombineShuffleToExtendVectorInreg()`: allow illegal types before
From: Roman Lebedev via Phabricator via llvm-commits <llvm-commits () lists ! llvm ! org>
Date: 2022-12-31 20:48:49
Message-ID: GzUtNaYgQqSUeQ28rMyG4Q () geopod-ismtpd-6-1
[Download RAW message or body]
lebedev.ri updated this revision to Diff 485786.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D140677/new/
https://reviews.llvm.org/D140677
Files:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
llvm/test/CodeGen/AArch64/vselect-ext.ll
llvm/test/CodeGen/AArch64/zext-to-tbl.ll
llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
["D140677.485786.patch" (D140677.485786.patch)]
Index: llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
===================================================================
--- llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -1700,21 +1700,23 @@
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = \
xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
+; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec256_v32i8_to_v2i128_factor16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1725,8 +1727,8 @@
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
@@ -1737,8 +1739,8 @@
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -1842,12 +1844,12 @@
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pxor %xmm1, %xmm1
-; SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT: \
pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; \
SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE42-NEXT: \
paddb 16(%rdx), %xmm0
-; SSE42-NEXT: paddb (%rdx), %xmm2
-; SSE42-NEXT: movdqa %xmm2, (%rcx)
+; SSE42-NEXT: paddb (%rdx), %xmm1
+; SSE42-NEXT: movdqa %xmm1, (%rcx)
; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
; SSE42-NEXT: retq
;
@@ -2187,12 +2189,12 @@
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: paddb (%rsi), %xmm0
-; SSE42-NEXT: pxor %xmm1, %xmm1
-; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE42-NEXT: paddb 16(%rdx), %xmm0
-; SSE42-NEXT: paddb (%rdx), %xmm2
-; SSE42-NEXT: movdqa %xmm2, (%rcx)
+; SSE42-NEXT: paddb (%rdx), %xmm1
+; SSE42-NEXT: movdqa %xmm1, (%rcx)
; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
; SSE42-NEXT: retq
;
@@ -2709,13 +2711,16 @@
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = \
xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = \
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpmovzxbw {{.*#+}} xmm4 = \
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = \
xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3
; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, (%rcx)
; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_v48i8_to_v24i16_factor2:
@@ -2724,7 +2729,7 @@
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6], \
zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6], \
zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2738,7 +2743,7 @@
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6], \
zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6], \
zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
@@ -2921,66 +2926,44 @@
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm2 = \
xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; AVX-NEXT: vpmovzxbd {{.*#+}} xmm3 = \
xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3
; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vmovdqa %xmm1, (%rcx)
; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 48(%rcx)
; AVX-NEXT: retq
;
-; AVX2-SLOW-LABEL: vec384_v48i8_to_v12i32_factor4:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-SLOW-NEXT: vpmovzxbd {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v12i32_factor4:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpmovzxbd {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vzeroupper
-; AVX2-FAST-PERLANE-NEXT: retq
-;
-; AVX2-FAST-LABEL: vec384_v48i8_to_v12i32_factor4:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpmovzxbd {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero
-; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: vec384_v48i8_to_v12i32_factor4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: vec384_v48i8_to_v12i32_factor4:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zer \
o,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0 \
[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -3160,65 +3143,43 @@
; AVX-NEXT: vpsrld $16, %xmm0, %xmm2
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = \
xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero \
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vpmovzxbq \
{{.*#+}} xmm3 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3
; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vmovdqa %xmm1, (%rcx)
; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 48(%rcx)
; AVX-NEXT: retq
;
-; AVX2-SLOW-LABEL: vec384_v48i8_to_v6i64_factor8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero, \
xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v6i64_factor8:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero, \
xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vzeroupper
-; AVX2-FAST-PERLANE-NEXT: retq
-;
-; AVX2-FAST-LABEL: vec384_v48i8_to_v6i64_factor8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero, \
xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: vec384_v48i8_to_v6i64_factor8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero, \
xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero \
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-NEXT: vpmovzxbq \
{{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero \
,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: vec384_v48i8_to_v6i64_factor8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero, \
xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} zmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero, \
xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero, \
xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero, \
xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero \
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpaddb \
32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -3378,93 +3339,97 @@
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpslldq {{.*#+}} xmm2 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = \
xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; \
AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpsrld $24, %xmm0, %xmm2
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; \
AVX-NEXT: vpsrldq {{.*#+}} xmm0 = \
xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX-NEXT: vpaddb 48(%rdx), %xmm3, %xmm3
+; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 48(%rcx)
+; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
-; AVX2-SLOW-LABEL: vec384_v48i8_to_v3i128_factor16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm1 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm1 = \
xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v3i128_factor16:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = \
xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-FAST-PERLANE-NEXT: vzeroupper
-; AVX2-FAST-PERLANE-NEXT: retq
-;
-; AVX2-FAST-LABEL: vec384_v48i8_to_v3i128_factor16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = \
xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: vec384_v48i8_to_v3i128_factor16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT: vpsrld $24, %xmm0, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: \
# ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vpand %ymm2, \
%ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: vec384_v48i8_to_v3i128_factor16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = \
xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpsrld $24, %xmm0, %xmm0
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = \
xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero \
+; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512F-NEXT: \
vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpandq \
{{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, \
%zmm0, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
-; AVX512BW-LABEL: vec384_v48i8_to_v3i128_factor16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = \
xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512BW-SLOW-LABEL: vec384_v48i8_to_v3i128_factor16:
+; AVX512BW-SLOW: # %bb.0:
+; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = \
ymm1[0,u,u,u,1,u,u,u,u,u,u,u,u,u,u,u,16,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; \
AVX512BW-SLOW-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: \
vpbroadcastb %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = \
<0,u,u,u,1,u,u,u,4,u,u,u,18,u,u,u> +; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm1, \
%zmm2 +; AVX512BW-SLOW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT: vzeroupper
+; AVX512BW-SLOW-NEXT: retq
+;
+; AVX512BW-FAST-LABEL: vec384_v48i8_to_v3i128_factor16:
+; AVX512BW-FAST: # %bb.0:
+; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-FAST-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm1
+; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = \
ymm1[0,u,u,u,1,u,u,u,u,u,u,u,u,u,u,u,16,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; \
AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,3,3,3,3,u,u,u,u] \
+; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = \
<0,u,u,u,1,u,u,u,4,u,u,u,18,u,u,u> +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, \
%zmm2 +; AVX512BW-FAST-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT: vzeroupper
+; AVX512BW-FAST-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -3678,15 +3643,15 @@
; SSE42-NEXT: movdqa 16(%rdi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 16(%rsi), %xmm1
-; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = \
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE42-NEXT: \
pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; \
SSE42-NEXT: pxor %xmm3, %xmm3 +; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE42-NEXT: \
paddb 16(%rdx), %xmm0
-; SSE42-NEXT: paddb (%rdx), %xmm3
+; SSE42-NEXT: paddb (%rdx), %xmm2
; SSE42-NEXT: paddb 32(%rdx), %xmm1
; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm3, (%rcx)
+; SSE42-NEXT: movdqa %xmm2, (%rcx)
; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
; SSE42-NEXT: retq
;
@@ -3699,13 +3664,16 @@
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX-NEXT: vpxor %xmm3, \
%xmm3, %xmm3 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = \
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = \
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpunpckhwd \
{{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; \
AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm4, \
%xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, (%rcx)
; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_v24i16_to_v12i32_factor2:
@@ -3714,7 +3682,7 @@
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpmovzxwd \
{{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
@@ -3726,13 +3694,12 @@
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = \
ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6], \
zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -3740,10 +3707,7 @@
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm1 = \
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = \
ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6], \
zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -3986,66 +3950,44 @@
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = \
xmm0[1,1,1,1] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = \
xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm3 = \
xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = \
xmm0[3,3,3,3] ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3
; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vmovdqa %xmm1, (%rcx)
; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 48(%rcx)
; AVX-NEXT: retq
;
-; AVX2-SLOW-LABEL: vec384_v24i16_to_v6i64_factor4:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v6i64_factor4:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vzeroupper
-; AVX2-FAST-PERLANE-NEXT: retq
-;
-; AVX2-FAST-LABEL: vec384_v24i16_to_v6i64_factor4:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: vec384_v24i16_to_v6i64_factor4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: vec384_v24i16_to_v6i64_factor4:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -4053,9 +3995,7 @@
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = \
xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} ymm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -4297,65 +4237,74 @@
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = \
xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vpblendw {{.*#+}} \
xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = \
xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX-NEXT: \
vpsrldq {{.*#+}} xmm0 = \
xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 48(%rcx)
; AVX-NEXT: retq
;
; AVX2-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm2 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm2 = \
xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa (%rdi), \
%xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} \
ymm1 = ymm1[0,1,1,3]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: \
vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} \
ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: \
vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v3i128_factor8:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = \
xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vmovdqa \
(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq \
{{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; \
AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} \
ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; \
AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: \
vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX2-FAST-LABEL: vec384_v24i16_to_v3i128_factor8:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = \
xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-NEXT: vmovdqa (%rdi), \
%xmm0 +; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} \
ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: \
vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: \
vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: \
vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-FAST-NEXT: vzeroupper
; AVX2-FAST-NEXT: retq
;
@@ -4364,11 +4313,14 @@
; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = \
ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} \
ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = \
ymm1[2,1,3,3] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: \
vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; \
AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: \
vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
@@ -4381,10 +4333,15 @@
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = \
xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512F-FAST-NEXT: \
vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 \
= ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa \
{{.*#+}} ymm2 = <4,u,u,u,5,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: \
vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; \
AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: \
vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
@@ -4393,35 +4350,19 @@
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = \
[16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15]
-; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vpermt2w %ymm0, %ymm1, %ymm2
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec384_v24i16_to_v3i128_factor8:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = \
[16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15]
-; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-FAST-NEXT: vpermt2w %ymm0, %ymm1, %ymm2
-; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec384_v24i16_to_v3i128_factor8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = \
[32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,40,17,18,19,20,21,22,23,41,25,26,27,28,29,30,31]
+; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -4659,15 +4600,15 @@
; SSE42-NEXT: movdqa 16(%rdi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 16(%rsi), %xmm1
-; SSE42-NEXT: pxor %xmm2, %xmm2
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT: pxor %xmm3, %xmm3
+; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSE42-NEXT: paddb 16(%rdx), %xmm0
-; SSE42-NEXT: paddb (%rdx), %xmm3
+; SSE42-NEXT: paddb (%rdx), %xmm2
; SSE42-NEXT: paddb 32(%rdx), %xmm1
; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm3, (%rcx)
+; SSE42-NEXT: movdqa %xmm2, (%rcx)
; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
; SSE42-NEXT: retq
;
@@ -4677,20 +4618,19 @@
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = \
ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, (%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3
+; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2:
@@ -4724,10 +4664,7 @@
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm1 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} ymm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} zmm0 = \
ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -4956,54 +4893,60 @@
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
; AVX-NEXT: vmovdqa %xmm1, (%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = \
xmm0[2,1,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, \
%ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v3i128_factor4:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = \
xmm1[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd \
{{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = \
ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpaddb \
32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
; AVX2-FAST-PERLANE-NEXT: retq
;
@@ -5011,11 +4954,13 @@
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = \
ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: \
vpermd %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = \
<2,u,u,u,3,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, \
%ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
@@ -5025,9 +4970,11 @@
;
; AVX512F-LABEL: vec384_v12i32_to_v3i128_factor4:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,1,2,3,17,5,6,7,18,9,10,11,u,u,u,u>
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = \
[16,1,2,3,17,5,6,7,20,9,10,11,21,13,14,15] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, \
%xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
@@ -5038,35 +4985,19 @@
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT: movb $17, %al
-; AVX512BW-SLOW-NEXT: kmovd %eax, %k1
-; AVX512BW-SLOW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z}
-; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec384_v12i32_to_v3i128_factor4:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-FAST-NEXT: movb $17, %al
-; AVX512BW-FAST-NEXT: kmovd %eax, %k1
-; AVX512BW-FAST-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z}
-; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec384_v12i32_to_v3i128_factor4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = \
[16,1,2,3,17,5,6,7,20,9,10,11,21,13,14,15] +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, \
%xmm2 +; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -5337,17 +5268,21 @@
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3]
-; AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2
; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
+; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, (%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -5355,11 +5290,11 @@
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,3]
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
@@ -5371,14 +5306,16 @@
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,9,1,11,2,13,u,u>
-; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: movb $85, %al
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -5386,12 +5323,11 @@
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: movb $5, %al
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movb $85, %al
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z}
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -5887,24 +5823,25 @@
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = \
xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = \
xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = \
xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: paddb 16(%rdx), %xmm0
-; SSE2-NEXT: paddb (%rdx), %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = \
xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = \
xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: \
movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = \
xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = \
xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: paddb 48(%rdx), %xmm2
-; SSE2-NEXT: paddb 32(%rdx), %xmm3
-; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
-; SSE2-NEXT: movdqa %xmm2, 48(%rcx)
-; SSE2-NEXT: movdqa %xmm4, (%rcx)
+; SSE2-NEXT: paddb 32(%rdx), %xmm4
+; SSE2-NEXT: paddb 16(%rdx), %xmm0
+; SSE2-NEXT: paddb (%rdx), %xmm3
+; SSE2-NEXT: movdqa %xmm3, (%rcx)
; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT: movdqa %xmm4, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm2, 48(%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: vec512_v64i8_to_v8i64_factor8:
@@ -5913,20 +5850,19 @@
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: pmovzxbq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = \
xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT: movdqa %xmm0, %xmm3
-; SSE42-NEXT: psrlq $48, %xmm3
-; SSE42-NEXT: pmovzxbq {{.*#+}} xmm3 = \
xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero \
+; SSE42-NEXT: pmovzxbq {{.*#+}} xmm3 = \
xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero \
; SSE42-NEXT: psrld $16, %xmm0 ; SSE42-NEXT: pmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
+; SSE42-NEXT: psrld $16, %xmm2 +; SSE42-NEXT: pmovzxbq {{.*#+}} xmm2 = \
xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero \
+; SSE42-NEXT: paddb 48(%rdx), %xmm2 ; SSE42-NEXT: paddb 16(%rdx), %xmm0
-; SSE42-NEXT: paddb 48(%rdx), %xmm3
-; SSE42-NEXT: paddb 32(%rdx), %xmm2
+; SSE42-NEXT: paddb 32(%rdx), %xmm3
; SSE42-NEXT: paddb (%rdx), %xmm1
; SSE42-NEXT: movdqa %xmm1, (%rcx)
-; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
+; SSE42-NEXT: movdqa %xmm3, 32(%rcx)
; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT: movdqa %xmm2, 48(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec512_v64i8_to_v8i64_factor8:
@@ -5936,9 +5872,9 @@
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
; AVX-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm2 = \
xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm3 = \
xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX-NEXT: vpmovzxbq {{.*#+}} xmm3 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, \
%xmm3 @@ -5997,159 +5933,127 @@
}
define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr \
%in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v64i8_to_v4i128_factor16:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm0
-; SSE-NEXT: paddb (%rsi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: pslldq {{.*#+}} xmm2 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; SSE-NEXT: psrldq {{.*#+}} xmm2 = \
xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = \
xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: pslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
-; SSE-NEXT: psrldq {{.*#+}} xmm0 = \
xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: paddb 16(%rdx), %xmm0
-; SSE-NEXT: paddb 48(%rdx), %xmm3
-; SSE-NEXT: paddb 32(%rdx), %xmm2
-; SSE-NEXT: paddb (%rdx), %xmm1
-; SSE-NEXT: movdqa %xmm1, (%rcx)
-; SSE-NEXT: movdqa %xmm2, 32(%rcx)
-; SSE-NEXT: movdqa %xmm3, 48(%rcx)
-; SSE-NEXT: movdqa %xmm0, 16(%rcx)
-; SSE-NEXT: retq
+; SSE2-LABEL: vec512_v64i8_to_v4i128_factor16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: paddb (%rsi), %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrld $16, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-NEXT: psrldq {{.*#+}} xmm1 = \
xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero \
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; \
SSE2-NEXT: psrldq {{.*#+}} xmm0 = \
xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero \
+; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb 48(%rdx), %xmm1
+; SSE2-NEXT: paddb 32(%rdx), %xmm3
+; SSE2-NEXT: paddb (%rdx), %xmm2
+; SSE2-NEXT: movdqa %xmm2, (%rcx)
+; SSE2-NEXT: movdqa %xmm3, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm1, 48(%rcx)
+; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: vec512_v64i8_to_v4i128_factor16:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: paddb (%rsi), %xmm0
+; SSE42-NEXT: movdqa %xmm0, %xmm1
+; SSE42-NEXT: pshufb {{.*#+}} xmm1 = \
xmm1[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
SSE42-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0] +; SSE42-NEXT: pand %xmm0, %xmm2
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE42-NEXT: psrldq {{.*#+}} xmm3 = \
xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero \
+; SSE42-NEXT: pslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; \
SSE42-NEXT: psrldq {{.*#+}} xmm0 = \
xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero \
+; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 48(%rdx), %xmm3
+; SSE42-NEXT: paddb (%rdx), %xmm2
+; SSE42-NEXT: paddb 32(%rdx), %xmm1
+; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm2, (%rcx)
+; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT: retq
;
; AVX-LABEL: vec512_v64i8_to_v4i128_factor16:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpslldq {{.*#+}} xmm2 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = \
xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpslldq {{.*#+}} xmm3 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = \
xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT: vpsrld $24, %xmm0, %xmm2
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; \
AVX-NEXT: vpsrldq {{.*#+}} xmm0 = \
xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 48(%rcx)
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX-NEXT: vpaddb 48(%rdx), %xmm3, %xmm3
+; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 48(%rcx)
+; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
-; AVX2-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
+; AVX2-LABEL: vec512_v64i8_to_v4i128_factor16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT: vpsrld $24, %xmm0, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: \
# ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vpand %ymm2, \
%ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
-; AVX2-FAST-PERLANE-LABEL: vec512_v64i8_to_v4i128_factor16:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT: vzeroupper
-; AVX2-FAST-PERLANE-NEXT: retq
-;
-; AVX2-FAST-LABEL: vec512_v64i8_to_v4i128_factor16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16:
-; AVX512F-SLOW: # %bb.0:
-; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512F-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
-; AVX512F-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512F-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512F-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX512F-SLOW-NEXT: vzeroupper
-; AVX512F-SLOW-NEXT: retq
-;
-; AVX512F-FAST-LABEL: vec512_v64i8_to_v4i128_factor16:
-; AVX512F-FAST: # %bb.0:
-; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = \
xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX512F-FAST-NEXT: vzeroupper
-; AVX512F-FAST-NEXT: retq
+; AVX512F-LABEL: vec512_v64i8_to_v4i128_factor16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpsrld $24, %xmm0, %xmm0
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = \
xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero \
+; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512F-NEXT: \
vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpandq \
{{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, \
%zmm0, %ymm1 +; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
;
; AVX512BW-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512BW-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
-; AVX512BW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512BW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-SLOW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = \
ymm1[0,u,u,u,1,u,u,u,u,u,u,u,u,u,u,u,16,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; \
AVX512BW-SLOW-NEXT: vpsrld $24, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: \
vpbroadcastb %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = \
<0,u,u,u,1,u,u,u,4,u,u,u,18,u,u,u> +; AVX512BW-SLOW-NEXT: vpermi2d %zmm0, %zmm1, \
%zmm2 +; AVX512BW-SLOW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -6159,13 +6063,13 @@
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = \
xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
-; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
-; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpsrld $16, %xmm0, %xmm1
+; AVX512BW-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm1
+; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = \
ymm1[0,u,u,u,1,u,u,u,u,u,u,u,u,u,u,u,16,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; \
AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,3,3,3,3,u,u,u,u] \
+; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = \
<0,u,u,u,1,u,u,u,4,u,u,u,18,u,u,u> +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm1, \
%zmm2 +; AVX512BW-FAST-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-FAST-NEXT: vzeroupper
@@ -6234,13 +6138,14 @@
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
-; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = \
ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -6248,10 +6153,9 @@
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
-; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = \
ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -6372,16 +6276,16 @@
; SSE42-NEXT: movdqa 16(%rdi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 16(%rsi), %xmm1
-; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pmovzxwd {{.*#+}} xmm3 = \
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = \
xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE42-NEXT: \
pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; \
SSE42-NEXT: pxor %xmm3, %xmm3 +; SSE42-NEXT: punpckhwd {{.*#+}} xmm1 = \
xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE42-NEXT: \
pmovzxwd {{.*#+}} xmm4 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE42-NEXT: \
punpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] ; SSE42-NEXT: \
paddb 16(%rdx), %xmm0 ; SSE42-NEXT: paddb (%rdx), %xmm4
; SSE42-NEXT: paddb 48(%rdx), %xmm1
-; SSE42-NEXT: paddb 32(%rdx), %xmm3
-; SSE42-NEXT: movdqa %xmm3, 32(%rcx)
+; SSE42-NEXT: paddb 32(%rdx), %xmm2
+; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
; SSE42-NEXT: movdqa %xmm1, 48(%rcx)
; SSE42-NEXT: movdqa %xmm4, (%rcx)
; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
@@ -6525,8 +6429,8 @@
;
; AVX2-LABEL: vec512_v32i16_to_v8i64_factor4:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -6577,19 +6481,18 @@
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE2-NEXT: pslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = \
xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: psrldq {{.*#+}} xmm2 = \
xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; \
SSE2-NEXT: psrldq {{.*#+}} xmm3 = \
xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; \
SSE2-NEXT: paddb 16(%rdx), %xmm3 ; SSE2-NEXT: paddb 48(%rdx), %xmm2
-; SSE2-NEXT: paddb 32(%rdx), %xmm0
-; SSE2-NEXT: paddb (%rdx), %xmm1
-; SSE2-NEXT: movdqa %xmm1, (%rcx)
-; SSE2-NEXT: movdqa %xmm0, 32(%rcx)
+; SSE2-NEXT: paddb 32(%rdx), %xmm1
+; SSE2-NEXT: paddb (%rdx), %xmm0
+; SSE2-NEXT: movdqa %xmm0, (%rcx)
+; SSE2-NEXT: movdqa %xmm1, 32(%rcx)
; SSE2-NEXT: movdqa %xmm2, 48(%rcx)
; SSE2-NEXT: movdqa %xmm3, 16(%rcx)
; SSE2-NEXT: retq
@@ -6599,21 +6502,21 @@
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: pxor %xmm1, %xmm1
-; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE42-NEXT: pslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
-; SSE42-NEXT: psrldq {{.*#+}} xmm0 = \
xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT: psrldq {{.*#+}} xmm2 = \
xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = \
xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7]
; SSE42-NEXT: psrldq {{.*#+}} xmm3 = \
xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT: paddb 16(%rdx), %xmm3
-; SSE42-NEXT: paddb 48(%rdx), %xmm2
-; SSE42-NEXT: paddb 32(%rdx), %xmm0
-; SSE42-NEXT: paddb (%rdx), %xmm1
-; SSE42-NEXT: movdqa %xmm1, (%rcx)
-; SSE42-NEXT: movdqa %xmm0, 32(%rcx)
-; SSE42-NEXT: movdqa %xmm2, 48(%rcx)
-; SSE42-NEXT: movdqa %xmm3, 16(%rcx)
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT: psrldq {{.*#+}} xmm0 = \
xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 48(%rdx), %xmm3
+; SSE42-NEXT: paddb 32(%rdx), %xmm1
+; SSE42-NEXT: paddb (%rdx), %xmm2
+; SSE42-NEXT: movdqa %xmm2, (%rcx)
+; SSE42-NEXT: movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT: movdqa %xmm3, 48(%rcx)
+; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
; SSE42-NEXT: retq
;
; AVX-LABEL: vec512_v32i16_to_v4i128_factor8:
@@ -6621,37 +6524,36 @@
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = \
xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpslldq {{.*#+}} xmm3 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: \
vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} \
xmm3 = xmm0[0,0,0,0] ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = \
xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = \
xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; \
AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
; AVX-NEXT: vmovdqa %xmm0, 48(%rcx)
; AVX-NEXT: retq
;
; AVX2-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm2 = \
xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = \
ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT: vmovdqa (%rdi), \
%xmm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} \
ymm1 = ymm1[0,1,1,3]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: \
vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: \
vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT: vpermq {{.*#+}} \
ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: \
vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-SLOW-NEXT: vzeroupper
@@ -6659,17 +6561,17 @@
;
; AVX2-FAST-PERLANE-LABEL: vec512_v32i16_to_v4i128_factor8:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm2 = \
xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = \
ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = \
xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), \
%xmm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq \
{{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; \
AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: \
vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; \
AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} \
ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; \
AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
@@ -6677,17 +6579,17 @@
;
; AVX2-FAST-LABEL: vec512_v32i16_to_v4i128_factor8:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm2 = \
xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = \
ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} \
ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: \
vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: \
vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT: \
vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: \
vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-FAST-NEXT: vzeroupper
@@ -6697,18 +6599,20 @@
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = \
ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} \
ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm1 = \
ymm1[2,1,3,3] ; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX512F-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-SLOW-NEXT: vpermq \
{{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
;
@@ -6716,25 +6620,30 @@
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512F-FAST-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = \
ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} \
ymm2 = <4,u,u,u,5,u,u,u> +; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX512F-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = \
xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpmovzxwq {{.*#+}} \
xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX512F-FAST-NEXT: vpermq \
{{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
;
; AVX512BW-LABEL: vec512_v32i16_to_v4i128_factor8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = \
[32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,17,18,19,20,21,22,23,35,25,26,27,28,29,30,31]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = \
[32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,40,17,18,19,20,21,22,23,41,25,26,27,28,29,30,31]
; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
@@ -6984,16 +6893,16 @@
; SSE42-NEXT: movdqa 16(%rdi), %xmm1
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 16(%rsi), %xmm1
-; SSE42-NEXT: pxor %xmm2, %xmm2
-; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
-; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE42-NEXT: pxor %xmm3, %xmm3
+; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
-; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSE42-NEXT: paddb 16(%rdx), %xmm0
; SSE42-NEXT: paddb (%rdx), %xmm4
; SSE42-NEXT: paddb 48(%rdx), %xmm1
-; SSE42-NEXT: paddb 32(%rdx), %xmm3
-; SSE42-NEXT: movdqa %xmm3, 32(%rcx)
+; SSE42-NEXT: paddb 32(%rdx), %xmm2
+; SSE42-NEXT: movdqa %xmm2, 32(%rcx)
; SSE42-NEXT: movdqa %xmm1, 48(%rcx)
; SSE42-NEXT: movdqa %xmm4, (%rcx)
; SSE42-NEXT: movdqa %xmm0, 16(%rcx)
@@ -7005,26 +6914,19 @@
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = \
ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = \
ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
-; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rcx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3
+; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2:
@@ -7081,22 +6983,23 @@
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: paddb (%rsi), %xmm0
; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrldq {{.*#+}} xmm2 = \
xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: xorps %xmm3, %xmm3
-; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrldq {{.*#+}} xmm4 = \
xmm4[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,3]
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; SSE2-NEXT: xorps %xmm4, %xmm4
+; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrldq {{.*#+}} xmm3 = \
xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] ; SSE2-NEXT: shufps \
{{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = \
xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: paddb 16(%rdx), %xmm0
+; SSE2-NEXT: paddb 48(%rdx), %xmm3
; SSE2-NEXT: paddb 32(%rdx), %xmm4
-; SSE2-NEXT: paddb (%rdx), %xmm3
-; SSE2-NEXT: paddb 48(%rdx), %xmm2
-; SSE2-NEXT: movdqa %xmm2, 48(%rcx)
-; SSE2-NEXT: movdqa %xmm3, (%rcx)
+; SSE2-NEXT: paddb (%rdx), %xmm2
+; SSE2-NEXT: movdqa %xmm2, (%rcx)
; SSE2-NEXT: movdqa %xmm4, 32(%rcx)
+; SSE2-NEXT: movdqa %xmm3, 48(%rcx)
; SSE2-NEXT: movdqa %xmm0, 16(%rcx)
; SSE2-NEXT: retq
;
@@ -7126,40 +7029,40 @@
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2
; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
-; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa %xmm1, (%rcx)
; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
+; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-SLOW-LABEL: vec512_v16i32_to_v4i128_factor4:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = \
ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, \
%xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = \
xmm0[2,1,3,3] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, \
%ymm0
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-SLOW-NEXT: vzeroupper
@@ -7167,17 +7070,17 @@
;
; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v4i128_factor4:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = \
ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7]
-; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, \
%xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd \
{{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = \
ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpaddb \
32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-FAST-PERLANE-NEXT: vzeroupper
@@ -7187,15 +7090,15 @@
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = \
ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = \
<0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = \
<2,u,u,u,3,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = \
ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 \
= ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vpaddb 32(%rdx), \
%ymm0, %ymm0
-; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-FAST-NEXT: vzeroupper
@@ -7203,31 +7106,31 @@
;
; AVX512F-LABEL: vec512_v16i32_to_v4i128_factor4:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = \
[16,1,2,3,17,5,6,7,20,9,10,11,21,13,14,15] +; AVX512F-NEXT: vpxor %xmm2, %xmm2, \
%xmm2 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm0
+; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: vec512_v16i32_to_v4i128_factor4:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: movb $17, %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z}
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = \
[16,1,2,3,17,5,6,7,20,9,10,11,21,13,14,15] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, \
%xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [2,9,10,11,3,13,14,15]
-; AVX512BW-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -7519,8 +7422,8 @@
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3]
@@ -7541,13 +7444,13 @@
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,3]
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-NEXT: vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX2-NEXT: vzeroupper
@@ -7557,6 +7460,8 @@
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: movb $85, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
@@ -7568,37 +7473,19 @@
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec512_v8i64_to_v4i128_factor2:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT: movb $5, %al
-; AVX512BW-SLOW-NEXT: kmovd %eax, %k1
-; AVX512BW-SLOW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z}
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = \
ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec512_v8i64_to_v4i128_factor2:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-FAST-NEXT: movb $5, %al
-; AVX512BW-FAST-NEXT: kmovd %eax, %k1
-; AVX512BW-FAST-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z}
-; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [2,5,3,7]
-; AVX512BW-FAST-NEXT: vpermi2q %ymm2, %ymm0, %ymm3
-; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec512_v8i64_to_v4i128_factor2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movb $85, %al
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -7835,9 +7722,9 @@
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa %xmm0, %xmm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT: movb $51, %al
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
Index: llvm/test/CodeGen/AArch64/zext-to-tbl.ll
===================================================================
--- llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -o - %s | FileCheck \
--check-prefix=CHECK-BE %s
-; RUN: llc -mtriple=arm64-apple-ios -mattr=+global-isel -mattr=+sve -o - %s | \
FileCheck %s
-; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+global-isel -mattr=+sve -o - %s \
| FileCheck --check-prefix=CHECK-BE %s +; RUN: llc -mtriple=arm64-apple-ios \
-mattr=+sve -o - %s | FileCheck --implicit-check-not=LCPI --implicit-check-not=lCPI \
%s +; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -o - %s | FileCheck \
--implicit-check-not=LCPI --implicit-check-not=lCPI --check-prefix=CHECK-BE %s +; \
RUN: llc -mtriple=arm64-apple-ios -mattr=+global-isel -mattr=+sve -o - %s | FileCheck \
--implicit-check-not=LCPI --implicit-check-not=lCPI %s +; RUN: llc \
-mtriple=aarch64_be-unknown-linux -mattr=+global-isel -mattr=+sve -o - %s | FileCheck \
--implicit-check-not=LCPI --implicit-check-not=lCPI --check-prefix=CHECK-BE %s
; CHECK-LABEL: lCPI0_0:
; CHECK-NEXT: .byte 0 ; 0x0
@@ -21,57 +21,6 @@
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT:lCPI0_1:
-; CHECK-NEXT: .byte 4 ; 0x4
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 5 ; 0x5
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 6 ; 0x6
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 7 ; 0x7
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT:lCPI0_2:
-; CHECK-NEXT: .byte 8 ; 0x8
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 9 ; 0x9
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 10 ; 0xa
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 11 ; 0xb
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT:lCPI0_3:
-; CHECK-NEXT: .byte 12 ; 0xc
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 13 ; 0xd
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 14 ; 0xe
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 15 ; 0xf
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-BE: .LCPI0_0:
; CHECK-BE-NEXT: .byte 255 // 0xff
@@ -149,39 +98,28 @@
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: Lloh0:
; CHECK-NEXT: adrp x9, lCPI0_0@PAGE
-; CHECK-NEXT: Lloh1:
-; CHECK-NEXT: adrp x10, lCPI0_1@PAGE
-; CHECK-NEXT: Lloh2:
-; CHECK-NEXT: adrp x11, lCPI0_2@PAGE
-; CHECK-NEXT: Lloh3:
-; CHECK-NEXT: adrp x12, lCPI0_3@PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh4:
+; CHECK-NEXT: Lloh1:
; CHECK-NEXT: ldr q0, [x9, lCPI0_0@PAGEOFF]
-; CHECK-NEXT: Lloh5:
-; CHECK-NEXT: ldr q1, [x10, lCPI0_1@PAGEOFF]
-; CHECK-NEXT: Lloh6:
-; CHECK-NEXT: ldr q2, [x11, lCPI0_2@PAGEOFF]
-; CHECK-NEXT: Lloh7:
-; CHECK-NEXT: ldr q3, [x12, lCPI0_3@PAGEOFF]
; CHECK-NEXT: LBB0_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q4, [x0, x8]
+; CHECK-NEXT: ldr q1, [x0, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
-; CHECK-NEXT: tbl.16b v5, { v4 }, v3
-; CHECK-NEXT: tbl.16b v6, { v4 }, v2
-; CHECK-NEXT: tbl.16b v7, { v4 }, v1
+; CHECK-NEXT: dup.2d v2, v1[1]
+; CHECK-NEXT: dup.4s v3, v1[1]
+; CHECK-NEXT: tbl.16b v1, { v1 }, v0
+; CHECK-NEXT: dup.4s v4, v2[1]
+; CHECK-NEXT: tbl.16b v3, { v3 }, v0
+; CHECK-NEXT: tbl.16b v2, { v2 }, v0
; CHECK-NEXT: tbl.16b v4, { v4 }, v0
-; CHECK-NEXT: stp q6, q5, [x1, #32]
-; CHECK-NEXT: stp q4, q7, [x1], #64
+; CHECK-NEXT: stp q1, q3, [x1]
+; CHECK-NEXT: stp q2, q4, [x1, #32]
+; CHECK-NEXT: add x1, x1, #64
; CHECK-NEXT: b.ne LBB0_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7
-; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6
-; CHECK-NEXT: .loh AdrpLdr Lloh1, Lloh5
-; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh4
+; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1
;
; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop:
; CHECK-BE: // %bb.0: // %entry
@@ -566,23 +504,6 @@
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: lCPI6_1:
-; CHECK-NEXT: .byte 4 ; 0x4
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 5 ; 0x5
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 6 ; 0x6
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 7 ; 0x7
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-BE: .LCPI6_0:
; CHECK-BE-NEXT: .byte 255 // 0xff
@@ -622,28 +543,24 @@
define void @zext_v8i8_to_v8i32_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v8i8_to_v8i32_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: Lloh8:
+; CHECK-NEXT: Lloh2:
; CHECK-NEXT: adrp x9, lCPI6_0@PAGE
-; CHECK-NEXT: Lloh9:
-; CHECK-NEXT: adrp x10, lCPI6_1@PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh10:
+; CHECK-NEXT: Lloh3:
; CHECK-NEXT: ldr q0, [x9, lCPI6_0@PAGEOFF]
-; CHECK-NEXT: Lloh11:
-; CHECK-NEXT: ldr q1, [x10, lCPI6_1@PAGEOFF]
; CHECK-NEXT: LBB6_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr d2, [x0, x8]
+; CHECK-NEXT: ldr d1, [x0, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
-; CHECK-NEXT: tbl.16b v3, { v2 }, v1
+; CHECK-NEXT: dup.2s v2, v1[1]
+; CHECK-NEXT: tbl.16b v1, { v1 }, v0
; CHECK-NEXT: tbl.16b v2, { v2 }, v0
-; CHECK-NEXT: stp q2, q3, [x1], #64
+; CHECK-NEXT: stp q1, q2, [x1], #64
; CHECK-NEXT: b.ne LBB6_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh9, Lloh11
-; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh10
+; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3
;
; CHECK-BE-LABEL: zext_v8i8_to_v8i32_in_loop:
; CHECK-BE: // %bb.0: // %entry
@@ -1032,10 +949,10 @@
define void @zext_v4i8_to_v4i32_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v4i8_to_v4i32_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: Lloh12:
+; CHECK-NEXT: Lloh4:
; CHECK-NEXT: adrp x9, lCPI11_0@PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh13:
+; CHECK-NEXT: Lloh5:
; CHECK-NEXT: ldr q0, [x9, lCPI11_0@PAGEOFF]
; CHECK-NEXT: LBB11_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1047,7 +964,7 @@
; CHECK-NEXT: b.ne LBB11_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh13
+; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5
;
; CHECK-BE-LABEL: zext_v4i8_to_v4i32_in_loop:
; CHECK-BE: // %bb.0: // %entry
@@ -1104,40 +1021,6 @@
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: lCPI12_1:
-; CHECK-NEXT: .byte 4 ; 0x4
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 5 ; 0x5
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 6 ; 0x6
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 7 ; 0x7
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: lCPI12_2:
-; CHECK-NEXT: .byte 8 ; 0x8
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 9 ; 0x9
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 10 ; 0xa
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 11 ; 0xb
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-BE-LABEL: .LCPI12_0:
; CHECK-BE-NEXT: .byte 255 // 0xff
@@ -1194,35 +1077,27 @@
define void @zext_v12i8_to_v12i32_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v12i8_to_v12i32_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: Lloh14:
+; CHECK-NEXT: Lloh6:
; CHECK-NEXT: adrp x9, lCPI12_0@PAGE
-; CHECK-NEXT: Lloh15:
-; CHECK-NEXT: adrp x10, lCPI12_1@PAGE
-; CHECK-NEXT: Lloh16:
-; CHECK-NEXT: adrp x11, lCPI12_2@PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh17:
+; CHECK-NEXT: Lloh7:
; CHECK-NEXT: ldr q0, [x9, lCPI12_0@PAGEOFF]
-; CHECK-NEXT: Lloh18:
-; CHECK-NEXT: ldr q1, [x10, lCPI12_1@PAGEOFF]
-; CHECK-NEXT: Lloh19:
-; CHECK-NEXT: ldr q2, [x11, lCPI12_2@PAGEOFF]
; CHECK-NEXT: LBB12_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q3, [x0, x8]
+; CHECK-NEXT: ldr q1, [x0, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
-; CHECK-NEXT: tbl.16b v4, { v3 }, v2
-; CHECK-NEXT: tbl.16b v5, { v3 }, v1
+; CHECK-NEXT: dup.2d v2, v1[1]
+; CHECK-NEXT: dup.4s v3, v1[1]
+; CHECK-NEXT: tbl.16b v1, { v1 }, v0
+; CHECK-NEXT: tbl.16b v2, { v2 }, v0
; CHECK-NEXT: tbl.16b v3, { v3 }, v0
-; CHECK-NEXT: stp q5, q4, [x1, #16]
-; CHECK-NEXT: str q3, [x1], #64
+; CHECK-NEXT: stp q3, q2, [x1, #16]
+; CHECK-NEXT: str q1, [x1], #64
; CHECK-NEXT: b.ne LBB12_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh19
-; CHECK-NEXT: .loh AdrpLdr Lloh15, Lloh18
-; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh17
+; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7
;
; CHECK-BE-LABEL: zext_v12i8_to_v12i32_in_loop:
; CHECK-BE: // %bb.0: // %entry
@@ -2192,22 +2067,22 @@
define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v20i8_to_v20i24_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: Lloh20:
+; CHECK-NEXT: Lloh8:
; CHECK-NEXT: adrp x9, lCPI20_0@PAGE
-; CHECK-NEXT: Lloh21:
+; CHECK-NEXT: Lloh9:
; CHECK-NEXT: adrp x10, lCPI20_1@PAGE
-; CHECK-NEXT: Lloh22:
+; CHECK-NEXT: Lloh10:
; CHECK-NEXT: adrp x11, lCPI20_2@PAGE
-; CHECK-NEXT: Lloh23:
+; CHECK-NEXT: Lloh11:
; CHECK-NEXT: adrp x12, lCPI20_3@PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh24:
+; CHECK-NEXT: Lloh12:
; CHECK-NEXT: ldr q0, [x9, lCPI20_0@PAGEOFF]
-; CHECK-NEXT: Lloh25:
+; CHECK-NEXT: Lloh13:
; CHECK-NEXT: ldr q1, [x10, lCPI20_1@PAGEOFF]
-; CHECK-NEXT: Lloh26:
+; CHECK-NEXT: Lloh14:
; CHECK-NEXT: ldr q2, [x11, lCPI20_2@PAGEOFF]
-; CHECK-NEXT: Lloh27:
+; CHECK-NEXT: Lloh15:
; CHECK-NEXT: ldr q3, [x12, lCPI20_3@PAGEOFF]
; CHECK-NEXT: LBB20_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2228,10 +2103,10 @@
; CHECK-NEXT: b.ne LBB20_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh23, Lloh27
-; CHECK-NEXT: .loh AdrpLdr Lloh22, Lloh26
-; CHECK-NEXT: .loh AdrpLdr Lloh21, Lloh25
-; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh24
+; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15
+; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14
+; CHECK-NEXT: .loh AdrpLdr Lloh9, Lloh13
+; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh12
;
; CHECK-BE-LABEL: zext_v20i8_to_v20i24_in_loop:
; CHECK-BE: // %bb.0: // %entry
@@ -2519,30 +2394,30 @@
define void @zext_v23i8_to_v23i48_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v23i8_to_v23i48_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: Lloh28:
+; CHECK-NEXT: Lloh16:
; CHECK-NEXT: adrp x9, lCPI21_0@PAGE
-; CHECK-NEXT: Lloh29:
+; CHECK-NEXT: Lloh17:
; CHECK-NEXT: adrp x10, lCPI21_1@PAGE
-; CHECK-NEXT: Lloh30:
+; CHECK-NEXT: Lloh18:
; CHECK-NEXT: adrp x11, lCPI21_2@PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh31:
+; CHECK-NEXT: Lloh19:
; CHECK-NEXT: ldr q0, [x9, lCPI21_0@PAGEOFF]
-; CHECK-NEXT: Lloh32:
+; CHECK-NEXT: Lloh20:
; CHECK-NEXT: adrp x9, lCPI21_3@PAGE
-; CHECK-NEXT: Lloh33:
+; CHECK-NEXT: Lloh21:
; CHECK-NEXT: ldr q1, [x10, lCPI21_1@PAGEOFF]
-; CHECK-NEXT: Lloh34:
+; CHECK-NEXT: Lloh22:
; CHECK-NEXT: adrp x10, lCPI21_4@PAGE
-; CHECK-NEXT: Lloh35:
+; CHECK-NEXT: Lloh23:
; CHECK-NEXT: ldr q2, [x11, lCPI21_2@PAGEOFF]
-; CHECK-NEXT: Lloh36:
+; CHECK-NEXT: Lloh24:
; CHECK-NEXT: adrp x11, lCPI21_5@PAGE
-; CHECK-NEXT: Lloh37:
+; CHECK-NEXT: Lloh25:
; CHECK-NEXT: ldr q3, [x9, lCPI21_3@PAGEOFF]
-; CHECK-NEXT: Lloh38:
+; CHECK-NEXT: Lloh26:
; CHECK-NEXT: ldr q4, [x10, lCPI21_4@PAGEOFF]
-; CHECK-NEXT: Lloh39:
+; CHECK-NEXT: Lloh27:
; CHECK-NEXT: ldr q5, [x11, lCPI21_5@PAGEOFF]
; CHECK-NEXT: LBB21_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -2570,15 +2445,15 @@
; CHECK-NEXT: b.ne LBB21_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh36, Lloh39
-; CHECK-NEXT: .loh AdrpLdr Lloh34, Lloh38
-; CHECK-NEXT: .loh AdrpLdr Lloh32, Lloh37
-; CHECK-NEXT: .loh AdrpAdrp Lloh30, Lloh36
-; CHECK-NEXT: .loh AdrpLdr Lloh30, Lloh35
-; CHECK-NEXT: .loh AdrpAdrp Lloh29, Lloh34
-; CHECK-NEXT: .loh AdrpLdr Lloh29, Lloh33
-; CHECK-NEXT: .loh AdrpAdrp Lloh28, Lloh32
-; CHECK-NEXT: .loh AdrpLdr Lloh28, Lloh31
+; CHECK-NEXT: .loh AdrpLdr Lloh24, Lloh27
+; CHECK-NEXT: .loh AdrpLdr Lloh22, Lloh26
+; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh25
+; CHECK-NEXT: .loh AdrpAdrp Lloh18, Lloh24
+; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh23
+; CHECK-NEXT: .loh AdrpAdrp Lloh17, Lloh22
+; CHECK-NEXT: .loh AdrpLdr Lloh17, Lloh21
+; CHECK-NEXT: .loh AdrpAdrp Lloh16, Lloh20
+; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh19
;
; CHECK-BE-LABEL: zext_v23i8_to_v23i48_in_loop:
; CHECK-BE: // %bb.0: // %entry
Index: llvm/test/CodeGen/AArch64/vselect-ext.ll
===================================================================
--- llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -575,51 +575,39 @@
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: Lloh2:
; CHECK-NEXT: adrp x9, lCPI24_0@PAGE
-; CHECK-NEXT: Lloh3:
-; CHECK-NEXT: adrp x10, lCPI24_1@PAGE
-; CHECK-NEXT: Lloh4:
-; CHECK-NEXT: adrp x11, lCPI24_2@PAGE
-; CHECK-NEXT: Lloh5:
-; CHECK-NEXT: adrp x12, lCPI24_3@PAGE
-; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh6:
-; CHECK-NEXT: ldr q0, [x9, lCPI24_0@PAGEOFF]
-; CHECK-NEXT: Lloh7:
-; CHECK-NEXT: ldr q1, [x10, lCPI24_1@PAGEOFF]
-; CHECK-NEXT: Lloh8:
-; CHECK-NEXT: ldr q3, [x11, lCPI24_2@PAGEOFF]
-; CHECK-NEXT: Lloh9:
-; CHECK-NEXT: ldr q4, [x12, lCPI24_3@PAGEOFF]
+; CHECK-NEXT: movi.2d v0, #0xffffffffffffffff
+; CHECK-NEXT: Lloh3:
+; CHECK-NEXT: ldr q1, [x9, lCPI24_0@PAGEOFF]
; CHECK-NEXT: LBB24_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q5, [x0, x8]
+; CHECK-NEXT: ldr q2, [x0, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
-; CHECK-NEXT: cmgt.16b v6, v5, v2
-; CHECK-NEXT: tbl.16b v7, { v5 }, v0
-; CHECK-NEXT: tbl.16b v16, { v5 }, v1
-; CHECK-NEXT: sshll2.8h v18, v6, #0
-; CHECK-NEXT: tbl.16b v17, { v5 }, v3
-; CHECK-NEXT: sshll2.4s v19, v18, #0
-; CHECK-NEXT: sshll.4s v18, v18, #0
-; CHECK-NEXT: tbl.16b v5, { v5 }, v4
-; CHECK-NEXT: sshll.8h v6, v6, #0
-; CHECK-NEXT: and.16b v7, v7, v19
-; CHECK-NEXT: and.16b v16, v16, v18
-; CHECK-NEXT: stp q16, q7, [x1, #32]
-; CHECK-NEXT: sshll2.4s v7, v6, #0
+; CHECK-NEXT: cmgt.16b v3, v2, v0
+; CHECK-NEXT: dup.2d v4, v2[1]
+; CHECK-NEXT: tbl.16b v5, { v2 }, v1
+; CHECK-NEXT: sshll2.8h v6, v3, #0
+; CHECK-NEXT: dup.4s v2, v2[1]
+; CHECK-NEXT: dup.4s v7, v4[1]
+; CHECK-NEXT: tbl.16b v4, { v4 }, v1
+; CHECK-NEXT: sshll.8h v3, v3, #0
+; CHECK-NEXT: tbl.16b v7, { v7 }, v1
+; CHECK-NEXT: tbl.16b v2, { v2 }, v1
+; CHECK-NEXT: sshll2.4s v16, v6, #0
; CHECK-NEXT: sshll.4s v6, v6, #0
-; CHECK-NEXT: and.16b v7, v17, v7
-; CHECK-NEXT: and.16b v5, v5, v6
-; CHECK-NEXT: stp q5, q7, [x1], #64
+; CHECK-NEXT: and.16b v7, v7, v16
+; CHECK-NEXT: and.16b v4, v4, v6
+; CHECK-NEXT: sshll2.4s v6, v3, #0
+; CHECK-NEXT: stp q4, q7, [x1, #32]
+; CHECK-NEXT: sshll.4s v3, v3, #0
+; CHECK-NEXT: and.16b v2, v2, v6
+; CHECK-NEXT: and.16b v3, v5, v3
+; CHECK-NEXT: stp q3, q2, [x1], #64
; CHECK-NEXT: b.ne LBB24_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh9
-; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh8
-; CHECK-NEXT: .loh AdrpLdr Lloh3, Lloh7
-; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh6
+; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3
entry:
br label %loop
@@ -643,23 +631,23 @@
define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(ptr %src, ptr %dst) {
; CHECK-LABEL: extension_in_loop_as_shuffle_v16i8_to_v16i32:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: Lloh10:
+; CHECK-NEXT: Lloh4:
; CHECK-NEXT: adrp x9, lCPI25_0@PAGE
-; CHECK-NEXT: Lloh11:
+; CHECK-NEXT: Lloh5:
; CHECK-NEXT: adrp x10, lCPI25_1@PAGE
-; CHECK-NEXT: Lloh12:
+; CHECK-NEXT: Lloh6:
; CHECK-NEXT: adrp x11, lCPI25_2@PAGE
-; CHECK-NEXT: Lloh13:
+; CHECK-NEXT: Lloh7:
; CHECK-NEXT: adrp x12, lCPI25_3@PAGE
; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh14:
+; CHECK-NEXT: Lloh8:
; CHECK-NEXT: ldr q0, [x9, lCPI25_0@PAGEOFF]
-; CHECK-NEXT: Lloh15:
+; CHECK-NEXT: Lloh9:
; CHECK-NEXT: ldr q1, [x10, lCPI25_1@PAGEOFF]
-; CHECK-NEXT: Lloh16:
+; CHECK-NEXT: Lloh10:
; CHECK-NEXT: ldr q3, [x11, lCPI25_2@PAGEOFF]
-; CHECK-NEXT: Lloh17:
+; CHECK-NEXT: Lloh11:
; CHECK-NEXT: ldr q4, [x12, lCPI25_3@PAGEOFF]
; CHECK-NEXT: LBB25_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -686,10 +674,10 @@
; CHECK-NEXT: b.ne LBB25_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh17
-; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh16
-; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh15
-; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh14
+; CHECK-NEXT: .loh AdrpLdr Lloh7, Lloh11
+; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh10
+; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh9
+; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh8
entry:
br label %loop
@@ -714,23 +702,23 @@
define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(ptr %src, ptr %dst) {
; CHECK-LABEL: shuffle_in_loop_is_no_extend_v16i8_to_v16i32:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: Lloh18:
+; CHECK-NEXT: Lloh12:
; CHECK-NEXT: adrp x9, lCPI26_0@PAGE
-; CHECK-NEXT: Lloh19:
+; CHECK-NEXT: Lloh13:
; CHECK-NEXT: adrp x10, lCPI26_1@PAGE
-; CHECK-NEXT: Lloh20:
+; CHECK-NEXT: Lloh14:
; CHECK-NEXT: adrp x11, lCPI26_2@PAGE
-; CHECK-NEXT: Lloh21:
+; CHECK-NEXT: Lloh15:
; CHECK-NEXT: adrp x12, lCPI26_3@PAGE
; CHECK-NEXT: movi.2d v2, #0xffffffffffffffff
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh22:
+; CHECK-NEXT: Lloh16:
; CHECK-NEXT: ldr q0, [x9, lCPI26_0@PAGEOFF]
-; CHECK-NEXT: Lloh23:
+; CHECK-NEXT: Lloh17:
; CHECK-NEXT: ldr q1, [x10, lCPI26_1@PAGEOFF]
-; CHECK-NEXT: Lloh24:
+; CHECK-NEXT: Lloh18:
; CHECK-NEXT: ldr q3, [x11, lCPI26_2@PAGEOFF]
-; CHECK-NEXT: Lloh25:
+; CHECK-NEXT: Lloh19:
; CHECK-NEXT: ldr q4, [x12, lCPI26_3@PAGEOFF]
; CHECK-NEXT: LBB26_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -757,10 +745,10 @@
; CHECK-NEXT: b.ne LBB26_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh21, Lloh25
-; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh24
-; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh23
-; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh22
+; CHECK-NEXT: .loh AdrpLdr Lloh15, Lloh19
+; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh18
+; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh17
+; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh16
entry:
br label %loop
Index: llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
===================================================================
--- llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -o - %s | FileCheck %s
+; RUN: llc -o - %s | FileCheck --implicit-check-not=LCPI --implicit-check-not=lCPI \
%s
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-ios"
@@ -438,69 +438,48 @@
ret void
}
-; CHECK-LABEL: lCPI8_0:
-; CHECK-NEXT: .byte 4 ; 0x4
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 5 ; 0x5
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 6 ; 0x6
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 7 ; 0x7
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: lCPI8_1:
-; CHECK-NEXT: .byte 0 ; 0x0
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 1 ; 0x1
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 2 ; 0x2
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 3 ; 0x3
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-LABEL:lCPI8_0:
+; CHECK-NEXT: .byte 0 ; 0x0
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 1 ; 0x1
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 2 ; 0x2
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 3 ; 0x3
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
define void @uitofp_v8i8_to_v8f32(ptr %src, ptr %dst) {
; CHECK-LABEL: uitofp_v8i8_to_v8f32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: Lloh10:
; CHECK-NEXT: adrp x9, lCPI8_0@PAGE
-; CHECK-NEXT: Lloh11:
-; CHECK-NEXT: adrp x10, lCPI8_1@PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh12:
+; CHECK-NEXT: Lloh11:
; CHECK-NEXT: ldr q0, [x9, lCPI8_0@PAGEOFF]
-; CHECK-NEXT: Lloh13:
-; CHECK-NEXT: ldr q1, [x10, lCPI8_1@PAGEOFF]
; CHECK-NEXT: LBB8_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr d2, [x0, x8, lsl #3]
+; CHECK-NEXT: ldr d1, [x0, x8, lsl #3]
; CHECK-NEXT: add x9, x1, x8, lsl #5
; CHECK-NEXT: add x8, x8, #1
; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: tbl.16b v3, { v2 }, v0
-; CHECK-NEXT: tbl.16b v2, { v2 }, v1
-; CHECK-NEXT: ucvtf.4s v3, v3
+; CHECK-NEXT: dup.2s v2, v1[1]
+; CHECK-NEXT: tbl.16b v1, { v1 }, v0
+; CHECK-NEXT: tbl.16b v2, { v2 }, v0
+; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: ucvtf.4s v2, v2
-; CHECK-NEXT: stp q2, q3, [x9]
+; CHECK-NEXT: stp q1, q2, [x9]
; CHECK-NEXT: b.eq LBB8_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh13
-; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh12
+; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh11
entry:
br label %loop
@@ -519,118 +498,55 @@
ret void
}
-; CHECK-LABEL: lCPI9_0:
-; CHECK-NEXT: .byte 12 ; 0xc
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 13 ; 0xd
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 14 ; 0xe
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 15 ; 0xf
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: lCPI9_1:
-; CHECK-NEXT: .byte 8 ; 0x8
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 9 ; 0x9
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 10 ; 0xa
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 11 ; 0xb
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: lCPI9_2:
-; CHECK-NEXT: .byte 4 ; 0x4
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 5 ; 0x5
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 6 ; 0x6
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 7 ; 0x7
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: lCPI9_3:
-; CHECK-NEXT: .byte 0 ; 0x0
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 1 ; 0x1
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 2 ; 0x2
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 3 ; 0x3
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-LABEL:lCPI9_0:
+; CHECK-NEXT: .byte 0 ; 0x0
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 1 ; 0x1
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 2 ; 0x2
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 3 ; 0x3
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
; CHECK-LABEL: uitofp_v16i8_to_v16f32:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: Lloh14:
+; CHECK-NEXT: Lloh12:
; CHECK-NEXT: adrp x9, lCPI9_0@PAGE
-; CHECK-NEXT: Lloh15:
-; CHECK-NEXT: adrp x10, lCPI9_1@PAGE
-; CHECK-NEXT: Lloh16:
-; CHECK-NEXT: adrp x11, lCPI9_2@PAGE
-; CHECK-NEXT: Lloh17:
-; CHECK-NEXT: adrp x12, lCPI9_3@PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh18:
+; CHECK-NEXT: Lloh13:
; CHECK-NEXT: ldr q0, [x9, lCPI9_0@PAGEOFF]
-; CHECK-NEXT: Lloh19:
-; CHECK-NEXT: ldr q1, [x10, lCPI9_1@PAGEOFF]
-; CHECK-NEXT: Lloh20:
-; CHECK-NEXT: ldr q2, [x11, lCPI9_2@PAGEOFF]
-; CHECK-NEXT: Lloh21:
-; CHECK-NEXT: ldr q3, [x12, lCPI9_3@PAGEOFF]
; CHECK-NEXT: LBB9_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q4, [x0, x8, lsl #4]
+; CHECK-NEXT: ldr q1, [x0, x8, lsl #4]
; CHECK-NEXT: add x9, x1, x8, lsl #6
; CHECK-NEXT: add x8, x8, #1
; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: tbl.16b v5, { v4 }, v0
-; CHECK-NEXT: tbl.16b v6, { v4 }, v1
-; CHECK-NEXT: tbl.16b v7, { v4 }, v2
-; CHECK-NEXT: tbl.16b v4, { v4 }, v3
-; CHECK-NEXT: ucvtf.4s v5, v5
-; CHECK-NEXT: ucvtf.4s v6, v6
-; CHECK-NEXT: ucvtf.4s v7, v7
+; CHECK-NEXT: tbl.16b v2, { v1 }, v0
+; CHECK-NEXT: dup.4s v3, v1[1]
+; CHECK-NEXT: dup.2d v1, v1[1]
+; CHECK-NEXT: tbl.16b v3, { v3 }, v0
+; CHECK-NEXT: dup.4s v4, v1[1]
+; CHECK-NEXT: tbl.16b v1, { v1 }, v0
+; CHECK-NEXT: ucvtf.4s v2, v2
+; CHECK-NEXT: tbl.16b v4, { v4 }, v0
+; CHECK-NEXT: ucvtf.4s v3, v3
+; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: ucvtf.4s v4, v4
-; CHECK-NEXT: stp q6, q5, [x9, #32]
-; CHECK-NEXT: stp q4, q7, [x9]
+; CHECK-NEXT: stp q2, q3, [x9]
+; CHECK-NEXT: stp q1, q4, [x9, #32]
; CHECK-NEXT: b.eq LBB9_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh17, Lloh21
-; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh20
-; CHECK-NEXT: .loh AdrpLdr Lloh15, Lloh19
-; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh18
+; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh13
entry:
br label %loop
Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22947,8 +22947,8 @@
static SDValue combineShuffleToZeroExtendVectorInReg(ShuffleVectorSDNode *SVN,
SelectionDAG &DAG,
const TargetLowering &TLI,
+ bool LegalTypes,
bool LegalOperations) {
- bool LegalTypes = true;
EVT VT = SVN->getValueType(0);
assert(!VT.isScalableVector() && "Encountered scalable shuffle?");
unsigned NumElts = VT.getVectorNumElements();
@@ -24042,8 +24042,8 @@
// Perform this really late, because it could eliminate knowledge
// of undef elements created by this shuffle.
if (Level < AfterLegalizeTypes)
- if (SDValue V = combineShuffleToZeroExtendVectorInReg(SVN, DAG, TLI,
- LegalOperations))
+ if (SDValue V = combineShuffleToZeroExtendVectorInReg(
+ SVN, DAG, TLI, LegalTypes, LegalOperations))
return V;
return SDValue();
[Attachment #4 (text/plain)]
_______________________________________________
llvm-commits mailing list
llvm-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic