[prev in list] [next in list] [prev in thread] [next in thread] 

List:       llvm-commits
Subject:    [PATCH] D140677: [AArch64][DAG] `canCombineShuffleToExtendVectorInreg()`: allow illegal types before
From:       Roman Lebedev via Phabricator via llvm-commits <llvm-commits () lists ! llvm ! org>
Date:       2022-12-31 20:48:49
Message-ID: GzUtNaYgQqSUeQ28rMyG4Q () geopod-ismtpd-6-1
[Download RAW message or body]

lebedev.ri updated this revision to Diff 485786.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D140677/new/

https://reviews.llvm.org/D140677

Files:
  llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
  llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
  llvm/test/CodeGen/AArch64/vselect-ext.ll
  llvm/test/CodeGen/AArch64/zext-to-tbl.ll
  llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll


["D140677.485786.patch" (D140677.485786.patch)]

Index: llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
===================================================================
--- llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -1700,21 +1700,23 @@
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = \
                zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
                
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = \
                xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, 16(%rcx)
+; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: vec256_v32i8_to_v2i128_factor16:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -1725,8 +1727,8 @@
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512F-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -1737,8 +1739,8 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -1842,12 +1844,12 @@
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
 ; SSE42-NEXT:    paddb (%rsi), %xmm0
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm2 = \
                xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE42-NEXT:    punpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE42-NEXT:    \
pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; \
SSE42-NEXT:    pxor %xmm2, %xmm2 +; SSE42-NEXT:    punpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]  ; SSE42-NEXT:    \
                paddb 16(%rdx), %xmm0
-; SSE42-NEXT:    paddb (%rdx), %xmm2
-; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
 ; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
 ; SSE42-NEXT:    retq
 ;
@@ -2187,12 +2189,12 @@
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
 ; SSE42-NEXT:    paddb (%rsi), %xmm0
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE42-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT:    pxor %xmm2, %xmm2
+; SSE42-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSE42-NEXT:    paddb 16(%rdx), %xmm0
-; SSE42-NEXT:    paddb (%rdx), %xmm2
-; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    paddb (%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, (%rcx)
 ; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
 ; SSE42-NEXT:    retq
 ;
@@ -2709,13 +2711,16 @@
 ; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm2 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
  ; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = \
xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
                
-; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = \
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
                
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm4 = \
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 +; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = \
xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
 +; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm4, %xmm3
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: vec384_v48i8_to_v24i16_factor2:
@@ -2724,7 +2729,7 @@
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6], \
zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
  ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 +; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6], \
zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
  ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
@@ -2738,7 +2743,7 @@
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6], \
zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
  ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} xmm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 +; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6], \
zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
  ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
@@ -2921,66 +2926,44 @@
 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
  ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm2 = \
xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
                
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm3 = \
xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
 +; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
                
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm0, 48(%rcx)
 ; AVX-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: vec384_v48i8_to_v12i32_factor4:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpmovzxbd {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
                
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-SLOW-NEXT:    vpmovzxbd {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
                
-; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v12i32_factor4:
-; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vpmovzxbd {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
                
-; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = \
xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero
                
-; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vzeroupper
-; AVX2-FAST-PERLANE-NEXT:    retq
-;
-; AVX2-FAST-LABEL: vec384_v48i8_to_v12i32_factor4:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpmovzxbd {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
                
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = \
xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero
                
-; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-NEXT:    vzeroupper
-; AVX2-FAST-NEXT:    retq
+; AVX2-LABEL: vec384_v48i8_to_v12i32_factor4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 +; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 +; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec384_v48i8_to_v12i32_factor4:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovzxbd {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
                
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = \
xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero
                
-; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zer \
o,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0 \
[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 +; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -3160,65 +3143,43 @@
 ; AVX-NEXT:    vpsrld $16, %xmm0, %xmm2
 ; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm2 = \
xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero \
; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT:    vpmovzxbq \
{{.*#+}} xmm3 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
 +; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm0, 48(%rcx)
 ; AVX-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: vec384_v48i8_to_v6i64_factor8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpmovzxbq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero, \
                xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX2-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v6i64_factor8:
-; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vpmovzxbq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero, \
                xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = \
                xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vzeroupper
-; AVX2-FAST-PERLANE-NEXT:    retq
-;
-; AVX2-FAST-LABEL: vec384_v48i8_to_v6i64_factor8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpmovzxbq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero, \
                xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = \
                xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-NEXT:    vzeroupper
-; AVX2-FAST-NEXT:    retq
+; AVX2-LABEL: vec384_v48i8_to_v6i64_factor8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero, \
xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero \
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-NEXT:    vpmovzxbq \
{{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero \
,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
 +; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec384_v48i8_to_v6i64_factor8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovzxbq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero, \
                xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
                
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = \
                xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero
                
-; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} zmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero, \
xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero, \
xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero, \
xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero \
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT:    vpaddb \
32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -3378,93 +3339,97 @@
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX-NEXT:    vpslldq {{.*#+}} xmm2 = \
                zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
                
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = \
                xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; \
AVX-NEXT:    vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT:    vpsrld $24, %xmm0, %xmm2
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]  ; \
AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = \
                xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX-NEXT:    vpaddb 48(%rdx), %xmm3, %xmm3
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
+; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 48(%rcx)
+; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: vec384_v48i8_to_v3i128_factor16:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpslldq {{.*#+}} xmm1 = \
                zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
                
-; AVX2-SLOW-NEXT:    vpsrldq {{.*#+}} xmm1 = \
                xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa %ymm1, 32(%rcx)
-; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-PERLANE-LABEL: vec384_v48i8_to_v3i128_factor16:
-; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = \
                xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-PERLANE-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vzeroupper
-; AVX2-FAST-PERLANE-NEXT:    retq
-;
-; AVX2-FAST-LABEL: vec384_v48i8_to_v3i128_factor16:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = \
                xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa %ymm1, 32(%rcx)
-; AVX2-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX2-FAST-NEXT:    vzeroupper
-; AVX2-FAST-NEXT:    retq
+; AVX2-LABEL: vec384_v48i8_to_v3i128_factor16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrld $24, %xmm0, %xmm2
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT:   \
# ymm2 = mem[0,1,0,1] +; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT:    vpand %ymm2, \
%ymm0, %ymm0 +; AVX2-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec384_v48i8_to_v3i128_factor16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = \
                xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512F-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpsrld $24, %xmm0, %xmm0
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = \
xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero \
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512F-NEXT:    \
vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT:    vpandq \
{{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT:    vextracti64x4 $1, \
%zmm0, %ymm1  ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
+; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512BW-LABEL: vec384_v48i8_to_v3i128_factor16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = \
                xmm0[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512BW-SLOW-LABEL: vec384_v48i8_to_v3i128_factor16:
+; AVX512BW-SLOW:       # %bb.0:
+; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512BW-SLOW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = \
ymm1[0,u,u,u,1,u,u,u,u,u,u,u,u,u,u,u,16,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; \
AVX512BW-SLOW-NEXT:    vpsrld $24, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT:    \
vpbroadcastb %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = \
<0,u,u,u,1,u,u,u,4,u,u,u,18,u,u,u> +; AVX512BW-SLOW-NEXT:    vpermi2d %zmm0, %zmm1, \
%zmm2 +; AVX512BW-SLOW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT:    vzeroupper
+; AVX512BW-SLOW-NEXT:    retq
+;
+; AVX512BW-FAST-LABEL: vec384_v48i8_to_v3i128_factor16:
+; AVX512BW-FAST:       # %bb.0:
+; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-FAST-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512BW-FAST-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm1
+; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = \
ymm1[0,u,u,u,1,u,u,u,u,u,u,u,u,u,u,u,16,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; \
AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,3,3,3,3,u,u,u,u] \
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = \
<0,u,u,u,1,u,u,u,4,u,u,u,18,u,u,u> +; AVX512BW-FAST-NEXT:    vpermi2d %zmm0, %zmm1, \
%zmm2 +; AVX512BW-FAST-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
+; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT:    vzeroupper
+; AVX512BW-FAST-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -3678,15 +3643,15 @@
 ; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
 ; SSE42-NEXT:    paddb (%rsi), %xmm0
 ; SSE42-NEXT:    paddb 16(%rsi), %xmm1
-; SSE42-NEXT:    pxor %xmm2, %xmm2
 ; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm1 = \
                xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm3 = \
                xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE42-NEXT:    punpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE42-NEXT:    \
pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; \
SSE42-NEXT:    pxor %xmm3, %xmm3 +; SSE42-NEXT:    punpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]  ; SSE42-NEXT:    \
                paddb 16(%rdx), %xmm0
-; SSE42-NEXT:    paddb (%rdx), %xmm3
+; SSE42-NEXT:    paddb (%rdx), %xmm2
 ; SSE42-NEXT:    paddb 32(%rdx), %xmm1
 ; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
-; SSE42-NEXT:    movdqa %xmm3, (%rcx)
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
 ; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
 ; SSE42-NEXT:    retq
 ;
@@ -3699,13 +3664,16 @@
 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero  ; AVX-NEXT:    vpxor %xmm3, \
%xmm3, %xmm3  ; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = \
                xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = \
                xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm4 = \
xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT:    vpunpckhwd \
{{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; \
AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT:    vpaddb 32(%rdx), %xmm4, \
%xmm3  ; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
 ; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: vec384_v24i16_to_v12i32_factor2:
@@ -3714,7 +3682,7 @@
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
  ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT:    vpmovzxwd \
{{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
  ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
@@ -3726,13 +3694,12 @@
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
                
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = \
                xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = \
ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6], \
zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 +; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -3740,10 +3707,7 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} xmm1 = \
                xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} ymm0 = \
xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
                
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = \
ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6], \
zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
  ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -3986,66 +3950,44 @@
 ; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero  ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = \
xmm0[1,1,1,1]  ; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm2 = \
                xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm3 = \
xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = \
xmm0[3,3,3,3]  ; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm0, 48(%rcx)
 ; AVX-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: vec384_v24i16_to_v6i64_factor4:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
                
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v6i64_factor4:
-; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
                
-; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = \
                xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vzeroupper
-; AVX2-FAST-PERLANE-NEXT:    retq
-;
-; AVX2-FAST-LABEL: vec384_v24i16_to_v6i64_factor4:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
                
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = \
                xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-NEXT:    vzeroupper
-; AVX2-FAST-NEXT:    retq
+; AVX2-LABEL: vec384_v24i16_to_v6i64_factor4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 +; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 +; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec384_v24i16_to_v6i64_factor4:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovzxwq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
                
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = \
                xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
                
-; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512F-NEXT:    vpmovzxwq {{.*#+}} zmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 +; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -4053,9 +3995,7 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm1 = \
                xmm0[8,9],zero,zero,zero,zero,zero,zero,xmm0[10,11],zero,zero,zero,zero,zero,zero
                
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} ymm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
                
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero \
,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
  ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -4297,65 +4237,74 @@
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = \
xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]  ; AVX-NEXT:    vpblendw {{.*#+}} \
                xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = \
                xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]  ; AVX-NEXT:    \
vpsrldq {{.*#+}} xmm0 = \
                xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
+; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm0, 48(%rcx)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpslldq {{.*#+}} xmm2 = \
                zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
-; AVX2-SLOW-NEXT:    vpsrldq {{.*#+}} xmm2 = \
                xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT:    vmovdqa (%rdi), \
%xmm0 +; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero  ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} \
                ymm1 = ymm1[0,1,1,3]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = \
                ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa %ymm1, 32(%rcx)
-; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT:    \
vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} \
ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT:    \
vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-PERLANE-LABEL: vec384_v24i16_to_v3i128_factor8:
 ; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm2 = \
                xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT:    vmovdqa \
(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero  ; AVX2-FAST-PERLANE-NEXT:    vpermq \
                {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = \
                ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; \
AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = \
xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} \
ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; \
AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT:   \
vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: vec384_v24i16_to_v3i128_factor8:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = \
                xmm1[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-FAST-NEXT:    vmovdqa (%rdi), \
%xmm0 +; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero  ; AVX2-FAST-NEXT:    vpermq {{.*#+}} \
                ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = \
                ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa %ymm1, 32(%rcx)
-; AVX2-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX2-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    \
vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT:    \
vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    \
vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
@@ -4364,11 +4313,14 @@
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = \
ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} \
ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = \
ymm1[2,1,3,3]  ; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]  ; AVX512F-SLOW-NEXT:   \
vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero  ; \
                AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]  ; AVX512F-SLOW-NEXT:   \
vpaddb (%rdx), %ymm0, %ymm0  ; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
@@ -4381,10 +4333,15 @@
 ; AVX512F-FAST:       # %bb.0:
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = \
xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512F-FAST-NEXT:    \
vinserti32x4 $1, %xmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} ymm1 \
= ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT:    vmovdqa \
{{.*#+}} ymm2 = <4,u,u,u,5,u,u,u> +; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX512F-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]  ; AVX512F-FAST-NEXT:   \
vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero  ; \
                AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512F-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]  ; AVX512F-FAST-NEXT:   \
vpaddb (%rdx), %ymm0, %ymm0  ; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
@@ -4393,35 +4350,19 @@
 ; AVX512F-FAST-NEXT:    vzeroupper
 ; AVX512F-FAST-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec384_v24i16_to_v3i128_factor8:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = \
                [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15]
-; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpermt2w %ymm0, %ymm1, %ymm2
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512BW-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec384_v24i16_to_v3i128_factor8:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = \
                [16,1,2,3,4,5,6,7,17,9,10,11,12,13,14,15]
-; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-FAST-NEXT:    vpermt2w %ymm0, %ymm1, %ymm2
-; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = \
                xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec384_v24i16_to_v3i128_factor8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = \
[32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,40,17,18,19,20,21,22,23,41,25,26,27,28,29,30,31]
 +; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -4659,15 +4600,15 @@
 ; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
 ; SSE42-NEXT:    paddb (%rsi), %xmm0
 ; SSE42-NEXT:    paddb 16(%rsi), %xmm1
-; SSE42-NEXT:    pxor %xmm2, %xmm2
 ; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; SSE42-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT:    pxor %xmm3, %xmm3
+; SSE42-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE42-NEXT:    paddb 16(%rdx), %xmm0
-; SSE42-NEXT:    paddb (%rdx), %xmm3
+; SSE42-NEXT:    paddb (%rdx), %xmm2
 ; SSE42-NEXT:    paddb 32(%rdx), %xmm1
 ; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
-; SSE42-NEXT:    movdqa %xmm3, (%rcx)
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
 ; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
 ; SSE42-NEXT:    retq
 ;
@@ -4677,20 +4618,19 @@
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = \
                ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm4, %xmm3
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
+; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2:
@@ -4724,10 +4664,7 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} ymm1 = \
                xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} ymm0 = \
                xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} zmm0 = \
ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
  ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -4956,54 +4893,60 @@
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vpaddb 48(%rdx), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
+; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm2, 48(%rcx)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX2-SLOW-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = \
                ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
-; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa %ymm1, 32(%rcx)
-; AVX2-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = \
xmm0[2,1,3,3] +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, \
%ymm0 +; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-PERLANE-LABEL: vec384_v12i32_to_v3i128_factor4:
 ; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm2 = \
                xmm1[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-PERLANE-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = \
                ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT:    vpshufd \
{{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = \
ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT:    vpaddb \
32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
 ;
@@ -5011,11 +4954,13 @@
 ; AVX2-FAST:       # %bb.0:
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
-; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
-; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = \
                ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = \
xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT:    \
vpermd %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = \
<2,u,u,u,3,u,u,u> +; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm3, %ymm0
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]  ; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, \
%ymm0  ; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
@@ -5025,9 +4970,11 @@
 ;
 ; AVX512F-LABEL: vec384_v12i32_to_v3i128_factor4:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <16,1,2,3,17,5,6,7,18,9,10,11,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = \
[16,1,2,3,17,5,6,7,20,9,10,11,21,13,14,15]  ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, \
%xmm2  ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
@@ -5038,35 +4985,19 @@
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec384_v12i32_to_v3i128_factor4:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT:    movb $17, %al
-; AVX512BW-SLOW-NEXT:    kmovd %eax, %k1
-; AVX512BW-SLOW-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
-; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec384_v12i32_to_v3i128_factor4:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-FAST-NEXT:    movb $17, %al
-; AVX512BW-FAST-NEXT:    kmovd %eax, %k1
-; AVX512BW-FAST-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
-; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = \
                xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec384_v12i32_to_v3i128_factor4:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = \
[16,1,2,3,17,5,6,7,20,9,10,11,21,13,14,15] +; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, \
%xmm2 +; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -5337,17 +5268,21 @@
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3]
-; AVX-NEXT:    vmovq {{.*#+}} xmm1 = xmm1[0],zero
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3]
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX-NEXT:    vpaddb 48(%rdx), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
+; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm2, 48(%rcx)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -5355,11 +5290,11 @@
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,1,3]
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
@@ -5371,14 +5306,16 @@
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,9,1,11,2,13,u,u>
-; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
-; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
-; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    movb $85, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -5386,12 +5323,11 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    movb $5, %al
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    movb $85, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpexpandq %ymm0, %ymm1 {%k1} {z}
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -5887,24 +5823,25 @@
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = \
xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
                
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = \
                xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = \
                xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    paddb 16(%rdx), %xmm0
-; SSE2-NEXT:    paddb (%rdx), %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = \
xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = \
xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT:    \
movdqa %xmm2, %xmm4 +; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = \
xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = \
xmm2[2],xmm1[2],xmm2[3],xmm1[3]  ; SSE2-NEXT:    paddb 48(%rdx), %xmm2
-; SSE2-NEXT:    paddb 32(%rdx), %xmm3
-; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
-; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
-; SSE2-NEXT:    movdqa %xmm4, (%rcx)
+; SSE2-NEXT:    paddb 32(%rdx), %xmm4
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb (%rdx), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, (%rcx)
 ; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    movdqa %xmm4, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: vec512_v64i8_to_v8i64_factor8:
@@ -5913,20 +5850,19 @@
 ; SSE42-NEXT:    paddb (%rsi), %xmm0
 ; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
                ; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm2 = \
                xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
                
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psrlq $48, %xmm3
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm3 = \
xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero \
+; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm3 = \
xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero \
; SSE42-NEXT:    psrld $16, %xmm0  ; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
+; SSE42-NEXT:    psrld $16, %xmm2 +; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm2 = \
xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero \
+; SSE42-NEXT:    paddb 48(%rdx), %xmm2  ; SSE42-NEXT:    paddb 16(%rdx), %xmm0
-; SSE42-NEXT:    paddb 48(%rdx), %xmm3
-; SSE42-NEXT:    paddb 32(%rdx), %xmm2
+; SSE42-NEXT:    paddb 32(%rdx), %xmm3
 ; SSE42-NEXT:    paddb (%rdx), %xmm1
 ; SSE42-NEXT:    movdqa %xmm1, (%rcx)
-; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
-; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
+; SSE42-NEXT:    movdqa %xmm3, 32(%rcx)
 ; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    movdqa %xmm2, 48(%rcx)
 ; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v64i8_to_v8i64_factor8:
@@ -5936,9 +5872,9 @@
 ; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
; AVX-NEXT:    vpsrld $16, %xmm0, %xmm2  ; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm2 = \
                xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm3 = \
                xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm3 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0  ; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm0  ; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, \
%xmm3 @@ -5997,159 +5933,127 @@
 }
 
 define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr \
                %in.vec.bias.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
-; SSE-LABEL: vec512_v64i8_to_v4i128_factor16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    pslldq {{.*#+}} xmm2 = \
                zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
                
-; SSE-NEXT:    psrldq {{.*#+}} xmm2 = \
                xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE-NEXT:    psrldq {{.*#+}} xmm3 = \
                xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = \
                zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
                
-; SSE-NEXT:    psrldq {{.*#+}} xmm0 = \
                xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; SSE-NEXT:    paddb 16(%rdx), %xmm0
-; SSE-NEXT:    paddb 48(%rdx), %xmm3
-; SSE-NEXT:    paddb 32(%rdx), %xmm2
-; SSE-NEXT:    paddb (%rdx), %xmm1
-; SSE-NEXT:    movdqa %xmm1, (%rcx)
-; SSE-NEXT:    movdqa %xmm2, 32(%rcx)
-; SSE-NEXT:    movdqa %xmm3, 48(%rcx)
-; SSE-NEXT:    movdqa %xmm0, 16(%rcx)
-; SSE-NEXT:    retq
+; SSE2-LABEL: vec512_v64i8_to_v4i128_factor16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    paddb (%rsi), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrld $16, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = \
xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero \
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; \
SSE2-NEXT:    psrldq {{.*#+}} xmm0 = \
xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero \
+; SSE2-NEXT:    paddb 16(%rdx), %xmm0 +; SSE2-NEXT:    paddb 48(%rdx), %xmm1
+; SSE2-NEXT:    paddb 32(%rdx), %xmm3
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
+; SSE2-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm1, 48(%rcx)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: vec512_v64i8_to_v4i128_factor16:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movdqa (%rdi), %xmm0
+; SSE42-NEXT:    paddb (%rsi), %xmm0
+; SSE42-NEXT:    movdqa %xmm0, %xmm1
+; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = \
xmm1[2],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0] +; SSE42-NEXT:    pand %xmm0, %xmm2
+; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE42-NEXT:    psrldq {{.*#+}} xmm3 = \
xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero \
+; SSE42-NEXT:    pslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; \
SSE42-NEXT:    psrldq {{.*#+}} xmm0 = \
xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero \
+; SSE42-NEXT:    paddb 16(%rdx), %xmm0 +; SSE42-NEXT:    paddb 48(%rdx), %xmm3
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    paddb 32(%rdx), %xmm1
+; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
+; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v64i8_to_v4i128_factor16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX-NEXT:    vpslldq {{.*#+}} xmm2 = \
                zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
                
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = \
                xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpslldq {{.*#+}} xmm3 = \
                zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
                
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = \
                xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpsrld $24, %xmm0, %xmm2
+; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]  ; \
AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = \
                xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 48(%rcx)
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX-NEXT:    vpaddb 48(%rdx), %xmm3, %xmm3
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
+; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 48(%rcx)
+; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = \
                [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-SLOW-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
+; AVX2-LABEL: vec512_v64i8_to_v4i128_factor16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpsrld $24, %xmm0, %xmm2
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT:   \
# ymm2 = mem[0,1,0,1] +; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero \
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT:    vpand %ymm2, \
%ymm0, %ymm0 +; AVX2-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX2-NEXT:    vmovdqa %ymm1, 32(%rcx)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
-; AVX2-FAST-PERLANE-LABEL: vec512_v64i8_to_v4i128_factor16:
-; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vpmovzxbq {{.*#+}} xmm1 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = \
                [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FAST-PERLANE-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = \
                xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-PERLANE-NEXT:    vzeroupper
-; AVX2-FAST-PERLANE-NEXT:    retq
-;
-; AVX2-FAST-LABEL: vec512_v64i8_to_v4i128_factor16:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpmovzxbq {{.*#+}} xmm1 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = \
                [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
-; AVX2-FAST-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-FAST-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
-; AVX2-FAST-NEXT:    vzeroupper
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16:
-; AVX512F-SLOW:       # %bb.0:
-; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX512F-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX512F-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
-; AVX512F-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                
-; AVX512F-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-SLOW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512F-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, 32(%rcx)
-; AVX512F-SLOW-NEXT:    vzeroupper
-; AVX512F-SLOW-NEXT:    retq
-;
-; AVX512F-FAST-LABEL: vec512_v64i8_to_v4i128_factor16:
-; AVX512F-FAST:       # %bb.0:
-; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = \
                xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                
-; AVX512F-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-FAST-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512F-FAST-NEXT:    vmovdqa %ymm1, 32(%rcx)
-; AVX512F-FAST-NEXT:    vzeroupper
-; AVX512F-FAST-NEXT:    retq
+; AVX512F-LABEL: vec512_v64i8_to_v4i128_factor16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512F-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpsrld $24, %xmm0, %xmm0
+; AVX512F-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = \
xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero \
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX512F-NEXT:    \
vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT:    vpandq \
{{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512F-NEXT:    vextracti64x4 $1, \
%zmm0, %ymm1 +; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-SLOW-LABEL: vec512_v64i8_to_v4i128_factor16:
 ; AVX512BW-SLOW:       # %bb.0:
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX512BW-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX512BW-SLOW-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
-; AVX512BW-SLOW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                
-; AVX512BW-SLOW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-SLOW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512BW-SLOW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = \
ymm1[0,u,u,u,1,u,u,u,u,u,u,u,u,u,u,u,16,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; \
AVX512BW-SLOW-NEXT:    vpsrld $24, %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT:    \
vpbroadcastb %xmm0, %xmm0 +; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = \
<0,u,u,u,1,u,u,u,4,u,u,u,18,u,u,u> +; AVX512BW-SLOW-NEXT:    vpermi2d %zmm0, %zmm1, \
%zmm2 +; AVX512BW-SLOW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-SLOW-NEXT:    vzeroupper
@@ -6159,13 +6063,13 @@
 ; AVX512BW-FAST:       # %bb.0:
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = \
                xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
-; AVX512BW-FAST-NEXT:    vpmovzxbq {{.*#+}} xmm0 = \
                xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
                
-; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
-; AVX512BW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = \
[255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
                
-; AVX512BW-FAST-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FAST-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-FAST-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512BW-FAST-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm1
+; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = \
ymm1[0,u,u,u,1,u,u,u,u,u,u,u,u,u,u,u,16,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; \
AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,3,3,3,3,u,u,u,u] \
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = \
<0,u,u,u,1,u,u,u,4,u,u,u,18,u,u,u> +; AVX512BW-FAST-NEXT:    vpermi2d %zmm0, %zmm1, \
%zmm2 +; AVX512BW-FAST-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
 ; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-FAST-NEXT:    vzeroupper
@@ -6234,13 +6138,14 @@
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
-; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = \
ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512F-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
+; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -6248,10 +6153,9 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
-; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm1
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = \
ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -6372,16 +6276,16 @@
 ; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
 ; SSE42-NEXT:    paddb (%rsi), %xmm0
 ; SSE42-NEXT:    paddb 16(%rsi), %xmm1
-; SSE42-NEXT:    pxor %xmm2, %xmm2
-; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm3 = \
                xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE42-NEXT:    punpckhwd {{.*#+}} xmm1 = \
xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE42-NEXT:    \
pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; \
SSE42-NEXT:    pxor %xmm3, %xmm3 +; SSE42-NEXT:    punpckhwd {{.*#+}} xmm1 = \
xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]  ; SSE42-NEXT:    \
                pmovzxwd {{.*#+}} xmm4 = \
                xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE42-NEXT:    punpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE42-NEXT:    \
punpckhwd {{.*#+}} xmm0 = \
xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]  ; SSE42-NEXT:    \
paddb 16(%rdx), %xmm0  ; SSE42-NEXT:    paddb (%rdx), %xmm4
 ; SSE42-NEXT:    paddb 48(%rdx), %xmm1
-; SSE42-NEXT:    paddb 32(%rdx), %xmm3
-; SSE42-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE42-NEXT:    paddb 32(%rdx), %xmm2
+; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
 ; SSE42-NEXT:    movdqa %xmm1, 48(%rcx)
 ; SSE42-NEXT:    movdqa %xmm4, (%rcx)
 ; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
@@ -6525,8 +6429,8 @@
 ;
 ; AVX2-LABEL: vec512_v32i16_to_v8i64_factor4:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
  ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 @@ -6577,19 +6481,18 @@
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = \
                zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
-; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = \
xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
SSE2-NEXT:    pand %xmm1, %xmm0 +; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = \
xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero  ; \
SSE2-NEXT:    psrldq {{.*#+}} xmm3 = \
xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero  ; \
SSE2-NEXT:    paddb 16(%rdx), %xmm3  ; SSE2-NEXT:    paddb 48(%rdx), %xmm2
-; SSE2-NEXT:    paddb 32(%rdx), %xmm0
-; SSE2-NEXT:    paddb (%rdx), %xmm1
-; SSE2-NEXT:    movdqa %xmm1, (%rcx)
-; SSE2-NEXT:    movdqa %xmm0, 32(%rcx)
+; SSE2-NEXT:    paddb 32(%rdx), %xmm1
+; SSE2-NEXT:    paddb (%rdx), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rcx)
+; SSE2-NEXT:    movdqa %xmm1, 32(%rcx)
 ; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
 ; SSE2-NEXT:    movdqa %xmm3, 16(%rcx)
 ; SSE2-NEXT:    retq
@@ -6599,21 +6502,21 @@
 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
 ; SSE42-NEXT:    paddb (%rsi), %xmm0
 ; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE42-NEXT:    pslldq {{.*#+}} xmm0 = \
                zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
-; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = \
                xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; SSE42-NEXT:    psrldq {{.*#+}} xmm2 = \
xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
SSE42-NEXT:    pxor %xmm2, %xmm2 +; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = \
xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
+; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7]
 ; SSE42-NEXT:    psrldq {{.*#+}} xmm3 = \
                xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; SSE42-NEXT:    paddb 16(%rdx), %xmm3
-; SSE42-NEXT:    paddb 48(%rdx), %xmm2
-; SSE42-NEXT:    paddb 32(%rdx), %xmm0
-; SSE42-NEXT:    paddb (%rdx), %xmm1
-; SSE42-NEXT:    movdqa %xmm1, (%rcx)
-; SSE42-NEXT:    movdqa %xmm0, 32(%rcx)
-; SSE42-NEXT:    movdqa %xmm2, 48(%rcx)
-; SSE42-NEXT:    movdqa %xmm3, 16(%rcx)
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = \
xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
SSE42-NEXT:    paddb 16(%rdx), %xmm0 +; SSE42-NEXT:    paddb 48(%rdx), %xmm3
+; SSE42-NEXT:    paddb 32(%rdx), %xmm1
+; SSE42-NEXT:    paddb (%rdx), %xmm2
+; SSE42-NEXT:    movdqa %xmm2, (%rcx)
+; SSE42-NEXT:    movdqa %xmm1, 32(%rcx)
+; SSE42-NEXT:    movdqa %xmm3, 48(%rcx)
+; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
 ; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: vec512_v32i16_to_v4i128_factor8:
@@ -6621,37 +6524,36 @@
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm2 = \
                xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpslldq {{.*#+}} xmm3 = \
zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT:    \
vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT:    vpshufd {{.*#+}} \
xmm3 = xmm0[0,0,0,0]  ; AVX-NEXT:    vpsrldq {{.*#+}} xmm3 = \
                xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = \
xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero  ; \
                AVX-NEXT:    vpaddb 48(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
+; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
 ; AVX-NEXT:    vmovdqa %xmm0, 48(%rcx)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: vec512_v32i16_to_v4i128_factor8:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = \
                xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = \
                ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX2-SLOW-NEXT:    vmovdqa (%rdi), \
%xmm0 +; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero  ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} \
                ymm1 = ymm1[0,1,1,3]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT:    \
vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT:    \
vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} \
ymm0 = ymm0[0,1,1,3] +; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]  ; AVX2-SLOW-NEXT:    \
                vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX2-SLOW-NEXT:    vzeroupper
@@ -6659,17 +6561,17 @@
 ;
 ; AVX2-FAST-PERLANE-LABEL: vec512_v32i16_to_v4i128_factor8:
 ; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} xmm2 = \
                xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm2 = \
                ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = \
xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), \
%xmm0 +; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero  ; AVX2-FAST-PERLANE-NEXT:    vpermq \
                {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; \
AVX2-FAST-PERLANE-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT:    \
vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; \
AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = \
xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} \
ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]  ; \
                AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
@@ -6677,17 +6579,17 @@
 ;
 ; AVX2-FAST-LABEL: vec512_v32i16_to_v4i128_factor8:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm2 = \
                xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = \
                ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
+; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero  ; AVX2-FAST-NEXT:    vpermq {{.*#+}} \
                ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    \
vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    \
vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX2-FAST-NEXT:    \
vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]  ; AVX2-FAST-NEXT:    \
                vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX2-FAST-NEXT:    vzeroupper
@@ -6697,18 +6599,20 @@
 ; AVX512F-SLOW:       # %bb.0:
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
                xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512F-SLOW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = \
ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} \
ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = \
ymm1[2,1,3,3]  ; AVX512F-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = \
                ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; AVX512F-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = \
xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero  ; AVX512F-SLOW-NEXT:    vpermq \
{{.*#+}} ymm0 = ymm0[0,1,1,3]  ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = \
                ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512F-SLOW-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, 32(%rcx)
+; AVX512F-SLOW-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-SLOW-NEXT:    vzeroupper
 ; AVX512F-SLOW-NEXT:    retq
 ;
@@ -6716,25 +6620,30 @@
 ; AVX512F-FAST:       # %bb.0:
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-FAST-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm1 = \
                xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512F-FAST-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} ymm1 = \
ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} \
ymm2 = <4,u,u,u,5,u,u,u> +; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX512F-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = \
                ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = \
xmm0[4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u] +; AVX512F-FAST-NEXT:    vpmovzxwq {{.*#+}} \
xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero  ; AVX512F-FAST-NEXT:    vpermq \
{{.*#+}} ymm0 = ymm0[0,1,1,3]  ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = \
                ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
-; AVX512F-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
+; AVX512F-FAST-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-FAST-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-FAST-NEXT:    vmovdqa %ymm1, 32(%rcx)
+; AVX512F-FAST-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-FAST-NEXT:    vzeroupper
 ; AVX512F-FAST-NEXT:    retq
 ;
 ; AVX512BW-LABEL: vec512_v32i16_to_v4i128_factor8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = \
[32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,17,18,19,20,21,22,23,35,25,26,27,28,29,30,31]
 +; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = \
[32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,40,17,18,19,20,21,22,23,41,25,26,27,28,29,30,31]
  ; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
@@ -6984,16 +6893,16 @@
 ; SSE42-NEXT:    movdqa 16(%rdi), %xmm1
 ; SSE42-NEXT:    paddb (%rsi), %xmm0
 ; SSE42-NEXT:    paddb 16(%rsi), %xmm1
-; SSE42-NEXT:    pxor %xmm2, %xmm2
-; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
-; SSE42-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE42-NEXT:    pxor %xmm3, %xmm3
+; SSE42-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
 ; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
-; SSE42-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE42-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE42-NEXT:    paddb 16(%rdx), %xmm0
 ; SSE42-NEXT:    paddb (%rdx), %xmm4
 ; SSE42-NEXT:    paddb 48(%rdx), %xmm1
-; SSE42-NEXT:    paddb 32(%rdx), %xmm3
-; SSE42-NEXT:    movdqa %xmm3, 32(%rcx)
+; SSE42-NEXT:    paddb 32(%rdx), %xmm2
+; SSE42-NEXT:    movdqa %xmm2, 32(%rcx)
 ; SSE42-NEXT:    movdqa %xmm1, 48(%rcx)
 ; SSE42-NEXT:    movdqa %xmm4, (%rcx)
 ; SSE42-NEXT:    movdqa %xmm0, 16(%rcx)
@@ -7005,26 +6914,19 @@
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = \
                ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT:    vblendps {{.*#+}} ymm1 = \
                ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vpaddb 48(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
-; AVX-NEXT:    vpaddb (%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm1, 32(%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 48(%rcx)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm4, %xmm3
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
+; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2:
@@ -7081,22 +6983,23 @@
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = \
                xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
                
-; SSE2-NEXT:    xorps %xmm3, %xmm3
-; SSE2-NEXT:    movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    psrldq {{.*#+}} xmm4 = \
                xmm4[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,3]
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; SSE2-NEXT:    xorps %xmm4, %xmm4
+; SSE2-NEXT:    movss {{.*#+}} xmm4 = xmm3[0],xmm4[1,2,3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = \
xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; \
SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]  ; SSE2-NEXT:    shufps \
{{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = \
xmm0[0,2],xmm1[2,3]  ; SSE2-NEXT:    paddb 16(%rdx), %xmm0
+; SSE2-NEXT:    paddb 48(%rdx), %xmm3
 ; SSE2-NEXT:    paddb 32(%rdx), %xmm4
-; SSE2-NEXT:    paddb (%rdx), %xmm3
-; SSE2-NEXT:    paddb 48(%rdx), %xmm2
-; SSE2-NEXT:    movdqa %xmm2, 48(%rcx)
-; SSE2-NEXT:    movdqa %xmm3, (%rcx)
+; SSE2-NEXT:    paddb (%rdx), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, (%rcx)
 ; SSE2-NEXT:    movdqa %xmm4, 32(%rcx)
+; SSE2-NEXT:    movdqa %xmm3, 48(%rcx)
 ; SSE2-NEXT:    movdqa %xmm0, 16(%rcx)
 ; SSE2-NEXT:    retq
 ;
@@ -7126,40 +7029,40 @@
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vpaddb 48(%rdx), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm3, %xmm3
-; AVX-NEXT:    vpaddb (%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vmovdqa %xmm2, (%rcx)
+; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
+; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm3, 16(%rcx)
 ; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
-; AVX-NEXT:    vmovdqa %xmm1, 48(%rcx)
+; AVX-NEXT:    vmovdqa %xmm2, 48(%rcx)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: vec512_v16i32_to_v4i128_factor4:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = \
                ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, \
%xmm2 +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = \
xmm0[2,1,3,3] +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]  ; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, \
                %ymm0
-; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX2-SLOW-NEXT:    vzeroupper
@@ -7167,17 +7070,17 @@
 ;
 ; AVX2-FAST-PERLANE-LABEL: vec512_v16i32_to_v4i128_factor4:
 ; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = \
                ymm2[0],ymm0[1,2,3],ymm2[4],ymm0[5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = \
ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm2, \
%xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT:    vpshufd \
{{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = \
ymm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = \
ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]  ; AVX2-FAST-PERLANE-NEXT:    vpaddb \
                32(%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, (%rcx)
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
@@ -7187,15 +7090,15 @@
 ; AVX2-FAST:       # %bb.0:
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
-; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
-; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = \
ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = \
<0,u,u,u,1,u,u,u> +; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = \
ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]  ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = \
<2,u,u,u,3,u,u,u>  ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm3, %ymm0
-; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = \
ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 \
= ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]  ; AVX2-FAST-NEXT:    vpaddb 32(%rdx), \
                %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa %ymm1, (%rcx)
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX2-FAST-NEXT:    vzeroupper
@@ -7203,31 +7106,31 @@
 ;
 ; AVX512F-LABEL: vec512_v16i32_to_v4i128_factor4:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    movw $4369, %ax # imm = 0x1111
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rcx)
+; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = \
[16,1,2,3,17,5,6,7,20,9,10,11,21,13,14,15] +; AVX512F-NEXT:    vpxor %xmm2, %xmm2, \
%xmm2 +; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
+; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
+; AVX512F-NEXT:    vmovdqa %ymm1, (%rcx)
+; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: vec512_v16i32_to_v4i128_factor4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    movb $17, %al
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = \
[16,1,2,3,17,5,6,7,20,9,10,11,21,13,14,15]  ; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, \
                %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,9,10,11,3,13,14,15]
-; AVX512BW-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -7519,8 +7422,8 @@
 ; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
 ; AVX-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[3]
@@ -7541,13 +7444,13 @@
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,1,3]
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
+; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
 ; AVX2-NEXT:    vmovdqa %ymm0, 32(%rcx)
 ; AVX2-NEXT:    vzeroupper
@@ -7557,6 +7460,8 @@
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    movb $85, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
@@ -7568,37 +7473,19 @@
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512BW-SLOW-LABEL: vec512_v8i64_to_v4i128_factor2:
-; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT:    movb $5, %al
-; AVX512BW-SLOW-NEXT:    kmovd %eax, %k1
-; AVX512BW-SLOW-NEXT:    vpexpandq %ymm0, %ymm1 {%k1} {z}
-; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
-; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = \
                ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
-; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT:    vzeroupper
-; AVX512BW-SLOW-NEXT:    retq
-;
-; AVX512BW-FAST-LABEL: vec512_v8i64_to_v4i128_factor2:
-; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-FAST-NEXT:    movb $5, %al
-; AVX512BW-FAST-NEXT:    kmovd %eax, %k1
-; AVX512BW-FAST-NEXT:    vpexpandq %ymm0, %ymm1 {%k1} {z}
-; AVX512BW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,5,3,7]
-; AVX512BW-FAST-NEXT:    vpermi2q %ymm2, %ymm0, %ymm3
-; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT:    vzeroupper
-; AVX512BW-FAST-NEXT:    retq
+; AVX512BW-LABEL: vec512_v8i64_to_v4i128_factor2:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    movb $85, %al
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
   %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
   %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -7835,9 +7722,9 @@
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa %xmm0, %xmm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT:    movb $51, %al
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
Index: llvm/test/CodeGen/AArch64/zext-to-tbl.ll
===================================================================
--- llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm64-apple-ios -mattr=+sve -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -o - %s | FileCheck \
                --check-prefix=CHECK-BE %s
-; RUN: llc -mtriple=arm64-apple-ios -mattr=+global-isel -mattr=+sve -o - %s | \
                FileCheck %s
-; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+global-isel -mattr=+sve -o - %s \
| FileCheck --check-prefix=CHECK-BE %s +; RUN: llc -mtriple=arm64-apple-ios \
-mattr=+sve -o - %s | FileCheck --implicit-check-not=LCPI --implicit-check-not=lCPI \
%s +; RUN: llc -mtriple=aarch64_be-unknown-linux -mattr=+sve -o - %s | FileCheck \
--implicit-check-not=LCPI --implicit-check-not=lCPI --check-prefix=CHECK-BE %s +; \
RUN: llc -mtriple=arm64-apple-ios -mattr=+global-isel -mattr=+sve -o - %s | FileCheck \
--implicit-check-not=LCPI --implicit-check-not=lCPI %s +; RUN: llc \
-mtriple=aarch64_be-unknown-linux -mattr=+global-isel -mattr=+sve -o - %s | FileCheck \
--implicit-check-not=LCPI --implicit-check-not=lCPI --check-prefix=CHECK-BE %s  
 ; CHECK-LABEL: lCPI0_0:
 ; CHECK-NEXT:    .byte   0                               ; 0x0
@@ -21,57 +21,6 @@
 ; CHECK-NEXT:    .byte   255                             ; 0xff
 ; CHECK-NEXT:    .byte   255                             ; 0xff
 ; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:lCPI0_1:
-; CHECK-NEXT:    .byte   4                               ; 0x4
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   5                               ; 0x5
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   6                               ; 0x6
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   7                               ; 0x7
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:lCPI0_2:
-; CHECK-NEXT:    .byte   8                               ; 0x8
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   9                               ; 0x9
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   10                              ; 0xa
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   11                              ; 0xb
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:lCPI0_3:
-; CHECK-NEXT:    .byte   12                              ; 0xc
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   13                              ; 0xd
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   14                              ; 0xe
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   15                              ; 0xf
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
-; CHECK-NEXT:    .byte   255                             ; 0xff
 
 ; CHECK-BE: .LCPI0_0:
 ; CHECK-BE-NEXT: 	.byte	255                             // 0xff
@@ -149,39 +98,28 @@
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh0:
 ; CHECK-NEXT:    adrp x9, lCPI0_0@PAGE
-; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:    adrp x10, lCPI0_1@PAGE
-; CHECK-NEXT:  Lloh2:
-; CHECK-NEXT:    adrp x11, lCPI0_2@PAGE
-; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    adrp x12, lCPI0_3@PAGE
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh4:
+; CHECK-NEXT:  Lloh1:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI0_0@PAGEOFF]
-; CHECK-NEXT:  Lloh5:
-; CHECK-NEXT:    ldr q1, [x10, lCPI0_1@PAGEOFF]
-; CHECK-NEXT:  Lloh6:
-; CHECK-NEXT:    ldr q2, [x11, lCPI0_2@PAGEOFF]
-; CHECK-NEXT:  Lloh7:
-; CHECK-NEXT:    ldr q3, [x12, lCPI0_3@PAGEOFF]
 ; CHECK-NEXT:  LBB0_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q4, [x0, x8]
+; CHECK-NEXT:    ldr q1, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    tbl.16b v5, { v4 }, v3
-; CHECK-NEXT:    tbl.16b v6, { v4 }, v2
-; CHECK-NEXT:    tbl.16b v7, { v4 }, v1
+; CHECK-NEXT:    dup.2d v2, v1[1]
+; CHECK-NEXT:    dup.4s v3, v1[1]
+; CHECK-NEXT:    tbl.16b v1, { v1 }, v0
+; CHECK-NEXT:    dup.4s v4, v2[1]
+; CHECK-NEXT:    tbl.16b v3, { v3 }, v0
+; CHECK-NEXT:    tbl.16b v2, { v2 }, v0
 ; CHECK-NEXT:    tbl.16b v4, { v4 }, v0
-; CHECK-NEXT:    stp q6, q5, [x1, #32]
-; CHECK-NEXT:    stp q4, q7, [x1], #64
+; CHECK-NEXT:    stp q1, q3, [x1]
+; CHECK-NEXT:    stp q2, q4, [x1, #32]
+; CHECK-NEXT:    add x1, x1, #64
 ; CHECK-NEXT:    b.ne LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh7
-; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh6
-; CHECK-NEXT:    .loh AdrpLdr Lloh1, Lloh5
-; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh4
+; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
 ;
 ; CHECK-BE-LABEL: zext_v16i8_to_v16i32_in_loop:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -566,23 +504,6 @@
 ; CHECK-NEXT:     .byte   255                             ; 0xff
 ; CHECK-NEXT:     .byte   255                             ; 0xff
 ; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT: lCPI6_1:
-; CHECK-NEXT:     .byte   4                               ; 0x4
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   5                               ; 0x5
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   6                               ; 0x6
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   7                               ; 0x7
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
 
 ; CHECK-BE:       .LCPI6_0:
 ; CHECK-BE-NEXT: 	.byte	255                             // 0xff
@@ -622,28 +543,24 @@
 define void @zext_v8i8_to_v8i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-LABEL: zext_v8i8_to_v8i32_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:  Lloh8:
+; CHECK-NEXT:  Lloh2:
 ; CHECK-NEXT:    adrp x9, lCPI6_0@PAGE
-; CHECK-NEXT:  Lloh9:
-; CHECK-NEXT:    adrp x10, lCPI6_1@PAGE
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh10:
+; CHECK-NEXT:  Lloh3:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI6_0@PAGEOFF]
-; CHECK-NEXT:  Lloh11:
-; CHECK-NEXT:    ldr q1, [x10, lCPI6_1@PAGEOFF]
 ; CHECK-NEXT:  LBB6_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr d2, [x0, x8]
+; CHECK-NEXT:    ldr d1, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    tbl.16b v3, { v2 }, v1
+; CHECK-NEXT:    dup.2s v2, v1[1]
+; CHECK-NEXT:    tbl.16b v1, { v1 }, v0
 ; CHECK-NEXT:    tbl.16b v2, { v2 }, v0
-; CHECK-NEXT:    stp q2, q3, [x1], #64
+; CHECK-NEXT:    stp q1, q2, [x1], #64
 ; CHECK-NEXT:    b.ne LBB6_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh9, Lloh11
-; CHECK-NEXT:    .loh AdrpLdr Lloh8, Lloh10
+; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh3
 ;
 ; CHECK-BE-LABEL: zext_v8i8_to_v8i32_in_loop:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -1032,10 +949,10 @@
 define void @zext_v4i8_to_v4i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-LABEL: zext_v4i8_to_v4i32_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:  Lloh12:
+; CHECK-NEXT:  Lloh4:
 ; CHECK-NEXT:    adrp x9, lCPI11_0@PAGE
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh13:
+; CHECK-NEXT:  Lloh5:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI11_0@PAGEOFF]
 ; CHECK-NEXT:  LBB11_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1047,7 +964,7 @@
 ; CHECK-NEXT:    b.ne LBB11_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh13
+; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh5
 ;
 ; CHECK-BE-LABEL: zext_v4i8_to_v4i32_in_loop:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -1104,40 +1021,6 @@
 ; CHECK-NEXT: 	.byte	255                             ; 0xff
 ; CHECK-NEXT: 	.byte	255                             ; 0xff
 ; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: lCPI12_1:
-; CHECK-NEXT: 	.byte	4                               ; 0x4
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	5                               ; 0x5
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	6                               ; 0x6
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	7                               ; 0x7
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: lCPI12_2:
-; CHECK-NEXT: 	.byte	8                               ; 0x8
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	9                               ; 0x9
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	10                              ; 0xa
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	11                              ; 0xb
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
-; CHECK-NEXT: 	.byte	255                             ; 0xff
 
 ; CHECK-BE-LABEL: .LCPI12_0:
 ; CHECK-BE-NEXT: 	.byte	255                             // 0xff
@@ -1194,35 +1077,27 @@
 define void @zext_v12i8_to_v12i32_in_loop(ptr %src, ptr %dst) {
 ; CHECK-LABEL: zext_v12i8_to_v12i32_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:  Lloh14:
+; CHECK-NEXT:  Lloh6:
 ; CHECK-NEXT:    adrp x9, lCPI12_0@PAGE
-; CHECK-NEXT:  Lloh15:
-; CHECK-NEXT:    adrp x10, lCPI12_1@PAGE
-; CHECK-NEXT:  Lloh16:
-; CHECK-NEXT:    adrp x11, lCPI12_2@PAGE
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh17:
+; CHECK-NEXT:  Lloh7:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI12_0@PAGEOFF]
-; CHECK-NEXT:  Lloh18:
-; CHECK-NEXT:    ldr q1, [x10, lCPI12_1@PAGEOFF]
-; CHECK-NEXT:  Lloh19:
-; CHECK-NEXT:    ldr q2, [x11, lCPI12_2@PAGEOFF]
 ; CHECK-NEXT:  LBB12_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q3, [x0, x8]
+; CHECK-NEXT:    ldr q1, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    tbl.16b v4, { v3 }, v2
-; CHECK-NEXT:    tbl.16b v5, { v3 }, v1
+; CHECK-NEXT:    dup.2d v2, v1[1]
+; CHECK-NEXT:    dup.4s v3, v1[1]
+; CHECK-NEXT:    tbl.16b v1, { v1 }, v0
+; CHECK-NEXT:    tbl.16b v2, { v2 }, v0
 ; CHECK-NEXT:    tbl.16b v3, { v3 }, v0
-; CHECK-NEXT:    stp q5, q4, [x1, #16]
-; CHECK-NEXT:    str q3, [x1], #64
+; CHECK-NEXT:    stp q3, q2, [x1, #16]
+; CHECK-NEXT:    str q1, [x1], #64
 ; CHECK-NEXT:    b.ne LBB12_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh16, Lloh19
-; CHECK-NEXT:    .loh AdrpLdr Lloh15, Lloh18
-; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh17
+; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh7
 ;
 ; CHECK-BE-LABEL: zext_v12i8_to_v12i32_in_loop:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -2192,22 +2067,22 @@
 define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) {
 ; CHECK-LABEL: zext_v20i8_to_v20i24_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:  Lloh20:
+; CHECK-NEXT:  Lloh8:
 ; CHECK-NEXT:    adrp x9, lCPI20_0@PAGE
-; CHECK-NEXT:  Lloh21:
+; CHECK-NEXT:  Lloh9:
 ; CHECK-NEXT:    adrp x10, lCPI20_1@PAGE
-; CHECK-NEXT:  Lloh22:
+; CHECK-NEXT:  Lloh10:
 ; CHECK-NEXT:    adrp x11, lCPI20_2@PAGE
-; CHECK-NEXT:  Lloh23:
+; CHECK-NEXT:  Lloh11:
 ; CHECK-NEXT:    adrp x12, lCPI20_3@PAGE
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh24:
+; CHECK-NEXT:  Lloh12:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI20_0@PAGEOFF]
-; CHECK-NEXT:  Lloh25:
+; CHECK-NEXT:  Lloh13:
 ; CHECK-NEXT:    ldr q1, [x10, lCPI20_1@PAGEOFF]
-; CHECK-NEXT:  Lloh26:
+; CHECK-NEXT:  Lloh14:
 ; CHECK-NEXT:    ldr q2, [x11, lCPI20_2@PAGEOFF]
-; CHECK-NEXT:  Lloh27:
+; CHECK-NEXT:  Lloh15:
 ; CHECK-NEXT:    ldr q3, [x12, lCPI20_3@PAGEOFF]
 ; CHECK-NEXT:  LBB20_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2228,10 +2103,10 @@
 ; CHECK-NEXT:    b.ne LBB20_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh23, Lloh27
-; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh26
-; CHECK-NEXT:    .loh AdrpLdr Lloh21, Lloh25
-; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh24
+; CHECK-NEXT:    .loh AdrpLdr Lloh11, Lloh15
+; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh14
+; CHECK-NEXT:    .loh AdrpLdr Lloh9, Lloh13
+; CHECK-NEXT:    .loh AdrpLdr Lloh8, Lloh12
 ;
 ; CHECK-BE-LABEL: zext_v20i8_to_v20i24_in_loop:
 ; CHECK-BE:       // %bb.0: // %entry
@@ -2519,30 +2394,30 @@
 define void @zext_v23i8_to_v23i48_in_loop(ptr %src, ptr %dst) {
 ; CHECK-LABEL: zext_v23i8_to_v23i48_in_loop:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:  Lloh28:
+; CHECK-NEXT:  Lloh16:
 ; CHECK-NEXT:    adrp x9, lCPI21_0@PAGE
-; CHECK-NEXT:  Lloh29:
+; CHECK-NEXT:  Lloh17:
 ; CHECK-NEXT:    adrp x10, lCPI21_1@PAGE
-; CHECK-NEXT:  Lloh30:
+; CHECK-NEXT:  Lloh18:
 ; CHECK-NEXT:    adrp x11, lCPI21_2@PAGE
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh31:
+; CHECK-NEXT:  Lloh19:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI21_0@PAGEOFF]
-; CHECK-NEXT:  Lloh32:
+; CHECK-NEXT:  Lloh20:
 ; CHECK-NEXT:    adrp x9, lCPI21_3@PAGE
-; CHECK-NEXT:  Lloh33:
+; CHECK-NEXT:  Lloh21:
 ; CHECK-NEXT:    ldr q1, [x10, lCPI21_1@PAGEOFF]
-; CHECK-NEXT:  Lloh34:
+; CHECK-NEXT:  Lloh22:
 ; CHECK-NEXT:    adrp x10, lCPI21_4@PAGE
-; CHECK-NEXT:  Lloh35:
+; CHECK-NEXT:  Lloh23:
 ; CHECK-NEXT:    ldr q2, [x11, lCPI21_2@PAGEOFF]
-; CHECK-NEXT:  Lloh36:
+; CHECK-NEXT:  Lloh24:
 ; CHECK-NEXT:    adrp x11, lCPI21_5@PAGE
-; CHECK-NEXT:  Lloh37:
+; CHECK-NEXT:  Lloh25:
 ; CHECK-NEXT:    ldr q3, [x9, lCPI21_3@PAGEOFF]
-; CHECK-NEXT:  Lloh38:
+; CHECK-NEXT:  Lloh26:
 ; CHECK-NEXT:    ldr q4, [x10, lCPI21_4@PAGEOFF]
-; CHECK-NEXT:  Lloh39:
+; CHECK-NEXT:  Lloh27:
 ; CHECK-NEXT:    ldr q5, [x11, lCPI21_5@PAGEOFF]
 ; CHECK-NEXT:  LBB21_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -2570,15 +2445,15 @@
 ; CHECK-NEXT:    b.ne LBB21_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh36, Lloh39
-; CHECK-NEXT:    .loh AdrpLdr Lloh34, Lloh38
-; CHECK-NEXT:    .loh AdrpLdr Lloh32, Lloh37
-; CHECK-NEXT:    .loh AdrpAdrp Lloh30, Lloh36
-; CHECK-NEXT:    .loh AdrpLdr Lloh30, Lloh35
-; CHECK-NEXT:    .loh AdrpAdrp Lloh29, Lloh34
-; CHECK-NEXT:    .loh AdrpLdr Lloh29, Lloh33
-; CHECK-NEXT:    .loh AdrpAdrp Lloh28, Lloh32
-; CHECK-NEXT:    .loh AdrpLdr Lloh28, Lloh31
+; CHECK-NEXT:    .loh AdrpLdr Lloh24, Lloh27
+; CHECK-NEXT:    .loh AdrpLdr Lloh22, Lloh26
+; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh25
+; CHECK-NEXT:    .loh AdrpAdrp Lloh18, Lloh24
+; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh23
+; CHECK-NEXT:    .loh AdrpAdrp Lloh17, Lloh22
+; CHECK-NEXT:    .loh AdrpLdr Lloh17, Lloh21
+; CHECK-NEXT:    .loh AdrpAdrp Lloh16, Lloh20
+; CHECK-NEXT:    .loh AdrpLdr Lloh16, Lloh19
 ;
 ; CHECK-BE-LABEL: zext_v23i8_to_v23i48_in_loop:
 ; CHECK-BE:       // %bb.0: // %entry
Index: llvm/test/CodeGen/AArch64/vselect-ext.ll
===================================================================
--- llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -575,51 +575,39 @@
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh2:
 ; CHECK-NEXT:    adrp x9, lCPI24_0@PAGE
-; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    adrp x10, lCPI24_1@PAGE
-; CHECK-NEXT:  Lloh4:
-; CHECK-NEXT:    adrp x11, lCPI24_2@PAGE
-; CHECK-NEXT:  Lloh5:
-; CHECK-NEXT:    adrp x12, lCPI24_3@PAGE
-; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh6:
-; CHECK-NEXT:    ldr q0, [x9, lCPI24_0@PAGEOFF]
-; CHECK-NEXT:  Lloh7:
-; CHECK-NEXT:    ldr q1, [x10, lCPI24_1@PAGEOFF]
-; CHECK-NEXT:  Lloh8:
-; CHECK-NEXT:    ldr q3, [x11, lCPI24_2@PAGEOFF]
-; CHECK-NEXT:  Lloh9:
-; CHECK-NEXT:    ldr q4, [x12, lCPI24_3@PAGEOFF]
+; CHECK-NEXT:    movi.2d v0, #0xffffffffffffffff
+; CHECK-NEXT:  Lloh3:
+; CHECK-NEXT:    ldr q1, [x9, lCPI24_0@PAGEOFF]
 ; CHECK-NEXT:  LBB24_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q5, [x0, x8]
+; CHECK-NEXT:    ldr q2, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    cmgt.16b v6, v5, v2
-; CHECK-NEXT:    tbl.16b v7, { v5 }, v0
-; CHECK-NEXT:    tbl.16b v16, { v5 }, v1
-; CHECK-NEXT:    sshll2.8h v18, v6, #0
-; CHECK-NEXT:    tbl.16b v17, { v5 }, v3
-; CHECK-NEXT:    sshll2.4s v19, v18, #0
-; CHECK-NEXT:    sshll.4s v18, v18, #0
-; CHECK-NEXT:    tbl.16b v5, { v5 }, v4
-; CHECK-NEXT:    sshll.8h v6, v6, #0
-; CHECK-NEXT:    and.16b v7, v7, v19
-; CHECK-NEXT:    and.16b v16, v16, v18
-; CHECK-NEXT:    stp q16, q7, [x1, #32]
-; CHECK-NEXT:    sshll2.4s v7, v6, #0
+; CHECK-NEXT:    cmgt.16b v3, v2, v0
+; CHECK-NEXT:    dup.2d v4, v2[1]
+; CHECK-NEXT:    tbl.16b v5, { v2 }, v1
+; CHECK-NEXT:    sshll2.8h v6, v3, #0
+; CHECK-NEXT:    dup.4s v2, v2[1]
+; CHECK-NEXT:    dup.4s v7, v4[1]
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v1
+; CHECK-NEXT:    sshll.8h v3, v3, #0
+; CHECK-NEXT:    tbl.16b v7, { v7 }, v1
+; CHECK-NEXT:    tbl.16b v2, { v2 }, v1
+; CHECK-NEXT:    sshll2.4s v16, v6, #0
 ; CHECK-NEXT:    sshll.4s v6, v6, #0
-; CHECK-NEXT:    and.16b v7, v17, v7
-; CHECK-NEXT:    and.16b v5, v5, v6
-; CHECK-NEXT:    stp q5, q7, [x1], #64
+; CHECK-NEXT:    and.16b v7, v7, v16
+; CHECK-NEXT:    and.16b v4, v4, v6
+; CHECK-NEXT:    sshll2.4s v6, v3, #0
+; CHECK-NEXT:    stp q4, q7, [x1, #32]
+; CHECK-NEXT:    sshll.4s v3, v3, #0
+; CHECK-NEXT:    and.16b v2, v2, v6
+; CHECK-NEXT:    and.16b v3, v5, v3
+; CHECK-NEXT:    stp q3, q2, [x1], #64
 ; CHECK-NEXT:    b.ne LBB24_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh5, Lloh9
-; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh8
-; CHECK-NEXT:    .loh AdrpLdr Lloh3, Lloh7
-; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh6
+; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh3
 entry:
   br label %loop
 
@@ -643,23 +631,23 @@
 define void @extension_in_loop_as_shuffle_v16i8_to_v16i32(ptr %src, ptr %dst) {
 ; CHECK-LABEL: extension_in_loop_as_shuffle_v16i8_to_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:  Lloh10:
+; CHECK-NEXT:  Lloh4:
 ; CHECK-NEXT:    adrp x9, lCPI25_0@PAGE
-; CHECK-NEXT:  Lloh11:
+; CHECK-NEXT:  Lloh5:
 ; CHECK-NEXT:    adrp x10, lCPI25_1@PAGE
-; CHECK-NEXT:  Lloh12:
+; CHECK-NEXT:  Lloh6:
 ; CHECK-NEXT:    adrp x11, lCPI25_2@PAGE
-; CHECK-NEXT:  Lloh13:
+; CHECK-NEXT:  Lloh7:
 ; CHECK-NEXT:    adrp x12, lCPI25_3@PAGE
 ; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh14:
+; CHECK-NEXT:  Lloh8:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI25_0@PAGEOFF]
-; CHECK-NEXT:  Lloh15:
+; CHECK-NEXT:  Lloh9:
 ; CHECK-NEXT:    ldr q1, [x10, lCPI25_1@PAGEOFF]
-; CHECK-NEXT:  Lloh16:
+; CHECK-NEXT:  Lloh10:
 ; CHECK-NEXT:    ldr q3, [x11, lCPI25_2@PAGEOFF]
-; CHECK-NEXT:  Lloh17:
+; CHECK-NEXT:  Lloh11:
 ; CHECK-NEXT:    ldr q4, [x12, lCPI25_3@PAGEOFF]
 ; CHECK-NEXT:  LBB25_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -686,10 +674,10 @@
 ; CHECK-NEXT:    b.ne LBB25_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh13, Lloh17
-; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh16
-; CHECK-NEXT:    .loh AdrpLdr Lloh11, Lloh15
-; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh14
+; CHECK-NEXT:    .loh AdrpLdr Lloh7, Lloh11
+; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh10
+; CHECK-NEXT:    .loh AdrpLdr Lloh5, Lloh9
+; CHECK-NEXT:    .loh AdrpLdr Lloh4, Lloh8
 entry:
   br label %loop
 
@@ -714,23 +702,23 @@
 define void @shuffle_in_loop_is_no_extend_v16i8_to_v16i32(ptr %src, ptr %dst) {
 ; CHECK-LABEL: shuffle_in_loop_is_no_extend_v16i8_to_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:  Lloh18:
+; CHECK-NEXT:  Lloh12:
 ; CHECK-NEXT:    adrp x9, lCPI26_0@PAGE
-; CHECK-NEXT:  Lloh19:
+; CHECK-NEXT:  Lloh13:
 ; CHECK-NEXT:    adrp x10, lCPI26_1@PAGE
-; CHECK-NEXT:  Lloh20:
+; CHECK-NEXT:  Lloh14:
 ; CHECK-NEXT:    adrp x11, lCPI26_2@PAGE
-; CHECK-NEXT:  Lloh21:
+; CHECK-NEXT:  Lloh15:
 ; CHECK-NEXT:    adrp x12, lCPI26_3@PAGE
 ; CHECK-NEXT:    movi.2d v2, #0xffffffffffffffff
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh22:
+; CHECK-NEXT:  Lloh16:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI26_0@PAGEOFF]
-; CHECK-NEXT:  Lloh23:
+; CHECK-NEXT:  Lloh17:
 ; CHECK-NEXT:    ldr q1, [x10, lCPI26_1@PAGEOFF]
-; CHECK-NEXT:  Lloh24:
+; CHECK-NEXT:  Lloh18:
 ; CHECK-NEXT:    ldr q3, [x11, lCPI26_2@PAGEOFF]
-; CHECK-NEXT:  Lloh25:
+; CHECK-NEXT:  Lloh19:
 ; CHECK-NEXT:    ldr q4, [x12, lCPI26_3@PAGEOFF]
 ; CHECK-NEXT:  LBB26_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -757,10 +745,10 @@
 ; CHECK-NEXT:    b.ne LBB26_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh21, Lloh25
-; CHECK-NEXT:    .loh AdrpLdr Lloh20, Lloh24
-; CHECK-NEXT:    .loh AdrpLdr Lloh19, Lloh23
-; CHECK-NEXT:    .loh AdrpLdr Lloh18, Lloh22
+; CHECK-NEXT:    .loh AdrpLdr Lloh15, Lloh19
+; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh18
+; CHECK-NEXT:    .loh AdrpLdr Lloh13, Lloh17
+; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh16
 entry:
   br label %loop
 
Index: llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
===================================================================
--- llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -o - %s | FileCheck %s
+; RUN: llc -o - %s | FileCheck --implicit-check-not=LCPI --implicit-check-not=lCPI \
%s  
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-ios"
@@ -438,69 +438,48 @@
   ret void
 }
 
-; CHECK-LABEL: lCPI8_0:
-; CHECK-NEXT:   .byte   4                               ; 0x4
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   5                               ; 0x5
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   6                               ; 0x6
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   7                               ; 0x7
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT: lCPI8_1:
-; CHECK-NEXT:   .byte   0                               ; 0x0
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   1                               ; 0x1
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   2                               ; 0x2
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   3                               ; 0x3
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
-; CHECK-NEXT:   .byte   255                             ; 0xff
+; CHECK-LABEL:lCPI8_0:
+; CHECK-NEXT:        .byte   0                               ; 0x0
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   1                               ; 0x1
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   2                               ; 0x2
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   3                               ; 0x3
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
 
 define void @uitofp_v8i8_to_v8f32(ptr %src, ptr %dst) {
 ; CHECK-LABEL: uitofp_v8i8_to_v8f32:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh10:
 ; CHECK-NEXT:    adrp x9, lCPI8_0@PAGE
-; CHECK-NEXT:  Lloh11:
-; CHECK-NEXT:    adrp x10, lCPI8_1@PAGE
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh12:
+; CHECK-NEXT:  Lloh11:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI8_0@PAGEOFF]
-; CHECK-NEXT:  Lloh13:
-; CHECK-NEXT:    ldr q1, [x10, lCPI8_1@PAGEOFF]
 ; CHECK-NEXT:  LBB8_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr d2, [x0, x8, lsl #3]
+; CHECK-NEXT:    ldr d1, [x0, x8, lsl #3]
 ; CHECK-NEXT:    add x9, x1, x8, lsl #5
 ; CHECK-NEXT:    add x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #1000
-; CHECK-NEXT:    tbl.16b v3, { v2 }, v0
-; CHECK-NEXT:    tbl.16b v2, { v2 }, v1
-; CHECK-NEXT:    ucvtf.4s v3, v3
+; CHECK-NEXT:    dup.2s v2, v1[1]
+; CHECK-NEXT:    tbl.16b v1, { v1 }, v0
+; CHECK-NEXT:    tbl.16b v2, { v2 }, v0
+; CHECK-NEXT:    ucvtf.4s v1, v1
 ; CHECK-NEXT:    ucvtf.4s v2, v2
-; CHECK-NEXT:    stp q2, q3, [x9]
+; CHECK-NEXT:    stp q1, q2, [x9]
 ; CHECK-NEXT:    b.eq LBB8_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh11, Lloh13
-; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh12
+; CHECK-NEXT:    .loh AdrpLdr Lloh10, Lloh11
 entry:
   br label %loop
 
@@ -519,118 +498,55 @@
   ret void
 }
 
-; CHECK-LABEL: lCPI9_0:
-; CHECK-NEXT:     .byte   12                              ; 0xc
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   13                              ; 0xd
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   14                              ; 0xe
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   15                              ; 0xf
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT: lCPI9_1:
-; CHECK-NEXT:     .byte   8                               ; 0x8
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   9                               ; 0x9
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   10                              ; 0xa
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   11                              ; 0xb
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT: lCPI9_2:
-; CHECK-NEXT:     .byte   4                               ; 0x4
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   5                               ; 0x5
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   6                               ; 0x6
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   7                               ; 0x7
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT: lCPI9_3:
-; CHECK-NEXT:     .byte   0                               ; 0x0
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   1                               ; 0x1
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   2                               ; 0x2
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   3                               ; 0x3
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
-; CHECK-NEXT:     .byte   255                             ; 0xff
+; CHECK-LABEL:lCPI9_0:
+; CHECK-NEXT:        .byte   0                               ; 0x0
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   1                               ; 0x1
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   2                               ; 0x2
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   3                               ; 0x3
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
+; CHECK-NEXT:        .byte   255                             ; 0xff
 
 define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
 ; CHECK-LABEL: uitofp_v16i8_to_v16f32:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:  Lloh14:
+; CHECK-NEXT:  Lloh12:
 ; CHECK-NEXT:    adrp x9, lCPI9_0@PAGE
-; CHECK-NEXT:  Lloh15:
-; CHECK-NEXT:    adrp x10, lCPI9_1@PAGE
-; CHECK-NEXT:  Lloh16:
-; CHECK-NEXT:    adrp x11, lCPI9_2@PAGE
-; CHECK-NEXT:  Lloh17:
-; CHECK-NEXT:    adrp x12, lCPI9_3@PAGE
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:  Lloh18:
+; CHECK-NEXT:  Lloh13:
 ; CHECK-NEXT:    ldr q0, [x9, lCPI9_0@PAGEOFF]
-; CHECK-NEXT:  Lloh19:
-; CHECK-NEXT:    ldr q1, [x10, lCPI9_1@PAGEOFF]
-; CHECK-NEXT:  Lloh20:
-; CHECK-NEXT:    ldr q2, [x11, lCPI9_2@PAGEOFF]
-; CHECK-NEXT:  Lloh21:
-; CHECK-NEXT:    ldr q3, [x12, lCPI9_3@PAGEOFF]
 ; CHECK-NEXT:  LBB9_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr q4, [x0, x8, lsl #4]
+; CHECK-NEXT:    ldr q1, [x0, x8, lsl #4]
 ; CHECK-NEXT:    add x9, x1, x8, lsl #6
 ; CHECK-NEXT:    add x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #1000
-; CHECK-NEXT:    tbl.16b v5, { v4 }, v0
-; CHECK-NEXT:    tbl.16b v6, { v4 }, v1
-; CHECK-NEXT:    tbl.16b v7, { v4 }, v2
-; CHECK-NEXT:    tbl.16b v4, { v4 }, v3
-; CHECK-NEXT:    ucvtf.4s v5, v5
-; CHECK-NEXT:    ucvtf.4s v6, v6
-; CHECK-NEXT:    ucvtf.4s v7, v7
+; CHECK-NEXT:    tbl.16b v2, { v1 }, v0
+; CHECK-NEXT:    dup.4s v3, v1[1]
+; CHECK-NEXT:    dup.2d v1, v1[1]
+; CHECK-NEXT:    tbl.16b v3, { v3 }, v0
+; CHECK-NEXT:    dup.4s v4, v1[1]
+; CHECK-NEXT:    tbl.16b v1, { v1 }, v0
+; CHECK-NEXT:    ucvtf.4s v2, v2
+; CHECK-NEXT:    tbl.16b v4, { v4 }, v0
+; CHECK-NEXT:    ucvtf.4s v3, v3
+; CHECK-NEXT:    ucvtf.4s v1, v1
 ; CHECK-NEXT:    ucvtf.4s v4, v4
-; CHECK-NEXT:    stp q6, q5, [x9, #32]
-; CHECK-NEXT:    stp q4, q7, [x9]
+; CHECK-NEXT:    stp q2, q3, [x9]
+; CHECK-NEXT:    stp q1, q4, [x9, #32]
 ; CHECK-NEXT:    b.eq LBB9_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:    .loh AdrpLdr Lloh17, Lloh21
-; CHECK-NEXT:    .loh AdrpLdr Lloh16, Lloh20
-; CHECK-NEXT:    .loh AdrpLdr Lloh15, Lloh19
-; CHECK-NEXT:    .loh AdrpLdr Lloh14, Lloh18
+; CHECK-NEXT:    .loh AdrpLdr Lloh12, Lloh13
 entry:
   br label %loop
 
Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22947,8 +22947,8 @@
 static SDValue combineShuffleToZeroExtendVectorInReg(ShuffleVectorSDNode *SVN,
                                                      SelectionDAG &DAG,
                                                      const TargetLowering &TLI,
+                                                     bool LegalTypes,
                                                      bool LegalOperations) {
-  bool LegalTypes = true;
   EVT VT = SVN->getValueType(0);
   assert(!VT.isScalableVector() && "Encountered scalable shuffle?");
   unsigned NumElts = VT.getVectorNumElements();
@@ -24042,8 +24042,8 @@
   // Perform this really late, because it could eliminate knowledge
   // of undef elements created by this shuffle.
   if (Level < AfterLegalizeTypes)
-    if (SDValue V = combineShuffleToZeroExtendVectorInReg(SVN, DAG, TLI,
-                                                          LegalOperations))
+    if (SDValue V = combineShuffleToZeroExtendVectorInReg(
+            SVN, DAG, TLI, LegalTypes, LegalOperations))
       return V;
 
   return SDValue();


[Attachment #4 (text/plain)]

_______________________________________________
llvm-commits mailing list
llvm-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic