'[Bug tree-optimization/83232] New: fma3d spec2000 regression on zen with -Ofast (generic tuning) aft'

[prev in list] [next in list] [prev in thread] [next in thread] 

List:       gcc-bugs
Subject:    [Bug tree-optimization/83232] New: fma3d spec2000 regression on zen with -Ofast (generic tuning) aft
From:       "hubicka at gcc dot gnu.org" <gcc-bugzilla () gcc ! gnu ! org>
Date:       2017-11-30 18:10:17
Message-ID: bug-83232-4 () http ! gcc ! gnu ! org/bugzilla/
[Download RAW message or body]

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83232

            Bug ID: 83232
           Summary: fma3d spec2000 regression on zen with -Ofast (generic
                    tuning) after r255268 by missed SLP oppurtunity
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

r255268 introduced regression on fma3d. This difference pays back in specfp2000
in general but should be tracked (and hopefully fixed).

Compiling with -fdisable-tree-cunroll makes the problem go away. Problem is
that slp vectorizer is confused by presence of unrolled vectorized loop body
and gives up on vectorizing initialization sequence which leads to memory
mismatch stall.

Problem is in material_11.f90 where the loop of material_41_integration is
vectorized in both cases. New code omits vectorization for alignment but it
also introduces partial stores followed by full sized load:

  _13 = *stress_203(D)[0];
  sinc[0] = _13;
  _336 = *stress_203(D)[1];
  sinc[1] = _336;
  _343 = *stress_203(D)[2];
  sinc[2] = _343;
  _350 = *stress_203(D)[3];
  sinc[3] = _350;
  _22 = *dtnext_206(D);
  _23 = *dxx_207(D);
  _24 = _22 * _23;
  einc[0] = _24;
  _25 = *dyy_209(D);
  _26 = _22 * _25;
  einc[1] = _26;
  _27 = *dzz_211(D);
  _28 = _22 * _27;
  einc[2] = _28;
  _29 = *dxy_213(D);
  _30 = _22 * _29;
  einc[3] = _30;
  _31 = *dxz_215(D);
  _32 = _22 * _31;
  einc[4] = _32;
  _33 = *dyz_217(D);
  _34 = _22 * _33;
  einc[5] = _34;
  vect_cst__471 = {p3_194, p3_194};
  vect__37.58_227 = MEM[(real(kind=8) *)stress_203(D)];
  vect__38.61_323 = MEM[(real(kind=8) *)&einc];
  vect__39.62_444 = vect__38.61_323 * vect_cst__471;
  vect__40.63_443 = vect__37.58_227 + vect__39.62_444;
  MEM[(real(kind=8) *)stress_203(D)] = vect__40.63_443;
  vect__37.58_432 = MEM[(real(kind=8) *)stress_203(D) + 16B];
  vect__38.61_431 = MEM[(real(kind=8) *)&einc + 16B];
  vect__39.62_430 = vect__38.61_431 * vect_cst__471;
  vect__40.63_429 = vect__39.62_430 + vect__37.58_432;
  MEM[(real(kind=8) *)stress_203(D) + 16B] = vect__40.63_429;
  vect__37.58_475 = MEM[(real(kind=8) *)stress_203(D) + 32B];
  vect__38.61_472 = MEM[(real(kind=8) *)&einc + 32B];
  vect__39.62_463 = vect_cst__471 * vect__38.61_472;
  vect__40.63_462 = vect__39.62_463 + vect__37.58_475;
  MEM[(real(kind=8) *)stress_203(D) + 32B] = vect__40.63_462;
  _41 = *stress_203(D)[0];
  _342 = _25 + _27;
  _44 = _23 + _342;
  _8 = _22 * _44;
  _45 = _6 * _8;
  _46 = _41 + _45;
  *stress_203(D)[0] = _46;
  _47 = *stress_203(D)[1];
  _48 = _45 + _47;
  *stress_203(D)[1] = _48;
  *stress_203(D)[2] = 0.0;
  einc[4] = 0.0;
  einc[5] = 0.0;
  if (ak_202 == 0.0)
    goto <bb 3>; [50.00%]
  else
    goto <bb 4>; [50.00%]


while old code did:

  vect__13.78_205 = MEM[(real(kind=8) *)stress_203(D)];
  vect__13.79_445 = MEM[(real(kind=8) *)stress_203(D) + 16B];
  MEM[(real(kind=8) *)&sinc] = vect__13.78_205;
  MEM[(real(kind=8) *)&sinc + 16B] = vect__13.79_445;
  _22 = *dtnext_206(D);
  _23 = *dxx_207(D);
  _24 = _22 * _23;
  _25 = *dyy_209(D);
  _26 = _22 * _25;
  _27 = *dzz_211(D);
  _28 = _22 * _27;
  _29 = *dxy_213(D);
  _30 = _22 * _29;
  _31 = *dxz_215(D);
  _32 = _22 * _31;
  _33 = *dyz_217(D);
  _34 = _22 * _33;
  vect_cst__479 = {_32, _34};
  vect_cst__482 = {_28, _30};
  vect_cst__486 = {_24, _26};
  MEM[(real(kind=8) *)&einc] = vect_cst__486;
  MEM[(real(kind=8) *)&einc + 16B] = vect_cst__482;
  MEM[(real(kind=8) *)&einc + 32B] = vect_cst__479;
  _441 = (unsigned long) stress_203(D);
  _440 = _441 >> 3;
  _439 = -_440;
  _438 = (unsigned int) _439;
  prolog_loop_niters.57_442 = _438 & 1;
  if (prolog_loop_niters.57_442 == 0)
    goto <bb 4>; [33.33%]
  else
    goto <bb 3>; [66.67%]

;;    succ:       3 [66.7% (guessed)]  count:102291806 (estimated locally)
(FALSE_VALUE,EXECUTABLE)
;;                4 [33.3% (guessed)]  count:51145904 (estimated locally)
(TRUE_VALUE,EXECUTABLE)

;;   basic block 3, loop depth 0, count 102291806 (estimated locally), maybe
hot
;;    prev block 2, next block 4, flags: (NEW, REACHABLE, VISITED)
;;    pred:       2 [66.7% (guessed)]  count:102291806 (estimated locally)
(FALSE_VALUE,EXECUTABLE)
  _471 = MEM[(real(kind=8)[6] *)stress_203(D)];
  _463 = einc[0];
  _462 = p3_194 * _463;
  _461 = _462 + _471;
  MEM[(real(kind=8)[6] *)stress_203(D)] = _461;
;;    succ:       4 [always]  count:102291806 (estimated locally)
(FALLTHRU,EXECUTABLE)

;;   basic block 4, loop depth 0, count 153437710 (estimated locally), maybe
hot
;;    prev block 3, next block 5, flags: (NEW, REACHABLE, VISITED)
;;    pred:       3 [always]  count:102291806 (estimated locally)
(FALLTHRU,EXECUTABLE)
;;                2 [33.3% (guessed)]  count:51145904 (estimated locally)
(TRUE_VALUE,EXECUTABLE)
  # i_435 = PHI <2(3), 1(2)>
  prolog_loop_adjusted_niters.58_432 = (sizetype) prolog_loop_niters.57_442;
  niters.59_431 = 6 - prolog_loop_niters.57_442;
  bnd.60_417 = niters.59_431 >> 1;
  _410 = prolog_loop_adjusted_niters.58_432 * 8;
  vectp_stress.65_411 = stress_203(D) + _410;
  vectp_einc.68_406 = &einc + _410;
  vect_cst__401 = {p3_194, p3_194};
  vect__37.66_250 = MEM[(real(kind=8) *)vectp_stress.65_411];
  vect__38.69_477 = MEM[(real(kind=8) *)vectp_einc.68_406];
  vect__39.70_219 = vect_cst__401 * vect__38.69_477;
  vect__40.71_227 = vect__39.70_219 + vect__37.66_250;
  MEM[(real(kind=8) *)vectp_stress.65_411] = vect__40.71_227;
  vect__37.66_382 = MEM[(real(kind=8) *)vectp_stress.65_411 + 16B];
  vect__38.69_381 = MEM[(real(kind=8) *)vectp_einc.68_406 + 16B];
  vect__39.70_380 = vect__38.69_381 * vect_cst__401;
  vect__40.71_379 = vect__39.70_380 + vect__37.66_382;
  MEM[(real(kind=8) *)vectp_stress.65_411 + 16B] = vect__40.71_379;
  if (bnd.60_417 == 2)
    goto <bb 6>; [33.33%]
  else
    goto <bb 5>; [66.67%]

;;    succ:       6 [33.3% (adjusted)]  count:51145904 (estimated locally)
(TRUE_VALUE,EXECUTABLE)
;;                5 [66.7% (adjusted)]  count:102291806 (estimated locally)
(FALSE_VALUE,EXECUTABLE)

;;   basic block 5, loop depth 0, count 68194539 (estimated locally), maybe hot
;;   Invalid sum of incoming counts 102291806 (estimated locally), should be
68194539 (estimated locally)
;;    prev block 4, next block 6, flags: (NEW, REACHABLE, VISITED)
;;    pred:       4 [66.7% (adjusted)]  count:102291806 (estimated locally)
(FALSE_VALUE,EXECUTABLE)
  vect__37.66_407 = MEM[(real(kind=8) *)vectp_stress.65_411 + 32B];
  vect__38.69_402 = MEM[(real(kind=8) *)vectp_einc.68_406 + 32B];
  vect__39.70_400 = vect_cst__401 * vect__38.69_402;
  vect__40.71_399 = vect__39.70_400 + vect__37.66_407;
  MEM[(real(kind=8) *)vectp_stress.65_411 + 32B] = vect__40.71_399;
;;    succ:       6 [always (adjusted)]  count:68194539 (estimated locally)
(FALLTHRU,EXECUTABLE)

;;   basic block 6, loop depth 0, count 153437710 (estimated locally), maybe
hot
;;   Invalid sum of incoming counts 119340443 (estimated locally), should be
153437710 (estimated locally)
;;    prev block 5, next block 7, flags: (NEW, VISITED)
;;    pred:       5 [always (adjusted)]  count:68194539 (estimated locally)
(FALLTHRU,EXECUTABLE)
;;                4 [33.3% (adjusted)]  count:51145904 (estimated locally)
(TRUE_VALUE,EXECUTABLE)
  niters_vector_mult_vf.61_416 = niters.59_431 & 4294967294;
  _414 = (integer(kind=4)) niters_vector_mult_vf.61_416;
  tmp.62_415 = _414 + i_435;
  if (niters_vector_mult_vf.61_416 == niters.59_431)
    goto <bb 8>; [33.33%]
  else
    goto <bb 7>; [66.67%]
;;    succ:       7 [66.7% (guessed)]  count:102291806 (estimated locally)
(FALSE_VALUE,EXECUTABLE)
;;                8 [33.3% (guessed)]  count:51145904 (estimated locally)
(TRUE_VALUE,EXECUTABLE)

;;   basic block 7, loop depth 0, count 613536076 (estimated locally), maybe
hot
;;   Invalid sum of incoming counts 102291806 (estimated locally), should be
613536076 (estimated locally)
;;    prev block 6, next block 8, flags: (NEW, REACHABLE, VISITED)
;;    pred:       6 [66.7% (guessed)]  count:102291806 (estimated locally)
(FALSE_VALUE,EXECUTABLE)
...

I would say new code is nicer and should run faster if there was no partial
stores.

slp vectorizer has difference when analzing einc:

Relevant difference in SLP vectorizer (old->new) is:
        offset alignment: 128
        step alignment: 128
        base_object: einc[5]
+Creating dr for MEM[(real(kind=8) *)stress_203(D)]
+analyze_innermost: success.
+       base_address: stress_203(D)
+       offset from base address: 0
+       constant offset from base address: 0
+       step: 0
+       base alignment: 8
+       base misalignment: 0
+       offset alignment: 128
+       step alignment: 128
+       base_object: MEM[(real(kind=8) *)stress_203(D)]
+Creating dr for MEM[(real(kind=8) *)&einc]
+analyze_innermost: success.
+       base_address: &einc
+       offset from base address: 0
+       constant offset from base address: 0
+       step: 0
+       base alignment: 16
+       base misalignment: 0
+       offset alignment: 128
+       step alignment: 128
+       base_object: MEM[(real(kind=8) *)&einc]

....

+Creating dr for MEM[(real(kind=8) *)vectp_einc.59_440]
+analyze_innermost: success.
+       base_address: &einc
+       offset from base address: 0
+       constant offset from base address: 16
+       step: 0
+       base alignment: 16
+       base misalignment: 0
+       offset alignment: 128
+       step alignment: 128
+       base_object: MEM[(real(kind=8) *)vectp_einc.59_440]
+Creating dr for MEM[(real(kind=8) *)vectp_stress.64_439]
+analyze_innermost: success.
+       base_address: stress_203(D)
+       offset from base address: 0
+       constant offset from base address: 16
+       step: 0
+       base alignment: 8
+       base misalignment: 0
+       offset alignment: 128
+       step alignment: 128
+       base_object: MEM[(real(kind=8) *)vectp_stress.64_439]

....

+Creating dr for MEM[(real(kind=8) *)vectp_einc.59_426]
+analyze_innermost: success.
+       base_address: &einc
+       offset from base address: 0
+       constant offset from base address: 32
+       step: 0
+       base alignment: 16
+       base misalignment: 0
+       offset alignment: 128
+       step alignment: 128
+       base_object: MEM[(real(kind=8) *)vectp_einc.59_426]

....

+Creating dr for einc[4]
+analyze_innermost: success.
+       base_address: &einc
+       offset from base address: 0
+       constant offset from base address: 32
+       step: 0
+       base alignment: 16
+       base misalignment: 0
+       offset alignment: 128
+       step alignment: 128
+       base_object: einc[4]
+Creating dr for einc[5]
+analyze_innermost: success.
+       base_address: &einc
+       offset from base address: 0
+       constant offset from base address: 40
+       step: 0
+       base alignment: 16
+       base misalignment: 0
+       offset alignment: 128
+       step alignment: 128
+       base_object: einc[5]


I  those are from the unrolled vectorized loop body.

+: note: not vectorized: no vectype for stmt: vect__37.58_227 =
MEM[(real(kind=8) *)stress_203(D)];
+ scalar_type: vector(2) real(kind=8)
+: note: not vectorized: no vectype for stmt: vect__38.61_323 =
MEM[(real(kind=8) *)&einc];
+ scalar_type: vector(2) real(kind=8)
+: note: not vectorized: no vectype for stmt: MEM[(real(kind=8)
*)stress_203(D)] = vect__40.63_443;
+ scalar_type: vector(2) real(kind=8)
+: note: not vectorized: no vectype for stmt: vect__37.58_432 =
MEM[(real(kind=8) *)vectp_stress.56_441];
+ scalar_type: vector(2) real(kind=8)
+: note: not vectorized: no vectype for stmt: vect__38.61_431 =
MEM[(real(kind=8) *)vectp_einc.59_440];
+ scalar_type: vector(2) real(kind=8)
+: note: not vectorized: no vectype for stmt: MEM[(real(kind=8)
*)vectp_stress.64_439] = vect__40.63_429;
+ scalar_type: vector(2) real(kind=8)
+: note: not vectorized: no vectype for stmt: vect__37.58_475 =
MEM[(real(kind=8) *)vectp_stress.56_427];
+ scalar_type: vector(2) real(kind=8)
+: note: not vectorized: no vectype for stmt: vect__38.61_472 =
MEM[(real(kind=8) *)vectp_einc.59_426];
+ scalar_type: vector(2) real(kind=8)
+: note: not vectorized: no vectype for stmt: MEM[(real(kind=8)
*)vectp_stress.64_425] = vect__40.63_462;
+ scalar_type: vector(2) real(kind=8)
+: note: got vectype for stmt: _41 = *stress_203(D)[0];
+vector(2) real(kind=8)
+: note: got vectype for stmt: *stress_203(D)[0] = _46;
+vector(2) real(kind=8)
+: note: got vectype for stmt: _47 = *stress_203(D)[1];
+vector(2) real(kind=8)
+: note: got vectype for stmt: *stress_203(D)[1] = _48;
+vector(2) real(kind=8)
+: note: got vectype for stmt: *stress_203(D)[2] = 0.0;
+vector(2) real(kind=8)
+: note: got vectype for stmt: einc[4] = 0.0;
+vector(2) real(kind=8)
+: note: got vectype for stmt: einc[5] = 0.0;
+vector(2) real(kind=8)
 : note: === vect_analyze_data_ref_accesses ===
 : note: Detected interleaving store einc[0] and einc[1]
 : note: Detected interleaving store einc[0] and einc[2]
 : note: Detected interleaving store einc[0] and einc[3]
 : note: Detected interleaving store einc[0] and einc[4]
-: note: Detected interleaving store einc[0] and einc[5]
+: note: Detected interleaving store einc[4] and einc[5]
 : note: Detected interleaving store sinc[0] and sinc[1]
 : note: Detected interleaving store sinc[0] and sinc[2]
 : note: Detected interleaving store sinc[0] and sinc[3]
@@ -475,9 +683,14 @@
 : note: Detected interleaving load MEM[(struct material_type[0:]
*)_1][_5].pval[6] and MEM[(struct material_type[0:] *)_1][_5].pval[15]
 : note: Detected interleaving load MEM[(struct material_type[0:]
*)_1][_5].pval[6] and MEM[(struct material_type[0:] *)_1][_5].pval[16]
 : note: Detected interleaving load *stress_203(D)[0] and *stress_203(D)[1]
+: note: Detected interleaving load *stress_203(D)[0] and *stress_203(D)[1]
 : note: Detected interleaving load *stress_203(D)[0] and *stress_203(D)[2]
 : note: Detected interleaving load *stress_203(D)[0] and *stress_203(D)[3]
-: note: Detected interleaving store of size 6 starting with einc[0] = _24;
+: note: Detected interleaving store *stress_203(D)[0] and *stress_203(D)[1]
+: note: Detected interleaving store *stress_203(D)[0] and *stress_203(D)[2]
+: note: Detected interleaving store of size 5 starting with einc[0] = _24;
+: note: Detected interleaving store of size 2 starting with einc[4] = 0.0;
+: note: not consecutive access einc[5] = 0.0;
 : note: not consecutive access _1 = material.data;
 : note: not consecutive access _2 = material.offset;
 : note: not consecutive access p1 = _6;
@@ -486,7 +699,10 @@
 : note: Detected interleaving load of size 11 starting with qr_195 =
MEM[(struct material_type[0:] *)_1][_5].pval[6];
 : note: not consecutive access _3 = *matid_191(D);
 : note: not consecutive access _16 = *efps_201(D);
-: note: Detected interleaving load of size 4 starting with _13 =
*stress_203(D)[0];
+: note: not consecutive access _13 = *stress_203(D)[0];
+: note: Two or more load stmts share the same dr.
+: note: Detected interleaving load of size 4 starting with _41 =
*stress_203(D)[0];
+: note: Detected interleaving store of size 3 starting with *stress_203(D)[0]
= _46;
 : note: not consecutive access _22 = *dtnext_206(D);
 : note: not consecutive access _23 = *dxx_207(D);
 : note: not consecutive access _25 = *dyy_209(D);
 : note: vect_is_simple_use: operand _22
 : note: def_stmt: _22 = *dtnext_206(D);
 : note: type of def: internal
-: note: vect_is_simple_use: operand _441
-: note: def_stmt: _441 = (unsigned long) stress_203(D);
+: note: vect_is_simple_use: operand vect__38.61_323
+: note: def_stmt: vect__38.61_323 = MEM[(real(kind=8) *)&einc];
+: note: type of def: internal
+: note: vect_is_simple_use: operand vect__38.61_431
+: note: def_stmt: vect__38.61_431 = MEM[(real(kind=8) *)vectp_einc.59_440];
 : note: type of def: internal
-: note: vect_is_simple_use: operand _438
-: note: def_stmt: _438 = (unsigned int) _439;
+: note: vect_is_simple_use: operand vect__38.61_472
+: note: def_stmt: vect__38.61_472 = MEM[(real(kind=8) *)vectp_einc.59_426];
 : note: type of def: internal
-: note: vect_is_simple_use: operand _439
-: note: def_stmt: _439 = -_440;
+: note: vect_is_simple_use: operand _22
+: note: def_stmt: _22 = *dtnext_206(D);
+: note: type of def: internal
+: note: vect_is_simple_use: operand _6
+: note: def_stmt: _6 = MEM[(struct material_type[0:] *)_1][_5].pval[7];
 : note: type of def: internal
 : note: === vect_analyze_slp ===
 : note: Build SLP for einc[0] = _24;
@@ -540,7 +762,6 @@
 : note: Build SLP for einc[2] = _28;
 : note: Build SLP for einc[3] = _30;
 : note: Build SLP for einc[4] = _32;
-: note: Build SLP for einc[5] = _34;
 : note: vect_is_simple_use: operand _24
 : note: def_stmt: _24 = _22 * _23;
 : note: type of def: internal
 : note: vect_is_simple_use: operand _32
 : note: def_stmt: _32 = _22 * _31;
 : note: type of def: internal
-: note: vect_is_simple_use: operand _34
-: note: def_stmt: _34 = _22 * _33;
-: note: type of def: internal
 : note: Build SLP for _24 = _22 * _23;
 : note: Build SLP for _26 = _22 * _25;
 : note: Build SLP for _28 = _22 * _27;
 : note: Build SLP for _30 = _22 * _29;
 : note: Build SLP for _32 = _22 * _31;
-: note: Build SLP for _34 = _22 * _33;
 : note: vect_is_simple_use: operand _22
 : note: def_stmt: _22 = *dtnext_206(D);
 : note: type of def: internal
@@ -595,12 +812,6 @@
 : note: vect_is_simple_use: operand _31
 : note: def_stmt: _31 = *dxz_215(D);
 : note: type of def: internal
-: note: vect_is_simple_use: operand _22
-: note: def_stmt: _22 = *dtnext_206(D);
-: note: type of def: internal
-: note: vect_is_simple_use: operand _33
-: note: def_stmt: _33 = *dyz_217(D);
-: note: type of def: internal
 : note: Build SLP for _22 = *dtnext_206(D);
 : note: Build SLP failed: unvectorizable statement _22 = *dtnext_206(D);
 : note: Building vector operands from scalars
@@ -608,21 +819,10 @@
 : note: Build SLP failed: unvectorizable statement _23 = *dxx_207(D);
 : note: Building vector operands from scalars
 : note: Building parent vector operands from scalars instead
-: note: Final SLP tree for instance:
-: note: node
-: note:        stmt 0 einc[0] = _24;
-: note:        stmt 1 einc[1] = _26;
-: note:        stmt 2 einc[2] = _28;
-: note:        stmt 3 einc[3] = _30;
-: note:        stmt 4 einc[4] = _32;
-: note:        stmt 5 einc[5] = _34;=
[prev in list] [next in list] [prev in thread] [next in thread]
Configure | About | News | Add a list | Sponsored by KoreLogic