[prev in list] [next in list] [prev in thread] [next in thread]
List: gcc-bugs
Subject: Slow code generated by gcc-3.0 on i686-pc-linux-gnu
From: Matteo Frigo <athena () fftw ! org>
Date: 2001-06-29 20:56:31
[Download RAW message or body]
Dear gcc developers,
gcc-3.0 generates code which is twice as slow as the code generated by
gcc-2.95 for a simple matrix-transposition loop. This behavior
occurs with gcc-3.0 for i686-pc-linux-gnu. A complete program
that demonstrates the problem is the following:
typedef float fftw_real;
void complex_transpose(fftw_real *rA, fftw_real *iA, int n, int is, int js)
{
int i, j;
for (i = 1; i < n; ++i) {
for (j = 0; j < i; ++j) {
fftw_real ar, ai, br, bi;
ar = rA[i * is + j * js];
ai = iA[i * is + j * js];
br = rA[j * is + i * js];
bi = iA[j * is + i * js];
rA[j * is + i * js] = ar;
iA[j * is + i * js] = ai;
rA[i * is + j * js] = br;
iA[i * is + j * js] = bi;
}
}
}
fftw_real A[2048];
int main(int argc, char *argv)
{
int i;
for (i = 0; i < 100000; ++i) {
complex_transpose(A, A+1, 32, 2, 64);
}
}
On a 600 MHz Pentium III, the program runs in .9 seconds when compiled
with gcc-2.95 -O2, and 2.0 seconds when compiled by gcc-3.0 -O2. I am
inclined to attribute the problem to the compiler for the reason that
follows.
gcc-2.95 -O2 -S generates the following code for the transpose function.
The code is perfectly reasonable.
complex_transpose:
pushl %ebp
movl %esp,%ebp
pushl %edi
pushl %esi
pushl %ebx
movl 8(%ebp),%esi
movl 12(%ebp),%ebx
movl $1,%ecx
cmpl 16(%ebp),%ecx
jge .L4
.p2align 4,,7
.L6:
leal 1(%ecx),%edi
testl %ecx,%ecx
jle .L5
movl %ecx,%edx
imull 24(%ebp),%edx
movl %ecx,%eax
imull 20(%ebp),%eax
.p2align 4,,7
.L10:
flds (%esi,%eax,4)
flds (%ebx,%eax,4)
flds (%esi,%edx,4)
flds (%ebx,%edx,4)
fxch %st(3)
fstps (%esi,%edx,4)
fxch %st(1)
fstps (%ebx,%edx,4)
addl 20(%ebp),%edx
fstps (%esi,%eax,4)
fstps (%ebx,%eax,4)
addl 24(%ebp),%eax
decl %ecx
jnz .L10
.L5:
movl %edi,%ecx
cmpl 16(%ebp),%ecx
jl .L6
.L4:
popl %ebx
popl %esi
popl %edi
leave
ret
However, gcc-3.0 -O2 -S generates code which is much more complicated, and
which performs more memory traffic than necessary:
complex_transpose:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $12, %esp
movl 16(%ebp), %eax
movl $1, -16(%ebp)
cmpl %eax, -16(%ebp)
jge .L12
movl 24(%ebp), %edx
movl 20(%ebp), %ecx
movl %edx, -20(%ebp)
movl %ecx, -24(%ebp)
.p2align 4
.L5:
movl -16(%ebp), %ebx
testl %ebx, %ebx
jle .L13
movl -20(%ebp), %edi
movl -24(%ebp), %esi
movl -16(%ebp), %eax
.p2align 4
.L9:
movl 8(%ebp), %ebx
movl 12(%ebp), %edx
movl (%ebx,%edi,4), %ecx
flds (%ebx,%esi,4)
movl (%edx,%edi,4), %ebx
flds (%edx,%esi,4)
fxch %st(1)
movl 8(%ebp), %edx
fstps (%edx,%edi,4)
movl 12(%ebp), %edx
fstps (%edx,%edi,4)
movl 8(%ebp), %edx
movl %ecx, (%edx,%esi,4)
movl 12(%ebp), %ecx
movl %ebx, (%ecx,%esi,4)
movl 20(%ebp), %ecx
movl 24(%ebp), %edx
addl %ecx, %edi
addl %edx, %esi
decl %eax
jne .L9
.L13:
incl -16(%ebp)
movl 24(%ebp), %ebx
movl 20(%ebp), %eax
movl 16(%ebp), %edx
addl %ebx, -20(%ebp)
addl %eax, -24(%ebp)
cmpl %edx, -16(%ebp)
jl .L5
.L12:
addl $12, %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
Regards,
Matteo Frigo
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic