Transcript PPT

CS 3214
Computer Systems
Lecture 6
Godmar Back
Announcements
• Exercise 2 due Sep 8
– Grades of exercise 1 posted later today
• Exercise 3 coming out today
– Recommend you do this concurrently with
project 2, not after
• Project 2 due Sep 21
• Now’s the time to reshuffle your team
should you need to
CS 3214 Fall 2011
Inlined Assembly
• asm(“…” : <output> : <input> : <clobber>)
• Means to inject assembly into code and link
with remained in a controlled manner
• Compiler doesn’t “know” what instructions do
– thus must describe
– a) state compiler must create upon enter: which
values must be in which registers, etc.
– b) state produced by inline instructions: which
registers contain which values, etc. – also: any
registers that may be clobbered
CS 3214 Fall 2011
Inlined Assembly Example
bool imul32x32_64(uint32_t leftop,
uint32_t rightop,
uint64_t *presult)
{
uint64_t result;
bool overflow;
asm("imull %2"
"\n\t"
"seto %%bl"
"\n\t"
: "=A" (result), "=b" (overflow) // output constraint
: "r" (leftop), "a" (rightop) // input constraint
);
*presult = result;
return overflow;
}
CS 3214 Fall 2011
Goal: exploit imull’s
property to compute 32x32
bit product: imull %ecx
means
(%edx, %eax) := %ecx * %eax
Magic instructions:
“r”(leftop) – pick any 32bit
register and put leftop in it
“a” (rightop) – make sure
%eax contains rightop
“%2” substitute whichever
register picked for ‘leftop’
“=A” result is in (%edx, %eax)
“=b” result is in %ebx
Inlined Assembly (2)
bool imul32x32_64(uint32_t leftop,
uint32_t rightop,
uint64_t *presult)
{
uint64_t result;
bool overflow;
asm("imull %2"
"\n\t"
"seto %%bl"
"\n\t"
: "=A" (result), "=b" (overflow) // output constraint
: "r" (leftop), "a" (rightop) // input constraint
);
*presult = result;
return overflow;
}
CS 3214 Fall 2011
imul32x32_64:
pushl %ebp
movl %esp, %ebp
subl $12, %esp
movl %ebx, (%esp)
movl %esi, 4(%esp)
movl %edi, 8(%esp)
movl 8(%ebp), %ecx
movl 12(%ebp), %eax
#APP
imull %ecx
seto %bl
#NO_APP
movl %eax, %esi
movl 16(%ebp), %eax
movl %esi, (%eax)
movl %edx, 4(%eax)
movzbl %bl, %eax
movl (%esp), %ebx
movl 4(%esp), %esi
movl 8(%esp), %edi
movl %ebp, %esp
popl %ebp
ret
Floating Point on IA32
• History:
– First implemented in 8087 coprocessor
– “stack based” – FPU has 8 registers that form a stack
%st(0), %st(1), …
– Known as ‘x87’ floating point
• Weirdness: internal accuracy 80bit (rather than
IEEE745 64bit) – thus storing involves rounding
– Results depends on how often values are moved out
of the FPU registers into memory (which depends on
compiler’s code generation strategy/optimization
level) – not good!
CS 3214 Fall 2011
Floating Point Code Example
• Compute Inner
Product of Two
Vectors
pushl %ebp
movl %esp,%ebp
pushl %ebx
– Single precision
arithmetic
– Common computation
float ipf (float x[],
float y[],
int n)
{
int i;
float result = 0.0;
for (i = 0; i < n; i++) {
result += x[i] * y[i];
}
return result;
}
movl 8(%ebp),%ebx
movl 12(%ebp),%ecx
movl 16(%ebp),%edx
fldz
xorl %eax,%eax
cmpl %edx,%eax
jge .L3
.L5:
flds (%ebx,%eax,4)
fmuls (%ecx,%eax,4)
faddp
incl %eax
cmpl %edx,%eax
jl .L5
.L3:
movl -4(%ebp),%ebx
movl %ebp, %esp
popl %ebp
ret
CS 3214 Fall 2011
# setup
#
#
#
#
#
#
%ebx=&x
%ecx=&y
%edx=n
push +0.0
i=0
if i>=n done
#
#
#
#
#
push x[i]
st(0)*=y[i]
st(1)+=st(0); pop
i++
if i<n repeat
# finish
# st(0) = result
Floating Point: SSE(*)
• Various extensions to x87 were introduced:
– SSE, SSE2, SSE3, SSE4, SSE5
• Use 16 128bit %xmm registers
– Can be used as 16x8bit, 4x32bit, 2x64bit, etc. for both
integer and floating point operations
• Use –fpmath=sse –msse switch to enable (or
–msse2, -msse3, -msse4)
• All doubles are 64bits internally - gives
reproducible results independent of load/stores
– Aside: if 80bit is ok, can combine –fpmath=sse,x87 for
24 registers
CS 3214 Fall 2011
Floating Point SSE
• Same code
compiled with:
ipf:
pushl
%ebp
movl
%esp, %ebp
pushl
%ebx
subl
$4, %esp
movl
8(%ebp), %ebx
movl
12(%ebp), %ecx
movl
16(%ebp), %edx
xorps
%xmm1, %xmm1
testl
%edx, %edx
float ipf (float x[],
jle
.L4
movl
$0, %eax
; i = 0
float y[],
xorps
%xmm1, %xmm1; result = 0.0
int n)
.L5:
{
movss
(%ebx,%eax,4), %xmm0 ; t = x[i]
int i;
mulss
(%ecx,%eax,4), %xmm0 ; t *= y[i]
float result = 0.0;
addss
%xmm0, %xmm1 ; result += t
addl
$1, %eax
; i = i+1
for (i = 0; i < n; i++) {
cmpl
%edx, %eax
jne
.L5
result += x[i] * y[i];
.L4:
}
movss
%xmm1, -8(%ebp)
return result;
flds
-8(%ebp) ; %st(0) = result
}
addl
$4, %esp
popl
%ebx
CS 3214popl
Fall 2011%ebp
ret
-msse2
-fpmath=sse
Floating Point x86_64
• Uses SSE by
default
• Return floating
point values in
%xmm0
float ipf (float x[],
float y[],
int n)
{
int i;
float result = 0.0;
for (i = 0; i < n; i++) {
result += x[i] * y[i];
}
return result;
}
ipf:
testl
jg .L2
xorps
ret
.L2:
subl
leaq
movl
xorps
.L4:
movss
mulss
addss
addq
cmpq
jne .L4
rep
ret
%edx, %edx
%xmm0, %xmm0
$1, %edx
4(,%rdx,4), %rdx
$0, %eax
%xmm0, %xmm0
(%rdi,%rax), %xmm1
(%rsi,%rax), %xmm1
%xmm1, %xmm0
$4, %rax
%rdx, %rax
Example generated with gcc 4.4.3 -O.
CS 3214 Fall 2011
Vectorization
• SSE* instruction sets can operate on
‘vectors’
• For instance, if 128bit register is treated as
(d1, d0) and (e1, e0), can compute
(d1+e1, d0+e0) using single instruction –
executes in parallel
• Also known as “SIMD”
– Single instruction, multiple data
CS 3214 Fall 2011
Floating Point SSE - Vectorized
• Trying to make
compiler achieve
transformation
shown on right
float ipf_vector (float x[],
float y[],
int n)
{
int i;
float result = 0.0;
for (i = 0; i < n; i+=4) {
p[0] = x[i]
* y[i];
p[1] = x[i+1] * y[i+1];
p[2] = x[i+2] * y[i+2];
p[3] = x[i+3] * y[i+3];
result += p[0]+p[1]+p[2]+p[3];
}
return result;
float ipf (float x[],
float y[],
int n)
{
int i;
float result = 0.0;
for (i = 0; i < n; i++) {
result += x[i] * y[i];
}
return result;
}
}
CS 3214 Fall 2011
Logical transformation, not
actual code
Example: GCC
Vector Extension
magic attribute that tells gcc that
v4sf is a type denoting vectors of 4 floats
typedef float v4sf
__attribute__ ((vector_size (16)));
float ipf (v4sf x[],
v4sf y[],
int n)
{
int i;
float partialsum, result = 0.0;
for (i = 0; i < n; i++) {
v4sf p = x[i] * y[i];
float * v = (float *)&p; // treat vector as float *
partialsum = v[0] + v[1] + v[2] + v[3];
result += partialsum;
}
return result;
}
CS 3214 Fall 2011
ipf:
Example: GCC
Vector Extensions
typedef float v4sf
__attribute__ ((vector_size (16)));
float ipf (v4sf x[],
v4sf y[],
int n)
{
int i;
float partialsum, result = 0.0;
float * v = (float *)&p;
partialsum = v[0] + v[1] + v[2] + v[3];
result += partialsum;
}
CS 3214 Fall 2011
%ebp
%esp, %ebp
%ebx
$36, %esp
16(%ebp), %ebx
8(%ebp), %edx
12(%ebp), %eax
$0, %ecx
%xmm1, %xmm1
movaps
mulps
movaps
movss
addss
addss
addss
addss
addl
addl
addl
cmpl
jne
movss
flds
addl
popl
popl
ret
(%eax), %xmm0
(%edx), %xmm0
%xmm0, -24(%ebp)
-24(%ebp), %xmm0
-20(%ebp), %xmm0
-16(%ebp), %xmm0
-12(%ebp), %xmm0
%xmm0, %xmm1
$1, %ecx
$16, %edx
$16, %eax
%ebx, %ecx
.L5
%xmm1, -28(%ebp)
-28(%ebp)
$36, %esp
%ebx
%ebp
.L5:
for (i = 0; i < n; i++) {
v4sf p = x[i] * y[i];
}
return result;
pushl
movl
pushl
subl
movl
movl
movl
movl
xorps
Comments
• Assembly code on previous slide is slightly
simplified (omits first i < n check in case n
==0)
• Two problems with it
– Problem 1: ‘partialresult’ is allocated on the
stack
• value is said to be “spilled” to the stack
– Problem 2:
• Does not use vector unit for computing sum
CS 3214 Fall 2011
SSE3: hadd_ps
• Treats 128bit as 4 floats (“parallel single”)
• Input are 2x128bit
(A3, A2, A1, A0) and (B3, B2, B1, B0)
• Computes
(B3 + B2, B1 + B0, A3 + A2, A1 + A0) – “horizontal” operation
“hadd”
• Apply twice to compute sum of all 4 elements in lowest element:
(A3, A2, A1, A0) + (0, 0, 0, 0) → (0, 0, A3+A2, A1+A0) + (0, 0, 0,
0) → (0, 0, 0, A3+A2+A1+A0)
• Use “intrinsics” – look like function calls, but are directions for the
compiler to use certain instructions
– Unlike ‘asm’, compiler knows their meaning: no need to specify input,
output constraints, or what’s clobbered
– Compiler performs register allocation
CS 3214 Fall 2011
GCC Vector Extensions + XMM Intrinsics
#include <pmmintrin.h>
typedef float v4sf
__attribute__ ((vector_size (16)));
float ipf (v4sf x[],
v4sf y[],
int n)
{
int i;
float partialsum, result = 0.0;
v4sf zero = _mm_setzero_ps(); // intrinsic, produces vector of 4 0.0f
for (i = 0; i < n; i++) {
v4sf p = x[i] * y[i];
_mm_store_ss(
&partialsum,
_mm_hadd_ps(_mm_hadd_ps(p, zero), zero));
result += partialsum;
}
return result;
}
CS 3214 Fall 2011
ipf:
pushl
movl
pushl
subl
movl
#include <pmmintrin.h>
movl
movl
typedef float v4sf
movl
__attribute__ ((vector_size (16)));
xorps
xorps
float ipf (v4sf x[],
.L5:
v4sf y[],
movaps
int n)
mulps
{
haddps
int i;
haddps
float partialsum, result = 0.0;
addss
v4sf zero = _mm_setzero_ps();
addl
addl
for (i = 0; i < n; i++) {
addl
v4sf p = x[i] * y[i];
cmpl
jne
_mm_store_ss(
movss
&partialsum,
flds
_mm_hadd_ps(_mm_hadd_ps(p, zero), zero));
addl
popl
result += partialsum;
popl
}
ret
return result;
CS 3214 Fall 2011
}
Example: GCC
Vector Extensions +
XMM Intrinsics
%ebp
%esp, %ebp
%ebx
$4, %esp
16(%ebp), %ebx
8(%ebp), %edx
12(%ebp), %eax
$0, %ecx
%xmm2, %xmm2
%xmm1, %xmm1
(%eax), %xmm0
(%edx), %xmm0
%xmm1, %xmm0
%xmm1, %xmm0
%xmm0, %xmm2
$1, %ecx
$16, %edx
$16, %eax
%ebx, %ecx
.L5
%xmm2, -8(%ebp)
-8(%ebp)
$4, %esp
%ebx
%ebp