写了一个 MASM 版本的 zeromemory 给 D语言 调用 ... 发现没有明显的速度提升 ... SSE速度的体现 也只是表现在 一对多的情况下么 ... (SIMD嘛 ) 。。。
.686 ; create 32 bit code
.mmx
.xmm
.model flat, stdcall ; 32 bit memory model
option casemap :none ; case sensitive
.code
; ml -c /omf ad.asm
; Microsoft (R) Macro Assembler Version 10.00.30319.01
; Copyright (C) Microsoft Corporation. All rights reserved.
_align16_zero_memory proc C _byte_num, _source
option prologue:none, epilogue:none
mov eax, [esp+4] ; - U load _byte_num
push edi ; - V save old frame
mov edi, [esp+12] ; - U load _source
push ecx ; - V save old frame
mov ecx, eax ; - U save old frame
nop ; - V spare
shr eax, 7 ; - U get round
je _remain_byte_deal ; - V/N deal remain frame
pxor xmm0, xmm0 ; get zero
align 16
main_loop:
movdqa [edi], xmm0
movdqa [edi+010h], xmm0
movdqa [edi+020h], xmm0
movdqa [edi+030h], xmm0
movdqa [edi+040h], xmm0
movdqa [edi+050h], xmm0
movdqa [edi+060h], xmm0
movdqa [edi+070h], xmm0
add edi, 128
nop
dec eax
jnz main_loop ; jnz opr ... maybe unsafe ...
mov eax, ecx ; - U save old frame
and ecx, 112 ; - V/N 0x0111 0000
shr ecx, 3 ; - U 0x0000 1110 0/2/4/6/
align 16 ; - V/N maybe jmp opcode/nop opcode
_remain_byte_deal:
pxor xmm0, xmm0 ; get zero
lea edi, [edi-070h+ecx*8] ; - N
jmp [RemainDQwordTable+ecx*4] ; - N
align 16
SB0E:
SB0F:
movdqa [edi], xmm0
SB0C:
SB0D:
movdqa [edi+010h], xmm0
SB0A:
SB0B:
movdqa [edi+020h], xmm0
SB08:
SB09:
movdqa [edi+030h], xmm0
SB06:
SB07:
movdqa [edi+040h], xmm0
SB04:
SB05:
movdqa [edi+050h], xmm0
SB02:
SB03:
movdqa [edi+060h], xmm0
SB00:
SB01:
neg ecx
and eax, 15
lea edi, [edi+070h+ecx*8]
mov ecx, eax
xor eax, eax
cld
rep stosb
pop ecx
pop edi
ret
RemainDQwordTable dd SB00, SB01, SB02, SB03, SB04, SB05, SB06, SB07
dd SB08, SB09, SB0A, SB0B, SB0C, SB0D, SB0E, SB0F
_align16_zero_memory endp
end
import std.stdio;
align(16) __gshared byte i[65535];
align(16) __gshared int ix;
extern(C) void _align16_zero_memory(uint i, uint o);
extern(C)
ulong RDTSC () {
asm {
naked ;
rdtsc ;
ret ;
}
}
void main()
{
while (ix++!=20){
uint cs = cast(uint)i.ptr;
ulong a = RDTSC ();
ulong b;
i[0..$] = 5;
b = cast(uint)(RDTSC()-a);
a = RDTSC ();
_align16_zero_memory(65535, cs);
printf("CRT:%d SSE:%d\n",cast(uint)b, cast(uint)(RDTSC()-a));
}
}
------解决思路----------------------
心理不够强大,曾经看过几眼速度优化方面的,发现要神经质了。