Try turning on optimization, then comparing the results. Your fastcall
version has many redundant operations because it's not optimized.
Here's output of VS 2010 with /Ox
.
fastcall
:
; _firstValue$ = ecx
; _secondValue$ = edx
?CallMe1@@YIXPAH0@Z PROC ; CallMe1
mov eax, DWORD PTR [ecx]
push esi
mov esi, DWORD PTR [edx]
cmp eax, esi
je SHORT $LN1@CallMe1
mov DWORD PTR [ecx], esi
mov DWORD PTR [edx], eax
$LN1@CallMe1:
pop esi
ret 0
?CallMe1@@YIXPAH0@Z ENDP ; CallMe1
stdcall
:
_firstValue$ = 8 ; size = 4
_secondValue$ = 12 ; size = 4
?CallMe2@@YGXPAH0@Z PROC ; CallMe2
mov edx, DWORD PTR _firstValue$[esp-4]
mov eax, DWORD PTR [edx]
push esi
mov esi, DWORD PTR _secondValue$[esp]
mov ecx, DWORD PTR [esi]
cmp eax, ecx
je SHORT $LN1@CallMe2
mov DWORD PTR [edx], ecx
mov DWORD PTR [esi], eax
$LN1@CallMe2:
pop esi
ret 8
?CallMe2@@YGXPAH0@Z ENDP ; CallMe2
cdecl
(what you mistakenly call stdcall
in your example):
_firstValue$ = 8 ; size = 4
_secondValue$ = 12 ; size = 4
?CallMe3@@YAXPAH0@Z PROC ; CallMe3
mov edx, DWORD PTR _firstValue$[esp-4]
mov eax, DWORD PTR [edx]
push esi
mov esi, DWORD PTR _secondValue$[esp]
mov ecx, DWORD PTR [esi]
cmp eax, ecx
je SHORT $LN1@CallMe3
mov DWORD PTR [edx], ecx
mov DWORD PTR [esi], eax
$LN1@CallMe3:
pop esi
ret 0
?CallMe3@@YAXPAH0@Z ENDP ; CallMe3