Tecniche di ottimizzazione dell'assembly Intel x86 in un problema di esempio
-
06-07-2019 - |
Domanda
Sto imparando l'assemblatore da un po 'e sto provando a riscrivere alcune semplici procedure \ funzioni per vedere i benefici in termini di prestazioni (se presenti). Il mio principale strumento di sviluppo è Delphi 2007 e i primi esempi saranno in quella lingua, ma possono essere facilmente tradotti anche in altre lingue.
Il problema indica come:
Abbiamo fornito un valore di byte senza segno in cui ciascuno degli otto bit rappresenta un pixel in una riga di una schermata. Ogni singolo pixel può essere solido (1) o trasparente (0). Quindi, in altre parole, abbiamo 8 pixel racchiusi in un valore di byte. Voglio decomprimere quei pixel in un array di otto byte nel modo in cui il pixel più giovane (bit) arriverà sotto l'indice più basso dell'array e così via. Ecco un esempio:
One byte value -----------> eight byte array
10011011 -----------------> [1][1][0][1][1][0][0][1]
Array index number -------> 0 1 2 3 4 5 6 7
Di seguito vi presento cinque metodi che stanno risolvendo il problema. Successivamente mostrerò il loro confronto temporale e come ho misurato quei tempi.
Le mie domande consistono in due parti:
1.
Ti sto chiedendo una risposta dettagliata relativa ai metodi DecodePixels4a
e DecodePixels4b
. Perché il metodo 4b
è leggermente più lento del 4a
?
Se ad esempio è più lento perché il mio codice non è allineato correttamente, mostrami quali istruzioni in un determinato metodo potrebbero essere meglio allineate e come farlo per non interrompere il metodo.
Vorrei vedere esempi concreti dietro la teoria. Tieni presente che sto imparando l'assemblaggio e desidero acquisire la conoscenza delle tue risposte che mi consentirà in futuro di scrivere codice ottimizzato meglio.
2.
Puoi scrivere routine più veloce di DecodePixels4a
? In tal caso, presentalo e descrivi i passaggi di ottimizzazione che hai adottato.
Per routine più veloce intendo routine che viene eseguita nel più breve periodo di tempo nel tuo ambiente di test tra tutte le routine presentate qui.
Sono ammessi tutti i processori della famiglia Intel e quelli compatibili con essi.
Di seguito troverai le routine scritte da me:
procedure DecodePixels1(EncPixels: Byte; var DecPixels: TDecodedPixels);
var
i3: Integer;
begin
DecPixels[0] := EncPixels and $01;
for i3 := 1 to 7 do
begin
EncPixels := EncPixels shr 1;
DecPixels[i3] := EncPixels and $01;
//DecPixels[i3] := (EncPixels shr i3) and $01; //this is even slower if you replace above 2 lines with it
end;
end;
//Lets unroll the loop and see if it will be faster.
procedure DecodePixels2(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
DecPixels[0] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[1] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[2] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[3] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[4] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[5] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[6] := EncPixels and $01;
EncPixels := EncPixels shr 1;
DecPixels[7] := EncPixels and $01;
end;
procedure DecodePixels3(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
asm
push eax;
push ebx;
push ecx;
mov bl, al;
and bl, $01;
mov [edx], bl;
mov ecx, program Test;
{$APPTYPE CONSOLE}
uses
SysUtils, Windows;
type
TDecodedPixels = array[0..7] of Byte;
var
Pixels: TDecodedPixels;
Freq, TimeStart, TimeEnd :Int64;
Time1, Time2, Time3, Time4a, Time4b: Extended;
i, i2: Integer;
begin
if QueryPerformanceFrequency(Freq) then
begin
for i2 := 1 to 100 do
begin
QueryPerformanceCounter(TimeStart);
for i := 1 to 100000 do
DecodePixels1(155, Pixels);
QueryPerformanceCounter(TimeEnd);
Time1 := Time1 + ((TimeEnd - TimeStart) / Freq * 1000);
QueryPerformanceCounter(TimeStart);
for i := 1 to 100000 do
DecodePixels2(155, Pixels);
QueryPerformanceCounter(TimeEnd);
Time2 := Time2 + ((TimeEnd - TimeStart) / Freq * 1000);
QueryPerformanceCounter(TimeStart);
for i := 1 to 100000 do
DecodePixels3(155, Pixels);
QueryPerformanceCounter(TimeEnd);
Time3 := Time3 + ((TimeEnd - TimeStart) / Freq * 1000);
QueryPerformanceCounter(TimeStart);
for i := 1 to 100000 do
DecodePixels4a(155, Pixels);
QueryPerformanceCounter(TimeEnd);
Time4a := Time4a + ((TimeEnd - TimeStart) / Freq * 1000);
QueryPerformanceCounter(TimeStart);
for i := 1 to 100000 do
DecodePixels4b(155, Pixels);
QueryPerformanceCounter(TimeEnd);
Time4b := Time4b + ((TimeEnd - TimeStart) / Freq * 1000);
end;
Writeln('Time1 : ' + FloatToStr(Time1 / 100) + ' ms. <- Delphi loop.');
Writeln('Time2 : ' + FloatToStr(Time2 / 100) + ' ms. <- Delphi unrolled loop.');
Writeln('Time3 : ' + FloatToStr(Time3/ 100) + ' ms. <- BASM loop.');
Writeln('Time4a : ' + FloatToStr(Time4a / 100) + ' ms. <- BASM unrolled loop.');
Writeln('Time4b : ' + FloatToStr(Time4b / 100) + ' ms. <- BASM unrolled loop instruction switch.');
end;
Readln;
end.
;
@@Decode:
inc ecx;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + ecx], bl;
cmp ecx, $07;
jnz @@Decode;
pop ecx;
pop ebx;
pop eax;
end;
end;
//Unrolled assembly loop
procedure DecodePixels4a(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
asm
push eax;
push ebx;
mov bl, al;
and bl, $01;
mov [edx], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $01], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $02], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $03], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $04], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $05], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $06], bl;
shr al, $01;
mov bl, al;
and bl, $01;
mov [edx + $07], bl;
pop ebx;
pop eax;
end;
end;
// it differs compared to 4a only in switching two instructions (but seven times)
procedure DecodePixels4b(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
asm
push eax;
push ebx;
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx], bl; //
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx + $01], bl; //
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx + $02], bl; //
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx + $03], bl; //
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx + $04], bl; //
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx + $05], bl; //
mov bl, al;
and bl, $01;
shr al, $01; //
mov [edx + $06], bl; //
mov bl, al;
and bl, $01;
mov [edx + $07], bl;
pop ebx;
pop eax;
end;
end;
Ed ecco come li collaudo:
Time1 : 1,68443549919493 ms. <- Delphi loop.
Time2 : 1,33773024572211 ms. <- Delphi unrolled loop.
Time3 : 1,37015271374424 ms. <- BASM loop.
Time4a : 0,822916962526627 ms. <- BASM unrolled loop.
Time4b : 0,862914462301607 ms. <- BASM unrolled loop instruction switch.
Ecco i risultati della mia macchina (Intel & # 174; Pentium & # 174; E2180 su Win32 XP):
Time1 : 1,66535493194387 ms. <- Delphi loop.
Time2 : 1,29115785420688 ms. <- Delphi unrolled loop.
Time3 : 1,33716934524107 ms. <- BASM loop.
Time4a : 0,795041753757838 ms. <- BASM unrolled loop.
Time4b : 0,843520166815013 ms. <- BASM unrolled loop instruction switch.
Time5 : 1,49457681191307 ms. <- Wouter van Nifterick, Delphi unrolled
Time6 : 0,400587402866258 ms. <- PhiS, table lookup Delphi
Time7 : 0,325472442519827 ms. <- PhiS, table lookup Delphi inline
Time8 : 0,37350491544239 ms. <- PhiS, table lookup BASM
I risultati sono abbastanza stabili - i tempi variano solo di qualche percento tra ogni test che ho effettuato. E questo era sempre vero: Time1 > Time3 > Tempo 2 > Time4b > Time4a
Quindi penso che la differenza tra Time4a e Time4b dipenda dal fatto che le istruzioni cambiano nel metodo DecodePixels4b
. A volte è del 4% a volte è fino al 10% ma 4b
è sempre più lento di 4a
.
Stavo pensando a un altro metodo con l'uso delle istruzioni MMX per scrivere in memoria otto byte alla volta, ma non riesco a capire un modo veloce per decomprimere byte nel registro a 64 bit.
Grazie per il tuo tempo.
Grazie ragazzi per il vostro prezioso contributo. Per cui potrei rispondere a tutti voi allo stesso tempo, sfortunatamente rispetto alle moderne CPU ho solo una "pipe" e può eseguire solo un'istruzione "rispondi" al tempo ;-) Quindi, proverò a riassumere alcune cose qui e scriverò ulteriori commenti sotto le tue risposte.
Prima di tutto, volevo dire che prima di pubblicare la mia domanda ho trovato la soluzione presentata da Wouter van Nifterick ed era in realtà molto più lento del mio codice assembly. Quindi ho deciso di non pubblicare quella routine qui, ma potresti vedere che ho adottato lo stesso approccio anche nel mio ciclo versione Delphi della routine. È stato commentato lì perché mi stava dando risultati peggiori.
Questo è un mistero per me. Ho eseguito di nuovo il mio codice con le routine di Wouter e PhilS e qui ci sono i risultati:
Time1 : 1,22508325749317 ms. <- Delphi loop.
Time2 : 1,33004145373084 ms. <- Delphi unrolled loop.
Time3 : 1,1473583622526 ms. <- BASM loop.
Time4a : 0,77322594033463 ms. <- BASM unrolled loop.
Time4b : 0,846033593023372 ms. <- BASM unrolled loop instruction switch.
Time5 : 0,688689382044384 ms. <- Wouter van Nifterick, Delphi unrolled
Time6 : 0,503233741036693 ms. <- PhiS, table lookup Delphi
Time7 : 0,385254722925063 ms. <- PhiS, table lookup Delphi inline
Time8 : 0,432993919452751 ms. <- PhiS, table lookup BASM
Time9 : 0,362680491244212 ms. <- PhiS, table lookup BASM with assembler directive
Guarda il risultato di Time5, abbastanza strano vero? Immagino di avere una versione di Delphi diversa, poiché il mio codice assembly generato differisce da quello fornito da Wouter.
Seconda modifica principale:
So perché la routine 5
è stata più lenta sul mio machnie. Avevo controllato " Controllo intervallo " e "quotazione di overflow" nelle mie opzioni di compilatore. Ho aggiunto la direttiva assembler
alla routine 9
per vedere se aiuta. Sembra che con questa direttiva la procedura di assemblaggio sia buona quanto la variante in linea Delphi o anche leggermente migliore.
Ecco i risultati finali:
procedure DecodePixels4c(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
asm
push ebx;
mov bl, al;
and bl, 1;
mov [edx], bl;
mov bl, al;
shr bl, 1;
and bl, 1;
mov [edx + $01], bl;
mov bl, al;
shr bl, 2;
and bl, 1;
mov [edx + $02], bl;
mov bl, al;
shr bl, 3;
and bl, 1;
mov [edx + $03], bl;
mov bl, al;
shr bl, 4;
and bl, 1;
mov [edx + $04], bl;
mov bl, al;
shr bl, 5;
and bl, 1;
mov [edx + $05], bl;
mov bl, al;
shr bl, 6;
and bl, 1;
mov [edx + $06], bl;
shr al, 7;
and al, 1;
mov [edx + $07], al;
pop ebx;
end;
end;
Terza modifica principale:
Secondo l'opinione @Pascal Cuoq e @j_random_hacker la differenza nei tempi di esecuzione tra le routine 4a
, 4b
e 5
è causata dalla dipendenza dei dati . Tuttavia, non sono d'accordo con tale opinione sulla base degli ulteriori test che ho effettuato.
Ho anche inventato la nuova routine 4c
basata su 4a
. Eccolo:
Test1 Test2 Test3 Test4
Time1 : 1,211 1,210 1,220 1,213
Time2 : 1,280 1,258 1,253 1,332
Time3 : 1,129 1,138 1,130 1,160
Time4a : 0,690 0,682 0,617 0,635
Time4b : 0,707 0,698 0,706 0,659
Time4c : 0,679 0,685 0,626 0,625
Time5 : 0,715 0,682 0,686 0,679
Time6 : 0,490 0,485 0,522 0,514
Time7 : 0,323 0,333 0,336 0,318
Time8 : 0,407 0,403 0,373 0,354
Time9 : 0,352 0,378 0,355 0,355
Time10a : 1,823 1,812 1,807 1,813
Time10b : 1,113 1,120 1,115 1,118
Time10c : 0,652 0,630 0,653 0,633
Time10d : 0,156 0,155 0,172 0,160 <-- current winner!
Direi che dipende piuttosto dai dati.
Ed ecco i test e i risultati. Ho fatto quattro prove per assicurarmi che non ci siano incidenti. Ho anche aggiunto nuovi tempi per le routine proposte da GJ (Time10a, Time10b).
<*> Come puoi vedere i risultati di 4a
, 4b
, 4c
e 5
sono molto vicini a ciascuno altro.
Perché? Perché ho rimosso da 4a, 4b (4c già non ce l'hanno) due istruzioni: push eax
e pop eax
. Dal momento che so che non userò in nessun altro punto del mio codice il valore sotto eax, non devo prenotarlo.
Ora il mio codice ha solo una coppia di push / pop come la routine 5.
La routine 5 conserva il valore di eax perché prima ne fa una copia sotto ecx ma non merita di riservare ecx.
Quindi la mia conclusione è che: la differenza nell'esecuzione temporale di 5 e 4a e 4b (prima della terza modifica) non riguardava la dipendenza dei dati ma era causata da una coppia aggiuntiva di istruzioni push / pop .
Sono molto interessato ai tuoi commenti.
Dopo alcuni giorni GJ ha inventato una routine ancora più veloce (Time 10d) di quella di PhiS. Bel lavoro GJ!
Soluzione
Il tuo codice asm è relativamente lento perché usa la fine dello stack per 8 volte nella memoria. Controlla questo ...
procedure DecodePixels(EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
xor ecx, ecx
add al, al
rcl ecx, 8
add al, al
rcl ecx, 8
add al, al
rcl ecx, 8
add al, al
rcl ecx, 1
mov [DecPixels + 4], ecx
xor ecx, ecx
add al, al
rcl ecx, 8
add al, al
rcl ecx, 8
add al, al
rcl ecx, 8
add al, al
rcl ecx, 1
mov [DecPixels], ecx
end;
Forse è persino più veloce del codice con la tabella di ricerca!
Versione migliorata:
procedure DecodePixelsI(EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
mov ecx, 0 //Faster than: xor ecx, ecx
add al, al
rcl ch, 1
add al, al
rcl cl, 1
ror ecx, 16
add al, al
rcl ch, 1
add al, al
rcl cl, 1
mov [DecPixels + 4], ecx
mov ecx, 0 //Faster than: xor ecx, ecx
add al, al
rcl ch, 1
add al, al
rcl cl, 1
ror ecx, 16
add al, al
rcl ch, 1
add al, al
rcl cl, 1
mov [DecPixels], ecx
end;
Versione 3:
procedure DecodePixelsX(EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
add al, al
setc byte ptr[DecPixels + 7]
add al, al
setc byte ptr[DecPixels + 6]
add al, al
setc byte ptr[DecPixels + 5]
add al, al
setc byte ptr[DecPixels + 4]
add al, al
setc byte ptr[DecPixels + 3]
add al, al
setc byte ptr[DecPixels + 2]
add al, al
setc byte ptr[DecPixels + 1]
setnz byte ptr[DecPixels]
end;
Versione 4:
const Uint32DecPix : array [0..15] of cardinal = (
<*>, $00000001, $00000100, $00000101,
$00010000, $00010001, $00010100, $00010101,
$01000000, $01000001, $01000100, $01000101,
$01010000, $01010001, $01010100, $01010101
);
procedure DecodePixelsY(EncPixels: byte; var DecPixels: TDecodedPixels); inline;
begin
pcardinal(@DecPixels)^ := Uint32DecPix[EncPixels and <*>F];
pcardinal(cardinal(@DecPixels) + 4)^ := Uint32DecPix[(EncPixels and $F0) shr 4];
end;
Altri suggerimenti
In generale, starei personalmente lontano dal tentativo di ottimizzare il codice usando trucchi a livello di assemblatore, a meno che non abbia davvero bisogno di quel 2 o 3% di velocità in più, e sei disposto a pagare il prezzo del codice più difficile da leggere, mantenere e trasferire.
Per spremere quell'ultimo 1%, potresti anche dover mantenere diverse versioni ottimizzate per processore e se arrivano processori più recenti e un compilatore Pascal migliorato, non ne trarrai beneficio.
Questo codice Delphi è più veloce del tuo codice assemblatore più veloce:
procedure DecodePixels5(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
DecPixels[0] := (EncPixels shr 0) and $01;
DecPixels[1] := (EncPixels shr 1) and $01;
DecPixels[2] := (EncPixels shr 2) and $01;
DecPixels[3] := (EncPixels shr 3) and $01;
DecPixels[4] := (EncPixels shr 4) and $01;
DecPixels[5] := (EncPixels shr 5) and $01;
DecPixels[6] := (EncPixels shr 6) and $01;
DecPixels[7] := (EncPixels shr 7) and $01;
end;
Results:
Time1 : 1,03096806151283 ms. <- Delphi loop.
Time2 : 0,740308641141395 ms. <- Delphi unrolled loop.
Time3 : 0,996602425688886 ms. <- BASM loop.
Time4a : 0,608267951561275 ms. <- BASM unrolled loop.
Time4b : 0,574162510648039 ms. <- BASM unrolled loop instruction switch.
Time5 : 0,499628206138524 ms. !!! <- Delphi unrolled loop 5.
È veloce perché le operazioni possono essere eseguite solo con i registri, invece di dover archiviare e recuperare memoria. I processori moderni eseguono questo in parte in parallelo (una nuova operazione può essere avviata prima della precedente), perché i risultati delle istruzioni consecutive sono indipendenti l'uno dall'altro.
Il codice macchina è simile al seguente:
push ebx;
// DecPixels[0] := (EncPixels shr 0) and 1;
movzx ecx,al
mov ebx,ecx
// shr ebx,var
PixelLookup:Array[byte] of TDecodedPixels;
// You could precalculate, but the performance gain would hardly be worth it because you call this once only.
for I := 0 to 255 do
DecodePixels5b(I, PixelLookup[I]);
procedure DecodePixels7(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
DecPixels := PixelLookup[EncPixels];
end;
Results:
Time1 : 1,03096806151283 ms. <- Delphi loop.
Time2 : 0,740308641141395 ms. <- Delphi unrolled loop.
Time3 : 0,996602425688886 ms. <- BASM loop.
Time4a : 0,608267951561275 ms. <- BASM unrolled loop.
Time4b : 0,574162510648039 ms. <- BASM unrolled loop instruction switch.
Time5 : 0,499628206138524 ms. !!! <- Delphi unrolled loop 5.
Time7 : 0,251533475182096 ms. <- simple table lookup
and bl,$01
mov [edx],bl
// DecPixels[1] := (EncPixels shr 1) and 1;
mov ebx,ecx
shr ebx,1
and bl,$01
mov [edx+$01],bl
// DecPixels[2] := (EncPixels shr 2) and 1;
mov ebx,ecx
shr ebx,$02
and bl,$01
mov [edx+$02],bl
// DecPixels[3] := (EncPixels shr 3) and 1;
mov ebx,ecx
shr ebx,$03
and bl,$01
mov [edx+$03],bl
// DecPixels[4] := (EncPixels shr 4) and 1;
mov ebx,ecx
shr ebx,$04
and bl,$01
mov [edx+$04],bl
// DecPixels[5] := (EncPixels shr 5) and 1;
mov ebx,ecx
shr ebx,$05
and bl,$01
mov [edx+$05],bl
// DecPixels[6] := (EncPixels shr 6) and 1;
mov ebx,ecx
shr ebx,$06
and bl,$01
mov [edx+$06],bl
// DecPixels[7] := (EncPixels shr 7) and 1;
shr ecx,$07
and cl,$01
mov [edx+$07],cl
pop ebx;
Modifica: come suggerito, la ricerca di una tabella è davvero più veloce.
<*>Expanding on Nick D's answer, I tried the following table-lookup based versions, all of which are faster than the implementations you give (and faster than Wouter van Nifterick's code).
Given the following packed array:
const Uint64DecPix : PACKED ARRAY [0..255] OF UINT64 =
( $0000000000000000, $0000000000000001, $0000000000000100, $0000000000000101, $0000000000010000, $0000000000010001, $0000000000010100, $0000000000010101, $0000000001000000, $0000000001000001, $0000000001000100, $0000000001000101, $0000000001010000, $0000000001010001, $0000000001010100, $0000000001010101,
$0000000100000000, $0000000100000001, $0000000100000100, $0000000100000101, $0000000100010000, $0000000100010001, $0000000100010100, $0000000100010101, $0000000101000000, $0000000101000001, $0000000101000100, $0000000101000101, $0000000101010000, $0000000101010001, $0000000101010100, $0000000101010101,
$0000010000000000, $0000010000000001, $0000010000000100, $0000010000000101, $0000010000010000, $0000010000010001, $0000010000010100, $0000010000010101, $0000010001000000, $0000010001000001, $0000010001000100, $0000010001000101, $0000010001010000, $0000010001010001, $0000010001010100, $0000010001010101,
$0000010100000000, $0000010100000001, $0000010100000100, $0000010100000101, $0000010100010000, $0000010100010001, $0000010100010100, $0000010100010101, $0000010101000000, $0000010101000001, $0000010101000100, $0000010101000101, $0000010101010000, $0000010101010001, $0000010101010100, $0000010101010101,
$0001000000000000, $0001000000000001, $0001000000000100, $0001000000000101, $0001000000010000, $0001000000010001, $0001000000010100, $0001000000010101, $0001000001000000, $0001000001000001, $0001000001000100, $0001000001000101, $0001000001010000, $0001000001010001, $0001000001010100, $0001000001010101,
$0001000100000000, $0001000100000001, $0001000100000100, $0001000100000101, $0001000100010000, $0001000100010001, $0001000100010100, $0001000100010101, $0001000101000000, $0001000101000001, $0001000101000100, $0001000101000101, $0001000101010000, $0001000101010001, $0001000101010100, $0001000101010101,
$0001010000000000, $0001010000000001, $0001010000000100, $0001010000000101, $0001010000010000, $0001010000010001, $0001010000010100, $0001010000010101, $0001010001000000, $0001010001000001, $0001010001000100, $0001010001000101, $0001010001010000, $0001010001010001, $0001010001010100, $0001010001010101,
$0001010100000000, $0001010100000001, $0001010100000100, $0001010100000101, $0001010100010000, $0001010100010001, $0001010100010100, $0001010100010101, $0001010101000000, $0001010101000001, $0001010101000100, $0001010101000101, $0001010101010000, $0001010101010001, $0001010101010100, $0001010101010101,
$0100000000000000, $0100000000000001, $0100000000000100, $0100000000000101, $0100000000010000, $0100000000010001, $0100000000010100, $0100000000010101, $0100000001000000, $0100000001000001, $0100000001000100, $0100000001000101, $0100000001010000, $0100000001010001, $0100000001010100, $0100000001010101,
$0100000100000000, $0100000100000001, $0100000100000100, $0100000100000101, $0100000100010000, $0100000100010001, $0100000100010100, $0100000100010101, $0100000101000000, $0100000101000001, $0100000101000100, $0100000101000101, $0100000101010000, $0100000101010001, $0100000101010100, $0100000101010101,
$0100010000000000, $0100010000000001, $0100010000000100, $0100010000000101, $0100010000010000, $0100010000010001, $0100010000010100, $0100010000010101, $0100010001000000, $0100010001000001, $0100010001000100, $0100010001000101, $0100010001010000, $0100010001010001, $0100010001010100, $0100010001010101,
$0100010100000000, $0100010100000001, $0100010100000100, $0100010100000101, $0100010100010000, $0100010100010001, $0100010100010100, $0100010100010101, $0100010101000000, $0100010101000001, $0100010101000100, $0100010101000101, $0100010101010000, $0100010101010001, $0100010101010100, $0100010101010101,
$0101000000000000, $0101000000000001, $0101000000000100, $0101000000000101, $0101000000010000, $0101000000010001, $0101000000010100, $0101000000010101, $0101000001000000, $0101000001000001, $0101000001000100, $0101000001000101, $0101000001010000, $0101000001010001, $0101000001010100, $0101000001010101,
$0101000100000000, $0101000100000001, $0101000100000100, $0101000100000101, $0101000100010000, $0101000100010001, $0101000100010100, $0101000100010101, $0101000101000000, $0101000101000001, $0101000101000100, $0101000101000101, $0101000101010000, $0101000101010001, $0101000101010100, $0101000101010101,
$0101010000000000, $0101010000000001, $0101010000000100, $0101010000000101, $0101010000010000, $0101010000010001, $0101010000010100, $0101010000010101, $0101010001000000, $0101010001000001, $0101010001000100, $0101010001000101, $0101010001010000, $0101010001010001, $0101010001010100, $0101010001010101,
$0101010100000000, $0101010100000001, $0101010100000100, $0101010100000101, $0101010100010000, $0101010100010001, $0101010100010100, $0101010100010101, $0101010101000000, $0101010101000001, $0101010101000100, $0101010101000101, $0101010101010000, $0101010101010001, $0101010101010100, $0101010101010101);
PUint64DecPix : pointer = @Uint64DecPix;
you can write the following:
procedure DecodePixelsPS1Pas (EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
DecPixels := TDecodedPixels(Uint64DecPix[EncPixels]);
end;
procedure DecodePixelsPS1PasInline (EncPixels: Byte; var DecPixels: TDecodedPixels);
inline;
begin
DecPixels := TDecodedPixels(Uint64DecPix[EncPixels]);
end;
procedure DecodePixelsPS1Asm (EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
lea ecx, Uint64DecPix //[<-Added in EDIT 3]
//mov ecx, dword ptr PUint64DecPix - alternative to the above line (slower for me)
movzx eax, al
movq xmm0, [8*eax+ecx] //Using XMM rather than MMX so we don't have to issue emms at the end
movq [edx], xmm0 //use MOVQ because it doesn't need mem alignment
end;
The standard PAS and ASM implementations are fairly similar speed-wise, but the PAS implementation marked with "INLINE" is the fastest because it gets rid of all the call/ret involved in calling the routine.
--EDIT--: I forgot to say: since you are implicitly assuming something about the memory layout of your TDecodedPixels structure, it would be better if you declare it as
PACKED ARRAY [0..7] of byte
--EDIT2--: Here are my results for comparison:
Time1 : 2.51638266874701 ms. <- Delphi loop.
Time2 : 2.11277620479698 ms. <- Delphi unrolled loop.
Time3 : 2.21972066282167 ms. <- BASM loop.
Time4a : 1.34093090043567 ms. <- BASM unrolled loop.
Time4b : 1.52222070123437 ms. <- BASM unrolled loop instruction switch.
Time5 : 1.17106364076999 ms. <- Wouter van Nifterick
TimePS1 : 0.633099318488802 ms. <- PS.Pas
TimePS2 : 0.551617593856202 ms. <- PS.Pas Inline
TimePS3 : 0.70921094720139 ms. <- PS.Asm (speed for version before 3rd EDIT)
Compilers do very good job at optimizing small routines.
I would optimize your code by using a lookup table.
Since you decode a single byte - 256 different states - you can precalculate 256 arrays with the unpacked values.
Edit: Note that Pentium processors can execute specific instructions in parallel (Superscalar architecture), it is called pairing.
Pure software solution
Using the beautiful technique from this question, which is again inspired by this question we'll have a great solution like this with only one line of code (excluding declarations)
type TPackedDecodedPixels = record
case integer of
0: (a: TDecodedPixels);
1: (v: Int64);
end;
procedure DecodePixels(EncPixels: byte; var DecPixels: TDecodedPixels); inline;
const
magic = $8040201008040201;
mask = $8080808080808080;
begin
TPackedDecodedPixels(DecPixels).v := SwapEndian(((EncPixels*magic) and mask) shr 7);
end;
Of course you need to make sure that DecPixels
is properly 8-byte aligned or you may suffer from some slow down (or even segfaults on other architectures). You can also easily vectorize the function to make it faster
Explanation:
Assume we have the following bit pattern as abcdefgh
. We'll want the output array to contain
0000000a 0000000b 0000000c 0000000d 0000000e 0000000f 0000000g 0000000h (1)
Reading that in little endian as a 64-bit integer we'll get %0000000h0000000g0000000f0000000e0000000d0000000c0000000b0000000a
. We have to find a magic number that shifts the original bits to the positions that we can extract the necessary bits
Let's multiply the value with the magic number
| b7 || b6 || b4 || b4 || b3 || b2 || b1 || b0 |
abcdefgh (1-byte value)
x 1000000001000000001000000001000000001000000001000000001000000001
__________________________________________________________________
= h0abcdefgh0abcdefgh0abcdefgh0abcdefgh0abcdefgh0abcdefgh0abcdefgh
At this point all the pixels' bits have been moved to the most significant bits of the corresponding bytes. As they already lied in the right place, we just need to strip out the remaining bits with and
| b7 || b6 || b4 || b4 || b3 || b2 || b1 || b0 |
h0abcdefgh0abcdefgh0abcdefgh0abcdefgh0abcdefgh0abcdefgh0abcdefgh
& 1000000010000000100000001000000010000000100000001000000010000000
__________________________________________________________________
= h0000000g0000000f0000000e0000000d0000000c0000000b0000000a0000000 (8-byte array)
Now the pixels' bits are in the most significant bits of the corresponding bytes, we need to do a logical right shift by 7 to move them to the least significant position. Because the OP wants the value in reversed order, we need SwapEndian()
to convert the bytes to big endian. If you just want little endian you can stop at this step
So the magic number is %1000000001000000001000000001000000001000000001000000001000000001 = $8040201008040201
and the mask is %1000000010000000100000001000000010000000100000001000000010000000 = $8080808080808080
. Of course in reality to solve the problem and get those values we need to do backwards from the final result → multiplied result → magic number
But why did I put the bytes in little endian at (1) and then have to convert back to big endian? Why don't just arrange the bytes in big endian order and find the magic number for that? In case you're wondering about that then it's because that way it'll only work for at most 7 bits at a time. I did that way in my old answer and have to split a bit off then combine it back later
0abcdefg
x 0000000000000010000001000000100000010000001000000100000010000001
__________________________________________________________________
= 00000000abcdefgabcdefgabcdefgabcdefgabcdefgabcdefgabcdefgabcdefg
& 0000000000000001000000010000000100000001000000010000000100000001
__________________________________________________________________
= 000000000000000a0000000b0000000c0000000d0000000e0000000f0000000g
Hardware support
This is actually a special case of bit expand with a constant mask. In AVX2 Intel introduced the pdep
instruction in BMI2 instruction set for that purpose, so you just need a single instruction to get the result. In other languages you can use this with the intrinsic function _pext_u64
. Unfortunately AFAIK Free Pascal doesn't support it and you have to use assembly directly. However the expression will look like
TPackedDecodedPixels(DecPixels).v := _pext_u64(EncPixels, $0101010101010101);
Correctness check:
I tried comparing the OP's version with both my versions and didn't find any problem until now. The compiler output is like this
mov al, dil
mov rbx, rsi
movzx edi, al
movabs rax, 0x8040201008040201
imul rdi, rax
movabs rax, 0x8080808080808080
and rdi, rax
shr rdi, 0x7
call 4016a0 <SYSTEM_$$_SWAPENDIAN$INT64$$INT64>
mov QWORD PTR [rbx], rax
The FPC output is still sub-optimal because the compiler doesn't know to replace the call to SwapEndian
with BSWAP
, and it copies data unnecessarily. Why mov al, dil; movzx edi, al
instead of just movzx edi, dil
?. As you can see, outputs from C and C++ compilers are a lot better
I was about to give the same algorithm as Wouter van Nifterick.
In addition, I would explain the better performance in terms of dependency chains. In each of the versions that you proposed, when you unrolled your basic loop, you kept a dependency between two successive iterations: each of your shr al, $01; requires the previous value of al to have been computed. If you organize your unrolled iterations such that they can be executed in parallel, they will actually be on a modern processor. Don't be fooled by false dependencies that can be suppressed by register renaming.
Someone pointed out that the Pentium can execute two instructions at once. That's true, but modern processors (since the Pentium Pro, PII,...,Core, Core 2) are executing much more than two instructions at the same time, when they have the chance -- that is, when there is no dependency between the instructions being executed. Note how in Wouter van Nifterick's version each line can be executed independently from the others.
http://www.agner.org/optimize/ has all the information you could ever need to understand the architecture of modern processors and how to take advantage of them.
if you only support 80386 and above you can use BTcc and SETcc set of instructions in this manner:
BT ax,1
SETC [dx]
inc dx
BT ax,2
SETC [dx]
inc dx
etc
How about something like:
/* input byte in eax, address to store result in edx */
and eax, 0xff /* may not be needed */
mov ebx, eax
shl ebx, 7
or eax, ebx
mov ebx, eax
shl ebx, 14
or eax, ebx
mov ebx, eax
and eax, 0x01010101
mov [edx], eax
shr ebx, 4
and ebx, 0x01010101
mov [edx+4], ebx
The likely reason that 4b is faster than 4a is that it parallelizes better. From 4a:
mov bl, al;
and bl, $01; // data dep (bl)
mov [edx], bl; // data dep (bl)
shr al, $01;
mov bl, al; // data dep (al)
and bl, $01; // data dep (bl)
mov [edx + $01], bl; // data dep (bl)
Instructions marked "data dep" cannot begin executing until the previous instruction has finished, and I've written the registers that cause this data dependency. Modern CPUs are capable of starting an instruction before the last one has completed, if there is no dependency. But the way you've ordered these operations prevents this.
In 4b, you have fewer data dependencies:
mov bl, al;
and bl, $01; // data dep (bl)
shr al, $01;
mov [edx], bl;
mov bl, al;
and bl, $01; // data dep (bl)
shr al, $01;
mov [edx + $01], bl;
With this instruction ordering, fewer of the instructions depend on the previous instruction, so there is more opportunity for parallelism.
I can't guarantee that this is the reason for the speed difference, but it is a likely candidate. Unfortunately it is hard to come across answers as absolute as the ones you are looking for; modern processors have branch predictors, multi-level caches, hardware pre-fetchers, and all sorts of other complexities that can make it difficult to isolate the reasons for performance differences. The best you can do is read a lot, perform experiments, and get familiar with the tools for taking good measurements.
I guess it's that writing to memory (actually, cache memory) is slower than working with registers.
So,
mov [edx+...], bl
shr al, $01;
mov bl, al;
gives the processor some time to write bl
to memory before the bl
register is needed again, while
shr al, $01;
mov [edx], bl;
mov bl, al;
needs bl
immediately so the processor has to stop and wait for the memory write to complete.
This is surprising to me. Modern Intel processors do crazy pipelining and register renaming so in my opinion, if anything, DecodePixels4b should be faster, since the dependencies of each instruction are further back. The above is all the explanation I can offer, apart from this:
x86 is a terrible instruction set, and Intel does amazing and very advanced hocus-pocus to make it efficient. If I were you, I would look into something else. There's very little demand for megaMcOptimised software for PCs today. My friendly suggestion is to look into processors for mobile devices (mainly ARM), because in mobile devices, processor speed, power consumption and battery life concerns mean that micro-optimised software is more important. And ARM has a superior instruction set to x86.
SIMD
If you extend the algorithm to processing arrays, then SIMD becomes an optimisation option. Here's a SIMD version that's 1/3 the time of an optimised C equivalent:
int main ()
{
const int
size = 0x100000;
unsigned char
*source = new unsigned char [size],
*dest,
*dest1 = new unsigned char [size * 32],
*dest2 = new unsigned char [size * 32];
for (int i = 0 ; i < size ; ++i)
{
source [i] = rand () & 0xff;
}
LARGE_INTEGER
start,
middle,
end;
QueryPerformanceCounter (&start);
dest = dest1;
for (int i = 0 ; i < size ; ++i)
{
unsigned char
v = source [i];
for (int b = 0 ; b < 8 ; ++b)
{
*(dest++) = (v >> b) & 1;
}
}
unsigned char
bits [] = {1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128},
zero [] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
ones [] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
QueryPerformanceCounter (&middle);
__asm
{
movdqu xmm1,bits
movdqu xmm2,zero
movdqu xmm3,ones
mov ecx,0x100000/4
mov esi,source
mov edi,dest2
l1:
lodsd
movd xmm0,eax
movd xmm4,eax
punpcklbw xmm0,xmm0
punpcklbw xmm4,xmm4
punpcklwd xmm0,xmm0
punpcklwd xmm4,xmm4
punpckldq xmm0,xmm0
punpckhdq xmm4,xmm4
pand xmm0,xmm1
pand xmm4,xmm1
pcmpeqb xmm0,xmm2
pcmpeqb xmm4,xmm2
paddb xmm0,xmm3
paddb xmm4,xmm3
movdqu [edi],xmm0
movdqu [edi+16],xmm4
add edi,32
dec ecx
jnz l1
}
QueryPerformanceCounter (&end);
cout << "Time taken = " << (middle.QuadPart - start.QuadPart) << endl;
cout << "Time taken = " << (end.QuadPart - middle.QuadPart) << endl;
cout << "memcmp = " << memcmp (dest1, dest2, size * 32) << endl;
return 0;
}
Incredible smart solution Chris, what would you do with the inverse problem: make a byte from an array of 8 bytes?
Non optimized solution for the inverse problem:
BtBld PROC Array:DWORD, Pixels:DWORD
mov eax, [Array]
add eax, 7
mov edx, [Pixels]
mov bx, 0
mov ecx, 8
rpt: or bx, [eax]
dec eax
shl bx, 1
loop rpt
shr bx, 1
mov [edx], bl
ret
BtBld ENDP
As you notice, the difference of speed in 4a and 4b implementation is because of CPU optimization (by execute multiple instructions in parallel / pipelining instruction). But the factor is not in the operands, but because of the nature of operator itself.
4a Instruction Sequence:
AND - MOV - SHR
4b Instruction Sequence:
AND - SHR - MOV
Both AND and SHR use Flags register, so these two instructions has wait state in their pipeline.
Read them as follow:
4a: AND (piped) MOV (piped) SHR
4b: AND (WAIT) SHR (piped) MOV
Conclusion: 4b has 7 more wait-state in it's pipeline than 4a, thus it's slower.
Josh mentioned that there's data dependencies, i.e.:
mov bl, al;
and bl, $01; // data dep (bl)
but it's not entirely true since those two instruction can partially be executed in paralel in CPU level:
mov bl, al -> (A:) read al (B:) write bl => (2 clocks in i386)
and bl, 01 -> (C:) read 01 (D:) write bl => idem
Sequentially they take 4 clocks, but pipelined they take only 3 "clocks" (actually the term "clock" is not adequate in pipeline perspective but I used it in context of simplicity)
[--A--][--B--]
[--C--]<wait>[---D--]