Because you deals with large-size data processing, I'd recommend to try some assembler - MMX, SSE2 commands are specially intended for such tasks. For example, paddusb instruction can add 16 pairs of bytes at once with saturation (clamps results in byte range). (Don't forget about proper aligning of data chunks)
Example (not thoroughly tested) for 32-bit compiler. It works 9x faster than pascal-version for treatment of 256M-arrays (604 vs 5100 ms with 10 repeats). Note that for reasonable data size pascal version is pretty fast as well.
program Project1;
{$APPTYPE CONSOLE}
uses SysUtils;
procedure AddBytesSat(const A, B, Res: PByteArray; Len: Integer);
//adds byte arrays Res[i] := A[i] + B[i] with saturation
//arrays should be aligned to 16-byte border, length divisible by 16
//three parameters in eax, edx, ecx registers, fourth on the stack
asm
push esi
mov esi, ecx // save Res pointer
mov ecx, Len
shr ecx, 4 // Len div 16
@@start:
movdqa xmm0, [eax] //copies 16 bytes (aligned) to sse register
paddusb xmm0, [edx] // adds 16 unsigned values with saturation
movdqa [esi], xmm0 // move result bytes back to memory
add eax, 16 // move array pointers
add edx, 16
add esi, 16
loop @@start //go to next iteration
pop esi
end;
var
A, B, C: PByteArray;
i: integer;
begin
//ensure that memory manager returns properly aligned blocks
SetMinimumBlockAlignment(System.mba16Byte);
GetMem(A, 32);
GetMem(B, 32);
GetMem(C, 32);
for i := 0 to 31 do begin
A[i] := 8 * i;
B[i] := 200;
end;
AddBytesSat(A, B, C, 32);
//clamping demonstration
for i := 0 to 15 do
Writeln(C[i]);
Readln;
end.