Question

I have the following code (RAD Studio XE2, Windows 7 x64):

program letters;

{$APPTYPE CONSOLE}

{$DEFINE BOO}

const
  ENGLISH_ALPHABET = 'abcdefghijklmnopqrstuvwxyz';

begin
{$IFDEF BOO}
  writeln;
{$ENDIF}
  write(ENGLISH_ALPHABET[1]:3);

  readln;
end.

When {$DEFINE BOO} directive is turned off, I have the following (expected) output (spaces are replaced with dots for readability):

..a

When the directive is turned on, I have the following (unexpected) output:

// empty line here
?..a

instead of expected

// empty line here
..a

When I change const ENGLISH_ALPHABET to const ENGLISH_ALPHABET: AnsiString, the expected output is printed without question character. When :3 formatting is removed or changed to :1, there is no question mark. When the output is redirected to file (either by AssignFile(Output, 'boo.log') or from command line), there is no question mark again.

What is the correct explanation for this behavior?

Was it helpful?

Solution

This is a rather odd bug in the RTL. The call to write resolves to a call to _WriteWChar. This function is implemented like this:

function _WriteWChar(var t: TTextRec; c: WideChar; width: Integer): Pointer;
begin
  if width <= 1 then
    result := _Write0WChar(t, c)
  else
  begin
    if t.UTF16Buffer[0] <> #0 then
    begin
      _Write0WChar(t, '?');
      t.UTF16Buffer[0] := #0;
    end;

    _WriteSpaces(t, width - 1);
    Result := _Write0WChar(t, c);
  end;
end;

The ? that you see is emitted by the code above.

So, why does this happen. The simplest SSCCE that I can construct is this:

{$APPTYPE CONSOLE}
const
  ENGLISH_ALPHABET = 'abcdefghijklmnopqrstuvwxyz';
begin
  writeln;
  write(ENGLISH_ALPHABET[1]:3);
end.

So, your first call writeln and that resolves to this:

function _WriteLn(var t: TTextRec): Pointer;
begin
  if (t.Flags and tfCRLF) <> 0 then
    _Write0Char(t, _AnsiChr(cCR));
  Result := _Write0Char(t, _AnsiChr(cLF));
  _Flush(t);
end;

Here you push a single character, cLF, ASCII character 10, linefeed, onto the output text record. This results in t.MBCSBuffer being fed the cLF character. That character is left in the buffer which is fine because System._Write0Char.WriteUnicodeFromMBCSBuffer does this:

t.MBCSLength := 0;
t.MBCSBufPos := 0;

But when _WriteWChar executes, it indiscriminately looks in t.UTF16Buffer. Which is declared in TTextRec like this:

type
  TTextRec = packed record 
    ....
    MBCSLength: ShortInt;
    MBCSBufPos: Byte;
    case Integer of
      0: (MBCSBuffer: array[0..5] of _AnsiChr);
      1: (UTF16Buffer: array[0..2] of WideChar);
  end;

So, MBCSBuffer and UTF16Buffer share the same storage.

The bug is that _WriteWChar should not look at the content of t.UTF16Buffer without first checking the length of the buffer. Something that is not immediately obvious how to achieve because TTextRec has not UTF16Length. Instead, if t.UTF16Buffer contains meaningful content, the convention is that its length is given by -t.MBCSLength!

So _WriteWChar should perhaps be:

function _WriteWChar(var t: TTextRec; c: WideChar; width: Integer): Pointer;
begin
  if width <= 1 then
    result := _Write0WChar(t, c)
  else
  begin
    if (t.MBCSLength < 0) and (t.UTF16Buffer[0] <> #0) then
    begin
      _Write0WChar(t, '?');
      t.UTF16Buffer[0] := #0;
    end;

    _WriteSpaces(t, width - 1);
    Result := _Write0WChar(t, c);
  end;
end;

Here is a rather vile hack that fixes _WriteWChar. Note that I have not been able to get the address of System._WriteSpaces to be able to call it. That's something that could be done if you were desperate to fix this.

{$APPTYPE CONSOLE}

uses
  Windows;

procedure PatchCode(Address: Pointer; const NewCode; Size: Integer);
var
  OldProtect: DWORD;
begin
  if VirtualProtect(Address, Size, PAGE_EXECUTE_READWRITE, OldProtect) then
  begin
    Move(NewCode, Address^, Size);
    FlushInstructionCache(GetCurrentProcess, Address, Size);
    VirtualProtect(Address, Size, OldProtect, @OldProtect);
  end;
end;

type
  PInstruction = ^TInstruction;
  TInstruction = packed record
    Opcode: Byte;
    Offset: Integer;
  end;

procedure RedirectProcedure(OldAddress, NewAddress: Pointer);
var
  NewCode: TInstruction;
begin
  NewCode.Opcode := $E9;//jump relative
  NewCode.Offset := NativeInt(NewAddress)-NativeInt(OldAddress)-SizeOf(NewCode);
  PatchCode(OldAddress, NewCode, SizeOf(NewCode));
end;

var
  _Write0WChar: function(var t: TTextRec; c: WideChar): Pointer;

function _Write0WCharAddress: Pointer;
asm
  MOV     EAX, offset System.@Write0WChar
end;

function _WriteWCharAddress: Pointer;
asm
  MOV     EAX, offset System.@WriteWChar
end;

function _WriteWChar(var t: TTextRec; c: WideChar; width: Integer): Pointer;
var
  i: Integer;
begin
  if width <= 1 then
    result := _Write0WChar(t, c)
  else
  begin
    if (t.MBCSLength < 0) and (t.UTF16Buffer[0] <> #0) then
    begin
      _Write0WChar(t, '?');
      t.UTF16Buffer[0] := #0;
    end;

    for i := 1 to width - 1 do
      _Write0WChar(t, ' ');
    Result := _Write0WChar(t, c);
  end;
end;

const
  ENGLISH_ALPHABET = 'abcdefghijklmnopqrstuvwxyz';

begin
  @_Write0WChar := _Write0WCharAddress;
  RedirectProcedure(_WriteWCharAddress, @_WriteWChar);

  writeln;
  write(ENGLISH_ALPHABET[1]:3);
end.

I submitted QC#123157.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top