buffer overflow when using peg/leg

https://stackoverflow.com/questions/16688745

30-05-2022
|

Question

I'm using Ian Piumarta's peg/leg software to parse a toy language that I'm working on. Unfortunately some inputs to the parser are causing a crash that Clang's address sanitizer says is caused by a heap buffer overflow. Here's the output I get and the relevant lines of code:

==27761== ERROR: AddressSanitizer: heap-buffer-overflow on address 0x7f051b06c398 at pc 0x419e18 bp 0x7fffd2223780 sp 0x7fffd2223778
WRITE of size 8 at 0x7f051b06c398 thread T0
    #0 0x419e17 in yySet grammar.c:257
    #1 0x41767b in yyDone grammar.c:224
    #2 0x41745e in yyparsefrom grammar.c:2061
    #3 0x418dec in xr_parse_dump_from_stdin grammar.c:2129
    #4 0x42a6d9 in main parse_dump.c:19
    #5 0x7f051b09652c in ?? ??:0
0x7f051b06c398 is located 88 bytes to the right of 256-byte region [0x7f051b06c240,0x7f051b06c340)
allocated by thread T0 here:
    #0 0x42f4e0 in malloc ??:0
    #1 0x417234 in yyparsefrom grammar.c:2054
    #2 0x418dec in xr_parse_dump_from_stdin grammar.c:2129
    #3 0x42a6d9 in main parse_dump.c:19
    #4 0x7f051b09652c in ?? ??:0
Shadow byte and word:
  0x1fe0a360d873: fa
  0x1fe0a360d870: fa fa fa fa fa fa fa fa
More shadow bytes:
  0x1fe0a360d850: 00 00 00 00 00 00 00 00
  0x1fe0a360d858: 00 00 00 00 00 00 00 00
  0x1fe0a360d860: 00 00 00 00 00 00 00 00
  0x1fe0a360d868: fa fa fa fa fa fa fa fa
=>0x1fe0a360d870: fa fa fa fa fa fa fa fa
  0x1fe0a360d878: fa fa fa fa fa fa fa fa
  0x1fe0a360d880: fa fa fa fa fa fa fa fa
  0x1fe0a360d888: fa fa fa fa fa fa fa fa
  0x1fe0a360d890: fa fa fa fa fa fa fa fa
Stats: 0M malloced (0M for red zones) by 136 calls
Stats: 0M realloced by 11 calls
Stats: 0M freed by 15 calls
Stats: 0M really freed by 0 calls
Stats: 3M (898 full pages) mmaped in 7 calls
  mmaps   by size class: 7:4095; 8:2047; 9:1023; 10:511; 11:255; 12:128; 13:64;
  mallocs by size class: 7:112; 8:2; 9:3; 10:14; 11:3; 12:1; 13:1;
  frees   by size class: 7:8; 8:2; 9:2; 10:1; 11:1; 12:1;
  rfrees  by size class:
Stats: malloc large: 0 small slow: 7
==27761== ABORTING

Here's where the memory is allocated in grammar.c (generated from my grammar.leg specification)

  yyctx->valslen= 32;
  yyctx->vals= (YYSTYPE *)malloc(sizeof(YYSTYPE) * yyctx->valslen);

And then accessed here:

YY_LOCAL(void) yySet(yycontext *ctx, char *text, int count)   { ctx->val[count]= ctx->yy; }

It seems that yyctx->val is set to yyctx->vals before parsing.

I had assumed that this meant I was accessing a parser variable generated from some rule that hadn't been defined properly or something but but I can't see any instance of this in my grammar (which is admittedly a bit slapped together with almost certainly a host of other problems). Hopefully somebody can at least point me in the right direction. Thanks!

Edit: To address Dayal rai's question and hopefully give more useful information... if I use this code where it normally overflows:

printf("COUNT %d  w/ TEXT: %s\n", count, text);
fflush(stdout);
ctx->val[count]= ctx->yy;

and this parser input:

#!./parse_dump
object $root;
method foo()
{
    var i = 0;

    while (i < 10) {
        i = i + 1
    }
}

I get this output before the buffer overflow:

COUNT -2  w/ TEXT: root
COUNT -3  w/ TEXT: foo
COUNT -2  w/ TEXT: foo
COUNT -5  w/ TEXT: i
COUNT -2  w/ TEXT: 0
COUNT -1  w/ TEXT: 0
COUNT -2  w/ TEXT: 0
COUNT -2  w/ TEXT: 0
COUNT -3  w/ TEXT: 0
COUNT -2  w/ TEXT:
COUNT -2  w/ TEXT:
COUNT -6  w/ TEXT:
COUNT -2  w/ TEXT:
COUNT -2  w/ TEXT: i
COUNT -1  w/ TEXT: i
COUNT -2  w/ TEXT: i
COUNT -2  w/ TEXT: i
COUNT -3  w/ TEXT: i
COUNT -2  w/ TEXT:
COUNT -2  w/ TEXT: 10
COUNT -1  w/ TEXT: 10
COUNT -2  w/ TEXT: 10
COUNT -2  w/ TEXT: 10
COUNT -3  w/ TEXT: 10
COUNT -1  w/ TEXT:
COUNT -2  w/ TEXT:
COUNT -6  w/ TEXT:
COUNT -5  w/ TEXT: i
COUNT -2  w/ TEXT: i

I don't really understand how the memory is supposed to be laid out here, but it parses some programs just fine if I don't have Clang catch this error and if it doesn't end up corrupting some other important memory and causing a segfault elsewhere. I supposed I should include the grammar too if I'm already dumping all this text here. It's kind of dodgy mainly because I'm still not exactly sure what the language is going to end up looking like but here we go:

%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "internal.h"
#include "elixr.h"
#include "compile.h"
#include "types.h"

#define YYSTYPE XR
#define YY_CTX_LOCAL
#define YY_CTX_MEMBERS XR src; XR method; XR object; XR dump;

unsigned int lineNumber;

#define YY_INPUT(buf, result, max)      \
  {                     \
    int c= getc(stdin);             \
    if ('\n' == c || '\r' == c) ++lineNumber;   \
    result= (EOF == c) ? 0 : (*(buf)= c, 1);    \
  }


#define XR_AST1(type, a)       ast_node(AST_##type, a, VAL_NIL, VAL_NIL)
#define XR_AST2(type, a, b)    ast_node(AST_##type, a, b, VAL_NIL)
#define XR_AST3(type, a, b, c) ast_node(AST_##type, a, b, c)
%}

Dump            =     -- o1:Object          { $$ = o1 = list_new(o1); }
                    ( -- o2:Object          { $$ = o1 = list_append(0, o1, o2); }
                    )*                      { ctx->dump = o1;}

Object          = "object" - '$' i:ID ';' - EOL {ctx->object = (i == xr_sym("root")) ? root : object_new(0, i); }
                    (m:Method { object_add_method(ctx->object, m); })*
                    { $$ = ctx->object; }

Method    = -- "method" - i:ID a:Params -- s:Code  { struct XRMethod *m = xr_method_new(i, a); printf("CODE\n-----\n"); qsend(s, "source", val_num(0)); xr_ast_compile(s, m); m->object = ctx->object; $$ = ctx->method = (XR)m; assert(ctx->object); }

Params      = '(' - ')' { $$ = list_empty(); }
            | '(' - p1:ID { p1 = list_new(p1); }
            (',' - p2:ID { p1 = list_append(0, p1, p2); }
            )* ')' { $$ = p1; } 

Code      = SBLOCK -- s:Statements -- CBLOCK   { $$ = XR_AST1(CODE, s);}

Statements  =  (s1:Stmt            { s1 = list_new(s1); }
                 (SEP s2:Stmt     { s1 = list_append(0, s1, s2); })* SEP?
               ) { $$ = s1; }

Stmt        = e:BitAndOr                { $$ = XR_AST1(EXPRSTMT, e); }
            | VAR i:ID !ASSIGN      { $$ = XR_AST1(VDECL, i); }
            | VAR i:ID ASSIGN e:BitAndOr{ $$ = XR_AST2(VINIT, i, e); }
            | i:ID ASSIGN s:BitAndOr    { $$ = XR_AST2(ASSIGN, i, s); }
            | c:Code                { $$ = c; }
            | "if" - '(' - e:BitAndOr - ')' -- t:Stmt -- "else" -- f:Stmt { $$ = XR_AST3(IFELSE, e, t, f); }
            | "if" - '(' - e:BitAndOr - ')' -- c:Stmt { $$ = XR_AST2(IF, e, c); }
            | "while" - '(' - e:BitAndOr - ')' -- c:Stmt { $$ = XR_AST2(WHILE, e, c); }
            | "debug" { $$ = XR_AST1(DEBUG, 0); }
            | "print" - s:BitAndOr { $$ = XR_AST1(PRINT, s); }
            #| "do" -- c:Stmt "while" - '(' - e:Cmp - ')' { $$ = XR_AST2(DOWHILE, e, c; }

BitAndOr        = s:Cmp
              ( AND s2:Cmp { s = XR_AST2(AND, s, s2); }
              | OR s2:Cmp  { s = XR_AST2(OR, s, s2); }
              )* { $$ = s; }

Cmp         = s:Msg
              ( EQ  s2:Msg { s = XR_AST2(EQ, s, s2); }
              | NEQ s2:Msg { s = XR_AST2(NEQ, s, s2); }
              | GT s2:Msg { s = XR_AST2(GT, s, s2); }
              | LT s2:Msg { s = XR_AST2(LT, s, s2); }
              | GTE s2:Msg { s = XR_AST2(GTE, s, s2); }
              | LTE s2:Msg { s = XR_AST2(LTE, s, s2); }
              )*              { $$ = s; }

Msg         = s:Sum
              ( i:ID             { s = XR_AST3(SEND, s, XR_AST1(SYMBOL, i), VAL_NIL);}
              | i:ID a:Arguments { s = XR_AST3(SEND, s, XR_AST1(SYMBOL, i), a); }
              )*                 { $$ = s; }

Arguments = OPEN CLOSE           - { $$ = VAL_NIL; }
          | '(' - l:ExprList ')' - { $$ = l; }

List = '[' -- ']' - { puts("EMPTY LSIT\n"); $$ = list_empty(); }
     | '[' -- l:ExprList -- ']' - { $$ = l; }

ExprList = h:Cmp { h = list_new(h); } (',' -- t:Cmp { list_append(0, h, t);} )* { $$ = h; }


Sum         = l:Product
              ( PLUS  r:Product   { l = XR_AST2(PLUS, l, r); }
              | MINUS r:Product   { l = XR_AST2(MINUS, l, r); }
              )*                  { $$ = l; }

Product     = l:BinaryNot
              ( TIMES  r:BinaryNot    { l = XR_AST2(TIMES, l, r); }
              | DIVIDE r:BinaryNot    { l = XR_AST2(DIVIDE, l, r); }
              )*                  { $$ = l; }

BinaryNot = v:Value     { $$ = v;}
       | NOT v:Value { $$ = XR_AST1(NOT, v); }

#FIXME: sort out values/symbols/ids
Value       = v:NUMBER            { $$ = XR_AST1(NUMBER, v); }
            | v:STRING            { $$ = XR_AST1(STRING, v); }
            | ':' v:ID             { $$ = XR_AST1(SYMBOL, v); }
            | v:List              { $$ = XR_AST1(LIST, v); }
            | "false" -           { $$ = XR_AST1(VALUE, VAL_FALSE); }
            | "true" -            { $$ = XR_AST1(VALUE, VAL_TRUE); }
            | "nil" -             { $$ = XR_AST1(VALUE, VAL_NIL); }
            | "self" -            { $$ = XR_AST1(SELF, 0); }
            | v:ID !ASSIGN        { $$ = XR_AST1(VAR, v); }
            | OPEN e:BitAndOr CLOSE    { $$ = e; }


NUMBER  = FIXNUM | DECIMAL
FIXNUM  = < '-'? [0-9]+ !'.' >          - { $$ = val_num(atoi(yytext)); }
DECIMAL = < [0-9]* '.'? [0-9]+ >        - { $$ = xr_double(atof(yytext)); }
ID      = !KEYWORD < [a-z] [a-z0-9_]* > - { $$ = xr_sym_n(yytext, yyleng); }

#FIXME: escapes
CHAR    = !EOL ('\\' [abefnrtv'"\[\]\\]
               | '\\' [0-3][0-7][0-7]
               | '\\' [0-7][0-7]?
               | !'\\' .
               )
STRING  = ["] < ( !["] CHAR )* > ["] - { $$ = xr_strn(yytext, yyleng); }



SEP     = (';' | EOL)   --

# TODO: sort out keywords
KEYWORD = "debug" | "nil" | "while" | "self" | "else" | "true" | "false" | "if" |"var" | "print"

ASSIGN  = '=' !'='      -
PLUS    = '+'           -
MINUS   = '-'           -
TIMES   = '*'           -
DIVIDE  = '/'           -
OPEN    = '('           -
CLOSE   = ')'           -
VAR     = "var"         -
SBLOCK  = '{'           -
CBLOCK  = '}'           -
NOT     = '!'           -

EQ      = '=='          -
NEQ     = '!='          -
GT      = '>'           -
LT      = '<'           -
GTE     = '>='          -
LTE     = '<='          -

OR      = '||'          -
AND     = '&&'          -

SPACE = ' ' | '\f' | '\v' | '\t'
COMMENT = '#' (!EOL .)*
EOL     = '\n' | '\r\n' | '\r'
-       = (SPACE | COMMENT)*
--      = (SPACE | COMMENT | EOL)*
#EOF     = !.
%%

void yyerror(char *message, yycontext *yyctx)
{

char *fileName = "<stdin>";
FILE *input = stdin;
  fprintf(stderr, "%s:%d: %s", fileName, lineNumber, message);
  if (yyctx->text[0]) fprintf(stderr, " near token '%s'", yyctx->text);
  if (yyctx->pos < yyctx->limit || !feof(input))
    {
      yyctx->buf[yyctx->limit]= '\0';
      fprintf(stderr, " before text \"");
      while (yyctx->pos < yyctx->limit)
    {
      if ('\n' == yyctx->buf[yyctx->pos] || '\r' == yyctx->buf[yyctx->pos]) break;
      fputc(yyctx->buf[yyctx->pos++], stderr);
    }
      if (yyctx->pos == yyctx->limit)
    {
      int c;
      while (EOF != (c= fgetc(input)) && '\n' != c && '\r' != c)
        fputc(c, stderr);
    }
      fputc('\"', stderr);
    }
  fprintf(stderr, "\n");
}

XR xr_parse_code_from_stdin()
{
    yycontext ctx;
    memset(&ctx, 0, sizeof(yycontext));

    while (yyparsefrom(&ctx, yy_Code));


    return ctx.src;
}

XR xr_parse_method_from_stdin()
{
    yycontext ctx;
    memset(&ctx, 0, sizeof(yycontext));

    while (yyparsefrom(&ctx, yy_Method));

    return ctx.method;
}

XR xr_parse_dump_from_stdin()
{
    lineNumber = 1;

    yycontext ctx;
    memset(&ctx, 0, sizeof(yycontext));

    while (yyparsefrom(&ctx, yy_Dump));

    /*yyerror("syntax error", &ctx);*/

    return ctx.dump;
}

Solution

It seems that the problem was that my grammar was causing too much recursion and yyctx->valslen= 32; wasn't enough to hold it. Leg doesn't have any bounds checks on this either so I just assumed it would be fine. I though the vals memory just held the variables used in the grammar rules and I only have ~1-5 of those in each rule. This guy got peg/leg to build on windows and also fixed this bug by including a #define for a bigger vals array as well as an assert to check bounds in yyPush. Thanks dude!

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow