Question

So I've been working on this for a little bit and I'm having some weird problems. The end goal is to split up an input string by both whitespace and quotes (i.e. this "is a" very "very complex" example goes to {this, is a, very, very complex, example). Right now it seems to split it all up correctly with the exception of the first string.

Here it is (buff is being passed in with a value from getline):

char **tokens = (char **)malloc(sizeof(char));
char *temp;
int count = 0;
int prev = 0;
// Get tokens
for (int i = 0; i <= strlen(command) && running; i++) {
    if (i > prev && strncmp((buff + i), " ", 1) == 0) {
        temp = (char **)realloc(tokens, (sizeof(char)) * WORD_SIZE * (++count));
        if (temp == NULL) {
            fprintf(stderr, "Error in parsing: ran out of memory\n");
            running = false;
            free(tokens);
        }
        else {
            tokens = temp;
            *(temp) = (buff + i);
            strncpy(*(temp), "\0", 1);
            temp = tokens + WORD_SIZE * (count - 1);
            *(temp) = buff+prev;
            prev = i+1;
        }
    }
    else if (strncmp((buff + i), "\"", 1) == 0) {
        *(temp) = (buff + i);
            strncpy(*(temp), "\0", 1);
        i++;
        prev = i;
        for (; strncmp((buff + i), "\"", 1) != 0; i++) { }
        temp = (char **)realloc(tokens, (sizeof(char)) * WORD_SIZE * (++count));
        if (temp == NULL) {
            fprintf(stderr, "Error in parsing: ran out of memory\n");
            running = false;
            free(tokens);
        }
        else {
            tokens = temp;
            *(temp) = (buff + i);
            strncpy(*(temp), "\0", 1);
            temp = tokens + WORD_SIZE * (count - 1);
            *(temp) = buff+prev;
            prev = i+1;
        }
    }
    else if (strncmp((buff + i), "\0", 1) == 0) {
        temp = (char **)realloc(tokens, (sizeof(char)) * WORD_SIZE * (++count));
        if (temp == NULL) {
            fprintf(stderr, "Error in parsing: ran out of memory\n");
            running = false;
            free(tokens);
        }
        else {
            tokens = temp;
            temp = tokens + WORD_SIZE * (count - 1);
            *(temp) = buff+prev;
            prev = i+1;
        }
    }
}
for (int i = 0; i < count; i++)
     printf("\t%i: %s\n", i, *tokens + sizeof(char) * WORD_SIZE * i);

Right now if I input "this is a test" (no quotes) I get:
0:
1: is
2: a
3: test

Quotes are a little more messed up, for "this \"is a\" very \"very complex\" test" I get:
0:
1: is a
2:
3: very complex
4: test

Was it helpful?

Solution 2

Here is a totally new write from scratch, as that was easier to rewrite your own code (apologies if that was not your intention). A few notes:

  1. No need to test for previous mallocs. You can safely realloc a NULL pointer.
  2. if (strncmp((buff + i), "\"", 1) == 0) -- you can test buff[i] immediately.
  3. Why all that prev shuffling? :) It's enough to loop once over your string.
  4. I left the temp test for successfully realloc in because you had it as well. It's actually unnecessary in my code, as it merely exits main.
  5. added: the character " also introduces a new 'word' when not preceded by a space.

Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main (void)
{
    char **tokens = NULL;
    int i, count = 0, strcount;
    char **temp, *iterate;

    char *input = "this \"is a\" very \"very complex\" test";

    iterate = input;

    if (iterate)
    {
        while (*iterate)
        {
            while (*iterate == ' ')
                iterate++;

            if (!*iterate)
                break;

            temp = realloc(tokens, sizeof(char *) * (count+1));
            if (temp == NULL)
            {
                fprintf(stderr, "Error in parsing: ran out of memory\n");
                return -1;
            }
            tokens = temp;

            if (*iterate == '\"')
            {
                iterate++;
                strcount = 0;
                while (iterate[strcount] && iterate[strcount] != '\"')
                    strcount++;
                tokens[count] = malloc(strcount+1);
                strncpy (tokens[count], iterate, strcount);
                tokens[count][strcount] = 0;
                count++;
                iterate += strcount;
                if (*iterate == '\"')
                    iterate++;
            } else
            {
                strcount = 0;
                while (iterate[strcount] && iterate[strcount] != ' ' && iterate[strcount] != '\"')
                    strcount++;
                tokens[count] = malloc(strcount+1);
                strncpy (tokens[count], iterate, strcount);
                tokens[count][strcount] = 0;
                count++;
                iterate += strcount;
            }
        } while (*iterate);
    }

    for (i = 0; i < count; i++)
        printf("\t%i: %s\n", i, tokens[i]);

    return 0;
}

Output for this "is a" very "very complex" test:

0: this
1: is a
2: very
3: very complex
4: test

OTHER TIPS

You said an alternative code would be okay. Simple string parsing algorithms are almost always easier and produce more maintainable code if you use a Deterministic Finite Automaton model to think about them. There are many free references for DFAs on the web.

Here's a DFA that solves your problem.

dfa

The meaning of [any] is "all else". In other words, if no other transition matches, take this one. It becomes the default case in a C switch. The meaning of [eos] is "end of string" or the null character.

Note that the DFA lets you be systematic about all cases, for example a quote appearing while in the middle of a word. Here I treated this as the end of the current word and beginning of a new quoted one. If the spec changes, the DFA is quite easy to change and the changes translate to code with no hard thinking.

All that remains is to add "action code" to capture the token starts and overwrite null terminators at the obvious places. In C, we have:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char **tokenize(char *str, int *n_tokens_rtn)
{
  // State of the DFA.
  enum { Error = -1, Start, InQuoted, InWord } state = Start;

  // String pointer and current character
  int cp = 0;

#define CURRENT_CHAR (str[cp])
#define ADVANCE_TO_NEXT_CHAR do { ++cp; } while (0)
#define MARK_END_OF_TOKEN do { str[cp] = '\0'; } while (0)

  // Token pointer and buffer. Allocate biggest possible and shrink at end.
  int tp = 0;
  char **tokens = safe_malloc((1 + strlen(str) / 2) * sizeof *tokens);

#define SAVE_TOKEN do { tokens[tp++] = &str[cp]; } while (0)

  // Each iteration is one DFA transition.
  for (;;) {
    switch (state) {
    case Start:
      switch (CURRENT_CHAR) {
      case '\0':
        goto done_scanning;

      case ' ': case '\t': case '\n':
        ADVANCE_TO_NEXT_CHAR;
        break;

      case '"':
        state = InQuoted;
        ADVANCE_TO_NEXT_CHAR;
        SAVE_TOKEN;
        break;

      default:
        state = InWord;
        SAVE_TOKEN;
        ADVANCE_TO_NEXT_CHAR;
        break;
      }
      break;

    case InQuoted:
      switch (CURRENT_CHAR) {
      case '\0':
        state = Error; // Missing close quote.
        break;

      case '"':
        state = Start;
        MARK_END_OF_TOKEN;
        ADVANCE_TO_NEXT_CHAR;
        break;

      default:
        ADVANCE_TO_NEXT_CHAR;
        break;
      }
      break;

    case InWord:
      switch (CURRENT_CHAR) {

      case '\0':
        goto done_scanning;

      case ' ': case '\t': case '\n':
        state = Start;
        MARK_END_OF_TOKEN;
        ADVANCE_TO_NEXT_CHAR;
        break;

      case '"': // Word ended in quote, not space.
        state = InQuoted;
        MARK_END_OF_TOKEN;
        ADVANCE_TO_NEXT_CHAR;
        SAVE_TOKEN;
        break;

      default:
        ADVANCE_TO_NEXT_CHAR;
        break;
      }
      break;

    case Error:
      fprintf(stderr, "Syntax error.\n");
      goto done_scanning;
    }
  }

 done_scanning:
  // Return number of tokens if caller is interested.
  if (n_tokens_rtn) *n_tokens_rtn = tp;

  // Append a null terminator for good measure.
  tokens[tp++] = NULL;

  // Trim the returned value to the right size.
  return realloc(tokens, tp * sizeof *tokens);
}

int main(void)
{
  char str[] = "this \"is a\" very \"very complex\" example";
  char **tokens = tokenize(str, NULL);
  for (int i = 0; tokens[i]; i++)
    printf("%s\n", tokens[i]);
  return 0;
}

This appears like a relatively simple problem, so rather than write a full parser, I wrote up a solution using the standard C library to do the heavy lifting. Judge for yourself if this solution is appealing. There are probably ways to improve on what I've done to make the code a bit clearer as well, I'll leave that as an exercise for anyone so inclined.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

int main()
{
    char input_string[] = "this \"is a\" very \"very complex\" test";
    char **tokens = NULL;
    int token_count = 0;
    char *ptr = input_string;
    int i;
    char *next_ptr = ptr;

    while (*ptr && next_ptr)
    {
        while (*ptr == ' ') ptr++;
        tokens = realloc(tokens, ++token_count * sizeof(char *));
        if (tokens == NULL)
            return -1;
        if (*ptr == '"')
            next_ptr = strchr(ptr+1, '"');
        else
            next_ptr = strpbrk(ptr, " \"");
        if (next_ptr)
        {
            tokens[token_count-1] = malloc(sizeof(char) * (next_ptr - (ptr+(*ptr=='"'))) + 1);
            if (tokens[token_count-1] == NULL)
                return -1;
            strncpy(tokens[token_count-1], (ptr+(*ptr=='"')), next_ptr - (ptr+(*ptr=='"')));
            tokens[token_count-1][next_ptr - (ptr+(*ptr=='"'))] = 0;
            ptr = next_ptr + (*ptr=='"');
        }
        else
            tokens[token_count-1] = strdup(ptr+(*ptr=='"'));
    }

    for (i = 0; i < token_count; ++i)
        printf("[%d]: %s\n", i, tokens[i]);

    return 0;
}

Output:

[0]: this
[1]: is a
[2]: very
[3]: very complex
[4]: test
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top