Use gzbuffer to read gzipped file quickly and then split content line by line

StackOverflow https://stackoverflow.com/questions/22275917

  •  11-06-2023
  •  | 
  •  

سؤال

I want a function that takes as input a file name and a vector of strings, and that fills the vector line by line by efficiently reading the file. Here is what I did so far:

/** \brief Read the whole file in a vector of lines
 */
  int
  readFile(
    const string & pathToFile,
    vector<string> & lines)
  {
    gzFile stream;
    openFile(pathToFile, stream, "rb");

    int errnum;
    const char * error_msg = NULL;

    size_t nb_bytes_to_read = 256000; // 8192 is default for gzbuffer
    if(gzbuffer(stream, nb_bytes_to_read) == -1){
      error_msg = gzerror(stream, &errnum);
      if(errnum != Z_OK){
        cerr << "ERROR: gzbuffer failed with " << nb_bytes_to_read
             << " bytes" << endl;
        cerr << error_msg << endl;
        exit(EXIT_FAILURE);
      }
    }

    size_t buf_len = nb_bytes_to_read;
    char * buf = (char *) malloc(buf_len);
    if(buf == NULL){
      cerr << "ERROR: can't allocate " << nb_bytes_to_read
           << " bytes" << endl;
      exit(EXIT_FAILURE);
    }

    size_t nb_bytes_read = 0, tot_nb_bytes_read = 0;
    while(! gzeof(stream)){
      nb_bytes_read = gzread(stream, buf + tot_nb_bytes_read,
                             nb_bytes_to_read);
      tot_nb_bytes_read += nb_bytes_read;
      if(nb_bytes_read < nb_bytes_to_read && ! gzeof(stream)){
        error_msg = gzerror(stream, &errnum);
        if(errnum != Z_OK){
          cerr << "ERROR: gzread failed on " << pathToFile << endl;
          cerr << error_msg << endl;
          exit(EXIT_FAILURE);
        }
      }
      if(tot_nb_bytes_read == buf_len){
        buf_len += nb_bytes_to_read;
        buf = (char*) realloc(buf, buf_len);
        if(buf == NULL){
          cerr << "ERROR: can't allocate " << nb_bytes_to_read
               << " bytes" << endl;
          exit(EXIT_FAILURE);
        }
      }
    }

    closeFile(pathToFile, stream);

    lines = split(buf, "\n", lines);

    free(buf);

    return 0;
  }

The documentation of zlib for gzread mentions: "If something other than a gzip stream is encountered after a gzip stream, that remaining trailing garbage is ignored (and no error is returned)". However, for some files, my code above reads "one line too far". More specifically, the output "lines" vector has N elements whereas the input file has N-1 lines. As a result, the last element of the "lines" vector can be something like "\223(\305ĿV".

How can I solve that?

Here are the other functions used in the code above:

  void
  openFile(
    const string & pathToFile,
    gzFile & fileStream,
    const char * mode)
  {
    fileStream = gzopen(pathToFile.c_str(), mode);
    if(fileStream == NULL){
      cerr << "ERROR: can't open file " << pathToFile
           << " with mode " << *mode
           << " (errno=" << errno << ")" << endl;
      exit(EXIT_FAILURE);
    }
  }

  void
  closeFile(
    const string & pathToFile,
    gzFile & fileStream)
  {
    int ret = gzclose(fileStream);
    if(ret != Z_OK){
      cerr << "ERROR: can't close the file " << pathToFile
           << ", gzclose() returned " << ret << endl;
      exit(EXIT_FAILURE);
    }
  }

  vector<string> &
  split(
    char * buf,
    const char * delim,
    vector<string> & tokens)
  {
    tokens.clear();
    char * pch;
    pch = strtok(buf, delim);
    while(pch != NULL){
      tokens.push_back(string(pch));
      pch = strtok(NULL, delim);
    }
    return tokens;
  }

(As I'm not a professional programmer, any other advice is welcome!)

هل كانت مفيدة؟

المحلول

strtok() operates on a null-terminated string. You are providing a buffer read from, presumably, a text file. There is no null. So strtok() is reading past the end of your buffer until it finds an accidental zero in memory.

By the way, strtok() has issues, and is not even reentrant. Read the man pages for strtok and strsep.

مرخصة بموجب: CC-BY-SA مع الإسناد
لا تنتمي إلى StackOverflow
scroll top