Efficient parsing of mmap file

Question 1

read in header information to get the number of points
reserve space in a std::vector for N*num_points (N=3 assuming only X,Y,Z, 6 with normals, 9 with normals and rgb)
load the remainder of the file into a string
boost::spirit::qi::phrase_parse into the vector.

//code here can parse a file with 40M points (> 1GB) in about 14s on my 2 year old macbook:

#include <boost/spirit/include/qi.hpp>
#include <fstream>
#include <vector>

template <typename Iter>
bool parse_into_vec(Iter p_it, Iter p_end, std::vector<float>& vf) {
    using boost::spirit::qi::phrase_parse;
    using boost::spirit::qi::float_;
    using boost::spirit::qi::ascii::space;

    bool ret = phrase_parse(p_it, p_end, *float_, space, vf);
    return p_it != p_end ? false : ret;
}

int main(int argc, char **args) {
    if(argc < 2) {
        std::cerr << "need a file" << std::endl;
        return -1;
    }
    std::ifstream in(args[1]);

    size_t numPoints;
    in >> numPoints;

    std::istreambuf_iterator<char> eos;
    std::istreambuf_iterator<char> it(in);
    std::string strver(it, eos);

    std::vector<float> vf;
    vf.reserve(3 * numPoints);

    if(!parse_into_vec(strver.begin(), strver.end(), vf)) {
        std::cerr << "failed during parsing" << std::endl;
        return -1;
    }

    return 0;
}

Question 2

AFAICT, you're currently copying the entire contents of the file into strBuffer.

What I think you want to do is use boost::iostreams::stream with your mapped_file_source instead.

Here's an untested example, based on the linked documentation:

// Create the stream
boost::iostreams::stream<boost::iostreams::mapped_file_source> str("some/path/file");
// Alternately, you can create the mapped_file_source separately and tell the stream to open it (using a copy of your mapped_file_source)
boost::iostreams::stream<boost::iostreams::mapped_file_source> str2;
str2.open(file);

// Now you can use std::getline as you normally would.
std::getline(str, strMaxPts);

As an aside, I'll note that by default mapped_file_source maps the entire file, so there's no need to pass the size explicitly.

Question 3

You can go with something like this (just a fast concept, you'll need to add some additional error checking etc.):

#include "boost/iostreams/stream.hpp"
#include "boost/iostreams/device/mapped_file.hpp"
#include "boost/filesystem.hpp"
#include "boost/lexical_cast.hpp"

double parse_double(const std::string & str)
{
  double value = 0;
  bool decimal = false;
  double divisor = 1.0;
  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
  {
    switch (*it)
    {
    case '.':
    case ',':
      decimal = true;
      break;
    default:
      {
        const int x = *it - '0';
        value = value * 10 + x;
        if (decimal)
          divisor *= 10;
      }
      break;
    }
  }
  return value / divisor;
}


void process_value(const bool initialized, const std::string & str, std::vector< double > & values)
{
  if (!initialized)
  {
    // convert the value count and prepare the output vector
    const size_t count = boost::lexical_cast< size_t >(str);
    values.reserve(count);
  }
  else
  {
    // convert the value
    //const double value = 0; // ~ 0:20 min
    const double value = parse_double(str); // ~ 0:35 min
    //const double value = atof(str.c_str()); // ~ 1:20 min
    //const double value = boost::lexical_cast< double >(str); // ~ 8:00 min ?!?!?
    values.push_back(value);
  }
}


bool load_file(const std::string & name, std::vector< double > & values)
{
  const int granularity = boost::iostreams::mapped_file_source::alignment();
  const boost::uintmax_t chunk_size = ( (256 /* MB */ << 20 ) / granularity ) * granularity;
  boost::iostreams::mapped_file_params in_params(name);
  in_params.offset = 0;
  boost::uintmax_t left = boost::filesystem::file_size(name);
  std::string value;
  bool whitespace = true;
  bool initialized = false;
  while (left > 0)
  {
    in_params.length = static_cast< size_t >(std::min(chunk_size, left));
    boost::iostreams::mapped_file_source in(in_params);
    if (!in.is_open())
      return false;
    const boost::iostreams::mapped_file_source::size_type size = in.size();
    const char * data = in.data();
    for (boost::iostreams::mapped_file_source::size_type i = 0; i < size; ++i, ++data)
    {
      const char c = *data;
      if (strchr(" \t\n\r", c))
      {
        // c is whitespace
        if (!whitespace)
        {
          whitespace = true;
          // finished previous value
          process_value(initialized, value, values);
          initialized = true;
          // start a new value
          value.clear();
        }
      }
      else
      {
        // c is not whitespace
        whitespace = false;
        // append the char to the value
        value += c;
      }
    }
    if (size < chunk_size)
      break;
    in_params.offset += chunk_size;
    left -= chunk_size;
  }
  if (!whitespace)
  {
    // convert the last value
    process_value(initialized, value, values);
  }
  return true;
}

Note that your main problem will be the conversion from string to float, which is very slow (insanely slow in the case of boost::lexical_cast). With my custom special parse_double func it is faster, however it only allows a special format (e.g. you'll need to add sign detection if negative values are allowed etc. - or you can just go with atof if all possible formats are needed).

If you'll want to parse the file faster, you'll probably need to go for multithreading - for example one thread only parsing the string values and other one or more threads converting the loaded string values to floats. In that case you probably won't even need the memory mapped file, as the regular buffered file read might suffice (the file will be read only once anyway).

Question 4

A few quick comments on your code: 1) you're not reserving space for your vector so it's doing expansion every time you add a value. You have read the number of points from the file so call reserve(N) after the clear().

2) you're forcing a map of the entire file in one hit which will work on 64 bits but is probably slow AND is forcing another allocation of the same amount of memory with strBuffer << pbuffer;

http://www.boost.org/doc/libs/1_53_0/doc/html/interprocess/sharedmemorybetweenprocesses.html#interprocess.sharedmemorybetweenprocesses.mapped_file.mapped_file_mapping_regions shows how to getRegion

Use a loop through getRegion to load an estimated chunk of data containing many lines. You are going to have to handle partial buffers - each getRegion will likely end with part of a line you need to preserve and join to the next partial buffer starting the next region.