C++ read a metadate file, and process the fields separated by a mix of whitespace and : [closed]

Question 1

If those are variable numbers of "regions" and variable numbers of file name strings you could make your life easier by using vectors instead of arrays.

And in general, I wouldn't do something like this in C++, but rather Python, Ruby, etc. C++ is really not made for such tasks... Anyway, here is some quick and dirty code that should do the job. You can clean it up by refactoring...

#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include <vector>
#include <numeric>
#include <algorithm>

using std::vector;
using std::cout;
using std::cin;
using std::endl;
using std::string;
using std::ifstream;
using std::istringstream;

void split(const string &s, vector<string> &elems, char sep=' ') {
  string element = "";
  vector<string> splitted_ints;
  for (auto c : s) {
    if (c != sep)
      element.push_back(c);
    else {
      elems.push_back(element);
      element = "";
    } // end if
  } // end for loop
  if (element != "")
    elems.push_back(element);
} // end split()


int main() {
  string line;
  vector<string> first_line;
  vector<string> second_line;
  vector<string> splitted_ints;
  vector<int> from;
  vector<int> to;
  vector<string> filenames;
  ifstream in_file("input.txt"); 

  getline(in_file, line);
  split(line, first_line, ' ');
  getline(in_file, line);
  split(line, second_line, ' ');


  for (string ele : first_line) 
    split(ele, splitted_ints, ':');
  // now you have the integer sequence of the first line
  // in splitted_ints, e.g., 48, 49, 68, 70, 93, 100
  size_t i = 0;
  while (i < splitted_ints.size()){ 
    int num;
    istringstream (splitted_ints[i]) >> num;
    from.push_back(num);
    ++i;
    istringstream (splitted_ints[i]) >> num;
    to.push_back(num);
    ++i;
  }
  splitted_ints.clear();


  // repeat for second line
  for (string ele : second_line) 
    split(ele, splitted_ints, ':');
  i = 0;
  while (i < splitted_ints.size()){

    int num;
    istringstream (splitted_ints[i]) >> num;
    from.push_back(num);
    ++i;
    istringstream (splitted_ints[i]) >> num;
    to.push_back(num);
    ++i;
  }

  // append the rest of the input file (i.e., the filenames)
  while (getline(in_file, line)) {
    filenames.push_back(line);    
  }

// Printing out the contents to make sure it worked

  cout << "FILENAMES:\n";
  for (auto ele : filenames)
    cout << ele << ",";

  cout << "\nFROM numbers:\n";
  for (auto ele : from)
    cout << ele << ",";

  cout << "\nTO numbers:\n";
  for (auto ele : to)
    cout << ele << ",";
  cout << endl;


/*
FILENAMES:
tr429a.frank/tr429a.reg1.0.pdb
,tr429a.frank/tr429a.reg1.1.pdb
,tr429a.frank/tr429a.reg1.2.pdb
,tr429a.frank/tr429a.reg1.3.pdb
,tr429a.frank/tr429a.reg1.4.pdb
,tr429a.frank/tr429a.reg1.5.pdb
,tr429a.frank/tr429a.reg1.6.pdb
,tr429a.frank/tr429a.reg1.7.pdb,
FROM numbers:
48,68,93,22,34,50,71,
TO numbers:
49,70,100,33,47,67,92,
*/

  return 0;
}

Question 2

It is so much easier to do more complicated parsing in steps. Sorry for the mix of styles,

void parserange()
{
  char  s[100] ;
  char * p, * q ; 

  for ( int i= 2 ; ( i -- ) ; )
  { 
    std::cin.getline( s, sizeof(s )) ;

    for (q= s; ( q && * q ) ; q= p )
    {
      if (( p= strchr( q, ' ' ))) { *( p ++)= '\0' ; }
      std::cout << q << "\n" ;
    }
  }
}

So you read a line into s[100], then look for spaces and break the string up into parts. At the point I do cout you would further parse from & too.

Output from parserange():

Question 3

Try using Boost Spirit:

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/qi_match.hpp>
#include <boost/fusion/adapted/struct.hpp>
#include <fstream>

struct region { int32_t from,to; };

BOOST_FUSION_ADAPT_STRUCT(region, (int32_t,from)(int32_t,to))

int main()
{
    using namespace boost::spirit::qi;
    rule<boost::spirit::istream_iterator, region(), blank_type> region_ = int_ >> ':' >> int_ ;

    std::ifstream ifs("input.txt");

    std::vector<region> line1, line2;
    std::vector<std::string> filenames;

    if (ifs >> std::noskipws >> phrase_match(
                +region_ >> eol >> +region_ >> eol >>
                lexeme[+(char_ - eol)] % eol,
                blank, line1, line2, filenames))
    {
        std::cout << "Parse success\n";
        for (auto& r : line1) std::cout << "line1: from " << r.from << " to " << r.to << "\n";
        for (auto& r : line2) std::cout << "line2: from " << r.from << " to " << r.to << "\n";
        for (auto& fn : filenames) std::cout << "filename '" << fn << "'\n";
    }
}

Coliru is down, but here's the output for your input file from my machine:

Parse success
line1: from 48 to 49
line1: from 68 to 70
line1: from 93 to 100
line2: from 22 to 33
line2: from 34 to 47
line2: from 50 to 67
line2: from 71 to 92
filename 'tr429a.frank/tr429a.reg1.0.pdb'
filename 'tr429a.frank/tr429a.reg1.1.pdb'
filename 'tr429a.frank/tr429a.reg1.2.pdb'
filename 'tr429a.frank/tr429a.reg1.3.pdb'
filename 'tr429a.frank/tr429a.reg1.4.pdb'
filename 'tr429a.frank/tr429a.reg1.5.pdb'
filename 'tr429a.frank/tr429a.reg1.6.pdb'
filename 'tr429a.frank/tr429a.reg1.7.pdb'