سؤال

The problem I'm trying to solve is optimising the input of some 3rd party code, which has commandline "program input_file output_file". The 3rd party code handles the input_file with standard fopen, fseek, fread etc. I want to be able to use multiple input files, treating them as a single file as if they were concatenated in the order they're supplied. I have the 3rd party code but want to modify it as little as possible. Currently am concatenating the files then calling program with the concatenated file as input, I'm trying to eliminate the concatenation as the files can be large and take time. Reading from stdin doesn't do what I want, because the program writes stdin to a file to allow seeks.

The solution I'm working on is to accept the input_file commandline argument as many files concatenated (? delimited), and adding concat_stream.h to the start of the program source (after including stdio). concat_stream.h implements transparently treating multiple streams as one stream by intercepting the standard calls, and implementing the concatenated streams with some global arrays of the streams and accompanying data. Here's a small portion of concat_stream.h as an example:

    FILE * fopen_concat_streams (char * filename, char * mode )
    {
      if( strchr(filename, '?')!=NULL )/*do we want a concat_stream?*/
        return concat_streams_init(filename, mode);/*setup concat_stream, return first stream as id*/
      else
        return fopen(filename, mode);/*standard library implementation*/
    }

    long int ftell_concat_streams( FILE * stream )
    {
      unsigned int index=is_stream_concat(stream);/*work out if stream refers to a concat_stream or regular stream*/
      if(index!=CONCAT_STREAMS_MAX)/*is stream a concat_stream?*/
      {
        ...
        return answer;/*work out and return location in concat_stream*/
      }
      else
        return ftell(stream);/*standard library implementation*/
    }

    #define fopen(x, y) fopen_concat_streams(x, y)
    #define ftell(x) ftell_concat_streams(x)

My question is am I on the right track, and is there an easier way to do it? If there's a library to sort this out for me I'll use that instead, it seems like it should be a popular thing to do but I haven't found anything so far. A totally different way to solve the initial problem would also be accepted, multiple streams as one is just my best guess at the easiest solution.

هل كانت مفيدة؟

المحلول

If you know the paths and sizes of all files, then this might work. What you try to achieve is to create a virtual file that is made up of all the individual parts.

You will need to create a data structure which contains the file handle and the offset (in the virtual file) of each file. Then you can search in this structure for the real file handle and calculate the correct offsets.

Problems to be aware of:

  • If you read over the end of a file with a single fread() call

Other options:

  • If you don't need fseek(), you can try to teach the code to understand - as an alias for stdin and use cat to concatenate the files: cat file1 file2 file3 | program - output

  • Write a file system using the FUSE API. That's not as scary as it sounds in your case. That would allow you to keep the original code unchanged. Instead, you'd use FUSE to make the files appear like one huge file.

نصائح أخرى

It sounds like you want to achieve what bash 4.x achieves through 'process substitution':

the_program <(cat file1 file2 file3) output

That is, you have cat send the input files as one named stream (it'll be a name such as /dev/fd/64, probably) that the program can open and read from. This avoids all modifications to the program.

Does this satisfy your requirements (other than requiring C code to achieve the effect)? One possible problem would be if the program needs a seekable file; it is not clear that you'll be able to seek on the file stream that is opened.

Here's a cutdown intercept solution implementing the basics. Limited testing, limited error checks, about as robust as a feather. Not all functions are complete, and many are missing (if your code uses fseeki64, implement it here etc). It's a solution I'm shying away from (will try fuse as suggested), but if anyone else wants to do it this way this may be a starting point.

main

#include <stdio>
#include "concat_streams.h"
int main(int argc, char*argv[])
{
  char buf[16];
  concat_streams_global_init('?');
  FILE* file = fopen( "file1?file2?file3?file4", "rb" );
  ...
  fseek( file, 12, SEEK_SET);
  ...
  fread(buf, 1, 16, file);
  ...
  fclose(file);
}

concat_streams.h

#define CONCAT_STREAMS_MAX 10 /*max number of concat streams*/
FILE*** concat_streams=NULL;
size_t** concat_streams_boundaries=NULL;
size_t* concat_streams_count=NULL;
size_t* concat_streams_selector=NULL;
size_t* concat_streams_tot_size=NULL;
char concat_streams_delim='?';

/*return index of stream if it is concat, CONCAT_STREAMS_MAX otherwise*/
int is_stream_concat(FILE* stream)
{
  unsigned int index=0;
  while(index<CONCAT_STREAMS_MAX)
  {
    if(concat_streams[index]!=NULL)
    {
      if(concat_streams[index][0]==stream)
        break;
    }
    ++index;
  }
  return index;
}

/*Initialise concat_stream store*/
void concat_streams_global_init(char delim_use)
{
  concat_streams_delim=delim_use;

  concat_streams=(FILE***) malloc(sizeof(FILE**)*CONCAT_STREAMS_MAX);
  concat_streams_boundaries=(size_t**) malloc(sizeof(size_t*)*CONCAT_STREAMS_MAX);
  concat_streams_count=(size_t*) malloc(sizeof(size_t)*CONCAT_STREAMS_MAX);
  concat_streams_selector=(size_t*) malloc(sizeof(size_t)*CONCAT_STREAMS_MAX);
  concat_streams_tot_size=(size_t*) malloc(sizeof(size_t)*CONCAT_STREAMS_MAX);

  memset(concat_streams, 0, sizeof(FILE**)*CONCAT_STREAMS_MAX );
  memset(concat_streams_boundaries, 0, sizeof(size_t*)*CONCAT_STREAMS_MAX);
  memset(concat_streams_count, 0, sizeof(size_t)*CONCAT_STREAMS_MAX );
  memset(concat_streams_selector, 0, sizeof(size_t)*CONCAT_STREAMS_MAX );
  memset(concat_streams_tot_size, 0, sizeof(size_t)*CONCAT_STREAMS_MAX );
}

/*The meat of fopen*/
FILE* concat_streams_init(char* files_question_delim, char * mode)
{
  unsigned int concat_streams_next_set=0;
  while(concat_streams_next_set<CONCAT_STREAMS_MAX)
  {
    if(concat_streams[concat_streams_next_set]==NULL)
      break;
    ++concat_streams_next_set;
  }
  if(concat_streams_next_set==CONCAT_STREAMS_MAX)
    return NULL;
  char*files_question_delim_cpy=NULL;
  unsigned int i=0;
  while(files_question_delim[i]!=0)
  {
    if(files_question_delim[i]=='?')
      ++concat_streams_count[concat_streams_next_set];
    ++i;
  }
  ++concat_streams_count[concat_streams_next_set];

  files_question_delim_cpy=(char*)malloc(i);
  memcpy(files_question_delim_cpy, files_question_delim, i);

  concat_streams[concat_streams_next_set]=(FILE**)malloc(sizeof(FILE*)*concat_streams_count[concat_streams_next_set]);
  concat_streams_boundaries[concat_streams_next_set]=(size_t*)malloc(sizeof(size_t)*(concat_streams_count[concat_streams_next_set]+1));
  concat_streams_boundaries[concat_streams_next_set][0]=0;


  char* next_file;
  next_file=strtok(files_question_delim_cpy, "?");
  while(next_file!=NULL)
  {
    concat_streams[concat_streams_next_set][concat_streams_selector[concat_streams_next_set]]=fopen(next_file, "rb");
    if(concat_streams[concat_streams_next_set][concat_streams_selector[concat_streams_next_set]]==NULL)
    {
      fclose_concat_streams(concat_streams[concat_streams_next_set][0]);
      return NULL;/*fopen failed*/
    }
    fseek(concat_streams[concat_streams_next_set][concat_streams_selector[concat_streams_next_set]], 0, SEEK_END);
    concat_streams_boundaries[concat_streams_next_set][1+concat_streams_selector[concat_streams_next_set]] = concat_streams_boundaries[concat_streams_next_set][concat_streams_selector[concat_streams_next_set]] + ftell(concat_streams[concat_streams_next_set][concat_streams_selector[concat_streams_next_set]]);
    concat_streams_tot_size[concat_streams_next_set]+=ftell(concat_streams[concat_streams_next_set][concat_streams_selector[concat_streams_next_set]]);
    rewind(concat_streams[concat_streams_next_set][concat_streams_selector[concat_streams_next_set]]);
    ++concat_streams_selector[concat_streams_next_set];
    next_file=strtok(NULL, "?");
  }
  concat_streams_selector[concat_streams_next_set]=0;

  free(files_question_delim_cpy);
  return concat_streams[concat_streams_next_set][0];
}

FILE * fopen_concat_streams (char * filename, char * mode )
{
  if( strchr(filename, '?')!=NULL )
    return concat_streams_init(filename, mode);
  else
    return fopen(filename, mode);
}

/*only implemented origin==SEEK_SET*/
int fseek_concat_streams( FILE * stream, long int offset, int origin )
{
  unsigned int i=0;
  unsigned int index=is_stream_concat(stream);
  if(index!=CONCAT_STREAMS_MAX)
  {
    switch(origin)
    {
      case SEEK_SET:
        while(i<concat_streams_count[index])
        {
          if(offset>=concat_streams_boundaries[index][i] && offset<concat_streams_boundaries[index][i+1])
            break;
          ++i;
        }
        if(i==concat_streams_count[index])
          return 1;/*out of range*/
        concat_streams_selector[index]=i;
        return fseek(concat_streams[index][concat_streams_selector[index]], offset-concat_streams_boundaries[index][concat_streams_selector[index]], SEEK_SET);
      default:
          puts("error, Only SEEK_SET supported when using cat streams");
        return 1;/*not implemented*/
    }
  }
  else
    return fseek(stream, offset, origin);/*just a normal file*/
}

long int ftell_concat_streams( FILE * stream )
{
  unsigned int index=is_stream_concat(stream);
  if(index!=CONCAT_STREAMS_MAX)
  {
    /*Found*/
    return concat_streams_boundaries[index][concat_streams_selector[index]] + ftell(concat_streams[index][concat_streams_selector[index]]);
  }
  else
    return ftell(stream);
}

int feof_concat_streams( FILE * stream )
{
  unsigned int index=is_stream_concat(stream);
  if(index!=CONCAT_STREAMS_MAX)
  {
    if(concat_streams_selector[index]==concat_streams_count[index])
      return 1;/*EOF*/
    else
      return 0;
  }
  else
    return feof(stream);
}

size_t fread_concat_streams (void * ptr, size_t size, size_t count, FILE * stream )
{
  size_t mult=size*count;
  size_t num_to_go=mult;
  char* buffer=NULL;
  unsigned int index=is_stream_concat(stream);
  unsigned int num_read;
  char* out_ptr=(char*)ptr;

  if(index!=CONCAT_STREAMS_MAX)
  {
    if(concat_streams_selector[index]==concat_streams_count[index])
      return 0;/*at eof*/

    buffer=(char*)malloc(2048*4096);
    while(num_to_go!=0)
    {
      num_read=fread(buffer, 1, num_to_go>=2048*4096?2048*4096:num_to_go, concat_streams[index][concat_streams_selector[index]]);
      if( num_read != (num_to_go>=2048*4096?2048*4096:num_to_go) )
      {
        if( feof(concat_streams[index][concat_streams_selector[index]])==0 )
        {
          puts("EOF not set, read error");
          memcpy(out_ptr, buffer, num_read);
          out_ptr+=num_read;
          num_to_go-=num_read;
          free(buffer);
          return mult-num_to_go;
        }
        else
        {
          rewind(concat_streams[index][concat_streams_selector[index]]);
          ++concat_streams_selector[index];
          if(concat_streams_selector[index]==concat_streams_count[index])
          {
            memcpy(out_ptr, buffer, num_read);
            out_ptr+=num_read;
            num_to_go-=num_read;
            free(buffer);
            return mult-num_to_go;
          }
          else
            rewind(concat_streams[index][concat_streams_selector[index]]);
        }
      }
      memcpy(out_ptr, buffer, num_read);
      out_ptr+=num_read;
      num_to_go-=num_read;
    }
    free(buffer);  
    return mult;
  }
  else
    return fread(ptr, size, count, stream);
}

size_t fwrite_concat_streams ( const void * ptr, size_t size, size_t count, FILE * stream )
{
  unsigned int index=is_stream_concat(stream);
  if(index!=CONCAT_STREAMS_MAX)
  {
    puts("error, writing to cat_streams not supported");
    return 0;
  }
  else
    return fwrite(ptr, size, count, stream);
}

int fclose_concat_streams ( FILE * stream )
{
  unsigned int i=0;
  unsigned int index=is_stream_concat(stream);
  if(index!=CONCAT_STREAMS_MAX)
  {
    while(i<concat_streams_count[index])
    {
      fclose(concat_streams[index][i]);
      ++i;
    }
    free(concat_streams[index]);
    concat_streams[index]=NULL;
    free(concat_streams_boundaries[index]);
    concat_streams_boundaries[index]=NULL;
    concat_streams_count[index]=0;
    concat_streams_selector[index]=0;
    concat_streams_tot_size[index]=0;
  }
  else
    return fclose(stream);
}

#define fseek(x, y, z) fseek_concat_streams(x, y, z)
#define fread(w, x, y, z) fread_concat_streams(w, x, y, z)
#define fwrite(w, x, y, z) fwrite_concat_streams(w, x, y, z)
#define fopen(x, y) fopen_concat_streams(x, y)
#define ftell(x) ftell_concat_streams(x)
#define feof(x) feof_concat_streams(x)
#define fclose(x) fclose_concat_streams(x)
مرخصة بموجب: CC-BY-SA مع الإسناد
لا تنتمي إلى StackOverflow
scroll top