How to best write out a std::vector < std::string > container to a HDF5 dataset?
Question
Given a vector of strings, what is the best way to write them out to a HDF5 dataset? At the moment I'm doing something like the following:
const unsigned int MaxStrLength = 512;
struct TempContainer {
char string[MaxStrLength];
};
void writeVector (hid_t group, std::vector<std::string> const & v)
{
//
// Firstly copy the contents of the vector into a temporary container
std::vector<TempContainer> tc;
for (std::vector<std::string>::const_iterator i = v.begin ()
, end = v.end ()
; i != end
; ++i)
{
TempContainer t;
strncpy (t.string, i->c_str (), MaxStrLength);
tc.push_back (t);
}
//
// Write the temporary container to a dataset
hsize_t dims[] = { tc.size () } ;
hid_t dataspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
, dims
, NULL);
hid_t strtype = H5Tcopy (H5T_C_S1);
H5Tset_size (strtype, MaxStrLength);
hid_t datatype = H5Tcreate (H5T_COMPOUND, sizeof (TempConainer));
H5Tinsert (datatype
, "string"
, HOFFSET(TempContainer, string)
, strtype);
hid_t dataset = H5Dcreate1 (group
, "files"
, datatype
, dataspace
, H5P_DEFAULT);
H5Dwrite (dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, &tc[0] );
H5Dclose (dataset);
H5Sclose (dataspace);
H5Tclose (strtype);
H5Tclose (datatype);
}
At a minimum, I would really like to change the above so that:
- It uses variable length strings
- I don't need to have a temporary container
I have no restrictions over how I store the data so for example, it doesn't have to be a COMPOUND datatype if there is a better way to do this.
EDIT: Just to narrow the problem down, I'm relatively familiar with playing with the data on the C++ side, it's the HDF5 side where I need most of the help.
Thanks for your help.
Solution
[Many thanks to dirkgently for his help in answering this.]
To write a variable length string in HDF5 use the following:
// Create the datatype as follows
hid_t datatype = H5Tcopy (H5T_C_S1);
H5Tset_size (datatype, H5T_VARIABLE);
//
// Pass the string to be written to H5Dwrite
// using the address of the pointer!
const char * s = v.c_str ();
H5Dwrite (dataset
, datatype
, H5S_ALL
, H5S_ALL
, H5P_DEFAULT
, &s );
One solution for writing a container is to write each element individually. This can be achieved using hyperslabs.
For example:
class WriteString
{
public:
WriteString (hid_t dataset, hid_t datatype
, hid_t dataspace, hid_t memspace)
: m_dataset (dataset), m_datatype (datatype)
, m_dataspace (dataspace), m_memspace (memspace)
, m_pos () {}
private:
hid_t m_dataset;
hid_t m_datatype;
hid_t m_dataspace;
hid_t m_memspace;
int m_pos;
//...
public:
void operator ()(std::vector<std::string>::value_type const & v)
{
// Select the file position, 1 record at position 'pos'
hsize_t count[] = { 1 } ;
hsize_t offset[] = { m_pos++ } ;
H5Sselect_hyperslab( m_dataspace
, H5S_SELECT_SET
, offset
, NULL
, count
, NULL );
const char * s = v.c_str ();
H5Dwrite (m_dataset
, m_datatype
, m_memspace
, m_dataspace
, H5P_DEFAULT
, &s );
}
};
// ...
void writeVector (hid_t group, std::vector<std::string> const & v)
{
hsize_t dims[] = { m_files.size () } ;
hid_t dataspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
, dims, NULL);
dims[0] = 1;
hid_t memspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
, dims, NULL);
hid_t datatype = H5Tcopy (H5T_C_S1);
H5Tset_size (datatype, H5T_VARIABLE);
hid_t dataset = H5Dcreate1 (group, "files", datatype
, dataspace, H5P_DEFAULT);
//
// Select the "memory" to be written out - just 1 record.
hsize_t offset[] = { 0 } ;
hsize_t count[] = { 1 } ;
H5Sselect_hyperslab( memspace, H5S_SELECT_SET, offset
, NULL, count, NULL );
std::for_each (v.begin ()
, v.end ()
, WriteStrings (dataset, datatype, dataspace, memspace));
H5Dclose (dataset);
H5Sclose (dataspace);
H5Sclose (memspace);
H5Tclose (datatype);
}
OTHER TIPS
Here is some working code for writing a vector of variable length strings using the HDF5 c++ API.
I incorporate some of the suggestions in the other posts:
- use H5T_C_S1 and H5T_VARIABLE
- use
string::c_str()
to obtain pointers to the strings - place the pointers into a
vector
ofchar*
and pass to the HDF5 API
It is not necessary to create expensive copies of the string (e.g. with strdup()
). c_str()
returns a pointer to the null terminated data of the underlying string. This is precisely what the function is intended for. Of course, strings with embedded nulls will not work with this...
std::vector
is guaranteed to have contiguous underlying storage, so using vector
and vector::data()
is the same as using raw arrays but is of course much neater and safer than the clunky, old-fashioned c way of doing things.
#include "H5Cpp.h"
void write_hdf5(H5::H5File file, const std::string& data_set_name,
const std::vector<std::string>& strings )
{
H5::Exception::dontPrint();
try
{
// HDF5 only understands vector of char* :-(
std::vector<const char*> arr_c_str;
for (unsigned ii = 0; ii < strings.size(); ++ii)
arr_c_str.push_back(strings[ii].c_str());
//
// one dimension
//
hsize_t str_dimsf[1] {arr_c_str.size()};
H5::DataSpace dataspace(1, str_dimsf);
// Variable length string
H5::StrType datatype(H5::PredType::C_S1, H5T_VARIABLE);
H5::DataSet str_dataset = file.createDataSet(data_set_name, datatype, dataspace);
str_dataset.write(arr_c_str.data(), datatype);
}
catch (H5::Exception& err)
{
throw std::runtime_error(string("HDF5 Error in " )
+ err.getFuncName()
+ ": "
+ err.getDetailMsg());
}
}
If you are looking at cleaner code: I suggest you create a functor that'll take a string and save it to the HDF5 Container (in a desired mode). Richard, I used the wrong algorithm, please re-check!
std::for_each(v.begin(), v.end(), write_hdf5);
struct hdf5 : public std::unary_function<std::string, void> {
hdf5() : _dataset(...) {} // initialize the HDF5 db
~hdf5() : _dataset(...) {} // close the the HDF5 db
void operator(std::string& s) {
// append
// use s.c_str() ?
}
};
Does that help get started?
I had a similar issue, with the caveat that I wanted a vector of strings stored as an attribute. The tricky thing with attributes is that we can't use fancy dataspace features like hyperslabs (at least with the C++ API).
But in either case, it may be useful to enter a vector of strings into a single entry in a dataset (if, for example, you always expect to read them together). In this case all the magic comes with the type, not with the dataspace itself.
There are basically 4 steps:
- Make a
vector<const char*>
which points to the strings. - Create a
hvl_t
structure that points to the vector and contains it's length. - Create the datatype. This is a
H5::VarLenType
wrapping a (variable length)H5::StrType
. - Write the
hvl_t
type to a dataset.
The really nice part of this method is that you're stuffing the whole entry into what HDF5 considers a scalar value. This means that making it an attribute (rather than a dataset) is trivial.
Whether you choose this solution or the one with each string in its own dataset entry is probably also a matter of the desired performance: if you're looking for random access to specific strings, it's probably better to write the strings out in a dataset so they can be indexed. If you're always going to read them all out together this solution may work just as well.
Here's a short example of how to do this, using the C++ API and a simple scalar dataset:
#include <vector>
#include <string>
#include "H5Cpp.h"
int main(int argc, char* argv[]) {
// Part 0: make up some data
std::vector<std::string> strings;
for (int iii = 0; iii < 10; iii++) {
strings.push_back("this is " + std::to_string(iii));
}
// Part 1: grab pointers to the chars
std::vector<const char*> chars;
for (const auto& str: strings) {
chars.push_back(str.data());
}
// Part 2: create the variable length type
hvl_t hdf_buffer;
hdf_buffer.p = chars.data();
hdf_buffer.len = chars.size();
// Part 3: create the type
auto s_type = H5::StrType(H5::PredType::C_S1, H5T_VARIABLE);
s_type.setCset(H5T_CSET_UTF8); // just for fun, you don't need this
auto svec_type = H5::VarLenType(&s_type);
// Part 4: write the output to a scalar dataset
H5::H5File out_file("vtest.h5", H5F_ACC_EXCL);
H5::DataSet dataset(
out_file.createDataSet("the_ds", svec_type, H5S_SCALAR));
dataset.write(&hdf_buffer, svec_type);
return 0;
}
Instead of a TempContainer, you can use a simple std::vector (you could also templatized it to match T -> basic_string . Something like this:
#include <algorithm>
#include <vector>
#include <string>
#include <functional>
class StringToVector
: std::unary_function<std::vector<char>, std::string> {
public:
std::vector<char> operator()(const std::string &s) const {
// assumes you want a NUL-terminated string
const char* str = s.c_str();
std::size_t size = 1 + std::strlen(str);
// s.size() != strlen(s.c_str())
std::vector<char> buf(&str[0], &str[size]);
return buf;
}
};
void conv(const std::vector<std::string> &vi,
std::vector<std::vector<char> > &vo)
{
// assert vo.size() == vi.size()
std::transform(vi.begin(), vi.end(),
vo.begin(),
StringToVector());
}
In the interest of having the ability to read std::vector<std::string>
I'm posting my solution, based on the hints from Leo here https://stackoverflow.com/a/15220532/364818.
I've mixed C and C++ APIs. Please feel free to edit this and make it simpler.
Note that the HDF5 API returns a list of char*
pointers when you call read. These char*
pointers must be freed after use, otherwise there is a memory leak.
Usage example
H5::Attribute Foo = file.openAttribute("Foo");
std::vector<std::string> foos
Foo >> foos;
Here's the code
const H5::Attribute& operator>>(const H5::Attribute& attr0, std::vector<std::string>& array)
{
H5::Exception::dontPrint();
try
{
hid_t attr = attr0.getId();
hid_t atype = H5Aget_type(attr);
hid_t aspace = H5Aget_space(attr);
int rank = H5Sget_simple_extent_ndims(aspace);
if (rank != 1) throw PBException("Attribute " + attr0.getName() + " is not a string array");
hsize_t sdim[1];
herr_t ret = H5Sget_simple_extent_dims(aspace, sdim, NULL);
size_t size = H5Tget_size (atype);
if (size != sizeof(void*))
{
throw PBException("Internal inconsistency. Expected pointer size element");
}
// HDF5 only understands vector of char* :-(
std::vector<char*> arr_c_str(sdim[0]);
H5::StrType stringType(H5::PredType::C_S1, H5T_VARIABLE);
attr0.read(stringType, arr_c_str.data());
array.resize(sdim[0]);
for(int i=0;i<sdim[0];i++)
{
// std::cout << i << "=" << arr_c_str[i] << std::endl;
array[i] = arr_c_str[i];
free(arr_c_str[i]);
}
}
catch (H5::Exception& err)
{
throw std::runtime_error(string("HDF5 Error in " )
+ err.getFuncName()
+ ": "
+ err.getDetailMsg());
}
return attr0;
}
I am late to the party but I've modified Leo Goodstadt's answer based on the comments regarding segfaults. I am on linux, but I don't have such problems. I wrote 2 functions, one to write a vector of std::string to a dataset of a given name in an open H5File, and another to read back the resulting data sets into a vector of std::string. Note there may unnecessary copying between types a few times that can be more optimised. Here is working code for writing and reading:
void write_varnames( const std::string& dsetname, const std::vector<std::string>& strings, H5::H5File& f)
{
H5::Exception::dontPrint();
try
{
// HDF5 only understands vector of char* :-(
std::vector<const char*> arr_c_str;
for (size_t ii = 0; ii < strings.size(); ++ii)
{
arr_c_str.push_back(strings[ii].c_str());
}
//
// one dimension
//
hsize_t str_dimsf[1] {arr_c_str.size()};
H5::DataSpace dataspace(1, str_dimsf);
// Variable length string
H5::StrType datatype(H5::PredType::C_S1, H5T_VARIABLE);
H5::DataSet str_dataset = f.createDataSet(dsetname, datatype, dataspace);
str_dataset.write(arr_c_str.data(), datatype);
}
catch (H5::Exception& err)
{
throw std::runtime_error(std::string("HDF5 Error in ")
+ err.getFuncName()
+ ": "
+ err.getDetailMsg());
}
}
And to read:
std::vector<std::string> read_string_dset( const std::string& dsname, H5::H5File& f )
{
H5::DataSet cdataset = f.openDataSet( dsname );
H5::DataSpace space = cdataset.getSpace();
int rank = space.getSimpleExtentNdims();
hsize_t dims_out[1];
int ndims = space.getSimpleExtentDims( dims_out, NULL);
size_t length = dims_out[0];
std::vector<const char*> tmpvect( length, NULL );
fprintf(stdout, "In read STRING dataset, got number of strings: [%ld]\n", length );
std::vector<std::string> strs(length);
H5::StrType datatype(H5::PredType::C_S1, H5T_VARIABLE);
cdataset.read( tmpvect.data(), datatype);
for(size_t x=0; x<tmpvect.size(); ++x)
{
fprintf(stdout, "GOT STRING [%s]\n", tmpvect[x] );
strs[x] = tmpvect[x];
}
return strs;
}
I don't know about HDF5, but you can use
struct TempContainer {
char* string;
};
and then copy the strings this way:
TempContainer t;
t.string = strdup(i->c_str());
tc.push_back (t);
This will allocate a string with the exact size, and also improves a lot when inserting or reading from the container (in your example there's an array copied, in this case only a pointer). You can also use std::vector:
std::vector<char *> tc;
...
tc.push_back(strdup(i->c_str());