如何从 C++ 中的 hdf5 文件中读取数据块?

Posted

技术标签:

【中文标题】如何从 C++ 中的 hdf5 文件中读取数据块?【英文标题】:How to read chunk of the data from a hdf5 file in c++? 【发布时间】:2021-12-09 20:24:15 【问题描述】:

我想读取一块数据,它只是存储在一个数据集中的许多帧中的一帧。整个数据集的形状为(10, 11214,3),10帧,每帧11214行4列。这是file。我要读取的块将具有形状 (11214,3)。我可以使用打印预定义的数组,但我不确定如何从 hdf5 文件中读取数据。这是我的代码,

#include <h5xx/h5xx.hpp>
#include <boost/multi_array.hpp>
#include <iostream>
#include <vector>
#include <cstdio>

typedef boost::multi_array<int, 2> array_2d_t;

const int NI=10;
const int NJ=NI;

void print_array(array_2d_t const& array)

    for (unsigned int j = 0; j < array.shape()[1]; j++)
    
        for (unsigned int i = 0; i < array.shape()[0]; i++)
        
            printf("%2d ", array[j][i]);
        
        printf("\n");
    

void write_int_data(std::string const& filename, array_2d_t const& array)

    h5xx::file file(filename, h5xx::file::trunc);
    std::string name;

    
        // --- create dataset and fill it with the default array data (positive values)
        name = "integer array";
        h5xx::create_dataset(file, name, array);
        h5xx::write_dataset(file, name, array);

        // --- create a slice object (aka hyperslab) to specify the location in the dataset to be overwritten
        std::vector<int> offset; int offset_raw[2] = 4,4; offset.assign(offset_raw, offset_raw + 2);
        std::vector<int> count;  int count_raw[2] = 2,2; count.assign(count_raw, count_raw + 2);
        h5xx::slice slice(offset, count);
    

void read_int_data(std::string const& filename)

    h5xx::file file(filename, h5xx::file::in);
    std::string name = "integer array";

    // read and print the full dataset
    
        array_2d_t array;
        // --- read the complete dataset into array, the array is resized and overwritten internally
        h5xx::read_dataset(file, name, array);
        printf("original integer array read from file, negative number patch was written using a slice\n");
        print_array(array);
        printf("\n");
    

int main(int argc, char** argv)

    std::string filename = argv[0];
    filename.append(".h5");

    // --- do a few demos/tests using integers
    
        array_2d_t array(boost::extents[NJ][NI]);
        
            const int nelem = NI*NJ;
            int data[nelem];
            for (int i = 0; i < nelem; i++)
                data[i] = i;
            array.assign(data, data + nelem);
        

        write_int_data(filename, array);

        read_int_data(filename);
    

    return 0;

我正在使用 h5xx — HDF5 库 link 和 boost 库的基于模板的 C++ 包装器。 数据集存储在粒子/脂质/盒子/位置路径中。数据集名称值包含帧。

【问题讨论】:

你正在读取 h5 文件中的什么对象?拥有“形状”还远远不够。帧是什么数据?脂质?职位?边缘?顺便说一下 HDFView 在我的机器上不合作:i.imgur.com/qbMhln3.png @sehe 我在问题中添加了数据集的路径。它存储在 /particles/lipids/box/positions/value 下。 属性名呢?输入? @sehe 数据集的名字叫做value 这是数据集的名称,但它似乎不是数组。文件中的类型是什么?对此进行逆向工程非常困难/乏味。如果您无法描述文件包含的内容,则很难判断如何阅读。 【参考方案1】:

    argv[0] 不是你想要的(参数从 1 开始,0 是程序名称)。考虑边界检查:

    std::vector<std::string> const args(argv, argv + argc);
    std::string const              filename = args.at(1) + ".h5";
    

    可以直接进行初始化,不需要临时数组(multi_array 是干什么用的,否则呢?)

    for (size_t i = 0; i < array.num_elements(); i++)
        array.data()[i] = i;
    

    或者确实,让它成为一种算法:

     std::iota(array.data(), array.data() + array.num_elements(), 0);
    

    与向量相同:

    std::vector<int> offset; int offset_raw[2] = 4,4; offset.assign(offset_raw, offset_raw + 2);
    std::vector<int> count;  int count_raw[2] = 2,2; count.assign(count_raw, count_raw + 2);
    

    除了格式混乱之外,还可以简单

    std::vector offset4,4, count2,2;
    h5xx::slice slice(offset, count);
    

真正的问题

代码与文件无关。完全没有。我创建了一些调试/跟踪代码来转储文件内容:

void dump(h5xx::group const& g, std::string indent = "") 
    auto dd = g.datasets();
    auto gg = g.groups();

    for (auto it = dd.begin(); it != dd.end(); ++it) 
        std::cout << indent << " ds:" << it.get_name() << "\n";
    

    for (auto it = gg.begin(); it != gg.end(); ++it) 
        dump(*it, indent + "/" + it.get_name());
    


int main()

    h5xx::file xaa("xaa.h5", h5xx::file::mode::in);

    dump(xaa);

打印

/particles/lipids/box/edges ds:box_size
/particles/lipids/box/edges ds:step
/particles/lipids/box/edges ds:time
/particles/lipids/box/edges ds:value
/particles/lipids/box/positions ds:step
/particles/lipids/box/positions ds:time
/particles/lipids/box/positions ds:value

现在我们可以深入到数据集。让我们看看我们是否能找出正确的类型。当然不是array_2d_t

h5xx::dataset ds(xaa, "particles/lipids/box/positions/value");

array_2d_t a;
h5xx::datatype detect(a);
std::cout << "type:   " << std::hex << ds.get_type() << std::dec << "\n";
std::cout << "detect: " << std::hex << detect.get_type_id() << std::dec << "\n";

打印

type:   30000000000013b
detect: 30000000000000c

这是类型不匹配。我想我也必须学会阅读那些胡言乱语......

让我们添加一些诊断:

void diag_type(hid_t type)

    std::cout << " Class     " << ::H5Tget_class(type)       << std::endl;
    std::cout << " Size      " << ::H5Tget_size(type)        << std::endl;
    std::cout << " Sign      " << ::H5Tget_sign(type)        << std::endl;
    std::cout << " Order     " << ::H5Tget_order(type)       << std::endl;
    std::cout << " Precision " << ::H5Tget_precision(type)   << std::endl;
    std::cout << " NDims     " << ::H5Tget_array_ndims(type) << std::endl;
    std::cout << " NMembers  " << ::H5Tget_nmembers(type)    << std::endl;


int main()

    h5xx::file xaa("xaa.h5", h5xx::file::mode::in);
    // dump(xaa);

    
        h5xx::group g(xaa, "particles/lipids/box/positions");
        h5xx::dataset ds(g, "value");
        std::cout << "dataset:   " << std::hex << ds.get_type() << std::dec << std::endl;
        diag_type(ds.get_type());
    

    
        array_2d_t     a(boost::extents[NJ][NI]);
        h5xx::datatype detect(a);
        std::cout << "detect: " << std::hex << detect.get_type_id() << std::dec << std::endl;
        diag_type(detect.get_type_id());
    

打印

dataset:   30000000000013b
 Class     1
 Size      4
 Sign      -1
 Order     0
 Precision 32
 NDims     -1
 NMembers  -1
detect: 30000000000000c
 Class     0
 Size      4
 Sign      1
 Order     0
 Precision 32
 NDims     -1
 NMembers  -1

至少我们知道HST_FLOAT (class 1) 是必需的。让我们修改array_2d_t

using array_2d_t = boost::multi_array<float, 2>;
array_2d_t a(boost::extents[11214][3]);

这至少使数据看起来相似。让我们……天真地尝试阅读:

h5xx::read_dataset(ds, a);

哎呀,可以预见的抛出

terminate called after throwing an instance of 'h5xx::error'
  what():  /home/sehe/Projects/***/deps/h5xx/h5xx/dataset/boost_multi_array.hpp:176:read_dataset(): dataset "/particles/lipi
ds/box/positions/value" and target array have mismatching dimensions

不用担心,我们可以猜测:

using array_3d_t = boost::multi_array<float, 3>;
array_3d_t     a(boost::extents[10][11214][3]);
h5xx::read_dataset(ds, a);

至少这确实有效。适配打印功能:

template <typename T> void print_array(T const& array) 
    for (auto const& row : array) 
        for (auto v : row) printf("%5f ", v);
        printf("\n");
    

现在我们可以打印第一帧了:

h5xx::read_dataset(ds, a);
print_array(*a.begin()); // print the first frame

打印出来:

80.480003 35.360001 4.250000
37.450001 3.920000 3.960000
18.530001 -9.690000 4.680000
55.389999 74.339996 4.600000
22.110001 68.709999 3.850000
-4.130000 24.040001 3.730000
40.160000 6.390000 4.730000
-5.400000 35.730000 4.850000
36.669998 22.450001 4.080000
-3.680000 -10.660000 4.180000
(...)

h5ls -r -d xaa.h5/particles/lipids/box/positions/value检查:

particles/lipids/box/positions/value Dataset 75/Inf, 11214, 3
    Data:
        (0,0,0) 80.48, 35.36, 4.25, 37.45, 3.92, 3.96, 18.53, -9.69, 4.68,
        (0,3,0) 55.39, 74.34, 4.6, 22.11, 68.71, 3.85, -4.13, 24.04, 3.73,
        (0,6,0) 40.16, 6.39, 4.73, -5.4, 35.73, 4.85, 36.67, 22.45, 4.08, -3.68,
        (0,9,1) -10.66, 4.18, 35.95, 36.43, 5.15, 57.17, 3.88, 5.08, -23.64,
        (0,12,1) 50.44, 4.32, 6.78, 8.24, 4.36, 21.34, 50.63, 5.21, 16.29,
        (0,15,1) -1.34, 5.28, 22.26, 71.25, 5.4, 19.76, 10.38, 5.34, 78.62,
        (0,18,1) 11.13, 5.69, 22.14, 59.7, 4.92, 15.65, 47.28, 5.22, 82.41,
        (0,21,1) 2.09, 5.24, 16.87, -11.68, 5.35, 15.54, -0.63, 5.2, 81.25,
(...)

首页延伸:添加切片

array_2d_t read_frame(int frame_no) 
    h5xx::file xaa("xaa.h5", h5xx::file::mode::in);

    h5xx::group   g(xaa, "particles/lipids/box/positions");
    h5xx::dataset ds(g, "value");

    array_2d_t a(boost::extents[11214][3]);

    std::vector offsetsframe_no, 0, 0, counts1, 11214, 3;
    h5xx::slice slice(offsets, counts);

    h5xx::read_dataset(ds, a, slice);
    return a;

你有它。现在我们可以打印任何帧:

print_array(read_frame(0));

打印和以前一样。让我们试试最后一帧:

print_array(read_frame(9));

打印

79.040001 36.349998 3.990000
37.250000 3.470000 4.140000
18.600000 -9.270000 4.900000
55.669998 75.070000 5.370000
21.920000 67.709999 3.790000
-4.670000 24.770000 3.690000
40.000000 6.060000 5.240000
-5.340000 36.320000 5.410000
36.369999 22.490000 4.130000
-3.520000 -10.430000 4.280000
(...)

h5ls -r -d xaa.h5/particles/lipids/box/positions/value |&amp; grep '(9' | head再次检查确认:

(9,0,0) 79.04, 36.35, 3.99, 37.25, 3.47, 4.14, 18.6, -9.27, 4.9, 55.67,
(9,3,1) 75.07, 5.37, 21.92, 67.71, 3.79, -4.67, 24.77, 3.69, 40, 6.06,
(9,6,2) 5.24, -5.34, 36.32, 5.41, 36.37, 22.49, 4.13, -3.52, -10.43,
(9,9,2) 4.28, 35.8, 36.43, 4.99, 56.6, 4.09, 5.04, -23.37, 49.42, 3.81,
(9,13,0) 6.31, 8.83, 4.56, 22.01, 50.38, 5.43, 16.3, -2.92, 5.4, 22.02,
(9,16,1) 70.09, 5.36, 20.23, 11.12, 5.66, 78.48, 11.34, 6.09, 20.26,
(9,19,1) 61.45, 5.35, 14.25, 48.32, 5.35, 79.95, 1.71, 5.38, 17.56,
(9,22,1) -11.61, 5.39, 15.64, -0.19, 5.06, 80.43, 71.77, 5.29, 75.54,
(9,25,1) 35.14, 5.26, 22.45, 56.86, 5.56, 16.47, 52.97, 6.16, 20.62,
(9,28,1) 65.12, 5.26, 19.68, 71.2, 5.52, 23.39, 49.84, 5.28, 22.7,

完整列表

#include <boost/multi_array.hpp>
#include <h5xx/h5xx.hpp>
#include <iostream>

using array_2d_t = boost::multi_array<float, 2>;

template <typename T> void print_array(T const& array)

    for (auto const& row : array)  for (auto v : row)
            printf("%5f ", v);
        printf("\n");
    


void dump(h5xx::group const& g, std::string indent = "") 
    auto dd = g.datasets();
    auto gg = g.groups();

    for (auto it = dd.begin(); it != dd.end(); ++it) 
        std::cout << indent << " ds:" << it.get_name() << std::endl;
    

    for (auto it = gg.begin(); it != gg.end(); ++it) 
        dump(*it, indent + "/" + it.get_name());
    


array_2d_t read_frame(int frame_no) 
    h5xx::file xaa("xaa.h5", h5xx::file::mode::in);

    h5xx::group   g(xaa, "particles/lipids/box/positions");
    h5xx::dataset ds(g, "value");

    array_2d_t arr(boost::extents[11214][3]);

    std::vector offsetsframe_no, 0, 0, counts1, 11214, 3;
    h5xx::slice slice(offsets, counts);

    h5xx::read_dataset(ds, arr, slice);
    return arr;


int main()

    print_array(read_frame(9));

【讨论】:

以上是关于如何从 C++ 中的 hdf5 文件中读取数据块?的主要内容,如果未能解决你的问题,请参考以下文章

HDF5 简单读取数据集失败

是否可以从 Python 的 HDF5 文件中的复合数据集中读取字段名称?

使用 HDF5 保存要在 C++ 中读取的 MATLAB 结构

如何从 HDF5 文件中读取属性?

将浮点数组写入和附加到 C++ 中 hdf5 文件中的唯一数据集

是否可以直接重命名存储在 hdf5 文件中的 pandas 数据框的列?