使用 C API 访问 NumPy 数组的视图

Posted 2023-02-17

技术标签:

【中文标题】使用 C API 访问 NumPy 数组的视图【英文标题】：Accessing view of a NumPy array using the C API 【发布时间】：2016-01-25 19:19:15 【问题描述】：

在我用 C++ 编写的 Python 扩展模块中，我使用以下 sn-p 代码将 NumPy 数组转换为 Armadillo 数组，以用于代码的 C++ 部分：

static arma::mat convertPyArrayToArma(PyArrayObject* pyarr, int nrows, int ncols)

    // Check if the dimensions are what I expect.
    if (!checkPyArrayDimensions(pyarr, nrows, ncols)) throw WrongDimensions();
    
    const std::vector<int> dims = getPyArrayDimensions(pyarr);  // Gets the dimensions using the API
    
    PyArray_Descr* reqDescr = PyArray_DescrFromType(NPY_DOUBLE);
    if (reqDescr == NULL) throw std::bad_alloc();
    
    // Convert the array to Fortran-ordering as required by Armadillo
    PyArrayObject* cleanArr = (PyArrayObject*)PyArray_FromArray(pyarr, reqDescr, 
                                                                NPY_ARRAY_FARRAY);
    if (cleanArr == NULL) throw std::bad_alloc();
    reqDescr = NULL;  // The new reference from DescrFromType was stolen by FromArray

    double* dataPtr = static_cast<double*>(PyArray_DATA(cleanArr));
    arma::mat result (dataPtr, dims[0], dims[1], true);  // this copies the data from cleanArr
    Py_DECREF(cleanArr);
    return result;

问题在于，当我传递一个 NumPy 数组（即my_array[:, 3]）的 view 时，它似乎无法正确处理底层 C 数组的步幅。根据输出，函数接收的数组pyarr 似乎实际上是完整的基本数组，而不是视图（或者至少当我使用PyArray_DATA 访问数据时，我似乎得到了一个指向完整的基本数组）。如果我将视图的副本传递给此函数（即my_array[:, 3].copy()），它会按预期工作，但我不想每次都记住这样做。

那么，有没有办法让PyArray_FromArray 只复制我想要的矩阵切片？我尝试使用标志NPY_ARRAY_ENSURECOPY，但没有帮助。

编辑 1

按照 cmets 中的建议，这是一个完整的工作示例：

在文件example.cpp：

#define NPY_NO_DEPRECATED_API NPY_1_9_API_VERSION

extern "C" 
    #include <Python.h>
    #include <numpy/arrayobject.h>

#include <exception>
#include <cassert>
#include <string>
#include <type_traits>
#include <map>
#include <vector>
#include <armadillo>

class WrongDimensions : public std::exception

public:
    WrongDimensions() 
    const char* what() const noexcept  return msg.c_str(); 

private:
    std::string msg = "The dimensions were incorrect";
;

class NotImplemented : public std::exception

public:
    NotImplemented() 
    const char* what() const noexcept  return msg.c_str(); 

private:
    std::string msg = "Not implemented";
;

class BadArrayLayout : public std::exception

public:
    BadArrayLayout() 
    const char* what() const noexcept  return msg.c_str(); 

private:
    std::string msg = "The matrix was not contiguous";
;

static const std::vector<npy_intp> getPyArrayDimensions(PyArrayObject* pyarr)

    npy_intp ndims = PyArray_NDIM(pyarr);
    npy_intp* dims = PyArray_SHAPE(pyarr);
    std::vector<npy_intp> result;
    for (int i = 0; i < ndims; i++) 
        result.push_back(dims[i]);
    
    return result;


/* Checks the dimensions of the given array. Pass -1 for either dimension to say you don't
 * care what the size is in that dimension. Pass dimensions (X, 1) for a vector.
 */
static bool checkPyArrayDimensions(PyArrayObject* pyarr, const npy_intp dim0, const npy_intp dim1)

    const auto dims = getPyArrayDimensions(pyarr);
    assert(dims.size() <= 2 && dims.size() > 0);
    if (dims.size() == 1) 
        return (dims[0] == dim0 || dim0 == -1) && (dim1 == 1 || dim1 == -1);
    
    else 
        return (dims[0] == dim0 || dim0 == -1) && (dims[1] == dim1 || dim1 == -1);
    


template<typename outT>
static arma::Mat<outT> convertPyArrayToArma(PyArrayObject* pyarr, int nrows, int ncols)

    if (!checkPyArrayDimensions(pyarr, nrows, ncols)) throw WrongDimensions();

    int arrTypeCode;
    if (std::is_same<outT, uint16_t>::value) 
        arrTypeCode = NPY_UINT16;
    
    else if (std::is_same<outT, double>::value) 
        arrTypeCode = NPY_DOUBLE;
    
    else 
        throw NotImplemented();
    

    const auto dims = getPyArrayDimensions(pyarr);
    if (dims.size() == 1) 
        outT* dataPtr = static_cast<outT*>(PyArray_DATA(pyarr));
        return arma::Col<outT>(dataPtr, dims[0], true);
    
    else 
        PyArray_Descr* reqDescr = PyArray_DescrFromType(arrTypeCode);
        if (reqDescr == NULL) throw std::bad_alloc();
        PyArrayObject* cleanArr = (PyArrayObject*)PyArray_FromArray(pyarr, reqDescr, NPY_ARRAY_FARRAY);
        if (cleanArr == NULL) throw std::bad_alloc();
        reqDescr = NULL;  // The new reference from DescrFromType was stolen by FromArray

        outT* dataPtr = static_cast<outT*>(PyArray_DATA(cleanArr));
        arma::Mat<outT> result (dataPtr, dims[0], dims[1], true);  // this copies the data from cleanArr
        Py_DECREF(cleanArr);
        return result;
    


static PyObject* convertArmaToPyArray(const arma::mat& matrix)

    npy_intp ndim = matrix.is_colvec() ? 1 : 2;
    npy_intp nRows = static_cast<npy_intp>(matrix.n_rows);  // NOTE: This narrows the integer
    npy_intp nCols = static_cast<npy_intp>(matrix.n_cols);
    npy_intp dims[2] = nRows, nCols;

    PyObject* result = PyArray_SimpleNew(ndim, dims, NPY_DOUBLE);
    if (result == NULL) throw std::bad_alloc();

    double* resultDataPtr = static_cast<double*>(PyArray_DATA((PyArrayObject*)result));
    for (int i = 0; i < nRows; i++) 
        for (int j = 0; j < nCols; j++) 
            resultDataPtr[i * nCols + j] = matrix(i, j);
        
    

    return result;


extern "C" 
    // An example function that takes a NumPy array and converts it to
    // an arma::mat and back. This should return the array unchanged.
    static PyObject* example_testFunction(PyObject* self, PyObject* args)
    
        PyArrayObject* myArray = NULL;

        if (!PyArg_ParseTuple(args, "O!", &PyArray_Type, &myArray)) 
            return NULL;
        

        PyObject* output = NULL;
        try 
            arma::mat myMat = convertPyArrayToArma<double>(myArray, -1, -1);
            output = convertArmaToPyArray(myMat);
        
        catch (const std::bad_alloc&) 
            PyErr_NoMemory();
            Py_XDECREF(output);
            return NULL;
        
        catch (const std::exception& err) 
            PyErr_SetString(PyExc_RuntimeError, err.what());
            Py_XDECREF(output);
            return NULL;
        

        return output;
    

    static PyMethodDef example_methods[] =
    
        "test_function", example_testFunction, METH_VARARGS, "A test function",
        NULL, NULL, 0, NULL
    ;

    static struct PyModuleDef example_module = 
       PyModuleDef_HEAD_INIT,
       "example",   /* name of module */
       NULL, /* module documentation, may be NULL */
       -1,       /* size of per-interpreter state of the module,
                    or -1 if the module keeps state in global variables. */
       example_methods
    ;

    PyMODINIT_FUNC
    PyInit_example(void)
    
        import_array();

        PyObject* m = PyModule_Create(&example_module);
        if (m == NULL) return NULL;
        return m;

还有setup.py编译：

from setuptools import setup, Extension
import numpy as np

example_module = Extension(
    'example',
    include_dirs=[np.get_include(), '/usr/local/include'],
    libraries=['armadillo'],
    library_dirs=['/usr/local/lib'],
    sources=['example.cpp'],
    language='c++',
    extra_compile_args=['-std=c++11', '-mmacosx-version-min=10.10'],
    )

setup(name='example',
      ext_modules=[example_module],
      )

现在假设我们有示例数组

a = np.array([[ 1, 2, 3, 4, 5, 6], 
              [ 7, 8, 9,10,11,12], 
              [13,14,15,16,17,18]], dtype='float64')

该函数似乎适用于像a[:, :3] 这样的多维切片，并且它返回的矩阵未改变，正如我所期望的那样。但如果我给它一个单维切片，除非我复制一份，否则我会得到错误的组件：

>>> example.test_function(a[:, 3])
array([ 4.,  5.,  6.])

>>> example.test_function(a[:, 3].copy())
array([  4.,  10.,  16.])

【问题讨论】：

NPY_ARRAY_CARRAY 怎么样？我认为这会强制从非连续切片构造一个连续数组，但我只是在这里猜测 :-) 我已经在使用 NPY_ARRAY_FARRAY。那不应该做同样的事情吗（除了以列为主的顺序）？为了与使用 BLAS 函数的犰狳兼容，我需要它以列为主（类似 Fortran）的顺序。哦，对了。错过了。 . . 你能发布一些例子和getPyArrayDimensions()吗？好电话。我刚刚编辑了这个问题以提供一个完整的最小工作示例。 【参考方案1】：

数组的视图只是同一数据数组的另一个信息包装器。 Numpy 不会在此处复制任何数据。仅调整用于解释数据的信息，并在有用时移动指向数据的指针。

在您的代码中，您假设向量a[:, 3] 的数据表示为内存中的一个向量，对于NPY_ARRAY_CARRAY 和NPY_ARRAY_FARRAY 而言不会有所不同。但是这种表示只有在创建数组本身的（fortran 有序）副本后才能得到。

为了让它工作，我稍微修改了你的 convertPyArrayToArma() 函数来创建一个副本，即使它是一个向量：

template<typename outT>
static arma::Mat<outT> convertPyArrayToArma(PyArrayObject* pyarr, int nrows, int ncols)

    if (!checkPyArrayDimensions(pyarr, nrows, ncols)) throw WrongDimensions();

    int arrTypeCode;
    if (std::is_same<outT, uint16_t>::value) 
        arrTypeCode = NPY_UINT16;
    
    else if (std::is_same<outT, double>::value) 
        arrTypeCode = NPY_DOUBLE;
    
    else 
        throw NotImplemented();
    

    PyArray_Descr* reqDescr = PyArray_DescrFromType(arrTypeCode);
    if (reqDescr == NULL) throw std::bad_alloc();
    PyArrayObject* cleanArr = (PyArrayObject*)PyArray_FromArray(pyarr, reqDescr, NPY_ARRAY_FARRAY);
    if (cleanArr == NULL) throw std::bad_alloc();
    reqDescr = NULL;  // The new reference from DescrFromType was stolen by FromArray

    const auto dims = getPyArrayDimensions(pyarr);
    outT* dataPtr = static_cast<outT*>(PyArray_DATA(cleanArr));

    // this copies the data from cleanArr
    arma::Mat<outT> result;
    if (dims.size() == 1) 
        result = arma::Col<outT>(dataPtr, dims[0], true);
    
    else 
        result = arma::Mat<outT>(dataPtr, dims[0], dims[1], true);
    

    Py_DECREF(cleanArr);
    return result;

【讨论】：

啊，我忽略了我为 1D 编写的单独代码路径......但这解释了我看到的结果。谢谢！

以上是关于使用 C API 访问 NumPy 数组的视图的主要内容，如果未能解决你的问题，请参考以下文章

Numpy C API - 使用 PyArray_Descr 创建数组会导致段错误

从使用 numpy.save(...) 保存的文件中将 numpy 数组加载到 C 中

在tensorflow中使用`dataset.map()`访问张量numpy数组

numpy C API 中的 import_array 如何工作？

对不存在的数组使用 numpy 视图

是否可以通过 Numpy 的 C API 创建多项式？