#pragma once
|
|
#include <unordered_map>
|
|
#include "caffe2/core/context.h"
|
#include "caffe2/core/init.h"
|
#include "caffe2/core/logging.h"
|
#include "caffe2/core/memonger.h"
|
#include "caffe2/core/net.h"
|
#include "caffe2/core/operator.h"
|
#include "caffe2/core/scope_guard.h"
|
#include "caffe2/core/tensor.h"
|
#include "caffe2/core/types.h"
|
#include "caffe2/core/workspace.h"
|
#include "caffe2/proto/caffe2_pb.h"
|
#include "caffe2/python/pybind_state_dlpack.h"
|
|
#include <pybind11/pybind11.h>
|
#include <pybind11/stl.h>
|
|
#include <Python.h>
|
|
#ifdef USE_NUMPY
|
|
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
|
#define PY_ARRAY_UNIQUE_SYMBOL caffe2_python_ARRAY_API
|
#include <numpy/arrayobject.h>
|
|
// Temporary solution for numpy < 1.7 versions: old macro, no promises.
|
// You're strongly advised to upgrade to >= 1.7.
|
#ifndef NPY_ARRAY_C_CONTIGUOUS
|
#define NPY_ARRAY_C_CONTIGUOUS NPY_C_CONTIGUOUS
|
#define PyArray_SetBaseObject(arr, x) (PyArray_BASE(arr) = (x))
|
#endif
|
|
#else
|
|
struct PyArrayObject; // Forward declaring PyArrayObject for safety
|
|
#endif // USE_NUMPY
|
|
namespace caffe2 {
|
namespace python {
|
|
namespace py = pybind11;
|
|
// Add methods common to both CPU and GPU mode.
|
void addGlobalMethods(pybind11::module& m);
|
// Expose Workspace, Net, Blob
|
void addObjectMethods(pybind11::module& m);
|
|
// Get current workspace
|
Workspace* GetCurrentWorkspace();
|
|
class C10_EXPORT BlobFetcherBase {
|
public:
|
struct FetchedBlob {
|
pybind11::object obj;
|
bool copied;
|
};
|
virtual ~BlobFetcherBase();
|
virtual pybind11::object Fetch(const Blob& blob) = 0;
|
};
|
|
class BlobFeederBase {
|
public:
|
virtual ~BlobFeederBase();
|
virtual void Feed(
|
const DeviceOption& option,
|
PyArrayObject* array,
|
Blob* blob,
|
bool in_place = false) = 0;
|
};
|
|
C10_DECLARE_TYPED_REGISTRY(
|
BlobFetcherRegistry,
|
TypeIdentifier,
|
BlobFetcherBase,
|
std::unique_ptr);
|
#define REGISTER_BLOB_FETCHER(id, ...) \
|
C10_REGISTER_TYPED_CLASS(BlobFetcherRegistry, id, __VA_ARGS__)
|
inline unique_ptr<BlobFetcherBase> CreateFetcher(TypeIdentifier id) {
|
return BlobFetcherRegistry()->Create(id);
|
}
|
|
C10_DECLARE_TYPED_REGISTRY(
|
BlobFeederRegistry,
|
DeviceType,
|
BlobFeederBase,
|
std::unique_ptr);
|
#define REGISTER_BLOB_FEEDER(device_type, ...) \
|
C10_REGISTER_TYPED_CLASS(BlobFeederRegistry, device_type, __VA_ARGS__)
|
inline unique_ptr<BlobFeederBase> CreateFeeder(int device_type) {
|
return BlobFeederRegistry()->Create(
|
caffe2::ProtoToType(static_cast<DeviceTypeProto>(device_type)));
|
}
|
|
static_assert(
|
sizeof(int) == sizeof(int32_t),
|
"We make an assumption that int is always int32 for numpy "
|
"type mapping.");
|
|
int CaffeToNumpyType(const TypeMeta& dtype);
|
const TypeMeta& NumpyTypeToCaffe(int numpy_type);
|
|
class TensorFetcher : public BlobFetcherBase {
|
public:
|
pybind11::object Fetch(const Blob& blob) override {
|
return FetchTensor(blob.Get<Tensor>(), true).obj;
|
}
|
|
// Checks whether the data with type `dtype` needs to be copied in the context
|
// of `tensor`
|
bool NeedsCopy(const Tensor* tensor, const TypeMeta& dtype) const {
|
#ifdef USE_NUMPY
|
return tensor->GetDeviceType() != CPU ||
|
CaffeToNumpyType(dtype) == NPY_OBJECT;
|
#else
|
return tensor->GetDeviceType() != CPU;
|
#endif // USE_NUMPY
|
}
|
|
FetchedBlob FetchTensor(const Tensor& tensor, bool force_copy) {
|
#ifdef USE_NUMPY
|
FetchedBlob result;
|
CAFFE_ENFORCE_GE(tensor.numel(), 0, "Trying to fetch uninitialized tensor");
|
const int numpy_type = CaffeToNumpyType(tensor.dtype());
|
CAFFE_ENFORCE(
|
numpy_type != -1,
|
"This tensor's data type is not supported: ",
|
tensor.dtype().name(),
|
".");
|
std::vector<npy_intp> npy_dims;
|
for (const auto dim : tensor.sizes()) {
|
npy_dims.push_back(dim);
|
}
|
result.copied = force_copy || NeedsCopy(&tensor, tensor.dtype());
|
void* outPtr;
|
if (result.copied) {
|
result.obj = py::reinterpret_steal<py::object>(
|
PyArray_SimpleNew(tensor.dim(), npy_dims.data(), numpy_type));
|
outPtr = static_cast<void*>(
|
PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.obj.ptr())));
|
} else {
|
outPtr = const_cast<Tensor&>(tensor).raw_mutable_data();
|
result.obj = py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
|
tensor.dim(), npy_dims.data(), numpy_type, outPtr));
|
}
|
|
if (numpy_type == NPY_OBJECT) {
|
PyObject** outObj = reinterpret_cast<PyObject**>(outPtr);
|
auto* str = tensor.template data<std::string>();
|
for (int i = 0; i < tensor.numel(); ++i) {
|
outObj[i] = PyBytes_FromStringAndSize(str->data(), str->size());
|
str++;
|
// cleanup on failure
|
if (outObj[i] == nullptr) {
|
for (int j = 0; j < i; ++j) {
|
Py_DECREF(outObj[j]);
|
}
|
CAFFE_THROW("Failed to allocate string for ndarray of strings.");
|
}
|
}
|
return result;
|
}
|
|
if (result.copied) {
|
// TODO: use CUDAGuard here instead of context and employ explicit sync
|
// copy
|
auto context = CreateContext(tensor.GetDeviceType());
|
context->CopyBytesToCPU(tensor.nbytes(), tensor.raw_data(), outPtr);
|
context->FinishDeviceComputation();
|
}
|
return result;
|
#else
|
CAFFE_THROW("Caffe2 was compiled without NumPy support.");
|
#endif // USE_NUMPY
|
}
|
};
|
|
template <class Context>
|
class TensorFeeder : public BlobFeederBase {
|
public:
|
Tensor FeedTensor(const DeviceOption& option, PyArrayObject* original_array) {
|
Tensor out;
|
FeedTensor(option, original_array, &out, false);
|
return out;
|
}
|
|
void FeedTensor(
|
const DeviceOption& option,
|
PyArrayObject* original_array,
|
Tensor* out,
|
bool in_place) {
|
#ifdef USE_NUMPY
|
PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
|
auto g = MakeGuard([&]() { Py_XDECREF(array); });
|
|
const auto npy_type = PyArray_TYPE(array);
|
const TypeMeta& dtype = NumpyTypeToCaffe(npy_type);
|
CAFFE_ENFORCE(
|
dtype.id() != TypeIdentifier::uninitialized(),
|
"This numpy data type is not supported: ",
|
PyArray_TYPE(array),
|
".");
|
Context context(option);
|
context.SwitchToDevice();
|
// numpy requires long int as its dims.
|
int ndim = PyArray_NDIM(array);
|
npy_intp* npy_dims = PyArray_DIMS(array);
|
std::vector<int64_t> dims;
|
for (int i = 0; i < ndim; ++i) {
|
dims.push_back(npy_dims[i]);
|
}
|
|
Tensor& tensor = *out;
|
if (in_place) {
|
tensor.Resize(dims);
|
}
|
// Now, copy the data to the tensor.
|
switch (npy_type) {
|
case NPY_OBJECT: {
|
PyObject** input = reinterpret_cast<PyObject**>(PyArray_DATA(array));
|
if (!in_place) {
|
tensor = caffe2::empty(
|
dims, at::dtype<std::string>().device(Context::GetDeviceType()));
|
}
|
auto* outPtr = tensor.template mutable_data<std::string>();
|
for (int i = 0; i < tensor.numel(); ++i) {
|
char* str;
|
Py_ssize_t strSize;
|
#if PY_MAJOR_VERSION > 2
|
if (PyBytes_Check(input[i])) {
|
CAFFE_ENFORCE(
|
PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1,
|
"Had a PyBytes object but cannot convert it to a string.");
|
} else if (PyUnicode_Check(input[i])) { // string
|
str =
|
const_cast<char*>(PyUnicode_AsUTF8AndSize(input[i], &strSize));
|
CAFFE_ENFORCE(
|
str,
|
"Had a PyUnicode object but cannot convert it to a string.");
|
} else {
|
CAFFE_THROW("Unsupported python object type passed into ndarray.");
|
}
|
#else
|
CAFFE_ENFORCE(
|
PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1,
|
"Unsupported python object type passed into ndarray.");
|
#endif // PY_MAJOR_VERSION > 2
|
outPtr[i] = std::string(str, strSize);
|
}
|
break;
|
}
|
case NPY_UNICODE:
|
CAFFE_THROW(
|
"You are feeding in a numpy array of unicode. Caffe2 C++ does not "
|
"support unicode yet. Please ensure that you are passing in bytes "
|
"instead of unicode strings.");
|
break;
|
default:
|
if (!in_place) {
|
tensor = caffe2::empty(
|
dims, at::dtype(dtype).device(Context::GetDeviceType()));
|
} else {
|
tensor.raw_mutable_data(dtype);
|
}
|
context.CopyBytesFromCPU(
|
tensor.numel() * dtype.itemsize(),
|
static_cast<void*>(PyArray_DATA(array)),
|
tensor.raw_mutable_data());
|
}
|
context.FinishDeviceComputation();
|
#else
|
CAFFE_THROW("Caffe2 compiled without NumPy support.");
|
#endif // USE_NUMPY
|
}
|
|
virtual void Feed(
|
const DeviceOption& option,
|
PyArrayObject* original_array,
|
Blob* blob,
|
bool in_place) {
|
if (in_place) {
|
FeedTensor(
|
option,
|
original_array,
|
BlobGetMutableTensor(blob, OptionToDevice(option).type()),
|
true);
|
} else {
|
blob->Reset<Tensor>(new Tensor(FeedTensor(option, original_array)));
|
}
|
}
|
};
|
|
namespace python_detail {
|
struct Func {
|
py::object py_func;
|
bool needs_workspace;
|
};
|
|
const Func& getOpFunc(const std::string& token);
|
|
const Func& getGradientFunc(const std::string& token);
|
|
} // namespace python_detail
|
|
// TODO: Remove template?
|
template <class Context, bool use_dlpack>
|
class PythonOpBase : public Operator<Context> {
|
public:
|
USE_OPERATOR_CONTEXT_FUNCTIONS;
|
|
PythonOpBase(
|
const OperatorDef& operator_def,
|
Workspace* ws,
|
const std::string& pickled_builder_arg_name)
|
: Operator<Context>(operator_def, ws),
|
ws_(ws),
|
token_(OperatorBase::template GetSingleArgument<std::string>(
|
"token",
|
"")) {
|
using namespace python_detail;
|
auto pickled = OperatorBase::template GetSingleArgument<std::string>(
|
pickled_builder_arg_name, "");
|
CAFFE_ENFORCE(
|
!pickled.empty() || !token_.empty(),
|
"PythonOp requires either pickled_builder or token arg.");
|
if (!pickled.empty()) {
|
py::gil_scoped_acquire g;
|
try {
|
auto pickle =
|
py::reinterpret_steal<py::object>(PyImport_ImportModule("pickle"));
|
|
CAFFE_ENFORCE(pickle);
|
auto loads = pickle.attr("loads").cast<py::object>();
|
CAFFE_ENFORCE(loads);
|
py::tuple builder_call;
|
try {
|
builder_call = loads(py::bytes(pickled)).cast<py::tuple>();
|
} catch (const py::error_already_set& e) {
|
#if PY_MAJOR_VERSION >= 3
|
LOG(INFO) << "Cannot unpickle python operator: " << e.what();
|
LOG(INFO) << "Try latin1 encoding for python3 run";
|
// to use the `_a` literal for arguments
|
using namespace pybind11::literals;
|
builder_call = loads(py::bytes(pickled), "encoding"_a = "latin1")
|
.template cast<py::tuple>();
|
#else
|
// for py2, simply re-throw the exception, as there is no encoding
|
// argument for pickle.loads
|
throw;
|
#endif
|
}
|
CAFFE_ENFORCE(builder_call);
|
CAFFE_ENFORCE_EQ(py::len(builder_call), 3);
|
auto func = builder_call[0].cast<py::object>();
|
auto args = builder_call[1].cast<py::tuple>();
|
auto kwargs = builder_call[2].cast<py::dict>();
|
auto built_func = func(*args, **kwargs);
|
CAFFE_ENFORCE(built_func);
|
built_func_.reset(
|
new Func{built_func,
|
OperatorBase::template GetSingleArgument<bool>(
|
"pass_workspace", false)});
|
} catch (const py::error_already_set& e) {
|
std::stringstream error;
|
error << "Python exception encountered while creating PythonOp: "
|
<< e.what();
|
LOG(ERROR) << error.str();
|
CAFFE_THROW(error.str());
|
}
|
}
|
}
|
|
bool RunOnDevice() override final {
|
auto* pyFunc = built_func_ ? built_func_.get() : &getFunc(token_);
|
CAFFE_ENFORCE(pyFunc);
|
{
|
// Acquire GIL for call to Python runtime.
|
py::gil_scoped_acquire g;
|
|
DeviceOption cpu_option;
|
cpu_option.set_device_type(PROTO_CPU);
|
|
std::vector<py::object> inputs;
|
inputs.reserve(InputSize());
|
for (auto i = 0; i < InputSize(); ++i) {
|
const auto* blob = &InputBlob(i);
|
// Allow CPU tensors in addition to operator context's tensors
|
py::object py_obj;
|
CAFFE_ENFORCE(
|
BlobIsTensorType(*blob, CPU),
|
"We only allow input blob to be CPU Tensor");
|
if (use_dlpack) {
|
DLPackWrapper<CPUContext> wrapper(
|
const_cast<Tensor*>(&(BlobGetTensor(*blob, CPU))), cpu_option);
|
// copy wrapper
|
py_obj = py::cast(wrapper, py::return_value_policy::copy);
|
} else {
|
py_obj = py::cast(
|
&(BlobGetTensor(*blob, CPU)), py::return_value_policy::reference);
|
}
|
inputs.push_back(py_obj);
|
}
|
std::vector<py::object> outputs;
|
outputs.reserve(OutputSize());
|
for (auto i = 0; i < OutputSize(); ++i) {
|
auto* blob = OutputBlob(i);
|
|
// Python op is always used with CPUContext only and treats inputs and
|
// outputs as CPU tensors, CUDA version of PythonOp is implemented
|
// through GPUFallbackOp that copies input CUDA blobs to CPU and copies
|
// outputs from CUDA to CPU.
|
// GPUFallbackOp also allows keeping some of the output blobs on CPU
|
// by specifying their indices explicitly in template parameters.
|
|
// PythonDLPack op allows working CPU blobs only through DLPack tensors.
|
// We don't have use cases of CUDA version yet, but if there is such use
|
// case, we can use GPUFallbackOp to enable it.
|
|
py::object py_obj;
|
if (use_dlpack) {
|
DLPackWrapper<CPUContext> wrapper(
|
BlobGetMutableTensor(blob, CPU), cpu_option);
|
py_obj = py::cast(wrapper, py::return_value_policy::copy);
|
} else {
|
py_obj = py::cast(
|
BlobGetMutableTensor(blob, CPU),
|
py::return_value_policy::reference);
|
}
|
outputs.push_back(py_obj);
|
}
|
|
try {
|
if (pyFunc->needs_workspace) {
|
pyFunc->py_func(inputs, outputs, ws_);
|
} else {
|
pyFunc->py_func(inputs, outputs);
|
}
|
} catch (const py::error_already_set& e) {
|
std::stringstream error;
|
error << "Exception encountered running PythonOp function: "
|
<< e.what();
|
LOG(ERROR) << error.str();
|
CAFFE_THROW(error.str());
|
}
|
}
|
return true;
|
}
|
|
virtual ~PythonOpBase() {
|
if (built_func_) {
|
// since it may trigger python interpreter when refcount reaches zero
|
py::gil_scoped_acquire g;
|
built_func_.reset();
|
}
|
}
|
|
protected:
|
virtual const python_detail::Func& getFunc(const std::string& token) = 0;
|
Workspace* ws_;
|
|
private:
|
const std::string token_;
|
std::unique_ptr<python_detail::Func> built_func_;
|
};
|
|
template <class Context, bool use_dlpack>
|
class PythonOp : public PythonOpBase<Context, use_dlpack> {
|
public:
|
PythonOp(const OperatorDef& operator_def, Workspace* ws)
|
: PythonOpBase<Context, use_dlpack>(operator_def, ws, "pickled_builder") {
|
}
|
|
protected:
|
const python_detail::Func& getFunc(const std::string& token) override {
|
return python_detail::getOpFunc(token);
|
}
|
};
|
|
template <class Context, bool use_dlpack>
|
class PythonGradientOp : public PythonOpBase<Context, use_dlpack> {
|
public:
|
PythonGradientOp(const OperatorDef& operator_def, Workspace* ws)
|
: PythonOpBase<Context, use_dlpack>(
|
operator_def,
|
ws,
|
"pickled_grad_builder") {}
|
|
protected:
|
const python_detail::Func& getFunc(const std::string& token) override {
|
return python_detail::getGradientFunc(token);
|
}
|
};
|
|
} // namespace python
|
} // namespace caffe2
|