#pragma once
|
|
#include <ATen/core/jit_type.h>
|
#include <ATen/core/stack.h>
|
#include <torch/csrc/WindowsTorchApiMacro.h>
|
#include <torch/csrc/autograd/variable.h>
|
#include <torch/csrc/jit/ir.h>
|
#include <torch/csrc/utils/hash.h>
|
#include <iostream>
|
#include <vector>
|
|
#include <torch/csrc/utils/hash.h>
|
|
namespace torch {
|
namespace jit {
|
|
// GraphExecutor creates specializations of Graphs for different
|
// dimensionalitities and types of inputs.
|
|
inline static at::Device ConvertIntToCPUOrCUDA(int device) {
|
return device < 0 ? at::kCPU : at::Device(at::DeviceType::CUDA, device);
|
}
|
struct ArgumentInfo {
|
friend struct ArgumentSpec;
|
using plain_data_type = uint32_t;
|
|
bool defined() const {
|
return defined_;
|
}
|
int device() const {
|
return device_;
|
}
|
// XXX: It is guaranteed that this will return false when called on non-tensor
|
// arguments
|
bool requires_grad() const {
|
return requires_grad_;
|
}
|
int dim() const {
|
return dim_;
|
}
|
at::ScalarType type() const {
|
return at::ScalarType(type_);
|
}
|
TypePtr toType() const {
|
if (!defined())
|
return TensorType::get();
|
return TensorType::create(
|
type(),
|
ConvertIntToCPUOrCUDA(device()),
|
c10::VaryingShape(dim()),
|
c10::VaryingShape(dim()),
|
requires_grad());
|
}
|
operator TypePtr() const {
|
return toType();
|
}
|
|
private:
|
unsigned defined_ : 1;
|
unsigned requires_grad_ : 1;
|
unsigned : 5;
|
unsigned dim_ : 8;
|
int device_ : 8; // NOTE: this needs to be signed because we use -1 to
|
// represent CPU
|
unsigned type_ : 8;
|
};
|
|
static_assert(
|
std::is_pod<ArgumentInfo>::value,
|
"ArgumentInfo is to be a POD struct");
|
static_assert(
|
sizeof(ArgumentInfo) == sizeof(ArgumentInfo::plain_data_type),
|
"ArgumentInfo is expected to be a 32-bit struct");
|
|
struct ArgumentSpec {
|
ArgumentSpec(size_t num_flat_tensor_inputs, size_t num_flat_optional_inputs) {
|
hash_code = hash_combine(num_flat_tensor_inputs, num_flat_optional_inputs);
|
tensor_args.reserve(num_flat_tensor_inputs);
|
optional_presence.reserve(num_flat_optional_inputs);
|
}
|
|
void addOptional(const IValue& input) {
|
bool is_present = !input.isNone();
|
optional_presence.push_back(is_present);
|
hash_code = hash_combine(hash_code, is_present);
|
}
|
|
void addTensor(const IValue& input, bool with_grad) {
|
AT_ASSERT(input.isTensor(), "Expected Tensor but found ", input.tagKind());
|
tensor_args.emplace_back();
|
auto& arg = tensor_args.back();
|
// Initialize all fields to 0. This is convenient, because e.g.
|
// requires_grad() can be checked even on tensors AND will make
|
// padding bits all 0s.
|
std::memset(&arg, 0, sizeof(ArgumentInfo));
|
|
// [argspec refcounting] reinterpret the IValue to avoid having to refcount
|
// the Tensor microbenchmarks
|
// https://github.com/zdevito/pytorch/commit/21e7200a0a0fc456bea2f10e95b1781f83933d10
|
// show overhead in extra refcounting along this path
|
const at::Tensor* t = reinterpret_cast<const at::Tensor*>(&input);
|
if ((arg.defined_ = t->defined())) {
|
arg.requires_grad_ = with_grad && autograd::Variable(*t).requires_grad();
|
arg.dim_ = t->dim();
|
arg.device_ = t->is_cuda() ? t->get_device() : -1;
|
arg.type_ = static_cast<unsigned>(t->scalar_type());
|
}
|
combineHash(arg);
|
}
|
|
void combineHash(const ArgumentInfo& arg) {
|
ArgumentInfo::plain_data_type arg_data;
|
std::memcpy(&arg_data, &arg, sizeof(ArgumentInfo));
|
hash_code = hash_combine(hash_code, arg_data);
|
}
|
|
// equality is fast: check ninputs, and then check the raw array data,
|
// there are no size/stride indirections
|
// hopefully std::vector<bool> has fast equality
|
bool operator==(const ArgumentSpec& spec) const {
|
if (optional_presence != spec.optional_presence) {
|
return false;
|
}
|
if (tensor_args.size() != spec.tensor_args.size())
|
return false;
|
// NB: we need to break out early when there are no elements, because
|
// passing a nullptr to memcmp is UB.
|
if (tensor_args.size() == 0)
|
return true;
|
return std::memcmp(
|
tensor_args.data(),
|
spec.tensor_args.data(),
|
tensor_args.size() * sizeof(ArgumentInfo)) == 0;
|
}
|
bool operator!=(const ArgumentSpec& spec) const {
|
return !(*this == spec);
|
}
|
size_t numTensors() const {
|
return tensor_args.size();
|
}
|
const ArgumentInfo& tensorAt(size_t i) const {
|
return tensor_args[i];
|
}
|
size_t numOptionals() const {
|
return optional_presence.size();
|
}
|
bool isPresent(size_t i) const {
|
return optional_presence[i];
|
}
|
size_t hashCode() const {
|
return hash_code;
|
}
|
|
private:
|
size_t hash_code; // precomputed on construction
|
std::vector<ArgumentInfo> tensor_args;
|
std::vector<bool> optional_presence;
|
};
|
|
// ArgumentSpecCreator takes an initial graph and comes up with a set
|
// of simple instructions to compute the ArgumentSpec given a set of
|
// input tensors.
|
struct TORCH_API ArgumentSpecCreator {
|
// instructs acts on a stack of a list of input IValues
|
// at the beginning the stack contains a single list of the inputs to the
|
// function the ENTER_ instructs descend into subobjects and push new lists
|
// onto the stack
|
enum Inst : char {
|
ENTER_TUPLE, // consume a tuple ivalue from the top-most list, and push the
|
// list of its elements onto the stack as a new list
|
ENTER_OBJECT, // same as ENTER_TUPLE, but the input is a class
|
LEAVE, // pop the top-most list from the stack
|
SKIP, // consume an element from the top-most list, and discard
|
SPECIALIZE_OPTIONAL_TENSOR, // consume a optional tensor for the top-most
|
// list, and add it to the ArgSpec key being
|
// created
|
SPECIALIZE_TENSOR, // consume a tensor for the top-most
|
// list, and add it to the ArgSpec key being created
|
SPECIALIZE_OPTIONAL,
|
// consume a nontensor optional from the top-most list,
|
// and add it to the ArgSpec key being created
|
};
|
ArgumentSpecCreator(Graph& graph);
|
ArgumentSpec create(bool with_grad, const Stack& stack) const;
|
void specializeTypes(Graph& g, const ArgumentSpec& spec) const;
|
void dump() const;
|
using WrittenSlots = std::unordered_set<std::string>;
|
|
private:
|
static constexpr size_t DEPTH_LIMIT = 128;
|
void scan(
|
const TypePtr& typ,
|
size_t depth,
|
const WrittenSlots& written_slots);
|
size_t num_inputs_;
|
size_t num_tensors_ = 0;
|
size_t num_optionals_ = 0;
|
std::vector<Inst> instructions_;
|
};
|
|
// CompleteArgumentSpec represents one particular specialization.
|
// It is designed so that it can be created, hashed, and compared quickly
|
// since it is used along the hot-path of the JIT to check if the code
|
// we have created is valid for the given inputs.
|
|
// COmpleteArgumentInfoPOD is only used internally in CompleteArgumentSpec
|
// API users should use ArgumentInfo
|
struct CompleteArgumentInfoPOD {
|
// total size is 64-bit
|
unsigned is_tensor : 8; // all other fields are invalid if this is false
|
unsigned type : 8; // scalar type
|
unsigned defined : 1;
|
unsigned requires_grad : 1;
|
signed device : 14;
|
uint32_t total_dims; // all TensorInfoPODs are in CompleteArgumentSpec's
|
// tensor_info() array. total_dims is the total number of
|
// dimensions seen so far in all previous members of
|
// tensor_info(), including this tensor 2*total_dims
|
// becomes the offset into the sizes_strides list for the
|
// _next_ tensor in the tensor_info array for tensor 0,
|
// the offset is always 0
|
};
|
|
static_assert(
|
sizeof(CompleteArgumentInfoPOD) == sizeof(int64_t),
|
"CompleteArgumentInfoPOD must be 64-bit struct for CompleteArgumentSpec encoding to work");
|
|
struct CompleteArgumentInfo;
|
|
struct CompleteArgumentSpec {
|
CompleteArgumentSpec(bool with_grad, at::ArrayRef<IValue> inputs)
|
: hash_code(0), ninputs(inputs.size()) {
|
int32_t all_dims = 0;
|
const int32_t num_inputs = inputs.size();
|
for (int32_t i = 0; i < num_inputs; i++) {
|
if (!inputs[i].isTensor())
|
continue;
|
auto tensor = inputs[i].toTensor();
|
all_dims += tensor.defined() ? tensor.ndimension() : 0;
|
}
|
// allocate enough room for all TensorPODs and dimensions
|
data.resize(ninputs + all_dims * 2);
|
|
// and reinterpret our data array as these structs
|
auto* pods = reinterpret_cast<CompleteArgumentInfoPOD*>(data.data());
|
int64_t* next_dim = sizes_strides();
|
int32_t total_dims = 0;
|
for (int32_t i = 0; i < num_inputs; i++) {
|
auto& pod = pods[i];
|
pod.is_tensor = static_cast<uint32_t>(inputs[i].isTensor());
|
if (pod.is_tensor) {
|
at::Tensor t = inputs[i].toTensor();
|
pod.defined = t.defined();
|
if (pod.defined) {
|
pod.type = static_cast<int>(t.scalar_type());
|
pod.device = (!t.is_cuda()) ? -1 : t.get_device();
|
pod.requires_grad =
|
with_grad && autograd::as_variable_ref(t).requires_grad();
|
total_dims += t.ndimension();
|
auto sizes = t.sizes();
|
std::copy(sizes.begin(), sizes.end(), next_dim);
|
next_dim += sizes.size();
|
auto strides = t.strides();
|
std::copy(strides.begin(), strides.end(), next_dim);
|
next_dim += strides.size();
|
}
|
}
|
// each POD has a running tally of all dimensions including its own
|
pod.total_dims = total_dims;
|
}
|
// we precompute the hash_code to minimize the time inside of hash
|
// table operations where we may need to hold a compiler cache lock.
|
hash_code = hash_combine(0, ninputs);
|
for (auto d : data) {
|
hash_code = hash_combine(hash_code, d);
|
}
|
}
|
|
// equality is fast: check ninputs, and then check the raw array data,
|
// there are no size/stride indirections
|
bool operator==(const CompleteArgumentSpec& spec) const {
|
return ninputs == spec.ninputs && data == spec.data;
|
}
|
bool operator!=(const CompleteArgumentSpec& spec) const {
|
return !(*this == spec);
|
}
|
friend struct CompleteArgumentInfo;
|
CompleteArgumentInfo at(size_t i) const;
|
size_t size() const {
|
return ninputs;
|
}
|
size_t hashCode() const {
|
return hash_code;
|
}
|
|
private:
|
ArrayRef<CompleteArgumentInfoPOD> tensor_info() const {
|
return ArrayRef<CompleteArgumentInfoPOD>(
|
reinterpret_cast<const CompleteArgumentInfoPOD*>(data.data()), ninputs);
|
}
|
// the start of the sizes_strides information, which comes after the
|
// CompleteArgumentInfoPOD list.
|
const int64_t* sizes_strides() const {
|
return data.data() + ninputs;
|
}
|
int64_t* sizes_strides() {
|
return data.data() + ninputs;
|
}
|
size_t hash_code; // precomputed on construction
|
int32_t ninputs;
|
// layout is ninputs of TensorPOD (each 64-bit) followed by their size and
|
// stride info for 3 tensors:
|
// [t0POD][t1POD][t2POD]...
|
// [t0 sizes][t0 strides][t1 sizes][t1 strides][t2 sizes][t2 strides]
|
std::vector<int64_t> data;
|
};
|
|
// public view of compressed CompleteArgumentInfo
|
struct CompleteArgumentInfo {
|
CompleteArgumentInfo(const CompleteArgumentSpec& spec, const int i)
|
: spec(spec), i(i) {}
|
bool isTensor() const {
|
return pod(i).is_tensor;
|
}
|
at::ScalarType type() const {
|
return at::ScalarType(pod(i).type);
|
}
|
bool defined() const {
|
return pod(i).defined;
|
}
|
bool requires_grad() const {
|
return pod(i).requires_grad;
|
}
|
int device() const {
|
return pod(i).device;
|
}
|
int ndimension() const {
|
// See [valid range], it is always valid to ask for offset for (i + 1)
|
return (sizes_strides_offset(i + 1) - sizes_strides_offset(i)) / 2;
|
}
|
at::IntArrayRef sizes() const {
|
return at::IntArrayRef(
|
spec.sizes_strides() + sizes_strides_offset(i), ndimension());
|
}
|
at::IntArrayRef strides() const {
|
int ndim = ndimension();
|
return at::IntArrayRef(
|
spec.sizes_strides() + sizes_strides_offset(i) + ndim, ndim);
|
}
|
operator TypePtr() const {
|
if (!defined())
|
return TensorType::get();
|
return TensorType::create(
|
type(), ConvertIntToCPUOrCUDA(device()), sizes(), strides());
|
}
|
|
private:
|
// offsetinto sizes_strides() array where the sizes start for tensor j
|
// [valid range] valid range is [0, ninputs]
|
// (i.e. you can ask for the offset at ninputs, which would be the offset of
|
// the next tensor if it existed)
|
int sizes_strides_offset(int j) const {
|
if (j == 0)
|
return 0;
|
return 2 * pod(j - 1).total_dims;
|
}
|
const CompleteArgumentInfoPOD& pod(int j) const {
|
return spec.tensor_info().at(j);
|
}
|
const CompleteArgumentSpec& spec;
|
const int i;
|
};
|
|
inline std::ostream& operator<<(std::ostream& out, const ArgumentInfo& info) {
|
if (!info.defined()) {
|
return out << "<undefined>";
|
}
|
out << "Tensor(device=" << info.device() << ", type=" << toString(info.type())
|
<< ", requires_grad=" << info.requires_grad() << ", dims=" << info.dim()
|
<< ")";
|
return out;
|
}
|
|
inline std::ostream& operator<<(std::ostream& out, const ArgumentSpec& spec) {
|
out << "{";
|
for (size_t i = 0; i < spec.numTensors(); ++i) {
|
if (i > 0)
|
out << ", ";
|
out << spec.tensorAt(i);
|
}
|
out << "; ";
|
for (size_t i = 0; i < spec.numOptionals(); ++i) {
|
if (i > 0)
|
out << ", ";
|
out << spec.isPresent(i);
|
}
|
out << "}";
|
return out;
|
}
|
|
inline std::ostream& operator<<(
|
std::ostream& out,
|
const CompleteArgumentInfo& info) {
|
if (!info.defined()) {
|
return out << "<undefined>";
|
}
|
out << "Tensor(device=" << info.device() << ", type=" << toString(info.type())
|
<< ", requires_grad=" << info.requires_grad()
|
<< ", sizes=" << info.sizes() << ", strides=" << info.strides() << ")";
|
return out;
|
}
|
|
inline std::ostream& operator<<(
|
std::ostream& out,
|
const CompleteArgumentSpec& spec) {
|
out << "{";
|
for (size_t i = 0; i < spec.size(); ++i) {
|
if (i > 0)
|
out << ", ";
|
out << spec.at(i);
|
}
|
out << "}";
|
return out;
|
}
|
|
inline CompleteArgumentInfo CompleteArgumentSpec::at(size_t i) const {
|
return CompleteArgumentInfo(*this, i);
|
}
|
|
inline c10::optional<int8_t> convertOptional(
|
c10::optional<c10::ScalarType> const& from) {
|
return (from) ? c10::optional<int8_t>(static_cast<int8_t>(*from))
|
: c10::optional<int8_t>{};
|
}
|
|
} // namespace jit
|
} // namespace torch
|
|
namespace std {
|
|
template <>
|
struct hash<c10::VaryingShape> {
|
size_t operator()(const c10::VaryingShape& vs) const {
|
return torch::get_hash(
|
vs.size(),
|
vs.size() ? vs.sizes().value() : std::vector<c10::optional<int64_t>>());
|
}
|
};
|
|
template <>
|
struct hash<c10::TensorType> {
|
size_t operator()(const c10::TensorType& ptt) const {
|
return torch::get_hash<
|
c10::optional<int8_t>,
|
c10::VaryingShape,
|
c10::VaryingShape,
|
c10::optional<bool>>(
|
torch::jit::convertOptional(ptt.scalarType()),
|
ptt.sizes(),
|
ptt.strides(),
|
ptt.requiresGrad());
|
}
|
};
|
|
template <>
|
struct hash<torch::jit::ArgumentSpec> {
|
size_t operator()(const torch::jit::ArgumentSpec& spec) const {
|
return spec.hashCode();
|
}
|
};
|
template <>
|
struct hash<torch::jit::CompleteArgumentSpec> {
|
size_t operator()(const torch::jit::CompleteArgumentSpec& spec) const {
|
return spec.hashCode();
|
}
|
};
|
} // namespace std
|