#pragma once #include #include #include #include #include #include #include #include #include namespace torch { namespace jit { // GraphExecutor creates specializations of Graphs for different // dimensionalitities and types of inputs. inline static at::Device ConvertIntToCPUOrCUDA(int device) { return device < 0 ? at::kCPU : at::Device(at::DeviceType::CUDA, device); } struct ArgumentInfo { friend struct ArgumentSpec; using plain_data_type = uint32_t; bool defined() const { return defined_; } int device() const { return device_; } // XXX: It is guaranteed that this will return false when called on non-tensor // arguments bool requires_grad() const { return requires_grad_; } int dim() const { return dim_; } at::ScalarType type() const { return at::ScalarType(type_); } TypePtr toType() const { if (!defined()) return TensorType::get(); return TensorType::create( type(), ConvertIntToCPUOrCUDA(device()), c10::VaryingShape(dim()), c10::VaryingShape(dim()), requires_grad()); } operator TypePtr() const { return toType(); } private: unsigned defined_ : 1; unsigned requires_grad_ : 1; unsigned : 5; unsigned dim_ : 8; int device_ : 8; // NOTE: this needs to be signed because we use -1 to // represent CPU unsigned type_ : 8; }; static_assert( std::is_pod::value, "ArgumentInfo is to be a POD struct"); static_assert( sizeof(ArgumentInfo) == sizeof(ArgumentInfo::plain_data_type), "ArgumentInfo is expected to be a 32-bit struct"); struct ArgumentSpec { ArgumentSpec(size_t num_flat_tensor_inputs, size_t num_flat_optional_inputs) { hash_code = hash_combine(num_flat_tensor_inputs, num_flat_optional_inputs); tensor_args.reserve(num_flat_tensor_inputs); optional_presence.reserve(num_flat_optional_inputs); } void addOptional(const IValue& input) { bool is_present = !input.isNone(); optional_presence.push_back(is_present); hash_code = hash_combine(hash_code, is_present); } void addTensor(const IValue& input, bool with_grad) { AT_ASSERT(input.isTensor(), "Expected Tensor but found ", input.tagKind()); tensor_args.emplace_back(); auto& arg = tensor_args.back(); // Initialize all fields to 0. This is convenient, because e.g. // requires_grad() can be checked even on tensors AND will make // padding bits all 0s. std::memset(&arg, 0, sizeof(ArgumentInfo)); // [argspec refcounting] reinterpret the IValue to avoid having to refcount // the Tensor microbenchmarks // https://github.com/zdevito/pytorch/commit/21e7200a0a0fc456bea2f10e95b1781f83933d10 // show overhead in extra refcounting along this path const at::Tensor* t = reinterpret_cast(&input); if ((arg.defined_ = t->defined())) { arg.requires_grad_ = with_grad && autograd::Variable(*t).requires_grad(); arg.dim_ = t->dim(); arg.device_ = t->is_cuda() ? t->get_device() : -1; arg.type_ = static_cast(t->scalar_type()); } combineHash(arg); } void combineHash(const ArgumentInfo& arg) { ArgumentInfo::plain_data_type arg_data; std::memcpy(&arg_data, &arg, sizeof(ArgumentInfo)); hash_code = hash_combine(hash_code, arg_data); } // equality is fast: check ninputs, and then check the raw array data, // there are no size/stride indirections // hopefully std::vector has fast equality bool operator==(const ArgumentSpec& spec) const { if (optional_presence != spec.optional_presence) { return false; } if (tensor_args.size() != spec.tensor_args.size()) return false; // NB: we need to break out early when there are no elements, because // passing a nullptr to memcmp is UB. if (tensor_args.size() == 0) return true; return std::memcmp( tensor_args.data(), spec.tensor_args.data(), tensor_args.size() * sizeof(ArgumentInfo)) == 0; } bool operator!=(const ArgumentSpec& spec) const { return !(*this == spec); } size_t numTensors() const { return tensor_args.size(); } const ArgumentInfo& tensorAt(size_t i) const { return tensor_args[i]; } size_t numOptionals() const { return optional_presence.size(); } bool isPresent(size_t i) const { return optional_presence[i]; } size_t hashCode() const { return hash_code; } private: size_t hash_code; // precomputed on construction std::vector tensor_args; std::vector optional_presence; }; // ArgumentSpecCreator takes an initial graph and comes up with a set // of simple instructions to compute the ArgumentSpec given a set of // input tensors. struct TORCH_API ArgumentSpecCreator { // instructs acts on a stack of a list of input IValues // at the beginning the stack contains a single list of the inputs to the // function the ENTER_ instructs descend into subobjects and push new lists // onto the stack enum Inst : char { ENTER_TUPLE, // consume a tuple ivalue from the top-most list, and push the // list of its elements onto the stack as a new list ENTER_OBJECT, // same as ENTER_TUPLE, but the input is a class LEAVE, // pop the top-most list from the stack SKIP, // consume an element from the top-most list, and discard SPECIALIZE_OPTIONAL_TENSOR, // consume a optional tensor for the top-most // list, and add it to the ArgSpec key being // created SPECIALIZE_TENSOR, // consume a tensor for the top-most // list, and add it to the ArgSpec key being created SPECIALIZE_OPTIONAL, // consume a nontensor optional from the top-most list, // and add it to the ArgSpec key being created }; ArgumentSpecCreator(Graph& graph); ArgumentSpec create(bool with_grad, const Stack& stack) const; void specializeTypes(Graph& g, const ArgumentSpec& spec) const; void dump() const; using WrittenSlots = std::unordered_set; private: static constexpr size_t DEPTH_LIMIT = 128; void scan( const TypePtr& typ, size_t depth, const WrittenSlots& written_slots); size_t num_inputs_; size_t num_tensors_ = 0; size_t num_optionals_ = 0; std::vector instructions_; }; // CompleteArgumentSpec represents one particular specialization. // It is designed so that it can be created, hashed, and compared quickly // since it is used along the hot-path of the JIT to check if the code // we have created is valid for the given inputs. // COmpleteArgumentInfoPOD is only used internally in CompleteArgumentSpec // API users should use ArgumentInfo struct CompleteArgumentInfoPOD { // total size is 64-bit unsigned is_tensor : 8; // all other fields are invalid if this is false unsigned type : 8; // scalar type unsigned defined : 1; unsigned requires_grad : 1; signed device : 14; uint32_t total_dims; // all TensorInfoPODs are in CompleteArgumentSpec's // tensor_info() array. total_dims is the total number of // dimensions seen so far in all previous members of // tensor_info(), including this tensor 2*total_dims // becomes the offset into the sizes_strides list for the // _next_ tensor in the tensor_info array for tensor 0, // the offset is always 0 }; static_assert( sizeof(CompleteArgumentInfoPOD) == sizeof(int64_t), "CompleteArgumentInfoPOD must be 64-bit struct for CompleteArgumentSpec encoding to work"); struct CompleteArgumentInfo; struct CompleteArgumentSpec { CompleteArgumentSpec(bool with_grad, at::ArrayRef inputs) : hash_code(0), ninputs(inputs.size()) { int32_t all_dims = 0; const int32_t num_inputs = inputs.size(); for (int32_t i = 0; i < num_inputs; i++) { if (!inputs[i].isTensor()) continue; auto tensor = inputs[i].toTensor(); all_dims += tensor.defined() ? tensor.ndimension() : 0; } // allocate enough room for all TensorPODs and dimensions data.resize(ninputs + all_dims * 2); // and reinterpret our data array as these structs auto* pods = reinterpret_cast(data.data()); int64_t* next_dim = sizes_strides(); int32_t total_dims = 0; for (int32_t i = 0; i < num_inputs; i++) { auto& pod = pods[i]; pod.is_tensor = static_cast(inputs[i].isTensor()); if (pod.is_tensor) { at::Tensor t = inputs[i].toTensor(); pod.defined = t.defined(); if (pod.defined) { pod.type = static_cast(t.scalar_type()); pod.device = (!t.is_cuda()) ? -1 : t.get_device(); pod.requires_grad = with_grad && autograd::as_variable_ref(t).requires_grad(); total_dims += t.ndimension(); auto sizes = t.sizes(); std::copy(sizes.begin(), sizes.end(), next_dim); next_dim += sizes.size(); auto strides = t.strides(); std::copy(strides.begin(), strides.end(), next_dim); next_dim += strides.size(); } } // each POD has a running tally of all dimensions including its own pod.total_dims = total_dims; } // we precompute the hash_code to minimize the time inside of hash // table operations where we may need to hold a compiler cache lock. hash_code = hash_combine(0, ninputs); for (auto d : data) { hash_code = hash_combine(hash_code, d); } } // equality is fast: check ninputs, and then check the raw array data, // there are no size/stride indirections bool operator==(const CompleteArgumentSpec& spec) const { return ninputs == spec.ninputs && data == spec.data; } bool operator!=(const CompleteArgumentSpec& spec) const { return !(*this == spec); } friend struct CompleteArgumentInfo; CompleteArgumentInfo at(size_t i) const; size_t size() const { return ninputs; } size_t hashCode() const { return hash_code; } private: ArrayRef tensor_info() const { return ArrayRef( reinterpret_cast(data.data()), ninputs); } // the start of the sizes_strides information, which comes after the // CompleteArgumentInfoPOD list. const int64_t* sizes_strides() const { return data.data() + ninputs; } int64_t* sizes_strides() { return data.data() + ninputs; } size_t hash_code; // precomputed on construction int32_t ninputs; // layout is ninputs of TensorPOD (each 64-bit) followed by their size and // stride info for 3 tensors: // [t0POD][t1POD][t2POD]... // [t0 sizes][t0 strides][t1 sizes][t1 strides][t2 sizes][t2 strides] std::vector data; }; // public view of compressed CompleteArgumentInfo struct CompleteArgumentInfo { CompleteArgumentInfo(const CompleteArgumentSpec& spec, const int i) : spec(spec), i(i) {} bool isTensor() const { return pod(i).is_tensor; } at::ScalarType type() const { return at::ScalarType(pod(i).type); } bool defined() const { return pod(i).defined; } bool requires_grad() const { return pod(i).requires_grad; } int device() const { return pod(i).device; } int ndimension() const { // See [valid range], it is always valid to ask for offset for (i + 1) return (sizes_strides_offset(i + 1) - sizes_strides_offset(i)) / 2; } at::IntArrayRef sizes() const { return at::IntArrayRef( spec.sizes_strides() + sizes_strides_offset(i), ndimension()); } at::IntArrayRef strides() const { int ndim = ndimension(); return at::IntArrayRef( spec.sizes_strides() + sizes_strides_offset(i) + ndim, ndim); } operator TypePtr() const { if (!defined()) return TensorType::get(); return TensorType::create( type(), ConvertIntToCPUOrCUDA(device()), sizes(), strides()); } private: // offsetinto sizes_strides() array where the sizes start for tensor j // [valid range] valid range is [0, ninputs] // (i.e. you can ask for the offset at ninputs, which would be the offset of // the next tensor if it existed) int sizes_strides_offset(int j) const { if (j == 0) return 0; return 2 * pod(j - 1).total_dims; } const CompleteArgumentInfoPOD& pod(int j) const { return spec.tensor_info().at(j); } const CompleteArgumentSpec& spec; const int i; }; inline std::ostream& operator<<(std::ostream& out, const ArgumentInfo& info) { if (!info.defined()) { return out << ""; } out << "Tensor(device=" << info.device() << ", type=" << toString(info.type()) << ", requires_grad=" << info.requires_grad() << ", dims=" << info.dim() << ")"; return out; } inline std::ostream& operator<<(std::ostream& out, const ArgumentSpec& spec) { out << "{"; for (size_t i = 0; i < spec.numTensors(); ++i) { if (i > 0) out << ", "; out << spec.tensorAt(i); } out << "; "; for (size_t i = 0; i < spec.numOptionals(); ++i) { if (i > 0) out << ", "; out << spec.isPresent(i); } out << "}"; return out; } inline std::ostream& operator<<( std::ostream& out, const CompleteArgumentInfo& info) { if (!info.defined()) { return out << ""; } out << "Tensor(device=" << info.device() << ", type=" << toString(info.type()) << ", requires_grad=" << info.requires_grad() << ", sizes=" << info.sizes() << ", strides=" << info.strides() << ")"; return out; } inline std::ostream& operator<<( std::ostream& out, const CompleteArgumentSpec& spec) { out << "{"; for (size_t i = 0; i < spec.size(); ++i) { if (i > 0) out << ", "; out << spec.at(i); } out << "}"; return out; } inline CompleteArgumentInfo CompleteArgumentSpec::at(size_t i) const { return CompleteArgumentInfo(*this, i); } inline c10::optional convertOptional( c10::optional const& from) { return (from) ? c10::optional(static_cast(*from)) : c10::optional{}; } } // namespace jit } // namespace torch namespace std { template <> struct hash { size_t operator()(const c10::VaryingShape& vs) const { return torch::get_hash( vs.size(), vs.size() ? vs.sizes().value() : std::vector>()); } }; template <> struct hash { size_t operator()(const c10::TensorType& ptt) const { return torch::get_hash< c10::optional, c10::VaryingShape, c10::VaryingShape, c10::optional>( torch::jit::convertOptional(ptt.scalarType()), ptt.sizes(), ptt.strides(), ptt.requiresGrad()); } }; template <> struct hash { size_t operator()(const torch::jit::ArgumentSpec& spec) const { return spec.hashCode(); } }; template <> struct hash { size_t operator()(const torch::jit::CompleteArgumentSpec& spec) const { return spec.hashCode(); } }; } // namespace std