[PyTorch] Store Tensor explicitly in IValue

swolchok · swolchok · commit 4a820f880bab · 2020-12-04T13:56:35.000-08:00
Pull Request resolved: #48824 Enables following diff, which will make toTensor() return `const Tensor&` and allow callers to avoid refcounting overhead. ghstack-source-id: 117906329 Differential Revision: [D25324617](https://our.internmc.facebook.com/intern/diff/D25324617/)
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
@@ -163,23 +163,34 @@ struct CAFFE2_API IValue final {
       c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr);
     }
   }
-  IValue(IValue&& rhs) noexcept : IValue() {
-    swap(rhs);
+  IValue(IValue&& rhs) noexcept : tag(rhs.tag), is_intrusive_ptr(rhs.is_intrusive_ptr) {
+    moveFrom(std::move(rhs));
+    rhs.tag = Tag::None;
+    rhs.is_intrusive_ptr = false;
   }
+
   /// @private [doxygen private]
   ~IValue() {
-    if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr);
-    }
+    destroy();
   }
-  IValue& operator=(IValue&& rhs) & noexcept {
-    IValue(std::move(rhs)).swap(*this); // this also sets rhs to None
+
+  // Always-inline for performance -- this gets called frequently
+  // inside the core of the static runtime.
+  C10_ALWAYS_INLINE IValue& operator=(IValue&& rhs) & noexcept {
+    if (&rhs == this) {
+      return *this;
+    }
+
+    destroy();
+    moveFrom(std::move(rhs));
     return *this;
   }
+
   IValue& operator=(IValue const& rhs) & {
     IValue(rhs).swap(*this);
     return *this;
   }
+
   void dump() const;
 
   /**
@@ -288,7 +299,27 @@ struct CAFFE2_API IValue final {
 
   /// @private [doxygen private]
   void swap(IValue& rhs) noexcept {
-    std::swap(payload, rhs.payload);
+    if (isTensor() && rhs.isTensor()) {
+      std::swap(payload.as_tensor, rhs.payload.as_tensor);
+    } else if (isTensor()) {
+      at::Tensor t = std::move(payload.as_tensor);
+      // As far as I can tell, omitting the usual explicit destructor call
+      // is not UB in and of itself, and it's a slight perf win. The
+      // destructor is a no-op, because the moved-from Tensor is
+      // effectively an intrusive_ptr in the null state, so we don't need
+      // the behavior for correctness reasons either. Leaving this
+      // explanatory comment, including commented-out destructor call, to
+      // make this abundantly clear.
+      //
+      // payload.as_tensor.~Tensor();
+      memcpy(&payload, &rhs.payload, sizeof(payload));
+      new (&rhs.payload.as_tensor) at::Tensor(std::move(t));
+    } else if (rhs.isTensor()) {
+      rhs.swap(*this);
+      return;
+    } else {
+      std::swap(reinterpret_cast<char(&)[sizeof(payload)]>(*&payload), reinterpret_cast<char(&)[sizeof(payload)]>(*&rhs.payload));
+    }
     std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr);
     std::swap(tag, rhs.tag);
   }
@@ -297,21 +328,16 @@ struct CAFFE2_API IValue final {
   // While some of these accessors could be generated through templates,
   // we prefer to write them manually for clarity
 
-  IValue(at::Tensor t) : tag(Tag::Tensor), is_intrusive_ptr(t.defined()) {
-    // Note: the undefined tensor is not refcounted, so while it
-    // is tagged as a tensor, is_intrusive_ptr is set to false.
-    // This is not an optional optimization: our incref call
-    // *will not* do the right thing when called on an
-    // undefined tensor.
-    payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl();
+  IValue(at::Tensor t) : tag(Tag::Tensor), is_intrusive_ptr(false) {
+    new (&payload.as_tensor) at::Tensor(std::move(t));
   }
   bool isTensor() const {
     return Tag::Tensor == tag;
   }
   at::Tensor toTensor() &&;
   at::Tensor toTensor() const&;
   at::TensorImpl* unsafeToTensorImpl() const {
-    return static_cast<at::TensorImpl*>(payload.as_intrusive_ptr);
+    return payload.as_tensor.unsafeGetTensorImpl();
   }
 
   const IValue& toIValue() const {
@@ -565,7 +591,7 @@ struct CAFFE2_API IValue final {
   c10::intrusive_ptr<ivalue::EnumHolder> toEnumHolder() const&;
 
   // None
-  IValue() : payload{0}, tag(Tag::None), is_intrusive_ptr(false) {}
+  IValue() : tag(Tag::None), is_intrusive_ptr(false) {}
   bool isNone() const {
     return Tag::None == tag;
   }
@@ -815,7 +841,35 @@ struct CAFFE2_API IValue final {
       class NullType = c10::detail::intrusive_target_default_null_type<T>>
   c10::intrusive_ptr<T, NullType> toIntrusivePtr() const;
 
-  void clearToNone() {
+  void destroy() {
+    if (isTensor()) {
+      payload.as_tensor.~Tensor();
+    } else if (is_intrusive_ptr) {
+      c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr);
+    }
+  }
+
+  C10_ALWAYS_INLINE void moveFrom(IValue&& rhs) noexcept {
+    if (rhs.isTensor()) {
+      new (&payload.as_tensor) at::Tensor(std::move(rhs.payload.as_tensor));
+      // As far as I can tell, omitting the usual explicit destructor call
+      // is not UB in and of itself, and it's a slight perf win. The
+      // destructor is a no-op, because the moved-from Tensor is
+      // effectively an intrusive_ptr in the null state, so we don't need
+      // the behavior for correctness reasons either. Leaving this
+      // explanatory comment, including commented-out destructor call, to
+      // make this abundantly clear.
+      //
+      // rhs.payload.as_tensor.~Tensor();
+    } else {
+      memcpy(&payload, &rhs.payload, sizeof(payload));
+    }
+    tag = rhs.tag;
+    is_intrusive_ptr = rhs.is_intrusive_ptr;
+    rhs.clearToNone();
+  }
+
+  void clearToNone() noexcept {
     payload.as_int = 0;
     tag = Tag::None;
     is_intrusive_ptr = false;
@@ -826,13 +880,23 @@ struct CAFFE2_API IValue final {
     double as_double;
     bool as_bool;
     c10::intrusive_ptr_target* as_intrusive_ptr;
+    at::Tensor as_tensor;
     struct {
       DeviceType type;
       DeviceIndex index;
     } as_device;
+
+    Payload() : as_int(0) {}
+    ~Payload() {}
   };
 
-  IValue(Payload p, Tag t, bool i) : payload(p), tag(t), is_intrusive_ptr(i) {}
+  IValue(const Payload& p, Tag t, bool i) : tag(t), is_intrusive_ptr(i) {
+    if (isTensor()) {
+      new (&payload.as_tensor) at::Tensor(p.as_tensor);
+    } else {
+      memcpy(&payload, &p, sizeof(payload));
+    }
+  }
 
   Payload payload;
   Tag tag;
@@ -852,9 +916,14 @@ struct CAFFE2_API WeakIValue final {
     }
   }
   WeakIValue(const IValue& rhs)
-      : payload(rhs.payload),
-        tag(rhs.tag),
+      : tag(rhs.tag),
         is_intrusive_ptr(rhs.is_intrusive_ptr) {
+    if (rhs.isTensor()) {
+      payload.as_intrusive_ptr = rhs.unsafeToTensorImpl();
+    } else {
+      static_assert(sizeof(payload) == sizeof(rhs.payload), "IValue and WeakIValue payload sizes don't match!");
+      memcpy(&payload, &rhs.payload, sizeof(payload));
+    }
     if (is_intrusive_ptr) {
       c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr);
     }
@@ -888,17 +957,28 @@ struct CAFFE2_API WeakIValue final {
 
   IValue lock() const {
     if (!is_intrusive_ptr) {
-      return IValue(payload, tag, false);
+      IValue::Payload newPayload;
+      memcpy(&newPayload, &payload, sizeof(newPayload));
+      return IValue(newPayload, tag, false);
     }
     auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target>::reclaim(
         payload.as_intrusive_ptr);
-    IValue::Payload pl;
-    pl.as_intrusive_ptr = temp.lock().release();
-    temp.release();
-    if (!pl.as_intrusive_ptr) {
-      return IValue();
+    if (IValue::Tag::Tensor == tag) {
+      auto ip = temp.lock().release();
+      if (!ip) {
+        return IValue();
+      } else {
+        return IValue(std::move(ip));
+      }
     } else {
-      return IValue(pl, tag, true);
+      IValue::Payload pl;
+      pl.as_intrusive_ptr = temp.lock().release();
+      temp.release();
+      if (!pl.as_intrusive_ptr) {
+        return IValue();
+      } else {
+        return IValue(pl, tag, true);
+      }
     }
   }
 
@@ -928,7 +1008,17 @@ struct CAFFE2_API WeakIValue final {
   }
 
  private:
-  IValue::Payload payload;
+  union Payload {
+    int64_t as_int;
+    double as_double;
+    bool as_bool;
+    c10::intrusive_ptr_target* as_intrusive_ptr;
+    struct {
+      DeviceType type;
+      DeviceIndex index;
+    } as_device;
+  };
+  Payload payload;
   IValue::Tag tag;
   bool is_intrusive_ptr;
 };
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
@@ -130,12 +130,22 @@ inline c10::intrusive_ptr<ivalue::EnumHolder> IValue::toEnumHolder() const& {
 }
 inline at::Tensor IValue::toTensor() && {
   AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind());
-  return at::Tensor(
-      moveToIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
+  auto result = std::move(payload.as_tensor);
+  // As far as I can tell, omitting the usual explicit destructor call
+  // is not UB in and of itself, and it's a slight perf win. The
+  // destructor is a no-op, because the moved-from Tensor is
+  // effectively an intrusive_ptr in the null state, so we don't need
+  // the behavior for correctness reasons either. Leaving this
+  // explanatory comment, including commented-out destructor call, to
+  // make this abundantly clear.
+  //
+  // payload.as_tensor.~Tensor();
+  clearToNone();
+  return result;
 }
 inline at::Tensor IValue::toTensor() const& {
   AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind());
-  return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
+  return payload.as_tensor;
 }
 inline c10::Stream IValue::toStream() && {
   return c10::Stream::unpack(payload.as_int);
diff --git a/aten/src/ATen/test/ivalue_test.cpp b/aten/src/ATen/test/ivalue_test.cpp
@@ -51,6 +51,72 @@ TEST(IValueTest, Basic) {
   ASSERT_EQ(tv.use_count(), 2);
 }
 
+TEST(IValueTest, Swap) {
+  at::Tensor tv1 = at::rand({3, 4});
+  IValue tensor(tv1), scalar(42);
+  // swap() has special cases depending on which side is tensor or
+  // not. Exercise all 4 combinations.
+  tensor.swap(scalar);
+  tensor.swap(scalar);
+  tensor.swap(tensor);
+  scalar.swap(scalar);
+}
+
+TEST(IValueTest, MoveConstruct) {
+  at::Tensor t1 = at::rand({3, 4});
+
+  {
+    IValue sourceTensor(t1);
+    IValue target(std::move(sourceTensor));
+    EXPECT_TRUE(target.toTensor().equal(t1));
+    EXPECT_TRUE(sourceTensor.isNone());
+  }
+
+  {
+    IValue sourceScalar(42);
+    IValue target(std::move(sourceScalar));
+    EXPECT_EQ(target, IValue(42));
+    EXPECT_TRUE(sourceScalar.isNone());
+  }
+}
+
+TEST(IValueTest, MoveAssign) {
+  at::Tensor tv1 = at::rand({3, 4});
+  at::Tensor tv2 = at::rand({3, 4});
+
+  // 1: tensor to tensor
+  {
+    IValue targetTensor(tv1), sourceTensor(tv2);
+    targetTensor = std::move(sourceTensor);
+    EXPECT_TRUE(targetTensor.toTensor().equal(tv2));
+    EXPECT_TRUE(sourceTensor.isNone());
+  }
+
+  // 2: tensor to scalar
+  {
+    IValue targetScalar(42), sourceTensor(tv1);
+    targetScalar = std::move(sourceTensor);
+    EXPECT_TRUE(targetScalar.toTensor().equal(tv1));
+    EXPECT_TRUE(sourceTensor.isNone());
+  }
+
+  // 3: scalar to tensor
+  {
+    IValue targetTensor(tv1), sourceScalar(42);
+    targetTensor = std::move(sourceScalar);
+    EXPECT_EQ(targetTensor, 42);
+    EXPECT_TRUE(sourceScalar.isNone());
+  }
+
+  // 4: scalar to scalar
+  {
+    IValue targetScalar(42), sourceScalar(43);
+    targetScalar = std::move(sourceScalar);
+    EXPECT_EQ(targetScalar, 43);
+    EXPECT_TRUE(sourceScalar.isNone());
+  }
+}
+
 TEST(IValueTest, Tuple) {
   std::tuple<int64_t, at::Tensor> t = std::make_tuple(123, at::randn({1}));
   auto iv = IValue(t);
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
@@ -186,6 +186,14 @@ namespace at { namespace cuda { using namespace c10::hip; }}
 #define C10_NOINLINE
 #endif
 
+#if __has_attribute(always_inline) || defined(__GNUC__)
+#define C10_ALWAYS_INLINE __attribute__((__always_inline__)) inline
+#elif defined(_MSC_VER)
+#define C10_ALWAYS_INLINE __forceinline
+#else
+#define C10_ALWAYS_INLINE inline
+#endif
+
 #include <sstream>
 #include <string>