Update on "[PyTorch] RFC: Add tuple inline storage"

I noticed a bunch of time being spent heap-allocating Tuples in the unpickler. 1-, 2-, and 3-element Tuples are apparently common enough that they get their own bytecode instructions, so I decided to try also giving them their own representation. We store up to 3 IValues inline in `Tuple` rather than doing a second heap allocation for a `std::vector<IValue>`. Differential Revision: [D30592622](https://our.internmc.facebook.com/intern/diff/D30592622/) **NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D30592622/)! [ghstack-poisoned]
pytorch · swolchok · Aug 26, 2021 · Aug 27, 2021 · Aug 27, 2021 · Aug 27, 2021
commit b4c2fd4f4f41c0fee3ef4fc18b2c21674eb35be4
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
@@ -387,6 +387,10 @@ struct TORCH_API TupleElements {
     }
   }
 
+  C10_NODISCARD bool empty() const {
+    return inlineSize_ ? false : elementsVector_.empty();
+  }
+
   C10_NODISCARD size_t size() const {
     return inlineSize_ ? inlineSize_ : elementsVector_.size();
   }

diff --git a/aten/src/ATen/test/ivalue_test.cpp b/aten/src/ATen/test/ivalue_test.cpp
@@ -749,6 +749,7 @@ using ivalue::TupleElements;
 
 namespace {
 void validateTupleElements(TupleElements& te, c10::ArrayRef<IValue> contents) {
+  EXPECT_EQ(te.empty(), contents.empty());
   EXPECT_EQ(te.size(), contents.size());
   for (const auto idx: c10::irange(contents.size())) {
     EXPECT_IVALUE_EQ(te[idx], contents[idx]);

diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
@@ -87,20 +87,6 @@ using caffe2::serialize::ReadAdapterInterface;
 
 OpCode parseOpCode(const char* str);
 
-IValue expect_field(
-    c10::ivalue::TupleElements& elements,
-    const std::string& expected_name,
-    size_t entry) {
-  auto row = std::move(elements.at(entry)).toTuple();
-  TORCH_INTERNAL_ASSERT(
-      row->elements().at(0).toStringRef() == expected_name,
-      "Expected ",
-      expected_name,
-      " found ",
-      row->elements().at(0).toStringRef());
-  return std::move(row->elements().at(1));
-}
-
 std::string operator_str(
     const std::string& name,
     const std::string& overloadname) {
@@ -237,22 +223,6 @@ class BytecodeDeserializer final {
       IValue* schemaTable,
       const int64_t& model_version,
       mobile::Function* function);
-  /**
-   * Loads operators by looking them up in the Dispatcher and returns
-   * the set of operator names (with overload) that are not supported
-   * by the current runtime.
-   *
-   * Accepts an operator_cache, which allows you to cache operator
-   * functions for the entire model. This is keyed on
-   * c10::OperatorName. The value may not be what you're looking for
-   * even if the key is the same. You need to call has_same_arg_num()
-   * on the value to ensure that the number of arguments are the same.
-   */
-  std::unordered_set<std::string> load_and_find_unsupported_operator_names(
-      c10::ivalue::TupleElements&& ops_list,
-      mobile::Function* function,
-      int64_t model_version,
-      mobile::Function::OperatorCacheType& operator_cache) const;
   std::shared_ptr<CompilationUnit> compilation_unit_;
   std::unordered_set<std::string> imported_libs_;
   std::unique_ptr<PyTorchStreamReader> reader_{};
@@ -267,12 +237,22 @@ BytecodeDeserializer::BytecodeDeserializer(
       reader_(std::move(reader)),
       module_load_options_(module_load_options) {}
 
-std::unordered_set<std::string> BytecodeDeserializer::
-    load_and_find_unsupported_operator_names(
-        c10::ivalue::TupleElements&& ops_list,
-        mobile::Function* function,
-        int64_t model_version,
-        mobile::Function::OperatorCacheType& operator_cache) const {
+/**
+ * Loads operators by looking them up in the Dispatcher and returns
+ * the set of operator names (with overload) that are not supported
+ * by the current runtime.
+ *
+ * Accepts an operator_cache, which allows you to cache operator
+ * functions for the entire model. This is keyed on
+ * c10::OperatorName. The value may not be what you're looking for
+ * even if the key is the same. You need to call has_same_arg_num()
+ * on the value to ensure that the number of arguments are the same.
+ */
+std::unordered_set<std::string> load_and_find_unsupported_operator_names(
+    c10::ivalue::TupleElements&& ops_list,
+    mobile::Function* function,
+    int64_t model_version,
+    mobile::Function::OperatorCacheType& operator_cache) {
   std::unordered_set<std::string> unsupported_op_names;
   // ops_list is the list of operator names that were read in from
   // bytecode.plk for the method that is currently being processed.
@@ -315,7 +295,7 @@ void BytecodeDeserializer::parseFunctionSchema(
     mobile::Function* function) {
   // function schema
   if (schemaTable) { // (schema is optional for back compat)
-    auto parseArgList = [this](std::vector<IValue>&& argTables) {
+    auto parseArgList = [this](c10::ivalue::TupleElements&& argTables) {
       std::vector<c10::Argument> args;
       for (auto&& argTable : std::move(argTables)) {
         auto argTableElements =
@@ -341,14 +321,14 @@ void BytecodeDeserializer::parseFunctionSchema(
     };
     auto schemaTableElements =
         std::move(*std::move(*schemaTable).toTuple()).elements();
-    std::vector<IValue> arg_list =
+    auto arg_list =
         std::move(*expect_field(
                        schemaTableElements,
                        "arguments",
                        BYTECODE_INDEX_SCHEMA_ARGUMENTS)
                        .toTuple())
             .elements();
-    std::vector<IValue> ret_list =
+    auto ret_list =
         std::move(
             *expect_field(
                  schemaTableElements, "returns", BYTECODE_INDEX_SCHEMA_RETURNS)
@@ -366,14 +346,14 @@ void BytecodeDeserializer::parseFunctionSchema(
 }
 
 void parseOperators(
-    const std::vector<IValue>& ops_list,
+    c10::ivalue::TupleElements&& ops_list,
     const int64_t& model_version,
     const uint64_t& module_load_options,
     mobile::Function* function,
     mobile::Function::OperatorCacheType& operator_cache) {
   std::unordered_set<std::string> unsupported_op_names =
       load_and_find_unsupported_operator_names(
-          ops_list, function, model_version, operator_cache);
+          std::move(ops_list), function, model_version, operator_cache);
   if ((module_load_options & MobileModuleLoadOptions::OPERATOR_CHECK) &&
       !unsupported_op_names.empty()) {
     print_unsupported_ops_and_throw(unsupported_op_names);
@@ -455,96 +435,30 @@ void BytecodeDeserializer::parseMethods(
             codeTableElements, "register_size", BYTECODE_INDEX_REGISTER_SIZE)
             .toInt();
 
-    std::vector<IValue> debug_handles_m_tuple;
+    c10::ivalue::TupleElements debug_handles_m_tuple;
     if (debug_handles) {
       debug_handles_m_tuple =
           std::move(*std::move((*debug_handles)[i]).toTuple()).elements();
     }
 
-    OpCodeCache opCodeCache;
-    for (const auto j : c10::irange(ins_list.size())) {
-      // Can't remove this, need to keep Tuple alive!
-      auto ins_tuple = std::move(ins_list[j]).toTuple();
-      c10::ArrayRef<IValue> ins_item = ins_tuple->elements();
-      TORCH_CHECK(
-          ins_item.size() == 3,
-          "There should be three parts in an instruction. The function name is ",
-          function_name);
-      OpCode op_code = opCodeCache.parse(*ins_item[0].toString());
-      int X = ins_item[1].toInt();
-      int N = ins_item[2].toInt();
-      if (debug_handles) {
-        int64_t debug_handle = debug_handles_list[j];
-        function->append_instruction(op_code, X, N, debug_handle);
-      } else {
-        function->append_instruction(op_code, X, N);
-      }
-    }
+    parseInstructions(
+        function_name, ins_list, debug_handles_m_tuple, function.get());
 
-    std::unordered_set<std::string> unsupported_op_names =
-        load_and_find_unsupported_operator_names(
-            std::move(ops_list), function.get(), model_version, operator_cache);
-    if ((module_load_options_ & MobileModuleLoadOptions::OPERATOR_CHECK) &&
-        !unsupported_op_names.empty()) {
-      print_unsupported_ops_and_throw(unsupported_op_names);
-    }
+    parseOperators(
+        std::move(ops_list),
+        model_version,
+        module_load_options_,
+        function.get(),
+        operator_cache);
 
     parseConstants(consts_list, function.get());
 
     parseTypes(types_list, function.get());
 
     function->set_register_size(register_size);
 
-    // function schema
-    if (schemaTable) { // (schema is optional for back compat)
-      auto parseArgList = [this](c10::ivalue::TupleElements&& argTables) {
-        std::vector<c10::Argument> args;
-        for (auto&& argTable : std::move(argTables)) {
-          auto argTableElements =
-              std::move(*std::move(argTable).toTuple()).elements();
-          auto name =
-              expect_field(
-                  argTableElements, "name", BYTECODE_INDEX_ARGUMENT_NAME)
-                  .toStringRef();
-          c10::TypePtr type = resolveTypeName(
-              (expect_field(
-                   argTableElements, "type", BYTECODE_INDEX_ARGUMENT_TYPE))
-                  .toStringRef());
-          IValue default_value = expect_field(
-              argTableElements,
-              "default_value",
-              BYTECODE_INDEX_ARGUMENT_DEFAULT_VALUE);
-          args.emplace_back(
-              name,
-              std::move(type),
-              c10::nullopt /*N*/,
-              std::move(default_value));
-        }
-        return args;
-      };
-      auto schemaTableElements =
-          std::move(*std::move(*schemaTable).toTuple()).elements();
-      auto arg_list = std::move(*expect_field(
-                                     schemaTableElements,
-                                     "arguments",
-                                     BYTECODE_INDEX_SCHEMA_ARGUMENTS)
-                                     .toTuple())
-                          .elements();
-      auto ret_list = std::move(*expect_field(
-                                     schemaTableElements,
-                                     "returns",
-                                     BYTECODE_INDEX_SCHEMA_RETURNS)
-                                     .toTuple())
-                          .elements();
-      c10::FunctionSchema schema(
-          function_name,
-          "" /*overload_name*/,
-          parseArgList(std::move(arg_list)),
-          parseArgList(std::move(ret_list)),
-          false /*is_varargs*/,
-          false /*is_varret*/);
-      function->setSchema(std::move(schema));
-    }
+    parseFunctionSchema(
+        function_name, schemaTable, model_version, function.get());
 
     mcu.register_function(std::move(function));
   }

diff --git a/torch/csrc/jit/mobile/parse_bytecode.cpp b/torch/csrc/jit/mobile/parse_bytecode.cpp
@@ -12,7 +12,7 @@ OpCode parseOpCode(const char* str);
 using c10::IValue;
 
 IValue expect_field(
-    std::vector<IValue>& elements,
+    c10::ivalue::TupleElements& elements,
     const std::string& expected_name,
     size_t entry) {
   auto row = std::move(elements.at(entry)).toTuple();
@@ -68,8 +68,8 @@ class OpCodeCache {
 
 void parseInstructions(
     const std::string& function_name,
-    const std::vector<IValue>& ins_list,
-    std::vector<IValue>& debug_handles_m_tuple,
+    const c10::ivalue::TupleElements& ins_list,
+    c10::ivalue::TupleElements& debug_handles_m_tuple,
     mobile::Function* function) {
   c10::List<int64_t> debug_handles_list;
   if (!debug_handles_m_tuple.empty()) {
@@ -79,10 +79,10 @@ void parseInstructions(
         debug_info_function_name == function_name,
         "The function names in the bytecode table and the debug info table do not match.");
     IValue& debug_handles_table = debug_handles_m_tuple[1];
-    auto debugHandlesElements = std::move(*std::move(debug_handles_table).toTuple()).elements();
+    auto debugHandlesTableElements = std::move(*std::move(debug_handles_table).toTuple()).elements();
     debug_handles_list =
         (expect_field(
-            debugHandlesElements,
+            debugHandlesTableElements,
              "function_debug_handles",
              BYTECODE_INDEX_MODULE_DEBUG_HANDLES)
              .toTuple()
@@ -99,8 +99,8 @@ void parseInstructions(
   // becomes an important use case.
   OpCodeCache opCodeCache;
   for (const auto j : c10::irange(ins_list.size())) {
-    std::vector<IValue> ins_item =
-        std::move(*std::move(ins_list[j]).toTuple()).elements();
+    auto ins_tuple = std::move(ins_list[j]).toTuple();
+    c10::ArrayRef<IValue> ins_item = ins_tuple->elements();
     TORCH_CHECK(
         ins_item.size() == 3,
         "There should be three parts in an instruction. The function name is ",
@@ -118,15 +118,15 @@ void parseInstructions(
 }
 
 void parseConstants(
-    const std::vector<IValue>& consts_list,
+    const c10::ivalue::TupleElements& consts_list,
     mobile::Function* function) {
   for (const auto& constant : consts_list) {
     function->append_constant(constant);
   }
 }
 
 void parseTypes(
-    const std::vector<IValue>& types_list,
+    const c10::ivalue::TupleElements& types_list,
     mobile::Function* function) {
   static const c10::QualifiedName classPrefix = "__torch__.torch.classes";
   for (const auto& t : types_list) {

diff --git a/torch/csrc/jit/mobile/parse_bytecode.h b/torch/csrc/jit/mobile/parse_bytecode.h
@@ -7,14 +7,14 @@ namespace mobile {
 using c10::IValue;
 TORCH_API void parseInstructions(
     const std::string& function_name,
-    const std::vector<IValue>& ins_list,
-    std::vector<IValue>& debug_handles_m_tuple,
+    const c10::ivalue::TupleElements& ins_list,
+    c10::ivalue::TupleElements& debug_handles_m_tuple,
     mobile::Function* function);
 TORCH_API void parseConstants(
-    const std::vector<IValue>& consts_list,
+    const c10::ivalue::TupleElements& consts_list,
     mobile::Function* function);
 TORCH_API void parseTypes(
-    const std::vector<IValue>& types_list,
+    const c10::ivalue::TupleElements& types_list,
     mobile::Function* function);
 TORCH_API void parseRegisterSize(size_t rsize, mobile::Function* function);
 } // namespace mobile

diff --git a/torch/csrc/jit/serialization/import_export_functions.h b/torch/csrc/jit/serialization/import_export_functions.h
@@ -6,7 +6,7 @@ namespace torch {
 namespace jit {
 using c10::IValue;
 IValue expect_field(
-    std::vector<IValue>& elements,
+    c10::ivalue::TupleElements& elements,
     const std::string& expected_name,
     size_t entry);
 std::string operator_str(