profile
viewpoint
Bixia Zheng bixia1 Google California

tensorflow/tensorrt 355

TensorFlow/TensorRT integration

bixia1/tensorflow 0

An Open Source Machine Learning Framework for Everyone

Pull request review commenttensorflow/tensorflow

TensorRT profile generation mode

 def build(self, input_fn):       raise RuntimeError("input_fn is None. Method build() needs input_fn "                          "to be specified in order to build TensorRT engines") +    def _rebuild_func():+      # Rebuild function from graph_def.+      reset_converted_func = wrap_function.function_from_graph_def(+          self._converted_graph_def,+          [tensor.name for tensor in self._converted_func.inputs],+          [tensor.name for tensor in self._converted_func.outputs])+      reset_converted_func.graph.structured_outputs = nest.pack_sequence_as(+          self._converted_func.graph.structured_outputs,+          reset_converted_func.graph.structured_outputs)

This code looks the same as the end of "convert" function to me. Can we make this function return reset_converted_func instead so that we can reuse it for "convert"? Need to rename the variable in this case.

tfeher

comment created time in 11 hours

Pull request review commenttensorflow/tensorflow

TensorRT profile generation mode

 def build(self, input_fn):       raise RuntimeError("input_fn is None. Method build() needs input_fn "                          "to be specified in order to build TensorRT engines") +    def _rebuild_func():+      # Rebuild function from graph_def.+      reset_converted_func = wrap_function.function_from_graph_def(+          self._converted_graph_def,+          [tensor.name for tensor in self._converted_func.inputs],+          [tensor.name for tensor in self._converted_func.outputs])+      reset_converted_func.graph.structured_outputs = nest.pack_sequence_as(+          self._converted_func.graph.structured_outputs,+          reset_converted_func.graph.structured_outputs)+      self._converted_func = reset_converted_func++    def _set_profile_generation_mode(value, node):+      node.attr["_profile_generation_mode"].b = value++    if self._need_trt_profiles:+      # Enable profile generation.+      self._for_each_trt_node(self._converted_graph_def,+                              partial(_set_profile_generation_mode, True))+      # Profile generation is enabled using the _profile_generation_mode+      # attribute of the TRTEngineOps. We need to rebuild the function to+      # change this attribute.+      _rebuild_func()++    # Use the first input in explicit batch mode to build TensorRT engines+    # after generating all the profiles. The first input is used but any of+    # the inputs can be used because the shape of this input does not+    # determine the engine and instead the shapes collected in profiles+    # determine the engine.+    first_input = None+    # Run inference:+    #   Builds TRT engines if self._need_trt_profiles is False.+    #   Builds TRT optimization profiles if self._need_trt_profiles is True.     for inp in input_fn():+      if not first_input:+        first_input = inp       self._converted_func(*map(ops.convert_to_tensor, inp))

I don't understand why we need to run the for loop with _converted_func when if self._need_trt_profiles == true. Can you explain?

tfeher

comment created time in 11 hours

Pull request review commenttensorflow/tensorflow

TensorRT profile generation mode

 def __init__(self,       raise ValueError("INT8 precision mode with calibration is not supported "                        "with static TensorRT ops. Set is_dynamic_op to True.") +    # rewriter_config is already validated+    self._need_trt_profiles = None+    for optimizer in self._rewriter_config.custom_optimizers:+      if optimizer.name == "TensorRTOptimizer":+        self._need_trt_profiles = not optimizer.parameter_map[+            "use_implicit_batch"].b \+            if "use_implicit_batch" in optimizer.parameter_map else False

Can we write this into a function of the converter class, converter.NeedTrtProfile(config), so that _NeedToBuild in the test can also use it?

tfeher

comment created time in 11 hours

Pull request review commenttensorflow/tensorflow

TensorRT profile generation mode

 def build(self, input_fn):       raise RuntimeError("input_fn is None. Method build() needs input_fn "                          "to be specified in order to build TensorRT engines") +    def _rebuild_func():+      # Rebuild function from graph_def.+      reset_converted_func = wrap_function.function_from_graph_def(+          self._converted_graph_def,+          [tensor.name for tensor in self._converted_func.inputs],+          [tensor.name for tensor in self._converted_func.outputs])+      reset_converted_func.graph.structured_outputs = nest.pack_sequence_as(+          self._converted_func.graph.structured_outputs,+          reset_converted_func.graph.structured_outputs)+      self._converted_func = reset_converted_func++    def _set_profile_generation_mode(value, node):+      node.attr["_profile_generation_mode"].b = value++    if self._need_trt_profiles:+      # Enable profile generation.+      self._for_each_trt_node(self._converted_graph_def,+                              partial(_set_profile_generation_mode, True))+      # Profile generation is enabled using the _profile_generation_mode+      # attribute of the TRTEngineOps. We need to rebuild the function to+      # change this attribute.+      _rebuild_func()++    # Use the first input in explicit batch mode to build TensorRT engines+    # after generating all the profiles. The first input is used but any of+    # the inputs can be used because the shape of this input does not+    # determine the engine and instead the shapes collected in profiles+    # determine the engine.

This comment is for the if self._need_trt_profiles: block below. Shall we move it closer to there?

tfeher

comment created time in 11 hours

Pull request review commenttensorflow/tensorflow

Execution context management for TensorRT profiles

 bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,   }    const bool kRetry = true;-  if (trt_context_idx >= 1) {+  if (trt_context_idx >= engine_context->execution_context.size()) {     LOG(ERROR) << "Requested engine context with index " << trt_context_idx-               << ", but only 1 context is present.";+               << ", but only " << engine_context->execution_context.size()+               << "contexts are present.";

Can we somehow combine this checking and engine_context->execution_context[trt_context_idx] into a member function of engine_context?

tfeher

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

Execution context management for TensorRT profiles

 string GetLinkedTensorRTVersion(); // TensorRT library version information {Maj, Min, Patch}. string GetLoadedTensorRTVersion(); +// Return true if an engine built for cached_shapes can also run actual_shapes.

Returns

tfeher

comment created time in 6 days

Pull request review commenttensorflow/tensorflow

Execution context management for TensorRT profiles

 class TRTEngineOp : public AsyncOpKernel {   Status GetEngineCacheResource(OpKernelContext* ctx,                                 TRTEngineCacheResource** cache_res); -  // Get engine for the input shape-  StatusOr<EngineContext*> GetEngine(-      const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx,-      TRTEngineCacheResource* cache_res);+  // Return a pair of 1) An EngineContext object that is compatible with the

Returns

tfeher

comment created time in 6 days

Pull request review commenttensorflow/tensorflow

Execution context management for TensorRT profiles

 struct EngineContext {   EngineContext(       TrtUniquePtrType<nvinfer1::ICudaEngine>&& input_cuda_engine,       TrtUniquePtrType<nvinfer1::IExecutionContext>&& input_execution_context)+      : cuda_engine(std::move(input_cuda_engine)) {+    execution_context.push_back(std::move(input_execution_context));+  }+  EngineContext(TrtUniquePtrType<nvinfer1::ICudaEngine>&& input_cuda_engine,+                std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>>&&+                    input_execution_context)       : cuda_engine(std::move(input_cuda_engine)),         execution_context(std::move(input_execution_context)) {}    mutex mu;   TrtUniquePtrType<nvinfer1::ICudaEngine> cuda_engine;-  TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context++  // In explicit batch mode, we maintain a vector of contexts for each engine,+  // where each context is created for a different profile.+  std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>> execution_context

I understand that this is not new in this PR, but do you know why we guard this field with a mutex? Somewhere in routine TRTEngineOp::ExecuteTrtEngine, there is this comment "nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex". Can we document the reason for GUARDED_BY(mu) here?

tfeher

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

Execution context management for TensorRT profiles

 StatusOr<EngineContext*> TRTEngineOp::GetEngine(     // Swap with temporary empty string to deallocate the CPU memory.     serialized_segment_.swap(tmp);     if (use_implicit_batch_ && (max_batch_size < batch_size)) {-      return &empty_context;+      return std::pair<EngineContext*, int>(&empty_context, 0);     }-    return cache.at(engine_input_shapes).get();+    return std::pair<EngineContext*, int>(cache.at(engine_input_shapes).get(),+                                          0);   }  // static_engine_ -  // Handle the dynamic engine case. See if there is a compatible engine cached.-  std::vector<TensorShape> engine_input_shapes;-  TF_RETURN_IF_ERROR(-      GetEngineInputShapes(cache, input_concrete_shapes, &engine_input_shapes));+  int profile_id = -1;+  if (!use_implicit_batch_) {+    profile_id = cache_res->profiles_.GetProfileNumber(input_concrete_shapes);+    // Since all profiles are already created at this point,+    // finding no compatible profiles results in falling back

Need to avoid partial comment lines

tfeher

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

Execution context management for TensorRT profiles

 StatusOr<EngineContext*> TRTEngineOp::GetEngine(     // Swap with temporary empty string to deallocate the CPU memory.     serialized_segment_.swap(tmp);     if (use_implicit_batch_ && (max_batch_size < batch_size)) {-      return &empty_context;+      return std::pair<EngineContext*, int>(&empty_context, 0);     }-    return cache.at(engine_input_shapes).get();+    return std::pair<EngineContext*, int>(cache.at(engine_input_shapes).get(),+                                          0);   }  // static_engine_ -  // Handle the dynamic engine case. See if there is a compatible engine cached.-  std::vector<TensorShape> engine_input_shapes;-  TF_RETURN_IF_ERROR(-      GetEngineInputShapes(cache, input_concrete_shapes, &engine_input_shapes));+  int profile_id = -1;+  if (!use_implicit_batch_) {+    profile_id = cache_res->profiles_.GetProfileNumber(input_concrete_shapes);+    // Since all profiles are already created at this point,+    // finding no compatible profiles results in falling back+    // to native TF.+    if (profile_id == -1) {+      return std::pair<EngineContext*, int>(&empty_context, 0);+    }+  }++  EngineContext* engine_contexts;+  if (use_implicit_batch_) {+    engine_contexts = cache_res->GetEngineContext(input_concrete_shapes);+  } else {+    engine_contexts = cache_res->GetEngineContext(profile_id);+  } -  // If matched, use that engine. Otherwise, we will look in cache for that-  // exact shape and possibly create a new engine if it is not in cache.-  if (!cache.count(engine_input_shapes)) {+  // If cache does not have a compatible engine+  // then create a new engine.

Need to avoid partial comment lines

tfeher

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

Execution context management for TensorRT profiles

 bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,   }    const bool kRetry = true;-  if (trt_context_idx >= 1) {+  if (trt_context_idx >= engine_context->execution_context.size()) {     LOG(ERROR) << "Requested engine context with index " << trt_context_idx-               << ", but only 1 context is present.";+               << ", but only " << engine_context->execution_context.size()+               << "contexts are present.";     return kRetry;   }   const int num_binding = cuda_engine->getNbBindings();   std::vector<void*> buffers(num_binding);    mutex_lock lock(engine_context->mu);-  auto& execution_context = engine_context->execution_context;+  auto& execution_context = engine_context->execution_context[trt_context_idx];

engine_context->execution_context is guarded by a mutex, so need to move the lock to before accessing engine_context->execution_context.size()

tfeher

comment created time in 5 days

Pull request review commenttensorflow/tensorflow

Execution context management for TensorRT profiles

 string DebugString(const std::vector<PartialTensorShape>& shapes) {   return PartialTensorShapeUtils::PartialShapeListString(shapes); } +bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,+                         const std::vector<TensorShape>& cached_shapes) {+  auto match_shape = [](const TensorShape& actual_shape,+                        const TensorShape& cached_shape) {+    // Match the rank.+    if (actual_shape.dims() != cached_shape.dims()) return false;+    // Match the batch size.

Can you add a comment to document why it is ok for cached_shape.dim_size(0) > actual_shape.dim_size(0)?

tfeher

comment created time in 6 days

Pull request review commenttensorflow/tensorflow

Execution context management for TensorRT profiles

 string DebugString(const std::vector<PartialTensorShape>& shapes) {   return PartialTensorShapeUtils::PartialShapeListString(shapes); } +bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,+                         const std::vector<TensorShape>& cached_shapes) {+  auto match_shape = [](const TensorShape& actual_shape,+                        const TensorShape& cached_shape) {+    // Match the rank.+    if (actual_shape.dims() != cached_shape.dims()) return false;+    // Match the batch size.+    if (actual_shape.dim_size(0) > cached_shape.dim_size(0)) return false;+    // Match remaining dimensions.+    for (int i = 1; i < actual_shape.dims(); ++i) {+      if (actual_shape.dim_size(i) != cached_shape.dim_size(i)) return false;+    }+    return true;+  };+  for (int i = 0; i < actual_shapes.size(); ++i) {+    if (!match_shape(actual_shapes[i], cached_shapes[i])) {

In fact, all shapes from actual_shapes have the same batch size, similar for cached_shapes, right? The checking here doesn't enforce this and can be misleading. Is there anything we can do to address this?

tfeher

comment created time in 6 days

issue commenttensorflow/tensorflow

[TF2] TRT Engine Ops are not garbage collected, resulting in incorrect reuse

I am not sure that I understand this. While I see the TRT engine being created I don't see the cache being accessed. That is I don't things related to message " trt_convert.py:1074] Could not find TRTEngineOp_0 in TF-TRT cache. This can happen if build() is not called, which means TensorRT engines will be built and cached at runtime" in the initial bug description.

MattConley

comment created time in 6 days

issue commenttensorflow/tensorflow

[TF2] TRT Engine Ops are not garbage collected, resulting in incorrect reuse

I ran the test without the forced garbage collection in tf_trt_integration_test_base.py, and did see TRT engines created for both F16 and F32. So the problem is gone in the current tree. My commands are similar to this:

$ blaze run -c opt tensorflow/python/compiler/tensorrt:gpu_dynamic_input_shapes_test --define use_experimental_tensorrt=1 --config=cuda --test_env=TF_CPP_VMODULE=convert_nodes=2,trt_engine_op=1 --test_arg=--logtostderr>& t.log $ grep -e "trt_engine_op" -e "DynamicInputShapesTest" t.log [ RUN ] DynamicInputShapesTest.testTfTrt_OfflineConversion_DynamicEngine_FP16_NoCalibration I0219 15:36:59.883159 143904 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,5,5,1]] I0219 15:37:00.669233 143905 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[10,5,5,1]] I0219 15:37:01.309641 143903 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,3,1,1]] I0219 15:37:01.961832 143903 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[2,9,9,1]] I0219 15:37:02.754980 143905 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,224,224,1]] I0219 15:37:03.625179 143902 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,128,224,1]] I0219 15:37:04.565040 143903 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,5,5,1]] I0219 15:37:05.345535 143904 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[10,5,5,1]] I0219 15:37:06.050820 143903 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,3,1,1]] I0219 15:37:06.776347 143903 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[2,9,9,1]] I0219 15:37:07.644002 143902 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,224,224,1]] I0219 15:37:08.518280 143902 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,128,224,1]] [ OK ] DynamicInputShapesTest.testTfTrt_OfflineConversion_DynamicEngine_FP16_NoCalibration [ RUN ] DynamicInputShapesTest.testTfTrt_OfflineConversion_DynamicEngine_FP32_NoCalibration I0219 15:37:09.430716 143905 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,5,5,1]] I0219 15:37:09.614939 143904 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[10,5,5,1]] I0219 15:37:09.722950 143904 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,3,1,1]] I0219 15:37:09.859450 143905 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[2,9,9,1]] I0219 15:37:10.052555 143903 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,224,224,1]] I0219 15:37:10.234415 143902 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,128,224,1]] I0219 15:37:10.486743 143905 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,5,5,1]] I0219 15:37:10.672606 143905 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[10,5,5,1]] I0219 15:37:10.777662 143902 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,3,1,1]] I0219 15:37:10.913178 143905 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[2,9,9,1]] I0219 15:37:11.098235 143902 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,224,224,1]] I0219 15:37:11.295327 143905 trt_engine_op.cc:984] Building a new TensorRT engine for TRTEngineOp_0 with input shapes: [[1,128,224,1]] [ OK ] DynamicInputShapesTest.testTfTrt_OfflineConversion_DynamicEngine_FP32_NoCalibration [ RUN ] DynamicInputShapesTest.testTfTrt_OfflineConversion_DynamicEngine_INT8_NoCalibration [ OK ] DynamicInputShapesTest.testTfTrt_OfflineConversion_DynamicEngine_INT8_NoCalibration [ RUN ] DynamicInputShapesTest.testTfTrt_OfflineConversion_DynamicEngine_INT8_UseCalibration [ OK ] DynamicInputShapesTest.testTfTrt_OfflineConversion_DynamicEngine_INT8_UseCalibration [ RUN ] DynamicInputShapesTest.testTfTrt_OfflineConversion_StaticEngine_FP16_NoCalibration [ OK ] DynamicInputShapesTest.testTfTrt_OfflineConversion_StaticEngine_FP16_NoCalibration [ RUN ] DynamicInputShapesTest.testTfTrt_OfflineConversion_StaticEngine_FP32_NoCalibration [ OK ] DynamicInputShapesTest.testTfTrt_OfflineConversion_StaticEngine_FP32_NoCalibration [ RUN ] DynamicInputShapesTest.testTfTrt_OfflineConversion_StaticEngine_INT8_NoCalibration [ OK ] DynamicInputShapesTest.testTfTrt_OfflineConversion_StaticEngine_INT8_NoCalibration

MattConley

comment created time in 6 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

 string GetLoadedTensorRTVersion() {   return absl::StrCat(major, ".", minor, ".", patch); } +int GetNumberOfEngineInputs(

Why this routine is outside #if GOOGLE_CUDA && GOOGLE_TENSORRT? I am getting this compilation error tensorflow/compiler/tf2tensorrt/convert/utils.cc:168:35: error: use of undeclared identifier 'nvinfer1' int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) { ^ tensorflow/compiler/tf2tensorrt/convert/utils.cc:179:5: error: function-like macro 'IS_TRT_VERSION_GE' is not defined #if IS_TRT_VERSION_GE(6, 0, 0, 0) ^

when building target without --config=cuda tensorflow/compiler/tf2tensorrt:trt_shape_optimization_profiles_test

tfeher

comment created time in 8 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"+#include <algorithm>+#include <functional>+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"++namespace tensorflow {+namespace tensorrt {++// Creates optimization profiles for a list of input shapes. The list of input+// shapes are stored in shapes_.+void TrtShapeOptimizationProfile::InitProfiles() {+  if (input_shapes_.size() == 0) {+    VLOG(1) << "Not creating profiles without input_shapes. "+               "You have to enable profile generation mode first (build).";+  } else {+    VLOG(1) << "Creating profiles with startegy of one profile "+            << "for each input (min=opt=max).";+  }+  for (auto& shape_vec : input_shapes_) {+    std::vector<nvinfer1::Dims> dimvec;+    for (auto& shape : shape_vec) {+      dimvec.push_back(TensorShapeToTrtDims(shape, false));+    }+    // We set min=opt=max.+    OptimizationProfileConfig profConfig{dimvec, dimvec, dimvec};+    profiles_.push_back(std::move(profConfig));+    VLOG(1) << "Created profile " << profiles_.back().DebugString();+  }+}++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+Status TrtShapeOptimizationProfile::AddProfiles(+    nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,+    const nvinfer1::INetworkDefinition* network) {+  // Create a vector of optimization profiles+  for (int i = 0; i < profiles_.size(); i++) {+    auto* optProfile = builder->createOptimizationProfile();+    Status status = profiles_[i].SetDimensions(network, optProfile);+    if (!status.ok()) {+      return status;+    }+    int idx = -1;+    if (optProfile->isValid()) {+      idx = config->addOptimizationProfile(optProfile);+    }+    if (idx >= 0) {+      if (i != idx) {+        return errors::Internal(+            "Profile index of engine config is different from resource profile "+            "index: ",+            i, " != ", idx);+      }+      VLOG(1) << "Added optimization profile " << profiles_[i].DebugString()+              << " to builder config.";+    } else {+      VLOG(ERROR) << "Failed to add optimization profile "

Does VLOG(ERROR) even pass compilation? It should be LOG(ERROR) if that is what you want.

tfeher

comment created time in 11 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#include <string.h>++#include <vector>++#include "absl/memory/memory.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"+#include "tensorflow/core/framework/tensor.h"+#include "tensorflow/core/framework/tensor_shape.h"+#include "tensorflow/core/framework/types.h"+#include "tensorflow/core/platform/test.h"++#if GOOGLE_CUDA+#if GOOGLE_TENSORRT++#include "third_party/tensorrt/NvInfer.h"++namespace tensorflow {+namespace tensorrt {++std::vector<TensorShape> DimVecToShapeVec(std::vector<nvinfer1::Dims3> dimvec) {+  std::vector<TensorShape> shapevec(dimvec.size());+  for (int i = 0; i < dimvec.size(); i++) {+    TensorShape shape;+    TensorShapeUtils::MakeShape(dimvec[i].d, dimvec[i].nbDims, &shape);+    shapevec[i] = shape;+  }+  return shapevec;+}++bool DimsContained(const nvinfer1::Dims& dim, const nvinfer1::Dims& min,+                   const nvinfer1::Dims& max) {+  if (dim.nbDims != min.nbDims || dim.nbDims != max.nbDims) {+    return false;+  }+  for (int i = 0; i < dim.nbDims; i++) {+    if (dim.d[i] < min.d[i] || dim.d[i] > max.d[i]) {+      return false;+    }+  }+  return true;+}++bool DimsEqual(const nvinfer1::Dims& a, const nvinfer1::Dims& b) {+  if (a.nbDims != b.nbDims) {+    return false;+  }+  for (int i = 0; i < a.nbDims; i++) {+    if (a.d[i] != b.d[i]) {+      return false;+    }+  }+  return true;+}++class TrtShapeOptimizationProfileTest : public ::testing::Test {+ protected:+  void SetUp() override {+    builder_ = TrtUniquePtrType<nvinfer1::IBuilder>(+        nvinfer1::createInferBuilder(logger_));+#if IS_TRT_VERSION_GE(6, 0, 0, 0)+    network_ = TrtUniquePtrType<nvinfer1::INetworkDefinition>(+        builder_->createNetworkV2(flags_));+    builder_config_ = TrtUniquePtrType<nvinfer1::IBuilderConfig>(+        builder_->createBuilderConfig());+    builder_config_->setMaxWorkspaceSize(1 << 10);+#else+    network_ = TrtUniquePtrType<nvinfer1::INetworkDefinition>(+        builder_->createNetwork());+    builder_->setMaxWorkspaceSize(1 << 10);+#endif+  }++  // Defines a simple network: output = input1 + input2.+  void DefineNetwork(nvinfer1::INetworkDefinition* network,+                     nvinfer1::Dims3& dims) {+    nvinfer1::ITensor* input1 =+        network->addInput("input1", nvinfer1::DataType::kFLOAT, dims);+    EXPECT_NE(nullptr, input1);++    nvinfer1::ITensor* input2 =+        network->addInput("input2", nvinfer1::DataType::kFLOAT, dims);+    EXPECT_NE(nullptr, input1);++    auto layer = network->addElementWise(*input1, *input2,+                                         nvinfer1::ElementWiseOperation::kSUM);+    EXPECT_NE(nullptr, layer);+    // Mark the output.+    nvinfer1::ITensor* output = layer->getOutput(0);+    output->setName("output");+    network->markOutput(*output);+  }++  Logger logger_;+  TrtUniquePtrType<nvinfer1::IBuilder> builder_;+  TrtUniquePtrType<nvinfer1::INetworkDefinition> network_;+#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  TrtUniquePtrType<nvinfer1::IBuilderConfig> builder_config_;+#endif+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine;+  std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>> exec_context_;+  // The order is important: exec_context_ must be destroyed first, and logger+  // at last.++  const uint32_t flags_ =+      1U << static_cast<int>(+          nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);

This doesn't work for TRT5

tfeher

comment created time in 11 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_

TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_

The naming conversion here: path name + file name TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_

tfeher

comment created time in 11 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"+#include <algorithm>+#include <functional>+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"++namespace tensorflow {+namespace tensorrt {++// Creates optimization profiles for a list of input shapes. The list of input+// shapes are stored in shapes_.+void TrtShapeOptimizationProfile::InitProfiles() {+  if (input_shapes_.size() == 0) {+    VLOG(1) << "Not creating profiles without input_shapes. "+               "You have to enable profile generation mode first (build).";+  } else {+    VLOG(1) << "Creating profiles with startegy of one profile "+            << "for each input (min=opt=max).";+  }+  for (auto& shape_vec : input_shapes_) {+    std::vector<nvinfer1::Dims> dimvec;+    for (auto& shape : shape_vec) {+      dimvec.push_back(TensorShapeToTrtDims(shape, false));+    }+    // We set min=opt=max.+    OptimizationProfileConfig profConfig{dimvec, dimvec, dimvec};+    profiles_.push_back(std::move(profConfig));+    VLOG(1) << "Created profile " << profiles_.back().DebugString();+  }+}++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+Status TrtShapeOptimizationProfile::AddProfiles(+    nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,+    const nvinfer1::INetworkDefinition* network) {+  // Create a vector of optimization profiles+  for (int i = 0; i < profiles_.size(); i++) {+    auto* optProfile = builder->createOptimizationProfile();+    Status status = profiles_[i].SetDimensions(network, optProfile);+    if (!status.ok()) {+      return status;+    }+    int idx = -1;+    if (optProfile->isValid()) {+      idx = config->addOptimizationProfile(optProfile);+    }+    if (idx >= 0) {+      if (i != idx) {+        return errors::Internal(+            "Profile index of engine config is different from resource profile "+            "index: ",+            i, " != ", idx);+      }+      VLOG(1) << "Added optimization profile " << profiles_[i].DebugString()+              << " to builder config.";+    } else {+      VLOG(ERROR) << "Failed to add optimization profile "+                  << profiles_[i].DebugString()+                  << ". This usually happens when profile is invalid.";+    }+  }+  if (config->getNbOptimizationProfiles() == 0) {+    return errors::Internal("Failure in adding an optimization profile.");+  }+  // if TRT_VERSION < 6, then we do not need to add+  return Status::OK();+}+#endif++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+Status TrtShapeOptimizationProfile::ConfigureBuilder(+    nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,+    const nvinfer1::INetworkDefinition* network) {+  AddProfiles(builder, config, network);+  return Status::OK();+}+#endif++int TrtShapeOptimizationProfile::GetProfileNumber(+    std::vector<TensorShape> shapes) {+  for (int i = 0; i < profiles_.size(); i++) {+    if (profiles_[i].IncludesShapes(shapes)) {+      return i;+    }+  }+  VLOG(1) << "Profile not found for input shapes " << DebugString(shapes)+          << ".";+  return -1;+}++Status TrtShapeOptimizationProfile::CreateExecutionContexts(+    nvinfer1::ICudaEngine* engine,+    std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>>& exec_context) {+  int i = 0;+  // The following loop runs once if we have static shapes, to create a single+  // execution context without profiles. In dynamic mode we create one context+  // for each profile and set the corresponding optimization profile.+  do {+    VLOG(1) << "Creating execution context " << i;+    nvinfer1::IExecutionContext* ctx = engine->createExecutionContext();+    if (ctx == nullptr) {+      return errors::Internal("Failed to create execution context");+    }+    if (i > 0) {+      // This condition is needed for two reasons:+      // - using static shapes we do not have any profiles so we cannot call+      //   set optimizationprofiles.+      // - The 0th profile is set implicitly for the first execution context+      //   therefore we do not need to set.+#if IS_TRT_VERSION_GE(6, 0, 0, 0)+      bool stat = ctx->setOptimizationProfile(i);+      if (!stat) {+        ctx->destroy();+        return errors::Internal("Could not set TRT optimization profile.");+      }+#endif+    }+    exec_context.push_back(+        std::move(TrtUniquePtrType<nvinfer1::IExecutionContext>(ctx)));+    i++;+  } while (i < profiles_.size());++  return Status::OK();+}++Status TrtShapeOptimizationProfile::RestoreProfiles(+    const nvinfer1::ICudaEngine* engine) {+#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  if (!engine || engine->hasImplicitBatchDimension()) {

I see this compilation error no member named 'hasImplicitBatchDimension' in 'nvinfer1::ICudaEngine'

tfeher

comment created time in 11 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#include <string.h>++#include <vector>++#include "absl/memory/memory.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"+#include "tensorflow/core/framework/tensor.h"+#include "tensorflow/core/framework/tensor_shape.h"+#include "tensorflow/core/framework/types.h"+#include "tensorflow/core/platform/test.h"++#if GOOGLE_CUDA+#if GOOGLE_TENSORRT++#include "third_party/tensorrt/NvInfer.h"++namespace tensorflow {+namespace tensorrt {++std::vector<TensorShape> DimVecToShapeVec(std::vector<nvinfer1::Dims3> dimvec) {+  std::vector<TensorShape> shapevec(dimvec.size());+  for (int i = 0; i < dimvec.size(); i++) {+    TensorShape shape;+    TensorShapeUtils::MakeShape(dimvec[i].d, dimvec[i].nbDims, &shape);

This returns a value, add TF_CHECK_OK(...)

tfeher

comment created time in 11 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#include <string.h>++#include <vector>++#include "absl/memory/memory.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"+#include "tensorflow/core/framework/tensor.h"+#include "tensorflow/core/framework/tensor_shape.h"+#include "tensorflow/core/framework/types.h"+#include "tensorflow/core/platform/test.h"++#if GOOGLE_CUDA+#if GOOGLE_TENSORRT++#include "third_party/tensorrt/NvInfer.h"++namespace tensorflow {+namespace tensorrt {++std::vector<TensorShape> DimVecToShapeVec(std::vector<nvinfer1::Dims3> dimvec) {+  std::vector<TensorShape> shapevec(dimvec.size());+  for (int i = 0; i < dimvec.size(); i++) {+    TensorShape shape;+    TensorShapeUtils::MakeShape(dimvec[i].d, dimvec[i].nbDims, &shape);+    shapevec[i] = shape;+  }+  return shapevec;+}++bool DimsContained(const nvinfer1::Dims& dim, const nvinfer1::Dims& min,+                   const nvinfer1::Dims& max) {+  if (dim.nbDims != min.nbDims || dim.nbDims != max.nbDims) {+    return false;+  }+  for (int i = 0; i < dim.nbDims; i++) {+    if (dim.d[i] < min.d[i] || dim.d[i] > max.d[i]) {+      return false;+    }+  }+  return true;+}++bool DimsEqual(const nvinfer1::Dims& a, const nvinfer1::Dims& b) {+  if (a.nbDims != b.nbDims) {+    return false;+  }+  for (int i = 0; i < a.nbDims; i++) {+    if (a.d[i] != b.d[i]) {+      return false;+    }+  }+  return true;+}++class TrtShapeOptimizationProfileTest : public ::testing::Test {+ protected:+  void SetUp() override {+    builder_ = TrtUniquePtrType<nvinfer1::IBuilder>(+        nvinfer1::createInferBuilder(logger_));+#if IS_TRT_VERSION_GE(6, 0, 0, 0)+    network_ = TrtUniquePtrType<nvinfer1::INetworkDefinition>(+        builder_->createNetworkV2(flags_));+    builder_config_ = TrtUniquePtrType<nvinfer1::IBuilderConfig>(+        builder_->createBuilderConfig());+    builder_config_->setMaxWorkspaceSize(1 << 10);+#else+    network_ = TrtUniquePtrType<nvinfer1::INetworkDefinition>(+        builder_->createNetwork());+    builder_->setMaxWorkspaceSize(1 << 10);+#endif+  }++  // Defines a simple network: output = input1 + input2.+  void DefineNetwork(nvinfer1::INetworkDefinition* network,+                     nvinfer1::Dims3& dims) {+    nvinfer1::ITensor* input1 =+        network->addInput("input1", nvinfer1::DataType::kFLOAT, dims);+    EXPECT_NE(nullptr, input1);++    nvinfer1::ITensor* input2 =+        network->addInput("input2", nvinfer1::DataType::kFLOAT, dims);+    EXPECT_NE(nullptr, input1);++    auto layer = network->addElementWise(*input1, *input2,+                                         nvinfer1::ElementWiseOperation::kSUM);+    EXPECT_NE(nullptr, layer);+    // Mark the output.+    nvinfer1::ITensor* output = layer->getOutput(0);+    output->setName("output");+    network->markOutput(*output);+  }++  Logger logger_;+  TrtUniquePtrType<nvinfer1::IBuilder> builder_;+  TrtUniquePtrType<nvinfer1::INetworkDefinition> network_;+#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  TrtUniquePtrType<nvinfer1::IBuilderConfig> builder_config_;+#endif+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine;+  std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>> exec_context_;+  // The order is important: exec_context_ must be destroyed first, and logger+  // at last.++  const uint32_t flags_ =+      1U << static_cast<int>(+          nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);+};++TEST_F(TrtShapeOptimizationProfileTest, Static) {+  // Network with static input shape+  nvinfer1::Dims3 dims(8, 8, 10);+  DefineNetwork(network_.get(), dims);++  TrtShapeOptimizationProfile profile;++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // Configure and build engine - should be a no-op+  profile.ConfigureBuilder(builder_.get(), builder_config_.get(),+                           network_.get());++  engine = TrtUniquePtrType<nvinfer1::ICudaEngine>(+      builder_->buildEngineWithConfig(*network_, *builder_config_));+#else+  engine = TrtUniquePtrType<nvinfer1::ICudaEngine>(+      builder_->buildCudaEngine(*network_));+#endif+  EXPECT_NE(nullptr, engine);+  profile.CreateExecutionContexts(engine.get(), exec_context_);

This returns a value, add TF_CHECK_OK(...)

tfeher

comment created time in 11 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"+#include <algorithm>+#include <functional>+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"++namespace tensorflow {+namespace tensorrt {++// Creates optimization profiles for a list of input shapes. The list of input+// shapes are stored in shapes_.+void TrtShapeOptimizationProfile::InitProfiles() {+  if (input_shapes_.size() == 0) {+    VLOG(1) << "Not creating profiles without input_shapes. "+               "You have to enable profile generation mode first (build).";+  } else {+    VLOG(1) << "Creating profiles with startegy of one profile "+            << "for each input (min=opt=max).";+  }+  for (auto& shape_vec : input_shapes_) {+    std::vector<nvinfer1::Dims> dimvec;+    for (auto& shape : shape_vec) {+      dimvec.push_back(TensorShapeToTrtDims(shape, false));+    }+    // We set min=opt=max.+    OptimizationProfileConfig profConfig{dimvec, dimvec, dimvec};+    profiles_.push_back(std::move(profConfig));+    VLOG(1) << "Created profile " << profiles_.back().DebugString();+  }+}++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+Status TrtShapeOptimizationProfile::AddProfiles(+    nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,+    const nvinfer1::INetworkDefinition* network) {+  // Create a vector of optimization profiles+  for (int i = 0; i < profiles_.size(); i++) {+    auto* optProfile = builder->createOptimizationProfile();+    Status status = profiles_[i].SetDimensions(network, optProfile);+    if (!status.ok()) {+      return status;+    }+    int idx = -1;+    if (optProfile->isValid()) {+      idx = config->addOptimizationProfile(optProfile);+    }+    if (idx >= 0) {+      if (i != idx) {+        return errors::Internal(+            "Profile index of engine config is different from resource profile "+            "index: ",+            i, " != ", idx);+      }+      VLOG(1) << "Added optimization profile " << profiles_[i].DebugString()+              << " to builder config.";+    } else {+      VLOG(ERROR) << "Failed to add optimization profile "+                  << profiles_[i].DebugString()+                  << ". This usually happens when profile is invalid.";+    }+  }+  if (config->getNbOptimizationProfiles() == 0) {+    return errors::Internal("Failure in adding an optimization profile.");+  }+  // if TRT_VERSION < 6, then we do not need to add+  return Status::OK();+}+#endif++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+Status TrtShapeOptimizationProfile::ConfigureBuilder(+    nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,+    const nvinfer1::INetworkDefinition* network) {+  AddProfiles(builder, config, network);

This returns a status, add TF_RETURN_IF_ERROR(...)

tfeher

comment created time in 11 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

  from unittest import SkipTest  # pylint: disable=g-importing-member +from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_version from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -

Why this line is deleted? It requires two lines.

tfeher

comment created time in 11 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_++#include <list>+#include <string>+#include <unordered_set>+#include <vector>++#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"+#include "tensorflow/core/framework/tensor_shape.h"+#include "tensorflow/core/lib/core/errors.h"+#include "tensorflow/core/lib/core/status.h"+#include "tensorflow/core/lib/strings/str_util.h"+#include "tensorflow/core/lib/strings/strcat.h"++#if GOOGLE_CUDA+#if GOOGLE_TENSORRT++#include "third_party/tensorrt/NvInfer.h"++namespace tensorflow {+namespace tensorrt {++// Stores optimization profile parameters (min/opt/max of each input shape).+//+// A TensorRT optimization profile describes the possible min/max values of+// each dynamic input shape along with an optimum value. These values are used+// by the TensorRT builder to select the best kernel for the optimum value among+// those kernels that are valid for all input tensors in the [min, max] range.+struct OptimizationProfileConfig {+  // Length of vector == num_inputs to engine+  std::vector<nvinfer1::Dims> min;+  std::vector<nvinfer1::Dims> opt;+  std::vector<nvinfer1::Dims> max;++  string DebugString() const {+    using absl::StrCat;+    return StrCat("[min: ", tensorflow::tensorrt::DebugString(min),+                  ", opt: : ", tensorflow::tensorrt::DebugString(opt),+                  ", max: ", tensorflow::tensorrt::DebugString(max), "]");+  }++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // Sets the stored min/opt/max dimensions for profile.+  //+  // Parameters:+  // network - TensorRT network, used to enumerate all the input tensors+  // profile - on exit the profile information will be set for each input tensor+  Status SetDimensions(const nvinfer1::INetworkDefinition* network,+                       nvinfer1::IOptimizationProfile* profile) const {+    int n_inputs = network->getNbInputs();+    if (min.size() != n_inputs || opt.size() != n_inputs ||+        max.size() != n_inputs) {+      return errors::Internal("Incorrect number of profile config parameters");+    }+    for (int i = 0; i < n_inputs; i++) {+      const char* name = network->getInput(i)->getName();+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, min[i]);+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, opt[i]);+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, max[i]);+    }+    return Status::OK();+  }+#endif++  // Returns true if profile range completely includes the given shapes.+  bool IncludesShapes(const std::vector<TensorShape>& shapes) const {+    // min, max, and opt must have the same size which is already verified in+    // SetDimensions.+    if (min.size() != shapes.size()) {+      return false;+    }+    for (int i = 0; i < shapes.size(); i++) {+      auto current_shape = shapes[i];+      // min, max, and opt must have the same nbDims, which is already verified+      // in SetDimensions.+      if (min[i].nbDims != current_shape.dims()) {+        return false;+      }+      // Check if range [min, max] includes current_shape.+      for (int dim = 0; dim < current_shape.dims(); dim++) {+        if ((min[i].d[dim] > current_shape.dim_size(dim)) ||+            (max[i].d[dim] < current_shape.dim_size(dim))) {+          return false;+        }+      }+    }+    return true;+  }+};++// Manages Optimization profiles during TRT Engine construction.+//+// An optimization profile describes a range of dimensions for each TRT network+// input, and the optimal dimensions that the auto-tuner should use for+// optimization.+//+// This class stores the list of input shapes that were seen during the+// build/profile_generation_mode phase, and using them it creates a set of+// OptimizationProfileConfigs. These configs will be added to IBuilderConfig+// before the engine is created.+class TrtShapeOptimizationProfile {+ public:+  TrtShapeOptimizationProfile(){};

Remove ;

tfeher

comment created time in 11 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"+#include <algorithm>+#include <functional>+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"++namespace tensorflow {+namespace tensorrt {++// Create optimization profiles for a list of input shapes. The list of input

Creates

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

 class TRTEngineOp : public AsyncOpKernel {    // Execute the tensorrt engine. Returns whether we need to retry by running

Executes

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_++#include <list>+#include <string>+#include <unordered_set>+#include <vector>++#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"+#include "tensorflow/core/framework/tensor_shape.h"+#include "tensorflow/core/lib/core/errors.h"+#include "tensorflow/core/lib/core/status.h"+#include "tensorflow/core/lib/strings/str_util.h"+#include "tensorflow/core/lib/strings/strcat.h"++#if GOOGLE_CUDA+#if GOOGLE_TENSORRT++#include "third_party/tensorrt/NvInfer.h"++namespace tensorflow {+namespace tensorrt {++// Stores optimization profile parameters (min/opt/max of each input shape).+//+// A TensorRT optimization profile describes the possible min/max values of+// each dynamic input shape along with an optimum value. These values are used+// by the TensorRT builder to select the best kernel for the optimum value among+// those kernels that are valid for all input tensors in the [min, max] range.+struct OptimizationProfileConfig {+  // Length of vector == num_inputs to engine+  std::vector<nvinfer1::Dims> min;+  std::vector<nvinfer1::Dims> opt;+  std::vector<nvinfer1::Dims> max;++  string DebugString() const {+    using absl::StrCat;+    return StrCat("[min: ", tensorflow::tensorrt::DebugString(min),+                  ", opt: : ", tensorflow::tensorrt::DebugString(opt),+                  ", max: ", tensorflow::tensorrt::DebugString(max), "]");+  }++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // Set the stored min/opt/max dimensions for profile.+  //+  // Parameters:+  // network - TensorRT network, used to enumerate all the input tensors+  // profile - on exit the profile information will be set for each input tensor+  Status SetDimensions(const nvinfer1::INetworkDefinition* network,+                       nvinfer1::IOptimizationProfile* profile) const {+    int n_inputs = network->getNbInputs();+    if (min.size() != n_inputs || opt.size() != n_inputs ||+        max.size() != n_inputs) {+      return errors::Internal("Incorrect number of profile config parameters");+    }+    for (int i = 0; i < n_inputs; i++) {+      const char* name = network->getInput(i)->getName();+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, min[i]);+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, opt[i]);+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, max[i]);+    }+    return Status::OK();+  }+#endif++  // Returns true if profile range completely includes the given shapes.+  bool IncludesShapes(const std::vector<TensorShape>& shapes) const {+    // min, max, and opt must have the same size which,+    // already verified in SetDimensions.+    if (min.size() != shapes.size()) {+      return false;+    }+    for (int i = 0; i < shapes.size(); i++) {+      auto current_shape = shapes[i];+      // min, max, and opt must have the same nbDims, which is+      // already verified in SetDimensions.+      if (min[i].nbDims != current_shape.dims()) {+        return false;+      }+      // Check if range [min, max] includes current_shape.+      for (int dim = 0; dim < current_shape.dims(); dim++) {+        if ((min[i].d[dim] > current_shape.dim_size(dim)) ||+            (max[i].d[dim] < current_shape.dim_size(dim))) {+          return false;+        }+      }+    }+    return true;+  }+};++// Manages Optimization profiles during TRT Engine construction.+//+// An optimization profile describes a range of dimensions for each TRT network+// input, and the optimal dimensions that the auto-tuner should use for+// optimization.+//+// This class stores the list of input shapes that were seen during the+// build/profile_generation_mode phase, and using them it creates a set of+// OptimizationProfileConfigs. These configs will be added to IBuilderConfig+// before the engine is created.+class TrtShapeOptimizationProfile {+ public:+  TrtShapeOptimizationProfile(){};++  // Stores input shape information during profile_generation_mode+  void AddShape(std::vector<TensorShape> shapes) {+    input_shapes_.insert(shapes);+    VLOG(1) << "Collected shape(s) " << DebugString(shapes) << " for profiles.";+  }++  void clear() { profiles_.clear(); }++  // Returns the profile number that should be used to execute the network with+  // the given input shapes. Returns -1 if none of cached profiles are+  // compatible with the given input shapes.+  int GetProfileNumber(std::vector<TensorShape> shapes);++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // Creates optimization profiles and add them to the builder config.+  Status ConfigureBuilder(nvinfer1::IBuilder* builder,+                          nvinfer1::IBuilderConfig* config,+                          const nvinfer1::INetworkDefinition* network);+#endif++  // Creates execution contexts for each optimization profile.+  Status CreateExecutionContexts(+      nvinfer1::ICudaEngine* engine,+      std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>>& exec_context);++  /// Map input vector shapes to TRT Optimization profiles (min, max, opt)+  // i.e. maps input_shapes_ to profiles_+  void InitProfiles();++  // Returns number of created profiles.+  int GetNumProfiles() const;++  // Restore profiles from the engine (used after deserialization)+  Status RestoreProfiles(const nvinfer1::ICudaEngine* engine);++ private:+  // Set of input shape vetors that we collect during profile_generation_mode+  std::unordered_set<std::vector<TensorShape>, VectorTensorShapeHasher>+      input_shapes_;++  // The optimization profiles generated from input_shapes_+  std::vector<OptimizationProfileConfig> profiles_;++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  /// Add optimization profiles to the builder config

Adds

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_++#include <list>+#include <string>+#include <unordered_set>+#include <vector>++#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"+#include "tensorflow/core/framework/tensor_shape.h"+#include "tensorflow/core/lib/core/errors.h"+#include "tensorflow/core/lib/core/status.h"+#include "tensorflow/core/lib/strings/str_util.h"+#include "tensorflow/core/lib/strings/strcat.h"++#if GOOGLE_CUDA+#if GOOGLE_TENSORRT++#include "third_party/tensorrt/NvInfer.h"++namespace tensorflow {+namespace tensorrt {++// Stores optimization profile parameters (min/opt/max of each input shape).+//+// A TensorRT optimization profile describes the possible min/max values of+// each dynamic input shape along with an optimum value. These values are used+// by the TensorRT builder to select the best kernel for the optimum value among+// those kernels that are valid for all input tensors in the [min, max] range.+struct OptimizationProfileConfig {+  // Length of vector == num_inputs to engine+  std::vector<nvinfer1::Dims> min;+  std::vector<nvinfer1::Dims> opt;+  std::vector<nvinfer1::Dims> max;++  string DebugString() const {+    using absl::StrCat;+    return StrCat("[min: ", tensorflow::tensorrt::DebugString(min),+                  ", opt: : ", tensorflow::tensorrt::DebugString(opt),+                  ", max: ", tensorflow::tensorrt::DebugString(max), "]");+  }++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // Set the stored min/opt/max dimensions for profile.+  //+  // Parameters:+  // network - TensorRT network, used to enumerate all the input tensors+  // profile - on exit the profile information will be set for each input tensor+  Status SetDimensions(const nvinfer1::INetworkDefinition* network,+                       nvinfer1::IOptimizationProfile* profile) const {+    int n_inputs = network->getNbInputs();+    if (min.size() != n_inputs || opt.size() != n_inputs ||+        max.size() != n_inputs) {+      return errors::Internal("Incorrect number of profile config parameters");+    }+    for (int i = 0; i < n_inputs; i++) {+      const char* name = network->getInput(i)->getName();+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, min[i]);+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, opt[i]);+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, max[i]);+    }+    return Status::OK();+  }+#endif++  // Returns true if profile range completely includes the given shapes.+  bool IncludesShapes(const std::vector<TensorShape>& shapes) const {+    // min, max, and opt must have the same size which,+    // already verified in SetDimensions.

Avoid partial comment lines

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_++#include <list>+#include <string>+#include <unordered_set>+#include <vector>++#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"+#include "tensorflow/core/framework/tensor_shape.h"+#include "tensorflow/core/lib/core/errors.h"+#include "tensorflow/core/lib/core/status.h"+#include "tensorflow/core/lib/strings/str_util.h"+#include "tensorflow/core/lib/strings/strcat.h"++#if GOOGLE_CUDA+#if GOOGLE_TENSORRT++#include "third_party/tensorrt/NvInfer.h"++namespace tensorflow {+namespace tensorrt {++// Stores optimization profile parameters (min/opt/max of each input shape).+//+// A TensorRT optimization profile describes the possible min/max values of+// each dynamic input shape along with an optimum value. These values are used+// by the TensorRT builder to select the best kernel for the optimum value among+// those kernels that are valid for all input tensors in the [min, max] range.+struct OptimizationProfileConfig {+  // Length of vector == num_inputs to engine+  std::vector<nvinfer1::Dims> min;+  std::vector<nvinfer1::Dims> opt;+  std::vector<nvinfer1::Dims> max;++  string DebugString() const {+    using absl::StrCat;+    return StrCat("[min: ", tensorflow::tensorrt::DebugString(min),+                  ", opt: : ", tensorflow::tensorrt::DebugString(opt),+                  ", max: ", tensorflow::tensorrt::DebugString(max), "]");+  }++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // Set the stored min/opt/max dimensions for profile.

Sets

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

 string GetLinkedTensorRTVersion(); // TensorRT library version information {Maj, Min, Patch}. string GetLoadedTensorRTVersion(); +// Returns the number of inputs for the engine, which also correspends to the+// number of input tensors for the network. This can differ from the number of+// input bindings, because each profile has a set of bindings.

Shall we replace the second sentence here with this: The number of total input bindings equals the number of profile times the number of engine inputs.

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#include <string.h>++#include <vector>++#include "absl/memory/memory.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"+#include "tensorflow/core/framework/tensor.h"+#include "tensorflow/core/framework/tensor_shape.h"+#include "tensorflow/core/framework/types.h"+#include "tensorflow/core/platform/test.h"++#if GOOGLE_CUDA+#if GOOGLE_TENSORRT++#include "third_party/tensorrt/NvInfer.h"++namespace tensorflow {+namespace tensorrt {++std::vector<TensorShape> DimVecToShapeVec(std::vector<nvinfer1::Dims3> dimvec) {+  std::vector<TensorShape> shapevec(dimvec.size());+  for (int i = 0; i < dimvec.size(); i++) {+    TensorShape shape;+    TensorShapeUtils::MakeShape(dimvec[i].d, dimvec[i].nbDims, &shape);+    shapevec[i] = shape;+  }+  return shapevec;+}++bool DimsContained(const nvinfer1::Dims& dim, const nvinfer1::Dims& min,+                   const nvinfer1::Dims& max) {+  if (dim.nbDims != min.nbDims || dim.nbDims != max.nbDims) {+    return false;+  }+  for (int i = 0; i < dim.nbDims; i++) {+    if (dim.d[i] < min.d[i] || dim.d[i] > max.d[i]) {+      return false;+    }+  }+  return true;+}++bool DimsEqual(const nvinfer1::Dims& a, const nvinfer1::Dims& b) {+  if (a.nbDims != b.nbDims) {+    return false;+  }+  for (int i = 0; i < a.nbDims; i++) {+    if (a.d[i] != b.d[i]) {+      return false;+    }+  }+  return true;+}++class TrtShapeOptimizationProfileTest : public ::testing::Test {+ protected:+  void SetUp() override {+    builder_ = TrtUniquePtrType<nvinfer1::IBuilder>(+        nvinfer1::createInferBuilder(logger_));+#if IS_TRT_VERSION_GE(6, 0, 0, 0)+    network_ = TrtUniquePtrType<nvinfer1::INetworkDefinition>(+        builder_->createNetworkV2(flags_));+    builder_config_ = TrtUniquePtrType<nvinfer1::IBuilderConfig>(+        builder_->createBuilderConfig());+    builder_config_->setMaxWorkspaceSize(1 << 10);+#else+    network_ = TrtUniquePtrType<nvinfer1::INetworkDefinition>(+        builder_->createNetwork());+    builder_->setMaxWorkspaceSize(1 << 10);+#endif+  }++  // Define a simple network: output = input1 + input2.

Defines

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

 void GetInputProperties(const grappler::GraphProperties& graph_properties,   } } +// This function checks if a tensor is compatible with TRT.+//+// We check that the shape and datatype is compatible with TensorRT. We also+// return the corresponding trt_dtype, the trt_dims and the batch_size (latter+// is only needed in implicit batch mode).+//+// The return status indicates wether the tensor is compatible.+//+// If validation_only == false, then we make an additional check. In implicit

Can we simplify this paragraph? For implicit batch mode, when validation_only == false, we also check that all input dimensions besides the implicit batch dimension, are known dimensions.

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

 void GetInputProperties(const grappler::GraphProperties& graph_properties,   } } +// This function checks if a tensor is compatible with TRT.+//+// We check that the shape and datatype is compatible with TensorRT. We also

Replace "is" with "are"?

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_++#include <list>+#include <string>+#include <unordered_set>+#include <vector>++#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"+#include "tensorflow/core/framework/tensor_shape.h"+#include "tensorflow/core/lib/core/errors.h"+#include "tensorflow/core/lib/core/status.h"+#include "tensorflow/core/lib/strings/str_util.h"+#include "tensorflow/core/lib/strings/strcat.h"++#if GOOGLE_CUDA+#if GOOGLE_TENSORRT++#include "third_party/tensorrt/NvInfer.h"++namespace tensorflow {+namespace tensorrt {++// Stores optimization profile parameters (min/opt/max of each input shape).+//+// A TensorRT optimization profile describes the possible min/max values of+// each dynamic input shape along with an optimum value. These values are used+// by the TensorRT builder to select the best kernel for the optimum value among+// those kernels that are valid for all input tensors in the [min, max] range.+struct OptimizationProfileConfig {+  // Length of vector == num_inputs to engine+  std::vector<nvinfer1::Dims> min;+  std::vector<nvinfer1::Dims> opt;+  std::vector<nvinfer1::Dims> max;++  string DebugString() const {+    using absl::StrCat;+    return StrCat("[min: ", tensorflow::tensorrt::DebugString(min),+                  ", opt: : ", tensorflow::tensorrt::DebugString(opt),+                  ", max: ", tensorflow::tensorrt::DebugString(max), "]");+  }++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // Set the stored min/opt/max dimensions for profile.+  //+  // Parameters:+  // network - TensorRT network, used to enumerate all the input tensors+  // profile - on exit the profile information will be set for each input tensor+  Status SetDimensions(const nvinfer1::INetworkDefinition* network,+                       nvinfer1::IOptimizationProfile* profile) const {+    int n_inputs = network->getNbInputs();+    if (min.size() != n_inputs || opt.size() != n_inputs ||+        max.size() != n_inputs) {+      return errors::Internal("Incorrect number of profile config parameters");+    }+    for (int i = 0; i < n_inputs; i++) {+      const char* name = network->getInput(i)->getName();+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, min[i]);+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, opt[i]);+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, max[i]);+    }+    return Status::OK();+  }+#endif++  // Returns true if profile range completely includes the given shapes.+  bool IncludesShapes(const std::vector<TensorShape>& shapes) const {+    // min, max, and opt must have the same size which,+    // already verified in SetDimensions.+    if (min.size() != shapes.size()) {+      return false;+    }+    for (int i = 0; i < shapes.size(); i++) {+      auto current_shape = shapes[i];+      // min, max, and opt must have the same nbDims, which is

Avoid partial comment lines.

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_++#include <list>+#include <string>+#include <unordered_set>+#include <vector>++#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"+#include "tensorflow/core/framework/tensor_shape.h"+#include "tensorflow/core/lib/core/errors.h"+#include "tensorflow/core/lib/core/status.h"+#include "tensorflow/core/lib/strings/str_util.h"+#include "tensorflow/core/lib/strings/strcat.h"++#if GOOGLE_CUDA+#if GOOGLE_TENSORRT++#include "third_party/tensorrt/NvInfer.h"++namespace tensorflow {+namespace tensorrt {++// Stores optimization profile parameters (min/opt/max of each input shape).+//+// A TensorRT optimization profile describes the possible min/max values of+// each dynamic input shape along with an optimum value. These values are used+// by the TensorRT builder to select the best kernel for the optimum value among+// those kernels that are valid for all input tensors in the [min, max] range.+struct OptimizationProfileConfig {+  // Length of vector == num_inputs to engine+  std::vector<nvinfer1::Dims> min;+  std::vector<nvinfer1::Dims> opt;+  std::vector<nvinfer1::Dims> max;++  string DebugString() const {+    using absl::StrCat;+    return StrCat("[min: ", tensorflow::tensorrt::DebugString(min),+                  ", opt: : ", tensorflow::tensorrt::DebugString(opt),+                  ", max: ", tensorflow::tensorrt::DebugString(max), "]");+  }++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // Set the stored min/opt/max dimensions for profile.+  //+  // Parameters:+  // network - TensorRT network, used to enumerate all the input tensors+  // profile - on exit the profile information will be set for each input tensor+  Status SetDimensions(const nvinfer1::INetworkDefinition* network,+                       nvinfer1::IOptimizationProfile* profile) const {+    int n_inputs = network->getNbInputs();+    if (min.size() != n_inputs || opt.size() != n_inputs ||+        max.size() != n_inputs) {+      return errors::Internal("Incorrect number of profile config parameters");+    }+    for (int i = 0; i < n_inputs; i++) {+      const char* name = network->getInput(i)->getName();+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, min[i]);+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, opt[i]);+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, max[i]);+    }+    return Status::OK();+  }+#endif++  // Returns true if profile range completely includes the given shapes.+  bool IncludesShapes(const std::vector<TensorShape>& shapes) const {+    // min, max, and opt must have the same size which,+    // already verified in SetDimensions.+    if (min.size() != shapes.size()) {+      return false;+    }+    for (int i = 0; i < shapes.size(); i++) {+      auto current_shape = shapes[i];+      // min, max, and opt must have the same nbDims, which is+      // already verified in SetDimensions.+      if (min[i].nbDims != current_shape.dims()) {+        return false;+      }+      // Check if range [min, max] includes current_shape.+      for (int dim = 0; dim < current_shape.dims(); dim++) {+        if ((min[i].d[dim] > current_shape.dim_size(dim)) ||+            (max[i].d[dim] < current_shape.dim_size(dim))) {+          return false;+        }+      }+    }+    return true;+  }+};++// Manages Optimization profiles during TRT Engine construction.+//+// An optimization profile describes a range of dimensions for each TRT network+// input, and the optimal dimensions that the auto-tuner should use for+// optimization.+//+// This class stores the list of input shapes that were seen during the+// build/profile_generation_mode phase, and using them it creates a set of+// OptimizationProfileConfigs. These configs will be added to IBuilderConfig+// before the engine is created.+class TrtShapeOptimizationProfile {+ public:+  TrtShapeOptimizationProfile(){};++  // Stores input shape information during profile_generation_mode+  void AddShape(std::vector<TensorShape> shapes) {+    input_shapes_.insert(shapes);+    VLOG(1) << "Collected shape(s) " << DebugString(shapes) << " for profiles.";+  }++  void clear() { profiles_.clear(); }++  // Returns the profile number that should be used to execute the network with+  // the given input shapes. Returns -1 if none of cached profiles are+  // compatible with the given input shapes.+  int GetProfileNumber(std::vector<TensorShape> shapes);++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // Creates optimization profiles and add them to the builder config.+  Status ConfigureBuilder(nvinfer1::IBuilder* builder,+                          nvinfer1::IBuilderConfig* config,+                          const nvinfer1::INetworkDefinition* network);+#endif++  // Creates execution contexts for each optimization profile.+  Status CreateExecutionContexts(+      nvinfer1::ICudaEngine* engine,+      std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>>& exec_context);++  /// Map input vector shapes to TRT Optimization profiles (min, max, opt)

Maps

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

 class TRTEngineOp : public AsyncOpKernel {       LRUCache<std::vector<TensorShape>, std::unique_ptr<EngineContext>,                VectorTensorShapeHasher>; -  // Execute calibration+  // Execute calibration.

Executes

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

 void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,    OP_REQUIRES_OK_ASYNC(ctx, VerifyInputShapes(input_concrete_shapes), *helper); +  if (!use_implicit_batch_) {+    if (cache_res->profiles_.GetNumProfiles() == 0) {+      // Create a single profile from the current input shape.+      // In the future we will collect a set of input shapes during build mode

Please avoid partial lines in comments.

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.++Licensed under the Apache License, Version 2.0 (the "License");+you may not use this file except in compliance with the License.+You may obtain a copy of the License at++    http://www.apache.org/licenses/LICENSE-2.0++Unless required by applicable law or agreed to in writing, software+distributed under the License is distributed on an "AS IS" BASIS,+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+See the License for the specific language governing permissions and+limitations under the License.+==============================================================================*/++#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_++#include <list>+#include <string>+#include <unordered_set>+#include <vector>++#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"+#include "tensorflow/core/framework/tensor_shape.h"+#include "tensorflow/core/lib/core/errors.h"+#include "tensorflow/core/lib/core/status.h"+#include "tensorflow/core/lib/strings/str_util.h"+#include "tensorflow/core/lib/strings/strcat.h"++#if GOOGLE_CUDA+#if GOOGLE_TENSORRT++#include "third_party/tensorrt/NvInfer.h"++namespace tensorflow {+namespace tensorrt {++// Stores optimization profile parameters (min/opt/max of each input shape).+//+// A TensorRT optimization profile describes the possible min/max values of+// each dynamic input shape along with an optimum value. These values are used+// by the TensorRT builder to select the best kernel for the optimum value among+// those kernels that are valid for all input tensors in the [min, max] range.+struct OptimizationProfileConfig {+  // Length of vector == num_inputs to engine+  std::vector<nvinfer1::Dims> min;+  std::vector<nvinfer1::Dims> opt;+  std::vector<nvinfer1::Dims> max;++  string DebugString() const {+    using absl::StrCat;+    return StrCat("[min: ", tensorflow::tensorrt::DebugString(min),+                  ", opt: : ", tensorflow::tensorrt::DebugString(opt),+                  ", max: ", tensorflow::tensorrt::DebugString(max), "]");+  }++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // Set the stored min/opt/max dimensions for profile.+  //+  // Parameters:+  // network - TensorRT network, used to enumerate all the input tensors+  // profile - on exit the profile information will be set for each input tensor+  Status SetDimensions(const nvinfer1::INetworkDefinition* network,+                       nvinfer1::IOptimizationProfile* profile) const {+    int n_inputs = network->getNbInputs();+    if (min.size() != n_inputs || opt.size() != n_inputs ||+        max.size() != n_inputs) {+      return errors::Internal("Incorrect number of profile config parameters");+    }+    for (int i = 0; i < n_inputs; i++) {+      const char* name = network->getInput(i)->getName();+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, min[i]);+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, opt[i]);+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, max[i]);+    }+    return Status::OK();+  }+#endif++  // Returns true if profile range completely includes the given shapes.+  bool IncludesShapes(const std::vector<TensorShape>& shapes) const {+    // min, max, and opt must have the same size which,+    // already verified in SetDimensions.+    if (min.size() != shapes.size()) {+      return false;+    }+    for (int i = 0; i < shapes.size(); i++) {+      auto current_shape = shapes[i];+      // min, max, and opt must have the same nbDims, which is+      // already verified in SetDimensions.+      if (min[i].nbDims != current_shape.dims()) {+        return false;+      }+      // Check if range [min, max] includes current_shape.+      for (int dim = 0; dim < current_shape.dims(); dim++) {+        if ((min[i].d[dim] > current_shape.dim_size(dim)) ||+            (max[i].d[dim] < current_shape.dim_size(dim))) {+          return false;+        }+      }+    }+    return true;+  }+};++// Manages Optimization profiles during TRT Engine construction.+//+// An optimization profile describes a range of dimensions for each TRT network+// input, and the optimal dimensions that the auto-tuner should use for+// optimization.+//+// This class stores the list of input shapes that were seen during the+// build/profile_generation_mode phase, and using them it creates a set of+// OptimizationProfileConfigs. These configs will be added to IBuilderConfig+// before the engine is created.+class TrtShapeOptimizationProfile {+ public:+  TrtShapeOptimizationProfile(){};++  // Stores input shape information during profile_generation_mode+  void AddShape(std::vector<TensorShape> shapes) {+    input_shapes_.insert(shapes);+    VLOG(1) << "Collected shape(s) " << DebugString(shapes) << " for profiles.";+  }++  void clear() { profiles_.clear(); }++  // Returns the profile number that should be used to execute the network with+  // the given input shapes. Returns -1 if none of cached profiles are+  // compatible with the given input shapes.+  int GetProfileNumber(std::vector<TensorShape> shapes);++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // Creates optimization profiles and add them to the builder config.+  Status ConfigureBuilder(nvinfer1::IBuilder* builder,+                          nvinfer1::IBuilderConfig* config,+                          const nvinfer1::INetworkDefinition* network);+#endif++  // Creates execution contexts for each optimization profile.+  Status CreateExecutionContexts(+      nvinfer1::ICudaEngine* engine,+      std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>>& exec_context);++  /// Map input vector shapes to TRT Optimization profiles (min, max, opt)+  // i.e. maps input_shapes_ to profiles_+  void InitProfiles();++  // Returns number of created profiles.+  int GetNumProfiles() const;++  // Restore profiles from the engine (used after deserialization)

Restores

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

 class TRTEngineOp : public AsyncOpKernel {    // Execute the tensorrt engine. Returns whether we need to retry by running   // the native segment.-  bool ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context);+  bool ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context,+                        int trt_context_idx); -  // Allocate necessary resources for calibration+  // Allocate necessary resources for calibration.

Allocates

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Add TensorRT optimization profiles

 TEST_F(TRTEngineOpTestBase, DynamicShapes) {   TensorShape input_shape({1, 2});   TRTEngineOpTestBase::AddSimpleInput<float>(input_shape); -  // We expect that TensorRT engine creation fails: we would need to configure-  // the engine with optimization profiles to use dynamic input shapes, but that-  // feature is not yet implemented.-  //-  // Since TRT engine creation has failed, we fall back to native segment.-  // Calling the native segment fails for the same reason that is investigated-  // in https://github.com/tensorflow/tensorflow/pull/34919. This is irrelevant-  // for the current test, here we want to just check wether TRT engine creation-  // has failed.-  OpsTestBase::RunOpKernel();+  TF_ASSERT_OK(OpsTestBase::RunOpKernel());

Can you add #ifdef and make sure that the test "pass" with all TRT version?

tfeher

comment created time in 12 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 Status ValidateTensorProperties(const string& producer_node_type,   }    if (validation_only) return Status::OK();-  // Following are validations at runtime. -  for (int d = first_trt_dim; d < shape.dims(); ++d) {-    if (shape.dim_size(d) < 0) {-      return errors::InvalidArgument(-          "Input tensor with shape ", shape.DebugString(),-          " has an unknown non-batch dimension at dim ", d);+  // Following checks are only used during TRT engine creation time. In implicit

Following checks are only used during TRT engine creation time. Do you mean this routine itself is called to validate tensors during TRT engine creation time? Can you move this sentence and make it a document for the routine?

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 Status Converter::SqueezeTensor(nvinfer1::ITensor* input,   }  #if IS_TRT_VERSION_GE(6, 0, 0, 0)-  // For dynamic input shapes, we need to use TRT ops to build the new shape.+  // If the remaining dimensions of squeeze operation have dynamic sizes, we

Add "a" between "of" and "squeeze operation"

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 class TRTEngineOp : public AsyncOpKernel {   // If true, create calibration graph for INT8 mode. Otherwise, we are using   // user-provided quantization ranges.   bool use_calibration_;++  // Array of all input shapes, collected from the input_shapes attribute when+  // constructing the TRTEngineOp. The input_shapes attribute is set during+  // graph conversion time. This data is used to retrive which input dimensions

retrieve, missing "e"

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 def ExpectedEnginesToBuild(self, run_params):     return ["TRTEngineOp_0"]  +class DynamicShapesTest(TrtModeTestBase):+  """ Test with dynamic input shape.

shapes

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 def setUp(self):     super(TfTrtIntegrationTestBase, self).setUp()     warnings.simplefilter("always") -  def BuildParams(self, graph_fn, dtype, input_shapes, output_shapes):-    """Build test parameters when not considering dynamic shapes."""+  def _GetTensorSpec(self, shape, mask, dtype, name):+    # Unset shape where mask[i] == None

Set dimension i to None if mask[i] == False

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,   return true; } +// This routine finds the engines with input shapes compatible with the+// actual_input_shapes, and returns the input shapes of one of such engine that+// has the smallest batch size Status TRTEngineOp::GetEngineInputShapes(     const CacheType& cache, const std::vector<TensorShape>& actual_input_shapes,     std::vector<TensorShape>* engine_input_shapes) {   // VerifyInputShapes() already ensured that all input shapes have same-  // batch size, and are not scalars.+  // batch size, and are not scalars, if we are in implicit batch mode.+  //+  // In explicit batch mode we plan to have single engine in the cache, and we+  // return with its shape if it is compatible.

Change "return with" to "return".

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 TEST_F(TRTEngineOpTestBase, DynamicShapes) {   EXPECT_EQ(1, cache->count({TensorShape({10, 10})})); } +TEST_F(TRTEngineOpTestBase, ExplicitBatch) {+  // Test inference in explicit batch mode with static input shapes. Static+  // shapes in this context means that the TensorRT knows all the input shapes+  // during engine creation time.+  TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,+                                      /*shape=*/PartialTensorShape({1, 2}),+                                      /*use_implicit_batch=*/false);++  TensorShape input_shape({1, 2});+  TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);+  TF_ASSERT_OK(OpsTestBase::RunOpKernel());++  // Get the engine cache.+  TRTEngineCacheResource* cache_resource = nullptr;+  TF_ASSERT_OK(+      device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));+  core::ScopedUnref sc(cache_resource);++  // The cache should contain only one EngineContext, with a valid cuda_engine.+  auto cache = &cache_resource->cache_;+  EXPECT_EQ(1, cache->size());+  ASSERT_EQ(1, cache->count({input_shape}));+  EngineContext* ectx = cache->at({input_shape}).get();+  EXPECT_NE(ectx->cuda_engine, nullptr);+}++TEST_F(TRTEngineOpTestBase, DynamicShapes) {+  // Test inference in explicit batch mode with dynamic input shapes. Dynamic+  // shapes in this context means that some input shapes for TensorRT are+  // unknown during engine creation time. When we create the network, the+  // unknow shapes are repsesented as -1. Before we run inference, these shapes+  // have to be specified by calling setBindingDimensions.+  TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,+                                      /*shape=*/PartialTensorShape({-1, -1}),+                                      /*use_implicit_batch=*/false);++  TensorShape input_shape({1, 2});+  TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);++  // We expect that TensorRT engine creation fails: we would need to configure+  // the engine with optimization profiles to use dynamic input shapes, but that+  // feature is not yet implemented.+  //+  // Since TRT engine creation has failed, we fall back to native segment.+  // Calling the native segment fails for the same reason that is investigated+  // in https://github.com/tensorflow/tensorflow/pull/34919. This is irrelevant+  // for the current test, here we want to just check wether TRT engine creation+  // has failed.+  OpsTestBase::RunOpKernel();++  // Get the engine cache.+  TRTEngineCacheResource* cache_resource = nullptr;+  TF_ASSERT_OK(+      device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));+  core::ScopedUnref sc(cache_resource);++  // The cache should contain only one EngineContext.+  auto cache = &cache_resource->cache_;+  EXPECT_EQ(1, cache->size());+  ASSERT_EQ(1, cache->count({input_shape}));+  EngineContext* ectx = cache->at({input_shape}).get();+  // Since engine creation failed, we expect to find nullptr. Finding a nullptr+  // indicates that the unknown shapes were used to define the TensorRT network.

Remove "the" in "the unknown shapes".

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,   return true; } +// This routine finds the engines with input shapes compatible with the+// actual_input_shapes, and returns the input shapes of one of such engine that+// has the smallest batch size

Please add . to the end of the paragraph.

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 class TRTEngineOp : public AsyncOpKernel {   // If true, create calibration graph for INT8 mode. Otherwise, we are using   // user-provided quantization ranges.   bool use_calibration_;++  // Array of all input shapes, collected from the input_shapes attribute when+  // constructing the TRTEngineOp. The input_shapes attribute is set during+  // graph conversion time. This data used to retrive which input dimensions

Missing "is" between "This data" and "used".

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 def ExpectedEnginesToBuild(self, run_params):     return ["TRTEngineOp_0"]  +class DynamicShapesTest(TrtModeTestBase):+  """ Test with dynamic input shape.++  The difference compered to ExplicitBatchTest is that we specify an input and

This is a typo here. Can we change the comment to: DynamicShapesTest is different from ExplicitBatchTest in that it uses input and output masks to change the input and output shapes to unknown shapes.

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 def setUp(self):     super(TfTrtIntegrationTestBase, self).setUp()     warnings.simplefilter("always") -  def BuildParams(self, graph_fn, dtype, input_shapes, output_shapes):-    """Build test parameters when not considering dynamic shapes."""+  def _GetTensorSpec(self, shape, mask, dtype, name):+    # Unset shape where mask[i] == None+    assert len(shape) == len(mask)+    new_shape = [None if m is False else s for s, m in zip(shape, mask)]+    return tensor_spec.TensorSpec(new_shape, dtype, name)++  def BuildParams(self, graph_fn, dtype, input_shapes, output_shapes,

The special handling for implicit batch mode makes BuildParams non-trivial to understand. Can we have two routines instead: BuildParamsForImplicitBatchMode BuildParamsForMasks The second one is similar to the current BuildParams with the exception that it doesn't allow default mask values. The first one constructs the masks for implicit batch mode and calls the second one.

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)     use_implicit_batch_ = true;   } #endif-  if (!use_implicit_batch_) {+  if (use_implicit_batch_) {+    if (input_partial_shapes_.size() == 0) {

Let's replace size () == 0 with empty().

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,   ExecuteNativeSegment(ctx, helper); } -Status TRTEngineOp::VerifyInputShapes(const std::vector<TensorShape>& shapes) {-  if (shapes.empty()) {+Status TRTEngineOp::VerifyInputShapes(+    const std::vector<TensorShape>& input_concrete_shapes) {+  if (input_concrete_shapes.empty()) {     return errors::InvalidArgument("Input shapes are empty, for ", name());   }-  if (shapes[0].dims() < 1) {-    return errors::InvalidArgument("Input shapes contain scalar, for ", name(),-                                   ": ",-                                   TensorShapeUtils::ShapeListString(shapes));++  if (input_partial_shapes_.size() == 0) {+    if (!use_implicit_batch_) {

Can you add a comment to state why it is ok for use_implicit_batch to have empty input_partial_shapes_ and maybe link this to the error message emit fromTRTEngineOp::TRTEngineOp that says this is to support the creation of engine for the TRTEngine op created using an older version of TF-TRT?

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)     use_implicit_batch_ = true;   } #endif-  if (!use_implicit_batch_) {+  if (use_implicit_batch_) {+    if (input_partial_shapes_.size() == 0) {+      VLOG(1) << "Attribute input_shapes is not set. This happens probably "+              << "because you are using a model that is already converted "+              << "to TensorRT with a previous version of TF-TRT (i.e. includes "+              << "TRTEngineOp in graph). This is not an error. If you convert "+              << "the original model again to TensorRT, the attributes "+              << "input_shapes will be set automatically.";+    }+  } else {+    OP_REQUIRES(+        context, input_partial_shapes_.size() > 0,

Similarly, let's replace size() > 0 with !empty()

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,   ExecuteNativeSegment(ctx, helper); } -Status TRTEngineOp::VerifyInputShapes(const std::vector<TensorShape>& shapes) {-  if (shapes.empty()) {+Status TRTEngineOp::VerifyInputShapes(+    const std::vector<TensorShape>& input_concrete_shapes) {+  if (input_concrete_shapes.empty()) {     return errors::InvalidArgument("Input shapes are empty, for ", name());   }-  if (shapes[0].dims() < 1) {-    return errors::InvalidArgument("Input shapes contain scalar, for ", name(),-                                   ": ",-                                   TensorShapeUtils::ShapeListString(shapes));++  if (input_partial_shapes_.size() == 0) {

Replace size() == 0 with empty()

tfeher

comment created time in 15 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 StatusOr<EngineContext*> TRTEngineOp::GetEngine(     bool convert_successfully = false;     LOG(INFO) << "Building a new TensorRT engine for " << name()               << " with input shapes: "-              << TensorShapeUtils::ShapeListString(engine_input_shapes);+              << TensorShapeUtils::ShapeListString(input_concrete_shapes); -    // Convert to partial shapes-    std::vector<PartialTensorShape> partial_shapes(engine_input_shapes.begin(),-                                                   engine_input_shapes.end());+    // Use concrete shapes for implicit batch mode and+    // use partial shapes for explicit batch mode.

Avoid partial line comments

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 def setUp(self):     super(TfTrtIntegrationTestBase, self).setUp()     warnings.simplefilter("always") -  def BuildParams(self, graph_fn, dtype, input_shapes, output_shapes):-    """Build test parameters when not considering dynamic shapes."""+  def _GetTensorSpec(self, shape, mask, dtype, name):+    if mask is None:+      # Unset the batch dim of the specs to make sure TRT can tolerate changes+      # on that.+      new_shape = [None] + shape[1:]

I don't understand the logic here. We only want to set shape[0] to None (dynamic) when we are generating tests for implicit_batch_mode==true, is this correct?

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)     use_implicit_batch_ = true;   } #endif-  if (!use_implicit_batch_) {+  if (use_implicit_batch_) {+    if (input_partial_shapes_.size() == 0) {+      VLOG(1) << "Attribute input_shapes it not set. This happens probably "

s/it/is/

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)     use_implicit_batch_ = true;   } #endif-  if (!use_implicit_batch_) {+  if (use_implicit_batch_) {+    if (input_partial_shapes_.size() == 0) {+      VLOG(1) << "Attribute input_shapes it not set. This happens probably "+              << "because you are using a model that is already converted "+              << "to TensorRT (i.e. includes TRTEngineOp in graph). If you "+              << "convert the original model again to TensorRT, the "+              << "attributes input_shapes will be set automatically.";+    }+  } else {+    OP_REQUIRES(+        context, input_partial_shapes_.size() > 0,+        errors::InvalidArgument(+            "Explicit batch mode requires attribute input_shapes "+            "to be set. If you are using a model that is already "+            "converted to TensorRT (i.e. includes TRTEngineOp in graph), "+            "then you need to convert the original model again to "+            "TensorRT in order to set the attribute input_shapes."));

I have similar comment to this one as the above error message.

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx, }  StatusOr<EngineContext*> TRTEngineOp::GetEngine(-    const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx,+    const std::vector<TensorShape>& input_concrete_shapes, OpKernelContext* ctx,     TRTEngineCacheResource* cache_res) {   static EngineContext empty_context;    mutex_lock lock(engine_mutex_);   // Using first input to get batch size is reliable - VerifyInputShapes() has-  // verified that.-  const int batch_size = input_shapes[0].dim_size(0);+  // verified that. Although this is not needed for explicit batch mode, but+  // we still have to pass it to ConvertGraphDefToEngine until the requirement+  // is removed from that function.+  const int batch_size = input_concrete_shapes[0].dim_size(0);

Can you change the comment to this: VerifyInputShapes guarantee that the first input is not a scalar. As such we can always use the first input to get the batch size for implicit batch mode. For explicit batch mode, this value is not used. Then add a TODO for removing the need to batch size to ConvertGraphDefToEngine for explicit batch mode.

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 Status ValidateTensorProperties(const string& producer_node_type,   }    if (validation_only) return Status::OK();-  // Following are validations at runtime.+  // Following are validations for creating TRT network and engine.

This comment doesn't make much sense to me. Would you please rewrite it into stating the reason why we want to check all dimensions of the shape are static and move the comment to closer to the if-stmt below (which could be just deleting the blank line right after here)?

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 class TRTEngineOp : public AsyncOpKernel {   // If true, create calibration graph for INT8 mode. Otherwise, we are using   // user-provided quantization ranges.   bool use_calibration_;++  // Array of all input shapes during graph construction set in an attribute.

I don't understand the structure of this sentense. Maybe you mean this: Array of all input shapes, collected from input_shapes attribute when constructing the TRTEngineOp.

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 Status TRTEngineOp::GetEngineInputShapes(     const CacheType& cache, const std::vector<TensorShape>& actual_input_shapes,     std::vector<TensorShape>* engine_input_shapes) {   // VerifyInputShapes() already ensured that all input shapes have same-  // batch size, and are not scalars.+  // batch size, and are not scalars, if we are in implicit batch mode.+  //+  // In explicit batch mode we plan to have single engine in the cache, and we+  // return with its shape if it is compatible.

This routine finds the engines with input shapes compatible with the actual_input_shapes, and returns the input shapes of one of such engines that has the smallest batch size. I don't feed the old and new comment here are very helpful.

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,   ExecuteNativeSegment(ctx, helper); } -Status TRTEngineOp::VerifyInputShapes(const std::vector<TensorShape>& shapes) {-  if (shapes.empty()) {+Status TRTEngineOp::VerifyInputShapes(+    const std::vector<TensorShape>& input_concrete_shapes) {+  if (input_concrete_shapes.empty()) {     return errors::InvalidArgument("Input shapes are empty, for ", name());   }-  if (shapes[0].dims() < 1) {-    return errors::InvalidArgument("Input shapes contain scalar, for ", name(),-                                   ": ",-                                   TensorShapeUtils::ShapeListString(shapes));++  if (input_partial_shapes_.size() == 0) {+    if (!use_implicit_batch_) {+      return errors::InvalidArgument(+          "Explicit batch mode requires input_partial_shapes_ ",+          "to contain the dynamic input shapes to TRTEngineOp");+    }+  } else {+    const string error_msg = StrCat(+        "Input shapes do not match input partial shapes stored in graph, for ",+        name(), ": ", DebugString(input_concrete_shapes),+        " != ", DebugString(input_partial_shapes_));+    if (input_concrete_shapes.size() != input_partial_shapes_.size()) {+      return errors::InvalidArgument(error_msg);+    }+    for (int i = 0; i < input_concrete_shapes.size(); i++) {+      if (input_concrete_shapes[i].dims() != input_partial_shapes_[i].dims()) {+        return errors::InvalidArgument(error_msg);+      }+    }+    for (int i = 0; i < input_concrete_shapes.size(); i++) {+      for (int d = 0; d < input_concrete_shapes[i].dims(); d++) {+        if (input_partial_shapes_[i].dim_size(d) != -1) {+          if (input_concrete_shapes[i].dim_size(d) !=+              input_partial_shapes_[i].dim_size(d)) {+            return errors::InvalidArgument(error_msg);+          }+        }+      }+    }+  }++  if (input_concrete_shapes[0].dims() < 1) {

Can you add a comment to state why we only need to check this for the first input shape? My understanding is that for implicit batch mode, the other input shapes will be checked in the loop below. How about explicit batch mode? Shall we check all inputs are not scalar?

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 Status CreateTRTNode(const ConversionParams& params,         out_types.at(conn.port_number) = conn.connection_type;       } else {         // Set the shapes and data types of input edge.+        tensorflow::TensorShapeProto in_shape;+        conn.outside_shape.AsProto(&in_shape);         if (input_shapes.size() <= conn.port_number) {+          input_shape_protos.resize(conn.port_number + 1);           input_shapes.resize(conn.port_number + 1);         }+        input_shape_protos.at(conn.port_number) = in_shape;

Can we use this to avoid the copy from in_shape conn.outside_shape.AsProto(&input_shape_protos.at(conn.port_number));

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)     use_implicit_batch_ = true;   } #endif-  if (!use_implicit_batch_) {+  if (use_implicit_batch_) {+    if (input_partial_shapes_.size() == 0) {+      VLOG(1) << "Attribute input_shapes it not set. This happens probably "+              << "because you are using a model that is already converted "+              << "to TensorRT (i.e. includes TRTEngineOp in graph). If you "+              << "convert the original model again to TensorRT, the "+              << "attributes input_shapes will be set automatically.";

I don't quite understand this error message after its first sentence. Do you mean: You are likely trying to convert a model that is already converted to TensorRT. You should convert the original model instead.

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 TEST_F(TRTEngineOpTestBase, DynamicShapes) {   EXPECT_EQ(1, cache->count({TensorShape({10, 10})})); } +TEST_F(TRTEngineOpTestBase, ExplicitBatch) {+  // Test inference in explicit batch mode with static input shapes. Static+  // shapes in this context means that the TensorRT knows all the input shapes+  // during engine creation time.+  TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,+                                      /*shape=*/PartialTensorShape({1, 2}),+                                      /*use_implicit_batch=*/false);++  TensorShape input_shape({1, 2});+  TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);+  TF_ASSERT_OK(OpsTestBase::RunOpKernel());++  // Get the engine cache.+  TRTEngineCacheResource* cache_resource = nullptr;+  TF_ASSERT_OK(+      device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));+  core::ScopedUnref sc(cache_resource);++  // It should contain only one engine.+  auto cache = &cache_resource->cache_;+  EXPECT_EQ(1, cache->size());+  ASSERT_EQ(1, cache->count({input_shape}));+  EngineContext* ectx = cache->at({input_shape}).get();+  EXPECT_NE(ectx->cuda_engine, nullptr);+}++TEST_F(TRTEngineOpTestBase, DynamicShapes) {+  // Test inference in explicit batch mode with dynamic input shapes. Dynamic+  // shapes in this context means that some input shapes for TensorRT are+  // unknown during engine creation time. When we create the network, the+  // unknow shapes are repsesented as -1. Before we run inference, these shapes+  // have to be specified by calling setBindingDimensions.+  TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,+                                      /*shape=*/PartialTensorShape({-1, -1}),+                                      /*use_implicit_batch=*/false);++  TensorShape input_shape({1, 2});+  TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);++  // We expect that TensorRT engine creation fails: we would need to configure+  // the engine with optimization profiles to use dynamic input shapes, but that+  // feature is not yet implemented.+  //+  // Since TRT engine creation has failed, we fall back to native segment.+  // Calling the native segment fails for the same reason that is investigated+  // in https://github.com/tensorflow/tensorflow/pull/34919. This is irrelevant+  // for the current test, here we want to just check wether TRT engine creation+  // has failed.+  OpsTestBase::RunOpKernel();++  // Get the engine cache.+  TRTEngineCacheResource* cache_resource = nullptr;+  TF_ASSERT_OK(+      device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));+  core::ScopedUnref sc(cache_resource);++  // It should contain only one EngineContext.+  auto cache = &cache_resource->cache_;+  EXPECT_EQ(1, cache->size());+  ASSERT_EQ(1, cache->count({input_shape}));+  EngineContext* ectx = cache->at({input_shape}).get();+  // Since engine creation failed, we expect to find nullptr.+  EXPECT_EQ(ectx->cuda_engine, nullptr);

Can you combine the comments before and after this line, because they are both explaining this line of code?

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 TEST_F(TRTEngineOpTestBase, DynamicShapes) {   EXPECT_EQ(1, cache->count({TensorShape({10, 10})})); } +TEST_F(TRTEngineOpTestBase, ExplicitBatch) {+  // Test inference in explicit batch mode with static input shapes. Static+  // shapes in this context means that the TensorRT knows all the input shapes+  // during engine creation time.+  TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,+                                      /*shape=*/PartialTensorShape({1, 2}),+                                      /*use_implicit_batch=*/false);++  TensorShape input_shape({1, 2});+  TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);+  TF_ASSERT_OK(OpsTestBase::RunOpKernel());++  // Get the engine cache.+  TRTEngineCacheResource* cache_resource = nullptr;+  TF_ASSERT_OK(+      device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));+  core::ScopedUnref sc(cache_resource);++  // It should contain only one engine.

Do you mean "one engine context" not "one engine"? The cache should contain only one EngineContext.

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 TEST_F(TRTEngineOpTestBase, DynamicShapes) {   EXPECT_EQ(1, cache->count({TensorShape({10, 10})})); } +TEST_F(TRTEngineOpTestBase, ExplicitBatch) {+  // Test inference in explicit batch mode with static input shapes. Static+  // shapes in this context means that the TensorRT knows all the input shapes+  // during engine creation time.+  TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,+                                      /*shape=*/PartialTensorShape({1, 2}),+                                      /*use_implicit_batch=*/false);++  TensorShape input_shape({1, 2});+  TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);+  TF_ASSERT_OK(OpsTestBase::RunOpKernel());++  // Get the engine cache.+  TRTEngineCacheResource* cache_resource = nullptr;+  TF_ASSERT_OK(+      device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));+  core::ScopedUnref sc(cache_resource);++  // It should contain only one engine.+  auto cache = &cache_resource->cache_;+  EXPECT_EQ(1, cache->size());+  ASSERT_EQ(1, cache->count({input_shape}));+  EngineContext* ectx = cache->at({input_shape}).get();+  EXPECT_NE(ectx->cuda_engine, nullptr);+}++TEST_F(TRTEngineOpTestBase, DynamicShapes) {+  // Test inference in explicit batch mode with dynamic input shapes. Dynamic+  // shapes in this context means that some input shapes for TensorRT are+  // unknown during engine creation time. When we create the network, the+  // unknow shapes are repsesented as -1. Before we run inference, these shapes+  // have to be specified by calling setBindingDimensions.+  TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,+                                      /*shape=*/PartialTensorShape({-1, -1}),+                                      /*use_implicit_batch=*/false);++  TensorShape input_shape({1, 2});+  TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);++  // We expect that TensorRT engine creation fails: we would need to configure+  // the engine with optimization profiles to use dynamic input shapes, but that+  // feature is not yet implemented.+  //+  // Since TRT engine creation has failed, we fall back to native segment.+  // Calling the native segment fails for the same reason that is investigated+  // in https://github.com/tensorflow/tensorflow/pull/34919. This is irrelevant+  // for the current test, here we want to just check wether TRT engine creation+  // has failed.+  OpsTestBase::RunOpKernel();++  // Get the engine cache.+  TRTEngineCacheResource* cache_resource = nullptr;+  TF_ASSERT_OK(+      device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));+  core::ScopedUnref sc(cache_resource);++  // It should contain only one EngineContext.

The cache should contain only one EngineContext.

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 Status Converter::SqueezeTensor(nvinfer1::ITensor* input,                                 nvinfer1::ITensor** output) {   const nvinfer1::Dims dims = input->getDimensions();   std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);+  const bool is_dynamic =+      absl::c_any_of(input_dims, [](int i) { return i == -1; });   // Mark axes to remove by setting them to 0.   for (int axis : trt_axes) {     input_dims[axis] = 0;   }  #if IS_TRT_VERSION_GE(6, 0, 0, 0)   // For dynamic input shapes, we need to use TRT ops to build the new shape.

The check for is_dynamic is done after we set the squeezed dimensions to 0. So the comment here is not proper anymore. Shall we change it to something like this: If the remaining dimensions of squeeze operation have dynamic sizes, we need to use TRT ops to build the result shape for the squeeze operation.

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 def _Validate(shapes):     _Validate(input_shapes)     _Validate(output_shapes) +    if input_mask is None:+      input_mask = [None]*len(input_shapes)

This will make all dimensions dynamic. Right? shall we change it to input_mask = input_shapes?

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Define TensorRT network with dynamic shapes

 def setUp(self):     super(TfTrtIntegrationTestBase, self).setUp()     warnings.simplefilter("always") -  def BuildParams(self, graph_fn, dtype, input_shapes, output_shapes):-    """Build test parameters when not considering dynamic shapes."""+  def _GetTensorSpec(self, shape, mask, dtype, name):+    if mask is None:+      # Unset the batch dim of the specs to make sure TRT can tolerate changes+      # on that.+      new_shape = [None] + shape[1:]+    else:+      # Unset shape where mask[i] == None+      assert len(shape) == len(mask)+      new_shape = [None if m is None else s for s, m in zip(shape, mask)]++    return tensor_spec.TensorSpec(new_shape, dtype, name)++  def BuildParams(self, graph_fn, dtype, input_shapes, output_shapes,+                  input_mask=None, output_mask=None):+    """Build test parameters with static shapes (input_mask==None) or with+       dynamic shapes. To define the first two dimension with dynamic shapes+       use e.g. input_shapes=[[1,2,1,8]], input_mask=[[None, None, 1, 8]]

Shall we use True/False for input_mask elements instead of input_mask=[[None, None, 1, 8]] where the value 1, 8 are not used an confusing?

tfeher

comment created time in 18 days

Pull request review commenttensorflow/tensorflow

Improve TensorRT binding index query

 bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,    const bool kRetry = true;   auto& execution_context = engine_context->execution_context;-  const int num_binding = ctx->num_inputs() + ctx->num_outputs();-+  const int num_binding = cuda_engine->getNbBindings();   std::vector<void*> buffers(num_binding);    // Setup engine inputs.   for (int i = 0; i < ctx->num_inputs(); i++) {     const string input_name = StrCat(IONamePrefixes::kInputPHName, i);-    const int binding_index = cuda_engine->getBindingIndex(input_name.c_str());-    if (binding_index == -1) {-      const string msg =-          StrCat("Input node ", input_name, " not found, at ", name());-      LOG(ERROR) << msg;-      ctx->SetStatus(errors::NotFound(msg));+    int binding_index = -1;

Do we have to initialize binding_index here?

tfeher

comment created time in 20 days

Pull request review commenttensorflow/tensorflow

Improve TensorRT binding index query

 void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,   } } +// Get the binding index of a tensor in an engine.+// The binding index is looked up using the tensor's name and the profile idx.+// Profile idx should be set to zero, if we do not have optimization profiles.+Status GetTrtBindingIndex(const char* tensor_name, int profile_idx,

Let's replace 'idx' with index so that we use index consistently.

tfeher

comment created time in 20 days

Pull request review commenttensorflow/tensorflow

Improve TensorRT binding index query

 void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,   } } +// Get the binding index of a tensor in an engine.+// The binding index is looked up using the tensor's name and the profile idx.+// Profile idx should be set to zero, if we do not have optimization profiles.+Status GetTrtBindingIndex(const char* tensor_name, int profile_idx,+                          const nvinfer1::ICudaEngine* cuda_engine,+                          int* binding_idx) {+  *binding_idx = cuda_engine->getBindingIndex(tensor_name);+  if (*binding_idx == -1) {+    const string msg = StrCat("Input node ", tensor_name, " not found");+    LOG(ERROR) << msg;+    return errors::NotFound(msg);+  }+#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  int n_profiles = cuda_engine->getNbOptimizationProfiles();+#else+  int n_profiles = 1;+#endif+  // If we have more then one optimization profiles then the binding idx+  // depends on the profile number

Can we replace this comment with this: binding_index_within_engine = binding_index_within_profile + profile_index * bindings_per_profile

tfeher

comment created time in 20 days

Pull request review commenttensorflow/tensorflow

Improve TensorRT binding index query

 void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,   } } +// Get the binding index of a tensor in an engine.

Gets. Also please add a blank line after this line.

tfeher

comment created time in 20 days

Pull request review commenttensorflow/tensorflow

Improve TensorRT binding index query

 void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,   } } +// Get the binding index of a tensor in an engine.+// The binding index is looked up using the tensor's name and the profile idx.+// Profile idx should be set to zero, if we do not have optimization profiles.+Status GetTrtBindingIndex(const char* tensor_name, int profile_idx,+                          const nvinfer1::ICudaEngine* cuda_engine,+                          int* binding_idx) {+  *binding_idx = cuda_engine->getBindingIndex(tensor_name);

Can we add this comment? // This is binding_index_within_profile.

tfeher

comment created time in 20 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.+#+# Licensed under the Apache License, Version 2.0 (the "License");+# you may not use this file except in compliance with the License.+# You may obtain a copy of the License at+#+#     http://www.apache.org/licenses/LICENSE-2.0+#+# Unless required by applicable law or agreed to in writing, software+# distributed under the License is distributed on an "AS IS" BASIS,+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+# See the License for the specific language governing permissions and+# limitations under the License.+# ==============================================================================+"""Model script to test TF-TensorRT integration."""++from __future__ import absolute_import+from __future__ import division+from __future__ import print_function++import numpy as np++from tensorflow.python.compiler.tensorrt import trt_convert+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test+from tensorflow.python.framework import constant_op+from tensorflow.python.framework import dtypes+from tensorflow.python.ops import array_ops+from tensorflow.python.ops import gen_array_ops+from tensorflow.python.ops import gen_math_ops+from tensorflow.python.ops import math_ops+from tensorflow.python.platform import test+from unittest import skip++@skip("TrtModeTestBase defines a common base class for other tests")+class TrtModeTestBase(trt_test.TfTrtIntegrationTestBase):

Nit: shall we call it BatchMode, not Mode? Similar for the file name.

tfeher

comment created time in 21 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

 class Converter {                                const bool validation_only,                                nvinfer1::ITensor** tensor); +  // Helper function to add a squeeze op to the network.

Either add an empty line after this or make this a "full line". This is because we only allow partial line if the line is the last line of a paragraph.

tfeher

comment created time in 21 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

 Status ConvertExpandDims(OpConverterParams* params) {   return Status::OK(); } +Status Converter::SqueezeTensor(nvinfer1::ITensor* input,+                                const std::vector<int>& trt_axes,+                                nvinfer1::ITensor** output) {+  const nvinfer1::Dims dims = input->getDimensions();+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);+  // Mark axes to remove by setting them to 0.+  for (int axis : trt_axes) {+    input_dims[axis] = 0;+  }++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // For dynamic input shapes, we need to use TRT ops to build the new shape.

Let me state my question in another way. Currently, you first use a loop to set input_dims[axis] = 0, then check input_dims[...] doesn't have -1. Can you switch the order of this, that is move the loop that set input_dims[axis]=0 to after the check? It looks to me that you intentionally set input_dims[axis] = 0 before the checking, if that is the case, can you document the reason for this?

tfeher

comment created time in 21 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

 Status ConvertExpandDims(OpConverterParams* params) {   return Status::OK(); } +Status Converter::SqueezeTensor(nvinfer1::ITensor* input,+                                const std::vector<int>& trt_axes,+                                nvinfer1::ITensor** output) {+  const nvinfer1::Dims dims = input->getDimensions();+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);+  // Mark axes to remove by setting them to 0.+  for (int axis : trt_axes) {+    input_dims[axis] = 0;+  }++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // For dynamic input shapes, we need to use TRT ops to build the new shape.

Can you fix the comment to make it clear that we can handle dynamic shapes IN the squeezed dimensions, but can't handle dynamic shapes OUTSIDE the squeezed dimensions? This is the reason why the for-loop above goes before the checking here. Right?

tfeher

comment created time in 22 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

 Status ConvertExpandDims(OpConverterParams* params) {   return Status::OK(); } +Status Converter::SqueezeTensor(nvinfer1::ITensor* input,+                                const std::vector<int>& trt_axes,+                                nvinfer1::ITensor** output) {+  const nvinfer1::Dims dims = input->getDimensions();+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);+  // Mark axes to remove by setting them to 0.+  for (int axis : trt_axes) {+    input_dims[axis] = 0;+  }++#if IS_TRT_VERSION_GE(6, 0, 0, 0)+  // For dynamic input shapes, we need to use TRT ops to build the new shape.+  const bool is_dynamic =+      std::count(input_dims.begin(), input_dims.end(), -1) > 0;+  if (is_dynamic) {

You may want to simply use if (absl::c_any_of(input_dims, {return i == -1;})) {

tfeher

comment created time in 22 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

 def __init__(self,                        "with static TensorRT ops. Set is_dynamic_op to True.")      self._converted = False+    self._build_called_once = False

Are the changes in this file relevant? I don't see how self._build_called_once is set to true...

tfeher

comment created time in 22 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

 Status TRTEngineOp::VerifyInputShapes(const std::vector<TensorShape>& shapes) {                                    TensorShapeUtils::ShapeListString(shapes));   } -  const int batch_size = shapes[0].dim_size(0);-  for (const TensorShape& shape : shapes) {-    if (shape.dims() < 1 || batch_size != shape.dim_size(0)) {-      return errors::InvalidArgument(-          "Input shapes are inconsistent on the batch dimension, for ", name(),-          ": ", TensorShapeUtils::ShapeListString(shapes));+  if (use_implicit_batch_) {+    const int batch_size = shapes[0].dim_size(0);+    for (const TensorShape& shape : shapes) {+      if (shape.dims() < 1 || batch_size != shape.dim_size(0)) {

Can you explain the check for shape.dims() < 1? Why it is not hoisted outside the loop, which is equivalent to checking variable batch_size < 1?

tfeher

comment created time in 22 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.+#+# Licensed under the Apache License, Version 2.0 (the "License");+# you may not use this file except in compliance with the License.+# You may obtain a copy of the License at+#+#     http://www.apache.org/licenses/LICENSE-2.0+#+# Unless required by applicable law or agreed to in writing, software+# distributed under the License is distributed on an "AS IS" BASIS,+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.+# See the License for the specific language governing permissions and+# limitations under the License.+# ==============================================================================+"""Model script to test TF-TensorRT integration."""++from __future__ import absolute_import+from __future__ import division+from __future__ import print_function++import numpy as np++from tensorflow.python.compiler.tensorrt import trt_convert+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test+from tensorflow.python.framework import constant_op+from tensorflow.python.framework import dtypes+from tensorflow.python.ops import array_ops+from tensorflow.python.ops import gen_array_ops+from tensorflow.python.ops import gen_math_ops+from tensorflow.python.ops import math_ops+from tensorflow.python.platform import test+++class ImplicitBatchTest(trt_test.TfTrtIntegrationTestBase):

It is very confusing that we later on derive ExplicitBatchTest from ImplicitBatchTest. Let's make the following changes: . Rename the test file to batch_mode_test.py. . Define a base class BatchModeTestBase, and then derive ImplicitBatchTest and ExplicitBatchTest from it. . GetConversionParams defined here is very similar to that in the super class TfTrtIntegrationTestBase. We can call the one from the super class and then modify the batch mode.

tfeher

comment created time in 22 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

 class TRTEngineResourceOpsTest : public OpsTestBase {   }    TrtUniquePtrType<nvinfer1::ICudaEngine> CreateTRTEngine() {-    Logger logger;     TrtUniquePtrType<nvinfer1::IBuilder> builder(-        nvinfer1::createInferBuilder(logger));+        nvinfer1::createInferBuilder(logger_));

Can you explain this change? What is the problem we are trying to fix here?

tfeher

comment created time in 22 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.

s/2018/2020

tfeher

comment created time in 22 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

 Status ConvertSqueeze(OpConverterParams* params) {   const TRT_TensorOrWeights& input_tensor = inputs.at(0);   const nvinfer1::Dims dims = input_tensor.GetTrtDims();   std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);-  // Mark axes to remove by setting them to 0.   TFAttrs attrs(node_def);   auto squeeze_dims = attrs.get<std::vector<int64>>("squeeze_dims");   if (squeeze_dims.empty()) {     return errors::Unimplemented(         "Squeeze is only implemented for explicit dims, at ", node_def.name());   }+  std::vector<int> trt_axes;+  trt_axes.reserve(squeeze_dims.size());   for (int tf_axis : squeeze_dims) {     // Make sure axis is valid.     int trt_axis;     TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),-                                   /*use_implicit_batch=*/true, &trt_axis));+                                   params->use_implicit_batch, &trt_axis));     // Make sure target dimension is size 1.

Let's fix the comment: We can only squeeze a dimension of size 1 or of unknown size (-1).

tfeher

comment created time in 22 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

 string DebugString(const nvinfer1::ITensor& tensor) {                 ", dims=", DebugString(tensor.getDimensions()), ")"); } +string DebugString(const std::vector<nvinfer1::Dims>& dimvec) {+  string out = "[";+  for (auto dims: dimvec) {+    StrAppend(&out, DebugString(dims));

You can also use this version of absl::StrJoin. Similar for the two routines below. std::string StrJoin(const Range& range, absl::string_view separator, Formatter&& fmt)

tfeher

comment created time in 22 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

 class Converter {                                const bool validation_only,                                nvinfer1::ITensor** tensor); +  Status SqueezeTensor(nvinfer1::ITensor* input,

Would you please add a description of the function to document it?

tfeher

comment created time in 22 days

Pull request review commenttensorflow/tensorflow

Enable TF-TRT explicit batch mode

 Status ConvertSqueeze(OpConverterParams* params) {   const TRT_TensorOrWeights& input_tensor = inputs.at(0);   const nvinfer1::Dims dims = input_tensor.GetTrtDims();   std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);-  // Mark axes to remove by setting them to 0.   TFAttrs attrs(node_def);   auto squeeze_dims = attrs.get<std::vector<int64>>("squeeze_dims");   if (squeeze_dims.empty()) {     return errors::Unimplemented(         "Squeeze is only implemented for explicit dims, at ", node_def.name());   }+  std::vector<int> trt_axes;+  trt_axes.reserve(squeeze_dims.size());   for (int tf_axis : squeeze_dims) {     // Make sure axis is valid.

This is not part of your change, but let's fix the comment: If the TF axis is valid, convert it to a TRT axis.

tfeher

comment created time in 22 days

issue commenttensorflow/tensorflow

TensorRT native segment lookup error in calibration mode

Can you tell me the detail on how you get the binary "python" as in "TF_CPP_VMODULE=trt_engine_op=2,convert_graph=7 python tensorflow/python/compiler/tensorrt/test/native_segment_test2.py"?

tfeher

comment created time in a month

issue commenttensorflow/tensorflow

TFTRT not converting dilated convolutions

First of all, I wonder why your test case doesn't work straightforwardly for me. I got this error: ValueError: ('Unrecognized keyword arguments:', ['input']) I have to modify "input=" to "shape=" to make it works. Why the program works on your end?

The test works for me and I see TRTEngineOp with convolution in the result. See the attachments. The .txt file is a pbtxt file, have to rename it to please github.

trt_github_issue_36109

trt_github_issue_36109_pbtxt.txt

fferroni

comment created time in a month

pull request commenttensorflow/tensorflow

[compiler] check batch size modified

Would you please provide a description on the PR, such as what problem it is trying to fix? The only difference this change would make is when input_dims[0] <= 0, IIUC.

gaurav1086

comment created time in a month

issue commenttensorflow/tensorflow

TensorRT native segment lookup error in calibration mode

Are you sure that https://github.com/tfeher/tensorflow/blob/native_segment_bug/tensorflow/python/compiler/tensorrt/test/native_segment_test2.py can be used to reproduce the problem?

If that is the case, I expect that removing AddTest from the file the test should pass.

But I got this error: ValueError: Fetch argument 'output_0:0' cannot be interpreted as a Tensor. ("The name 'output_0:0' refers to a Tensor which does not exist. The operation, 'output_0', does not exist in the graph.")

I think this is because the test has a problem, in GraphFn. I fixed GraphDef as follows, and added AddTest back, the test pass. def GraphFn(self, inp1): """Create a graph containing single segment.""" dtype = inp1.dtype val = inp1 * inp1 abs = math_ops.abs(val) return array_ops.identity(abs, name="output_0")

tfeher

comment created time in a month

fork bixia1/tensorflow

An Open Source Machine Learning Framework for Everyone

https://tensorflow.org

fork in a month

Pull request review commenttensorflow/tensorflow

Enable explicit batch mode with optimization profiles

 def ExpectedEnginesToBuild(self, run_params):     return ["TRTEngineOp_0"]  +# Due to a leak in TF ResourceManager, where we store the TRT engines cache,+# we sometimes observe test failures. We initially put the following test+# in trt_convert_test.py and we observed those failures. The symptoms vary

"We initially put the following test in trt_convert_test.py and we observed those failures."

Now that you put the test in compiler/tensorrt/test/ not trt_convert_test.py, do you still observe the test failure? If not, do you know why putting the test here makes such a difference?

pooyadavoodi

comment created time in a month

pull request commenttensorflow/tensorflow

Enable preventing engine build at runtime

I still see the "bad_function_call" error when running the test that you added in the PR, here is the stack: *** SIGABRT received by PID 2361 (TID 2361) from PID 2361; stack trace: *** PC: @ 0x7f1fd1132602 (unknown) raise @ 0x55b930fbd67b 1728 FailureSignalHandler() @ 0x7f1fd12b09a0 1950672896 (unknown) @ 0x55b92b3e2109 16 std::__u::__throw_bad_function_call() @ 0x55b92f737cc9 16 std::__u::__function::__policy_invoker<>::__call_empty() @ 0x55b92f730c0b 512 tensorflow::(anonymous namespace)::ExecutorImpl::RunAsync() @ 0x55b92f74452d 560 tensorflow::FunctionLibraryRuntimeImpl::Run() @ 0x55b92b3eccf3 336 tensorflow::tensorrt::TRTEngineOp::ExecuteNativeSegment() @ 0x55b92b3ee2cd 192 tensorflow::tensorrt::TRTEngineOp::ComputeAsync() @ 0x55b92fb4703d 80 tensorflow::AsyncOpKernel::Compute() @ 0x55b92f6bab00 400 tensorflow::BaseGPUDevice::Compute() @ 0x55b92b3e0814 208 tensorflow::OpsTestBase::RunOpKernel() @ 0x55b92b3e09d8 160 tensorflow::tensorrt::TRTEngineOpTestBase_AllowBuildAtRuntime_Test::TestBody() @ 0x55b92cda00c2 48 testing::Test::Run() @ 0x55b92cda0d8c 96 testing::TestInfo::Run() @ 0x55b92cda16c7 80 testing::TestSuite::Run() @ 0x55b92cdaeb47 208 testing::internal::UnitTestImpl::RunAllTests() @ 0x55b92cdae16f 64 testing::UnitTest::Run() @ 0x55b92ca08f6e 32 main @ 0x7f1fd111ebbd 208 __libc_start_main

pooyadavoodi

comment created time in a month

Pull request review commenttensorflow/tensorflow

[XLA] Enhancements to algebraic_simplifier

 bool IsAll(const HloInstruction* op, int8 value) {   } } +bool IsAnyOperandComplex(const HloInstruction* hlo) {+  for (auto operand : hlo->operands()) {+    if (ShapeUtil::ElementIsComplex(operand->shape())) {+      return true;+    }+  }+  return false;+}++bool IsPositive(const HloInstruction* hlo,+                const AlgebraicSimplifierOptions& options) {+  // Utility only handles real types.+  if (IsAnyOperandComplex(hlo)) {+    return false;+  }+  switch (hlo->opcode()) {+    case HloOpcode::kGetTupleElement: {+      const HloInstruction* gte_operand = hlo->operand(0);+      switch (gte_operand->opcode()) {+        case HloOpcode::kCustomCall: {+          const auto& target = gte_operand->custom_call_target();+          return target ==+                     options.get_cudnn_batchnorm_forward_training_metadata() &&+                 hlo->tuple_index() == 2;+        }+        default:+          return false;+      }+    }+    case HloOpcode::kPower:+    case HloOpcode::kAbs:+    case HloOpcode::kRsqrt:+    case HloOpcode::kSqrt:+      return IsPositive(hlo->operand(0), options);++    case HloOpcode::kMultiply: {+      return hlo->operand(0) == hlo->operand(1) &&+             IsPositive(hlo->operand(0), options);+    }+    default:+      return false;+  }+}++bool IsNonNegative(const HloInstruction* hlo,+                   const AlgebraicSimplifierOptions& options) {+  // Utility only handles real types.+  if (IsAnyOperandComplex(hlo)) {

I think so. You can check the implementation of ElementalIrEmitter::EmitComplexAbs.

AyanmoI

comment created time in 2 months

more